diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,126433 @@ +{ + "best_metric": 1.5737380981445312, + "best_model_checkpoint": "fat5-fr-small_v1/checkpoint-1545000", + "epoch": 6.518048374103387, + "eval_steps": 1000, + "global_step": 1600000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00040737802338146166, + "grad_norm": 2.0072696208953857, + "learning_rate": 0.002512499999999994, + "loss": 24.7291, + "step": 100 + }, + { + "epoch": 0.0008147560467629233, + "grad_norm": 1.5881823301315308, + "learning_rate": 0.0025249999999999904, + "loss": 20.4769, + "step": 200 + }, + { + "epoch": 0.001222134070144385, + "grad_norm": 2.2315056324005127, + "learning_rate": 0.00253749999999999, + "loss": 19.0568, + "step": 300 + }, + { + "epoch": 0.0016295120935258466, + "grad_norm": 2.400899887084961, + "learning_rate": 0.0025499999999999885, + "loss": 18.3718, + "step": 400 + }, + { + "epoch": 0.0020368901169073085, + "grad_norm": 2.5054872035980225, + "learning_rate": 0.0025624999999999862, + "loss": 17.9085, + "step": 500 + }, + { + "epoch": 0.00244426814028877, + "grad_norm": 3.132996082305908, + "learning_rate": 0.0025749999999999827, + "loss": 17.5639, + "step": 600 + }, + { + "epoch": 0.0028516461636702317, + "grad_norm": 3.571537971496582, + "learning_rate": 0.0025874999999999813, + "loss": 17.2831, + "step": 700 + }, + { + "epoch": 0.0032590241870516933, + "grad_norm": 3.905449390411377, + "learning_rate": 0.0025999999999999795, + "loss": 16.9897, + "step": 800 + }, + { + "epoch": 0.003666402210433155, + "grad_norm": 3.9679980278015137, + "learning_rate": 0.0026124999999999764, + "loss": 16.8025, + "step": 900 + }, + { + "epoch": 0.004073780233814617, + "grad_norm": 6.748215675354004, + "learning_rate": 0.00262499999999997, + "loss": 16.5832, + "step": 1000 + }, + { + "epoch": 0.004073780233814617, + "eval_MaskedAccuracy": 0.21481029309245273, + "eval_loss": 3.4091806411743164, + "eval_runtime": 210.9409, + "eval_samples_per_second": 300.918, + "eval_steps_per_second": 1.176, + "step": 1000 + }, + { + "epoch": 0.0044811582571960785, + "grad_norm": 3.9621713161468506, + "learning_rate": 0.002637499999999968, + "loss": 16.4083, + "step": 1100 + }, + { + "epoch": 0.00488853628057754, + "grad_norm": 4.407429218292236, + "learning_rate": 0.002649999999999965, + "loss": 16.2311, + "step": 1200 + }, + { + "epoch": 0.005295914303959002, + "grad_norm": 3.0638349056243896, + "learning_rate": 0.0026624999999999635, + "loss": 16.0918, + "step": 1300 + }, + { + "epoch": 0.005703292327340463, + "grad_norm": 3.7296924591064453, + "learning_rate": 0.002674999999999961, + "loss": 15.9018, + "step": 1400 + }, + { + "epoch": 0.006110670350721925, + "grad_norm": 5.068863391876221, + "learning_rate": 0.0026874999999999564, + "loss": 15.721, + "step": 1500 + }, + { + "epoch": 0.0065180483741033865, + "grad_norm": 3.687300682067871, + "learning_rate": 0.002699999999999955, + "loss": 15.6485, + "step": 1600 + }, + { + "epoch": 0.006925426397484848, + "grad_norm": 3.4417190551757812, + "learning_rate": 0.002712499999999955, + "loss": 15.4687, + "step": 1700 + }, + { + "epoch": 0.00733280442086631, + "grad_norm": 3.9998209476470947, + "learning_rate": 0.0027249999999999514, + "loss": 15.3952, + "step": 1800 + }, + { + "epoch": 0.007740182444247771, + "grad_norm": 3.1600570678710938, + "learning_rate": 0.002737499999999953, + "loss": 15.2506, + "step": 1900 + }, + { + "epoch": 0.008147560467629234, + "grad_norm": 3.644062042236328, + "learning_rate": 0.0027499999999999495, + "loss": 15.1622, + "step": 2000 + }, + { + "epoch": 0.008147560467629234, + "eval_MaskedAccuracy": 0.24587377267058683, + "eval_loss": 3.1341230869293213, + "eval_runtime": 1685.5139, + "eval_samples_per_second": 37.66, + "eval_steps_per_second": 0.147, + "step": 2000 + }, + { + "epoch": 0.008554938491010695, + "grad_norm": 4.730564117431641, + "learning_rate": 0.0027624999999999425, + "loss": 15.0285, + "step": 2100 + }, + { + "epoch": 0.008962316514392157, + "grad_norm": 4.022442817687988, + "learning_rate": 0.0027749999999999394, + "loss": 14.9321, + "step": 2200 + }, + { + "epoch": 0.009369694537773619, + "grad_norm": 2.78922700881958, + "learning_rate": 0.0027874999999999363, + "loss": 14.8595, + "step": 2300 + }, + { + "epoch": 0.00977707256115508, + "grad_norm": 3.2824580669403076, + "learning_rate": 0.0027999999999999345, + "loss": 14.7518, + "step": 2400 + }, + { + "epoch": 0.010184450584536542, + "grad_norm": 2.8441643714904785, + "learning_rate": 0.0028124999999999322, + "loss": 14.6065, + "step": 2500 + }, + { + "epoch": 0.010591828607918003, + "grad_norm": 3.85910701751709, + "learning_rate": 0.00282499999999993, + "loss": 14.5693, + "step": 2600 + }, + { + "epoch": 0.010999206631299465, + "grad_norm": 3.6006782054901123, + "learning_rate": 0.0028374999999999286, + "loss": 14.5064, + "step": 2700 + }, + { + "epoch": 0.011406584654680927, + "grad_norm": 3.207846164703369, + "learning_rate": 0.002849999999999924, + "loss": 14.4506, + "step": 2800 + }, + { + "epoch": 0.011813962678062388, + "grad_norm": 4.087400436401367, + "learning_rate": 0.0028624999999999224, + "loss": 14.314, + "step": 2900 + }, + { + "epoch": 0.01222134070144385, + "grad_norm": 2.7313225269317627, + "learning_rate": 0.002874999999999916, + "loss": 14.2647, + "step": 3000 + }, + { + "epoch": 0.01222134070144385, + "eval_MaskedAccuracy": 0.2672389482733827, + "eval_loss": 2.9623844623565674, + "eval_runtime": 156.1547, + "eval_samples_per_second": 406.494, + "eval_steps_per_second": 1.588, + "step": 3000 + }, + { + "epoch": 0.012628718724825311, + "grad_norm": 2.6286871433258057, + "learning_rate": 0.0028874999999999123, + "loss": 14.2328, + "step": 3100 + }, + { + "epoch": 0.013036096748206773, + "grad_norm": 3.34944486618042, + "learning_rate": 0.0028999999999999113, + "loss": 14.0831, + "step": 3200 + }, + { + "epoch": 0.013443474771588235, + "grad_norm": 2.468932628631592, + "learning_rate": 0.002912499999999909, + "loss": 14.0768, + "step": 3300 + }, + { + "epoch": 0.013850852794969696, + "grad_norm": 2.6487114429473877, + "learning_rate": 0.0029249999999999042, + "loss": 13.9294, + "step": 3400 + }, + { + "epoch": 0.014258230818351158, + "grad_norm": 2.071166753768921, + "learning_rate": 0.002937499999999899, + "loss": 13.9091, + "step": 3500 + }, + { + "epoch": 0.01466560884173262, + "grad_norm": 3.003756284713745, + "learning_rate": 0.002949999999999894, + "loss": 13.7766, + "step": 3600 + }, + { + "epoch": 0.015072986865114081, + "grad_norm": 3.022969961166382, + "learning_rate": 0.002962499999999894, + "loss": 13.7965, + "step": 3700 + }, + { + "epoch": 0.015480364888495543, + "grad_norm": 2.5271995067596436, + "learning_rate": 0.002974999999999889, + "loss": 13.7016, + "step": 3800 + }, + { + "epoch": 0.015887742911877004, + "grad_norm": 2.9642605781555176, + "learning_rate": 0.002987499999999884, + "loss": 13.6264, + "step": 3900 + }, + { + "epoch": 0.016295120935258468, + "grad_norm": 2.866058111190796, + "learning_rate": 0.002999999999999876, + "loss": 13.5886, + "step": 4000 + }, + { + "epoch": 0.016295120935258468, + "eval_MaskedAccuracy": 0.2823220328091172, + "eval_loss": 2.8647966384887695, + "eval_runtime": 583.093, + "eval_samples_per_second": 108.861, + "eval_steps_per_second": 0.425, + "step": 4000 + }, + { + "epoch": 0.016702498958639928, + "grad_norm": 3.0533554553985596, + "learning_rate": 0.003012499999999872, + "loss": 13.51, + "step": 4100 + }, + { + "epoch": 0.01710987698202139, + "grad_norm": 3.9694809913635254, + "learning_rate": 0.0030249999999998685, + "loss": 13.4058, + "step": 4200 + }, + { + "epoch": 0.01751725500540285, + "grad_norm": 3.5330469608306885, + "learning_rate": 0.003037499999999864, + "loss": 13.3674, + "step": 4300 + }, + { + "epoch": 0.017924633028784314, + "grad_norm": 4.143810272216797, + "learning_rate": 0.0030499999999998627, + "loss": 13.2921, + "step": 4400 + }, + { + "epoch": 0.018332011052165774, + "grad_norm": 3.4098572731018066, + "learning_rate": 0.003062499999999858, + "loss": 13.2422, + "step": 4500 + }, + { + "epoch": 0.018739389075547237, + "grad_norm": 2.815377950668335, + "learning_rate": 0.0030749999999998565, + "loss": 13.1686, + "step": 4600 + }, + { + "epoch": 0.019146767098928697, + "grad_norm": 2.2003087997436523, + "learning_rate": 0.0030874999999998525, + "loss": 13.1174, + "step": 4700 + }, + { + "epoch": 0.01955414512231016, + "grad_norm": 3.600919723510742, + "learning_rate": 0.003099999999999849, + "loss": 13.0667, + "step": 4800 + }, + { + "epoch": 0.01996152314569162, + "grad_norm": 3.6641998291015625, + "learning_rate": 0.0031124999999998476, + "loss": 13.0321, + "step": 4900 + }, + { + "epoch": 0.020368901169073084, + "grad_norm": 2.7134268283843994, + "learning_rate": 0.003124999999999845, + "loss": 12.997, + "step": 5000 + }, + { + "epoch": 0.020368901169073084, + "eval_MaskedAccuracy": 0.297161426868168, + "eval_loss": 2.7633774280548096, + "eval_runtime": 497.1203, + "eval_samples_per_second": 127.687, + "eval_steps_per_second": 0.499, + "step": 5000 + }, + { + "epoch": 0.020776279192454544, + "grad_norm": 5.596516132354736, + "learning_rate": 0.003137499999999841, + "loss": 12.9323, + "step": 5100 + }, + { + "epoch": 0.021183657215836007, + "grad_norm": 6.993663311004639, + "learning_rate": 0.003149999999999838, + "loss": 12.8955, + "step": 5200 + }, + { + "epoch": 0.021591035239217467, + "grad_norm": 2.8680942058563232, + "learning_rate": 0.003162499999999835, + "loss": 12.8421, + "step": 5300 + }, + { + "epoch": 0.02199841326259893, + "grad_norm": 3.843177556991577, + "learning_rate": 0.003174999999999833, + "loss": 12.8343, + "step": 5400 + }, + { + "epoch": 0.02240579128598039, + "grad_norm": 10.011335372924805, + "learning_rate": 0.0031874999999998307, + "loss": 12.7531, + "step": 5500 + }, + { + "epoch": 0.022813169309361853, + "grad_norm": 3.5039470195770264, + "learning_rate": 0.003199999999999828, + "loss": 12.7002, + "step": 5600 + }, + { + "epoch": 0.023220547332743313, + "grad_norm": 3.037332773208618, + "learning_rate": 0.0032124999999998288, + "loss": 12.6757, + "step": 5700 + }, + { + "epoch": 0.023627925356124777, + "grad_norm": 3.4344160556793213, + "learning_rate": 0.0032249999999998287, + "loss": 12.6907, + "step": 5800 + }, + { + "epoch": 0.024035303379506236, + "grad_norm": 3.7940402030944824, + "learning_rate": 0.003237499999999828, + "loss": 12.6524, + "step": 5900 + }, + { + "epoch": 0.0244426814028877, + "grad_norm": 3.782240152359009, + "learning_rate": 0.0032499999999998216, + "loss": 12.5874, + "step": 6000 + }, + { + "epoch": 0.0244426814028877, + "eval_MaskedAccuracy": 0.3055268174984478, + "eval_loss": 2.7101807594299316, + "eval_runtime": 642.9482, + "eval_samples_per_second": 98.726, + "eval_steps_per_second": 0.386, + "step": 6000 + }, + { + "epoch": 0.02485005942626916, + "grad_norm": 3.1856234073638916, + "learning_rate": 0.0032624999999998194, + "loss": 12.5659, + "step": 6100 + }, + { + "epoch": 0.025257437449650623, + "grad_norm": 2.2912404537200928, + "learning_rate": 0.0032749999999998154, + "loss": 12.5519, + "step": 6200 + }, + { + "epoch": 0.025664815473032083, + "grad_norm": 2.196720838546753, + "learning_rate": 0.003287499999999813, + "loss": 12.4975, + "step": 6300 + }, + { + "epoch": 0.026072193496413546, + "grad_norm": 2.9122323989868164, + "learning_rate": 0.003299999999999811, + "loss": 12.4262, + "step": 6400 + }, + { + "epoch": 0.026479571519795006, + "grad_norm": 2.5234193801879883, + "learning_rate": 0.0033124999999998082, + "loss": 12.4096, + "step": 6500 + }, + { + "epoch": 0.02688694954317647, + "grad_norm": 4.733991622924805, + "learning_rate": 0.0033249999999998042, + "loss": 12.3823, + "step": 6600 + }, + { + "epoch": 0.02729432756655793, + "grad_norm": 2.9852294921875, + "learning_rate": 0.0033374999999997994, + "loss": 12.3526, + "step": 6700 + }, + { + "epoch": 0.027701705589939393, + "grad_norm": 3.347761392593384, + "learning_rate": 0.0033499999999997932, + "loss": 12.3387, + "step": 6800 + }, + { + "epoch": 0.028109083613320852, + "grad_norm": 2.850923538208008, + "learning_rate": 0.003362499999999787, + "loss": 12.3076, + "step": 6900 + }, + { + "epoch": 0.028516461636702316, + "grad_norm": 2.5441689491271973, + "learning_rate": 0.0033749999999997853, + "loss": 12.2344, + "step": 7000 + }, + { + "epoch": 0.028516461636702316, + "eval_MaskedAccuracy": 0.31737777984121823, + "eval_loss": 2.6147336959838867, + "eval_runtime": 523.4424, + "eval_samples_per_second": 121.266, + "eval_steps_per_second": 0.474, + "step": 7000 + }, + { + "epoch": 0.028923839660083776, + "grad_norm": 2.6782543659210205, + "learning_rate": 0.0033874999999997804, + "loss": 12.1975, + "step": 7100 + }, + { + "epoch": 0.02933121768346524, + "grad_norm": 4.042718410491943, + "learning_rate": 0.0033999999999997713, + "loss": 12.1332, + "step": 7200 + }, + { + "epoch": 0.029738595706846702, + "grad_norm": 2.6437010765075684, + "learning_rate": 0.0034124999999997647, + "loss": 12.0517, + "step": 7300 + }, + { + "epoch": 0.030145973730228162, + "grad_norm": 3.0328099727630615, + "learning_rate": 0.003424999999999764, + "loss": 11.9627, + "step": 7400 + }, + { + "epoch": 0.030553351753609626, + "grad_norm": 2.895012855529785, + "learning_rate": 0.0034374999999997606, + "loss": 11.9723, + "step": 7500 + }, + { + "epoch": 0.030960729776991085, + "grad_norm": 8.878241539001465, + "learning_rate": 0.003449999999999758, + "loss": 11.9015, + "step": 7600 + }, + { + "epoch": 0.031368107800372545, + "grad_norm": 2.7825474739074707, + "learning_rate": 0.003462499999999754, + "loss": 11.8462, + "step": 7700 + }, + { + "epoch": 0.03177548582375401, + "grad_norm": 2.184889316558838, + "learning_rate": 0.003474999999999749, + "loss": 11.7993, + "step": 7800 + }, + { + "epoch": 0.03218286384713547, + "grad_norm": 2.5624606609344482, + "learning_rate": 0.003487499999999749, + "loss": 11.7754, + "step": 7900 + }, + { + "epoch": 0.032590241870516935, + "grad_norm": 2.3846676349639893, + "learning_rate": 0.003499999999999745, + "loss": 11.7032, + "step": 8000 + }, + { + "epoch": 0.032590241870516935, + "eval_MaskedAccuracy": 0.3393313736809394, + "eval_loss": 2.4879164695739746, + "eval_runtime": 483.8698, + "eval_samples_per_second": 131.184, + "eval_steps_per_second": 0.513, + "step": 8000 + }, + { + "epoch": 0.03299761989389839, + "grad_norm": 3.5283403396606445, + "learning_rate": 0.0035124999999997415, + "loss": 11.6798, + "step": 8100 + }, + { + "epoch": 0.033404997917279855, + "grad_norm": 2.359555959701538, + "learning_rate": 0.003524999999999741, + "loss": 11.5984, + "step": 8200 + }, + { + "epoch": 0.03381237594066132, + "grad_norm": 1.9014986753463745, + "learning_rate": 0.0035374999999997353, + "loss": 11.5199, + "step": 8300 + }, + { + "epoch": 0.03421975396404278, + "grad_norm": 4.172168254852295, + "learning_rate": 0.0035499999999997313, + "loss": 11.4863, + "step": 8400 + }, + { + "epoch": 0.03462713198742424, + "grad_norm": 2.5342233180999756, + "learning_rate": 0.00356249999999973, + "loss": 11.4043, + "step": 8500 + }, + { + "epoch": 0.0350345100108057, + "grad_norm": 8.684643745422363, + "learning_rate": 0.0035749999999997255, + "loss": 11.3848, + "step": 8600 + }, + { + "epoch": 0.035441888034187165, + "grad_norm": 11.089255332946777, + "learning_rate": 0.0035874999999997237, + "loss": 11.3441, + "step": 8700 + }, + { + "epoch": 0.03584926605756863, + "grad_norm": 2.411904811859131, + "learning_rate": 0.0035999999999997236, + "loss": 11.3067, + "step": 8800 + }, + { + "epoch": 0.036256644080950085, + "grad_norm": 4.111713409423828, + "learning_rate": 0.003612499999999721, + "loss": 11.2171, + "step": 8900 + }, + { + "epoch": 0.03666402210433155, + "grad_norm": 5.4018354415893555, + "learning_rate": 0.00362499999999972, + "loss": 11.242, + "step": 9000 + }, + { + "epoch": 0.03666402210433155, + "eval_MaskedAccuracy": 0.36379460285274207, + "eval_loss": 2.403564929962158, + "eval_runtime": 502.7075, + "eval_samples_per_second": 126.268, + "eval_steps_per_second": 0.493, + "step": 9000 + }, + { + "epoch": 0.03707140012771301, + "grad_norm": 2.967698812484741, + "learning_rate": 0.0036374999999997113, + "loss": 11.1676, + "step": 9100 + }, + { + "epoch": 0.037478778151094475, + "grad_norm": 1.8247146606445312, + "learning_rate": 0.0036499999999997073, + "loss": 11.151, + "step": 9200 + }, + { + "epoch": 0.03788615617447593, + "grad_norm": 2.4903030395507812, + "learning_rate": 0.003662499999999711, + "loss": 11.1003, + "step": 9300 + }, + { + "epoch": 0.038293534197857394, + "grad_norm": 3.0097227096557617, + "learning_rate": 0.0036749999999997097, + "loss": 11.0943, + "step": 9400 + }, + { + "epoch": 0.03870091222123886, + "grad_norm": 5.106104373931885, + "learning_rate": 0.0036874999999997097, + "loss": 11.0692, + "step": 9500 + }, + { + "epoch": 0.03910829024462032, + "grad_norm": 2.8577446937561035, + "learning_rate": 0.0036999999999997118, + "loss": 11.02, + "step": 9600 + }, + { + "epoch": 0.03951566826800178, + "grad_norm": 2.462996244430542, + "learning_rate": 0.0037124999999997074, + "loss": 10.9984, + "step": 9700 + }, + { + "epoch": 0.03992304629138324, + "grad_norm": 3.696761131286621, + "learning_rate": 0.0037249999999997034, + "loss": 10.94, + "step": 9800 + }, + { + "epoch": 0.040330424314764704, + "grad_norm": 6.5250983238220215, + "learning_rate": 0.0037374999999996972, + "loss": 10.9396, + "step": 9900 + }, + { + "epoch": 0.04073780233814617, + "grad_norm": 1.9880354404449463, + "learning_rate": 0.003749999999999697, + "loss": 10.9231, + "step": 10000 + }, + { + "epoch": 0.04073780233814617, + "eval_MaskedAccuracy": 0.3768304916518788, + "eval_loss": 2.3210949897766113, + "eval_runtime": 560.8588, + "eval_samples_per_second": 113.176, + "eval_steps_per_second": 0.442, + "step": 10000 + }, + { + "epoch": 0.041145180361527624, + "grad_norm": 3.661884307861328, + "learning_rate": 0.003762499999999692, + "loss": 10.9047, + "step": 10100 + }, + { + "epoch": 0.04155255838490909, + "grad_norm": 4.004960060119629, + "learning_rate": 0.0037749999999996914, + "loss": 10.9156, + "step": 10200 + }, + { + "epoch": 0.04195993640829055, + "grad_norm": 2.3835198879241943, + "learning_rate": 0.003787499999999695, + "loss": 10.8281, + "step": 10300 + }, + { + "epoch": 0.042367314431672014, + "grad_norm": 5.180380821228027, + "learning_rate": 0.0037999999999996873, + "loss": 10.8489, + "step": 10400 + }, + { + "epoch": 0.04277469245505347, + "grad_norm": 3.258373975753784, + "learning_rate": 0.003812499999999687, + "loss": 10.8193, + "step": 10500 + }, + { + "epoch": 0.043182070478434934, + "grad_norm": 2.7809267044067383, + "learning_rate": 0.0038249999999996833, + "loss": 10.7944, + "step": 10600 + }, + { + "epoch": 0.0435894485018164, + "grad_norm": 3.132779121398926, + "learning_rate": 0.0038374999999996823, + "loss": 10.8097, + "step": 10700 + }, + { + "epoch": 0.04399682652519786, + "grad_norm": 3.2742056846618652, + "learning_rate": 0.0038499999999996775, + "loss": 10.7187, + "step": 10800 + }, + { + "epoch": 0.044404204548579324, + "grad_norm": 2.2164018154144287, + "learning_rate": 0.0038624999999996735, + "loss": 10.7667, + "step": 10900 + }, + { + "epoch": 0.04481158257196078, + "grad_norm": 2.96410870552063, + "learning_rate": 0.0038749999999996712, + "loss": 10.7232, + "step": 11000 + }, + { + "epoch": 0.04481158257196078, + "eval_MaskedAccuracy": 0.3850977976517477, + "eval_loss": 2.2640016078948975, + "eval_runtime": 545.8539, + "eval_samples_per_second": 116.288, + "eval_steps_per_second": 0.454, + "step": 11000 + }, + { + "epoch": 0.04521896059534224, + "grad_norm": 4.5663957595825195, + "learning_rate": 0.0038874999999996686, + "loss": 10.7144, + "step": 11100 + }, + { + "epoch": 0.04562633861872371, + "grad_norm": 2.1517741680145264, + "learning_rate": 0.003899999999999665, + "loss": 10.6831, + "step": 11200 + }, + { + "epoch": 0.04603371664210517, + "grad_norm": 2.739794969558716, + "learning_rate": 0.003912499999999662, + "loss": 10.6677, + "step": 11300 + }, + { + "epoch": 0.046441094665486626, + "grad_norm": 1.7963323593139648, + "learning_rate": 0.003924999999999659, + "loss": 10.6297, + "step": 11400 + }, + { + "epoch": 0.04684847268886809, + "grad_norm": 4.963115215301514, + "learning_rate": 0.0039374999999996566, + "loss": 10.6341, + "step": 11500 + }, + { + "epoch": 0.04725585071224955, + "grad_norm": 4.522215366363525, + "learning_rate": 0.0039499999999996595, + "loss": 10.5891, + "step": 11600 + }, + { + "epoch": 0.047663228735631016, + "grad_norm": 1.9597105979919434, + "learning_rate": 0.00396249999999966, + "loss": 10.572, + "step": 11700 + }, + { + "epoch": 0.04807060675901247, + "grad_norm": 3.4971799850463867, + "learning_rate": 0.003974999999999655, + "loss": 10.5531, + "step": 11800 + }, + { + "epoch": 0.048477984782393936, + "grad_norm": 3.4723501205444336, + "learning_rate": 0.003987499999999656, + "loss": 10.527, + "step": 11900 + }, + { + "epoch": 0.0488853628057754, + "grad_norm": 3.51145339012146, + "learning_rate": 0.003999999999999657, + "loss": 10.5652, + "step": 12000 + }, + { + "epoch": 0.0488853628057754, + "eval_MaskedAccuracy": 0.3894803210237721, + "eval_loss": 2.252936840057373, + "eval_runtime": 492.0122, + "eval_samples_per_second": 129.013, + "eval_steps_per_second": 0.504, + "step": 12000 + }, + { + "epoch": 0.04929274082915686, + "grad_norm": 4.758820056915283, + "learning_rate": 0.004012499999999649, + "loss": 10.5526, + "step": 12100 + }, + { + "epoch": 0.04970011885253832, + "grad_norm": 4.174615383148193, + "learning_rate": 0.00402499999999965, + "loss": 10.5421, + "step": 12200 + }, + { + "epoch": 0.05010749687591978, + "grad_norm": 3.07735013961792, + "learning_rate": 0.004037499999999652, + "loss": 10.4709, + "step": 12300 + }, + { + "epoch": 0.050514874899301246, + "grad_norm": 2.38267183303833, + "learning_rate": 0.00404999999999965, + "loss": 10.487, + "step": 12400 + }, + { + "epoch": 0.05092225292268271, + "grad_norm": 9.329970359802246, + "learning_rate": 0.004062499999999646, + "loss": 10.4827, + "step": 12500 + }, + { + "epoch": 0.051329630946064166, + "grad_norm": 1.8156367540359497, + "learning_rate": 0.004074999999999641, + "loss": 10.4427, + "step": 12600 + }, + { + "epoch": 0.05173700896944563, + "grad_norm": 2.2023582458496094, + "learning_rate": 0.004087499999999641, + "loss": 10.4732, + "step": 12700 + }, + { + "epoch": 0.05214438699282709, + "grad_norm": 6.574010372161865, + "learning_rate": 0.004099999999999645, + "loss": 10.4458, + "step": 12800 + }, + { + "epoch": 0.052551765016208556, + "grad_norm": 2.2283875942230225, + "learning_rate": 0.004112499999999646, + "loss": 10.4411, + "step": 12900 + }, + { + "epoch": 0.05295914303959001, + "grad_norm": 2.2073442935943604, + "learning_rate": 0.004124999999999645, + "loss": 10.4139, + "step": 13000 + }, + { + "epoch": 0.05295914303959001, + "eval_MaskedAccuracy": 0.3959563478200268, + "eval_loss": 2.1902289390563965, + "eval_runtime": 503.6059, + "eval_samples_per_second": 126.043, + "eval_steps_per_second": 0.492, + "step": 13000 + }, + { + "epoch": 0.053366521062971475, + "grad_norm": 1.9398807287216187, + "learning_rate": 0.0041374999999996415, + "loss": 10.3892, + "step": 13100 + }, + { + "epoch": 0.05377389908635294, + "grad_norm": 1.7884042263031006, + "learning_rate": 0.004149999999999641, + "loss": 10.4056, + "step": 13200 + }, + { + "epoch": 0.0541812771097344, + "grad_norm": 1.9226492643356323, + "learning_rate": 0.004162499999999636, + "loss": 10.3725, + "step": 13300 + }, + { + "epoch": 0.05458865513311586, + "grad_norm": 3.9229259490966797, + "learning_rate": 0.004174999999999633, + "loss": 10.3876, + "step": 13400 + }, + { + "epoch": 0.05499603315649732, + "grad_norm": 3.9717321395874023, + "learning_rate": 0.004187499999999628, + "loss": 10.3371, + "step": 13500 + }, + { + "epoch": 0.055403411179878785, + "grad_norm": 2.2701687812805176, + "learning_rate": 0.004199999999999622, + "loss": 10.3493, + "step": 13600 + }, + { + "epoch": 0.05581078920326025, + "grad_norm": 2.833211898803711, + "learning_rate": 0.004212499999999617, + "loss": 10.3324, + "step": 13700 + }, + { + "epoch": 0.056218167226641705, + "grad_norm": 2.876721143722534, + "learning_rate": 0.004224999999999617, + "loss": 10.3139, + "step": 13800 + }, + { + "epoch": 0.05662554525002317, + "grad_norm": 2.396615982055664, + "learning_rate": 0.004237499999999612, + "loss": 10.2732, + "step": 13900 + }, + { + "epoch": 0.05703292327340463, + "grad_norm": 2.2113237380981445, + "learning_rate": 0.004249999999999605, + "loss": 10.2791, + "step": 14000 + }, + { + "epoch": 0.05703292327340463, + "eval_MaskedAccuracy": 0.3979616627594577, + "eval_loss": 2.1914055347442627, + "eval_runtime": 428.8843, + "eval_samples_per_second": 148.003, + "eval_steps_per_second": 0.578, + "step": 14000 + }, + { + "epoch": 0.057440301296786095, + "grad_norm": 2.713885545730591, + "learning_rate": 0.004262499999999602, + "loss": 10.2823, + "step": 14100 + }, + { + "epoch": 0.05784767932016755, + "grad_norm": 6.887316703796387, + "learning_rate": 0.004274999999999597, + "loss": 10.3201, + "step": 14200 + }, + { + "epoch": 0.058255057343549015, + "grad_norm": 2.308656930923462, + "learning_rate": 0.004287499999999591, + "loss": 10.2849, + "step": 14300 + }, + { + "epoch": 0.05866243536693048, + "grad_norm": 3.651346206665039, + "learning_rate": 0.004299999999999582, + "loss": 10.2324, + "step": 14400 + }, + { + "epoch": 0.05906981339031194, + "grad_norm": 2.9673140048980713, + "learning_rate": 0.004312499999999577, + "loss": 10.2551, + "step": 14500 + }, + { + "epoch": 0.059477191413693405, + "grad_norm": 2.1455464363098145, + "learning_rate": 0.004324999999999574, + "loss": 10.2204, + "step": 14600 + }, + { + "epoch": 0.05988456943707486, + "grad_norm": 2.9998409748077393, + "learning_rate": 0.004337499999999567, + "loss": 10.1696, + "step": 14700 + }, + { + "epoch": 0.060291947460456324, + "grad_norm": 2.5606961250305176, + "learning_rate": 0.004349999999999561, + "loss": 10.1846, + "step": 14800 + }, + { + "epoch": 0.06069932548383779, + "grad_norm": 3.6551334857940674, + "learning_rate": 0.004362499999999559, + "loss": 10.1903, + "step": 14900 + }, + { + "epoch": 0.06110670350721925, + "grad_norm": 3.281557559967041, + "learning_rate": 0.0043749999999995555, + "loss": 10.1966, + "step": 15000 + }, + { + "epoch": 0.06110670350721925, + "eval_MaskedAccuracy": 0.40231697911507147, + "eval_loss": 2.163734197616577, + "eval_runtime": 525.3812, + "eval_samples_per_second": 120.819, + "eval_steps_per_second": 0.472, + "step": 15000 + }, + { + "epoch": 0.06151408153060071, + "grad_norm": 2.2004635334014893, + "learning_rate": 0.004387499999999551, + "loss": 10.1657, + "step": 15100 + }, + { + "epoch": 0.06192145955398217, + "grad_norm": 4.036062717437744, + "learning_rate": 0.004399999999999546, + "loss": 10.2044, + "step": 15200 + }, + { + "epoch": 0.062328837577363634, + "grad_norm": 3.3084022998809814, + "learning_rate": 0.004412499999999543, + "loss": 10.1388, + "step": 15300 + }, + { + "epoch": 0.06273621560074509, + "grad_norm": 5.075527191162109, + "learning_rate": 0.004424999999999538, + "loss": 10.1232, + "step": 15400 + }, + { + "epoch": 0.06314359362412655, + "grad_norm": 2.4572486877441406, + "learning_rate": 0.004437499999999531, + "loss": 10.1193, + "step": 15500 + }, + { + "epoch": 0.06355097164750802, + "grad_norm": 2.4803900718688965, + "learning_rate": 0.004449999999999525, + "loss": 10.1271, + "step": 15600 + }, + { + "epoch": 0.06395834967088948, + "grad_norm": 2.1690118312835693, + "learning_rate": 0.004462499999999522, + "loss": 10.089, + "step": 15700 + }, + { + "epoch": 0.06436572769427094, + "grad_norm": 2.363934278488159, + "learning_rate": 0.004474999999999518, + "loss": 10.0945, + "step": 15800 + }, + { + "epoch": 0.06477310571765241, + "grad_norm": 3.3021128177642822, + "learning_rate": 0.004487499999999511, + "loss": 10.1015, + "step": 15900 + }, + { + "epoch": 0.06518048374103387, + "grad_norm": 3.0676352977752686, + "learning_rate": 0.00449999999999951, + "loss": 10.055, + "step": 16000 + }, + { + "epoch": 0.06518048374103387, + "eval_MaskedAccuracy": 0.4079402003894328, + "eval_loss": 2.1536672115325928, + "eval_runtime": 538.1235, + "eval_samples_per_second": 117.958, + "eval_steps_per_second": 0.461, + "step": 16000 + }, + { + "epoch": 0.06558786176441532, + "grad_norm": 2.4352786540985107, + "learning_rate": 0.004512499999999503, + "loss": 10.0648, + "step": 16100 + }, + { + "epoch": 0.06599523978779678, + "grad_norm": 3.645604133605957, + "learning_rate": 0.004524999999999504, + "loss": 10.026, + "step": 16200 + }, + { + "epoch": 0.06640261781117825, + "grad_norm": 2.6965138912200928, + "learning_rate": 0.004537499999999495, + "loss": 10.0569, + "step": 16300 + }, + { + "epoch": 0.06680999583455971, + "grad_norm": 2.6536505222320557, + "learning_rate": 0.004549999999999481, + "loss": 10.0487, + "step": 16400 + }, + { + "epoch": 0.06721737385794117, + "grad_norm": 2.891232490539551, + "learning_rate": 0.0045624999999994715, + "loss": 10.0514, + "step": 16500 + }, + { + "epoch": 0.06762475188132264, + "grad_norm": 2.675612688064575, + "learning_rate": 0.0045749999999994675, + "loss": 10.016, + "step": 16600 + }, + { + "epoch": 0.0680321299047041, + "grad_norm": 4.82465934753418, + "learning_rate": 0.004587499999999466, + "loss": 10.0081, + "step": 16700 + }, + { + "epoch": 0.06843950792808556, + "grad_norm": 2.8379580974578857, + "learning_rate": 0.004599999999999462, + "loss": 10.0316, + "step": 16800 + }, + { + "epoch": 0.06884688595146701, + "grad_norm": 2.4882845878601074, + "learning_rate": 0.0046124999999994564, + "loss": 9.991, + "step": 16900 + }, + { + "epoch": 0.06925426397484848, + "grad_norm": 1.8132230043411255, + "learning_rate": 0.004624999999999449, + "loss": 9.9832, + "step": 17000 + }, + { + "epoch": 0.06925426397484848, + "eval_MaskedAccuracy": 0.41064153600548414, + "eval_loss": 2.115696430206299, + "eval_runtime": 591.8965, + "eval_samples_per_second": 107.242, + "eval_steps_per_second": 0.419, + "step": 17000 + }, + { + "epoch": 0.06966164199822994, + "grad_norm": 2.6878249645233154, + "learning_rate": 0.004637499999999442, + "loss": 9.9764, + "step": 17100 + }, + { + "epoch": 0.0700690200216114, + "grad_norm": 1.909752607345581, + "learning_rate": 0.00464999999999944, + "loss": 10.0211, + "step": 17200 + }, + { + "epoch": 0.07047639804499287, + "grad_norm": 2.974292516708374, + "learning_rate": 0.00466249999999944, + "loss": 9.9357, + "step": 17300 + }, + { + "epoch": 0.07088377606837433, + "grad_norm": 2.8411359786987305, + "learning_rate": 0.004674999999999435, + "loss": 9.9466, + "step": 17400 + }, + { + "epoch": 0.07129115409175579, + "grad_norm": 1.8513596057891846, + "learning_rate": 0.004687499999999431, + "loss": 9.9352, + "step": 17500 + }, + { + "epoch": 0.07169853211513726, + "grad_norm": 2.7194032669067383, + "learning_rate": 0.004699999999999424, + "loss": 9.9283, + "step": 17600 + }, + { + "epoch": 0.0721059101385187, + "grad_norm": 3.459972858428955, + "learning_rate": 0.004712499999999414, + "loss": 9.9118, + "step": 17700 + }, + { + "epoch": 0.07251328816190017, + "grad_norm": 1.906203031539917, + "learning_rate": 0.004724999999999409, + "loss": 9.9149, + "step": 17800 + }, + { + "epoch": 0.07292066618528163, + "grad_norm": 2.151893377304077, + "learning_rate": 0.0047374999999993984, + "loss": 9.9155, + "step": 17900 + }, + { + "epoch": 0.0733280442086631, + "grad_norm": 3.4184906482696533, + "learning_rate": 0.004749999999999393, + "loss": 9.8929, + "step": 18000 + }, + { + "epoch": 0.0733280442086631, + "eval_MaskedAccuracy": 0.41359918387247874, + "eval_loss": 2.0913889408111572, + "eval_runtime": 477.7729, + "eval_samples_per_second": 132.858, + "eval_steps_per_second": 0.519, + "step": 18000 + }, + { + "epoch": 0.07373542223204456, + "grad_norm": 3.0248520374298096, + "learning_rate": 0.004762499999999384, + "loss": 9.9187, + "step": 18100 + }, + { + "epoch": 0.07414280025542602, + "grad_norm": 3.433152675628662, + "learning_rate": 0.004774999999999384, + "loss": 9.9119, + "step": 18200 + }, + { + "epoch": 0.07455017827880749, + "grad_norm": 1.9969996213912964, + "learning_rate": 0.00478749999999938, + "loss": 9.8964, + "step": 18300 + }, + { + "epoch": 0.07495755630218895, + "grad_norm": 2.542823553085327, + "learning_rate": 0.004799999999999372, + "loss": 9.9127, + "step": 18400 + }, + { + "epoch": 0.07536493432557041, + "grad_norm": 2.1811461448669434, + "learning_rate": 0.004812499999999362, + "loss": 9.9061, + "step": 18500 + }, + { + "epoch": 0.07577231234895186, + "grad_norm": 2.4389021396636963, + "learning_rate": 0.004824999999999358, + "loss": 9.8594, + "step": 18600 + }, + { + "epoch": 0.07617969037233333, + "grad_norm": 1.6460280418395996, + "learning_rate": 0.004837499999999354, + "loss": 9.8418, + "step": 18700 + }, + { + "epoch": 0.07658706839571479, + "grad_norm": 3.33437442779541, + "learning_rate": 0.00484999999999935, + "loss": 9.8456, + "step": 18800 + }, + { + "epoch": 0.07699444641909625, + "grad_norm": 2.103670120239258, + "learning_rate": 0.004862499999999345, + "loss": 9.8474, + "step": 18900 + }, + { + "epoch": 0.07740182444247772, + "grad_norm": 19.984230041503906, + "learning_rate": 0.004874999999999335, + "loss": 9.8543, + "step": 19000 + }, + { + "epoch": 0.07740182444247772, + "eval_MaskedAccuracy": 0.41617825892081933, + "eval_loss": 2.082752227783203, + "eval_runtime": 497.0849, + "eval_samples_per_second": 127.697, + "eval_steps_per_second": 0.499, + "step": 19000 + }, + { + "epoch": 0.07780920246585918, + "grad_norm": 3.1915500164031982, + "learning_rate": 0.004887499999999324, + "loss": 9.8487, + "step": 19100 + }, + { + "epoch": 0.07821658048924064, + "grad_norm": 3.868563652038574, + "learning_rate": 0.004899999999999312, + "loss": 9.8177, + "step": 19200 + }, + { + "epoch": 0.0786239585126221, + "grad_norm": 1.9810203313827515, + "learning_rate": 0.0049124999999993054, + "loss": 9.8262, + "step": 19300 + }, + { + "epoch": 0.07903133653600355, + "grad_norm": 8.838945388793945, + "learning_rate": 0.004924999999999296, + "loss": 9.8182, + "step": 19400 + }, + { + "epoch": 0.07943871455938502, + "grad_norm": 2.2315778732299805, + "learning_rate": 0.004937499999999295, + "loss": 9.8063, + "step": 19500 + }, + { + "epoch": 0.07984609258276648, + "grad_norm": 3.6076674461364746, + "learning_rate": 0.004949999999999293, + "loss": 9.7827, + "step": 19600 + }, + { + "epoch": 0.08025347060614794, + "grad_norm": 1.9593068361282349, + "learning_rate": 0.004962499999999282, + "loss": 9.772, + "step": 19700 + }, + { + "epoch": 0.08066084862952941, + "grad_norm": 5.378314018249512, + "learning_rate": 0.004974999999999273, + "loss": 9.8082, + "step": 19800 + }, + { + "epoch": 0.08106822665291087, + "grad_norm": 1.7450096607208252, + "learning_rate": 0.004987499999999261, + "loss": 9.7298, + "step": 19900 + }, + { + "epoch": 0.08147560467629233, + "grad_norm": 2.121546983718872, + "learning_rate": 0.005, + "loss": 9.7705, + "step": 20000 + }, + { + "epoch": 0.08147560467629233, + "eval_MaskedAccuracy": 0.4192546410315609, + "eval_loss": 2.068751573562622, + "eval_runtime": 537.5828, + "eval_samples_per_second": 118.077, + "eval_steps_per_second": 0.461, + "step": 20000 + }, + { + "epoch": 0.0818829826996738, + "grad_norm": 3.6601402759552, + "learning_rate": 0.004999999950679656, + "loss": 9.7526, + "step": 20100 + }, + { + "epoch": 0.08229036072305525, + "grad_norm": 4.677398204803467, + "learning_rate": 0.004999999802718617, + "loss": 9.7548, + "step": 20200 + }, + { + "epoch": 0.08269773874643671, + "grad_norm": 1.8637826442718506, + "learning_rate": 0.0049999995561169075, + "loss": 9.7575, + "step": 20300 + }, + { + "epoch": 0.08310511676981817, + "grad_norm": 2.215644598007202, + "learning_rate": 0.004999999210874492, + "loss": 9.7453, + "step": 20400 + }, + { + "epoch": 0.08351249479319964, + "grad_norm": 3.032287359237671, + "learning_rate": 0.004999998766991425, + "loss": 9.7357, + "step": 20500 + }, + { + "epoch": 0.0839198728165811, + "grad_norm": 4.440647125244141, + "learning_rate": 0.004999998224467713, + "loss": 9.7288, + "step": 20600 + }, + { + "epoch": 0.08432725083996256, + "grad_norm": 2.7516491413116455, + "learning_rate": 0.004999997583303374, + "loss": 9.718, + "step": 20700 + }, + { + "epoch": 0.08473462886334403, + "grad_norm": 3.037583112716675, + "learning_rate": 0.004999996843498439, + "loss": 9.6802, + "step": 20800 + }, + { + "epoch": 0.08514200688672549, + "grad_norm": 2.4326248168945312, + "learning_rate": 0.004999996005052938, + "loss": 9.7056, + "step": 20900 + }, + { + "epoch": 0.08554938491010694, + "grad_norm": 2.9642679691314697, + "learning_rate": 0.0049999950679669, + "loss": 9.6847, + "step": 21000 + }, + { + "epoch": 0.08554938491010694, + "eval_MaskedAccuracy": 0.41856565694952075, + "eval_loss": 2.056820869445801, + "eval_runtime": 591.4303, + "eval_samples_per_second": 107.326, + "eval_steps_per_second": 0.419, + "step": 21000 + }, + { + "epoch": 0.0859567629334884, + "grad_norm": 5.9169816970825195, + "learning_rate": 0.004999994032240362, + "loss": 9.7047, + "step": 21100 + }, + { + "epoch": 0.08636414095686987, + "grad_norm": 3.5219688415527344, + "learning_rate": 0.004999992897873365, + "loss": 9.6912, + "step": 21200 + }, + { + "epoch": 0.08677151898025133, + "grad_norm": 2.1443097591400146, + "learning_rate": 0.004999991664865955, + "loss": 9.6986, + "step": 21300 + }, + { + "epoch": 0.0871788970036328, + "grad_norm": 2.3986363410949707, + "learning_rate": 0.004999990333218182, + "loss": 9.6317, + "step": 21400 + }, + { + "epoch": 0.08758627502701426, + "grad_norm": 3.069018602371216, + "learning_rate": 0.004999988902930097, + "loss": 9.6689, + "step": 21500 + }, + { + "epoch": 0.08799365305039572, + "grad_norm": 2.816075325012207, + "learning_rate": 0.004999987374001755, + "loss": 9.6185, + "step": 21600 + }, + { + "epoch": 0.08840103107377718, + "grad_norm": 3.1795783042907715, + "learning_rate": 0.004999985746433214, + "loss": 9.6014, + "step": 21700 + }, + { + "epoch": 0.08880840909715865, + "grad_norm": 2.872256278991699, + "learning_rate": 0.004999984020224545, + "loss": 9.648, + "step": 21800 + }, + { + "epoch": 0.0892157871205401, + "grad_norm": 1.5614632368087769, + "learning_rate": 0.004999982195375815, + "loss": 9.6818, + "step": 21900 + }, + { + "epoch": 0.08962316514392156, + "grad_norm": 10.637242317199707, + "learning_rate": 0.004999980271887092, + "loss": 9.6328, + "step": 22000 + }, + { + "epoch": 0.08962316514392156, + "eval_MaskedAccuracy": 0.4240338106169538, + "eval_loss": 2.0506956577301025, + "eval_runtime": 454.9073, + "eval_samples_per_second": 139.536, + "eval_steps_per_second": 0.545, + "step": 22000 + }, + { + "epoch": 0.09003054316730302, + "grad_norm": 3.350553512573242, + "learning_rate": 0.004999978249758461, + "loss": 9.6098, + "step": 22100 + }, + { + "epoch": 0.09043792119068449, + "grad_norm": 7.316722869873047, + "learning_rate": 0.004999976128989992, + "loss": 9.6121, + "step": 22200 + }, + { + "epoch": 0.09084529921406595, + "grad_norm": 2.7911298274993896, + "learning_rate": 0.00499997390958177, + "loss": 9.6148, + "step": 22300 + }, + { + "epoch": 0.09125267723744741, + "grad_norm": 2.023620843887329, + "learning_rate": 0.004999971591533885, + "loss": 9.5944, + "step": 22400 + }, + { + "epoch": 0.09166005526082888, + "grad_norm": 3.6603286266326904, + "learning_rate": 0.004999969174846427, + "loss": 9.597, + "step": 22500 + }, + { + "epoch": 0.09206743328421034, + "grad_norm": 1.7879774570465088, + "learning_rate": 0.004999966659519497, + "loss": 9.6179, + "step": 22600 + }, + { + "epoch": 0.09247481130759179, + "grad_norm": 2.120997428894043, + "learning_rate": 0.004999964045553194, + "loss": 9.5787, + "step": 22700 + }, + { + "epoch": 0.09288218933097325, + "grad_norm": 3.384791851043701, + "learning_rate": 0.00499996133294762, + "loss": 9.5839, + "step": 22800 + }, + { + "epoch": 0.09328956735435472, + "grad_norm": 3.9101996421813965, + "learning_rate": 0.004999958521702876, + "loss": 9.593, + "step": 22900 + }, + { + "epoch": 0.09369694537773618, + "grad_norm": 3.653648614883423, + "learning_rate": 0.00499995561181908, + "loss": 9.5407, + "step": 23000 + }, + { + "epoch": 0.09369694537773618, + "eval_MaskedAccuracy": 0.42734800606093365, + "eval_loss": 2.0217700004577637, + "eval_runtime": 553.2274, + "eval_samples_per_second": 114.738, + "eval_steps_per_second": 0.448, + "step": 23000 + }, + { + "epoch": 0.09410432340111764, + "grad_norm": 1.577781319618225, + "learning_rate": 0.004999952603296345, + "loss": 9.5353, + "step": 23100 + }, + { + "epoch": 0.0945117014244991, + "grad_norm": 4.443587303161621, + "learning_rate": 0.004999949496134789, + "loss": 9.5974, + "step": 23200 + }, + { + "epoch": 0.09491907944788057, + "grad_norm": 3.1883552074432373, + "learning_rate": 0.004999946290334532, + "loss": 9.5262, + "step": 23300 + }, + { + "epoch": 0.09532645747126203, + "grad_norm": 3.181093215942383, + "learning_rate": 0.004999942985895707, + "loss": 9.5158, + "step": 23400 + }, + { + "epoch": 0.09573383549464348, + "grad_norm": 1.8355048894882202, + "learning_rate": 0.004999939582818444, + "loss": 9.5269, + "step": 23500 + }, + { + "epoch": 0.09614121351802495, + "grad_norm": 2.4568188190460205, + "learning_rate": 0.004999936081102877, + "loss": 9.5686, + "step": 23600 + }, + { + "epoch": 0.09654859154140641, + "grad_norm": 1.7274566888809204, + "learning_rate": 0.004999932480749139, + "loss": 9.5021, + "step": 23700 + }, + { + "epoch": 0.09695596956478787, + "grad_norm": 1.916005253791809, + "learning_rate": 0.004999928781757373, + "loss": 9.4656, + "step": 23800 + }, + { + "epoch": 0.09736334758816934, + "grad_norm": 2.241408109664917, + "learning_rate": 0.004999924984127735, + "loss": 9.5168, + "step": 23900 + }, + { + "epoch": 0.0977707256115508, + "grad_norm": 1.5999352931976318, + "learning_rate": 0.004999921087860366, + "loss": 9.539, + "step": 24000 + }, + { + "epoch": 0.0977707256115508, + "eval_MaskedAccuracy": 0.42926830999883303, + "eval_loss": 2.0102176666259766, + "eval_runtime": 525.098, + "eval_samples_per_second": 120.884, + "eval_steps_per_second": 0.472, + "step": 24000 + }, + { + "epoch": 0.09817810363493226, + "grad_norm": 3.433403253555298, + "learning_rate": 0.004999917092955428, + "loss": 9.5043, + "step": 24100 + }, + { + "epoch": 0.09858548165831373, + "grad_norm": 1.3292396068572998, + "learning_rate": 0.004999912999413068, + "loss": 9.4909, + "step": 24200 + }, + { + "epoch": 0.09899285968169518, + "grad_norm": 1.9488959312438965, + "learning_rate": 0.004999908807233453, + "loss": 9.5018, + "step": 24300 + }, + { + "epoch": 0.09940023770507664, + "grad_norm": 2.3496742248535156, + "learning_rate": 0.004999904516416743, + "loss": 9.5013, + "step": 24400 + }, + { + "epoch": 0.0998076157284581, + "grad_norm": 3.0133869647979736, + "learning_rate": 0.0049999001269631195, + "loss": 9.483, + "step": 24500 + }, + { + "epoch": 0.10021499375183957, + "grad_norm": 2.9716012477874756, + "learning_rate": 0.004999895638872754, + "loss": 9.4936, + "step": 24600 + }, + { + "epoch": 0.10062237177522103, + "grad_norm": 1.4587233066558838, + "learning_rate": 0.004999891052145823, + "loss": 9.4348, + "step": 24700 + }, + { + "epoch": 0.10102974979860249, + "grad_norm": 3.042893171310425, + "learning_rate": 0.004999886366782502, + "loss": 9.4609, + "step": 24800 + }, + { + "epoch": 0.10143712782198396, + "grad_norm": 4.598676681518555, + "learning_rate": 0.004999881582782976, + "loss": 9.4539, + "step": 24900 + }, + { + "epoch": 0.10184450584536542, + "grad_norm": 7.61506986618042, + "learning_rate": 0.0049998767001474395, + "loss": 9.4283, + "step": 25000 + }, + { + "epoch": 0.10184450584536542, + "eval_MaskedAccuracy": 0.4277697153857394, + "eval_loss": 2.019444465637207, + "eval_runtime": 599.1003, + "eval_samples_per_second": 105.952, + "eval_steps_per_second": 0.414, + "step": 25000 + }, + { + "epoch": 0.10225188386874687, + "grad_norm": 2.0222227573394775, + "learning_rate": 0.00499987171887608, + "loss": 9.4448, + "step": 25100 + }, + { + "epoch": 0.10265926189212833, + "grad_norm": 4.220450401306152, + "learning_rate": 0.004999866638969104, + "loss": 9.4704, + "step": 25200 + }, + { + "epoch": 0.1030666399155098, + "grad_norm": 3.21671199798584, + "learning_rate": 0.004999861460426701, + "loss": 9.4621, + "step": 25300 + }, + { + "epoch": 0.10347401793889126, + "grad_norm": 3.3255972862243652, + "learning_rate": 0.004999856183249082, + "loss": 9.4166, + "step": 25400 + }, + { + "epoch": 0.10388139596227272, + "grad_norm": 2.6986167430877686, + "learning_rate": 0.004999850807436452, + "loss": 9.4351, + "step": 25500 + }, + { + "epoch": 0.10428877398565418, + "grad_norm": 2.148319959640503, + "learning_rate": 0.004999845332989027, + "loss": 9.4018, + "step": 25600 + }, + { + "epoch": 0.10469615200903565, + "grad_norm": 3.1325294971466064, + "learning_rate": 0.004999839759907026, + "loss": 9.4271, + "step": 25700 + }, + { + "epoch": 0.10510353003241711, + "grad_norm": 2.4521164894104004, + "learning_rate": 0.0049998340881906665, + "loss": 9.3909, + "step": 25800 + }, + { + "epoch": 0.10551090805579857, + "grad_norm": 4.3904571533203125, + "learning_rate": 0.004999828317840176, + "loss": 9.431, + "step": 25900 + }, + { + "epoch": 0.10591828607918002, + "grad_norm": 3.555058717727661, + "learning_rate": 0.004999822448855773, + "loss": 9.3742, + "step": 26000 + }, + { + "epoch": 0.10591828607918002, + "eval_MaskedAccuracy": 0.43347176834705675, + "eval_loss": 1.9825259447097778, + "eval_runtime": 484.8715, + "eval_samples_per_second": 130.913, + "eval_steps_per_second": 0.511, + "step": 26000 + }, + { + "epoch": 0.10632566410256149, + "grad_norm": 2.8984215259552, + "learning_rate": 0.004999816481237698, + "loss": 9.3632, + "step": 26100 + }, + { + "epoch": 0.10673304212594295, + "grad_norm": 3.2958881855010986, + "learning_rate": 0.0049998104149861855, + "loss": 9.433, + "step": 26200 + }, + { + "epoch": 0.10714042014932441, + "grad_norm": 3.2342183589935303, + "learning_rate": 0.004999804250101472, + "loss": 9.3908, + "step": 26300 + }, + { + "epoch": 0.10754779817270588, + "grad_norm": 2.292680501937866, + "learning_rate": 0.004999797986583799, + "loss": 9.385, + "step": 26400 + }, + { + "epoch": 0.10795517619608734, + "grad_norm": 1.9405272006988525, + "learning_rate": 0.004999791624433422, + "loss": 9.4612, + "step": 26500 + }, + { + "epoch": 0.1083625542194688, + "grad_norm": 5.097912788391113, + "learning_rate": 0.004999785163650587, + "loss": 9.4082, + "step": 26600 + }, + { + "epoch": 0.10876993224285027, + "grad_norm": 3.781742572784424, + "learning_rate": 0.004999778604235553, + "loss": 9.4121, + "step": 26700 + }, + { + "epoch": 0.10917731026623172, + "grad_norm": 2.784597158432007, + "learning_rate": 0.004999771946188575, + "loss": 9.3863, + "step": 26800 + }, + { + "epoch": 0.10958468828961318, + "grad_norm": 2.1151978969573975, + "learning_rate": 0.004999765189509921, + "loss": 9.341, + "step": 26900 + }, + { + "epoch": 0.10999206631299464, + "grad_norm": 3.7644362449645996, + "learning_rate": 0.004999758334199855, + "loss": 9.37, + "step": 27000 + }, + { + "epoch": 0.10999206631299464, + "eval_MaskedAccuracy": 0.4307816741389779, + "eval_loss": 1.9972769021987915, + "eval_runtime": 427.1236, + "eval_samples_per_second": 148.613, + "eval_steps_per_second": 0.581, + "step": 27000 + }, + { + "epoch": 0.1103994443363761, + "grad_norm": 4.518658638000488, + "learning_rate": 0.004999751380258652, + "loss": 9.4088, + "step": 27100 + }, + { + "epoch": 0.11080682235975757, + "grad_norm": 3.079120397567749, + "learning_rate": 0.004999744327686584, + "loss": 9.3893, + "step": 27200 + }, + { + "epoch": 0.11121420038313903, + "grad_norm": 3.1319916248321533, + "learning_rate": 0.0049997371764839245, + "loss": 9.3885, + "step": 27300 + }, + { + "epoch": 0.1116215784065205, + "grad_norm": 2.0859529972076416, + "learning_rate": 0.004999729926650963, + "loss": 9.3566, + "step": 27400 + }, + { + "epoch": 0.11202895642990196, + "grad_norm": 4.668200492858887, + "learning_rate": 0.004999722578187986, + "loss": 9.3839, + "step": 27500 + }, + { + "epoch": 0.11243633445328341, + "grad_norm": 3.367041826248169, + "learning_rate": 0.00499971513109529, + "loss": 9.3382, + "step": 27600 + }, + { + "epoch": 0.11284371247666487, + "grad_norm": 3.040536880493164, + "learning_rate": 0.004999707585373157, + "loss": 9.3642, + "step": 27700 + }, + { + "epoch": 0.11325109050004634, + "grad_norm": 2.1498825550079346, + "learning_rate": 0.004999699941021894, + "loss": 9.3099, + "step": 27800 + }, + { + "epoch": 0.1136584685234278, + "grad_norm": 1.7926315069198608, + "learning_rate": 0.004999692198041797, + "loss": 9.3113, + "step": 27900 + }, + { + "epoch": 0.11406584654680926, + "grad_norm": 1.7881391048431396, + "learning_rate": 0.004999684356433179, + "loss": 9.4188, + "step": 28000 + }, + { + "epoch": 0.11406584654680926, + "eval_MaskedAccuracy": 0.4331004890207451, + "eval_loss": 1.993469476699829, + "eval_runtime": 520.9301, + "eval_samples_per_second": 121.851, + "eval_steps_per_second": 0.476, + "step": 28000 + }, + { + "epoch": 0.11447322457019073, + "grad_norm": 4.676214218139648, + "learning_rate": 0.0049996764161963385, + "loss": 9.3526, + "step": 28100 + }, + { + "epoch": 0.11488060259357219, + "grad_norm": 3.5479514598846436, + "learning_rate": 0.004999668377331597, + "loss": 9.3614, + "step": 28200 + }, + { + "epoch": 0.11528798061695365, + "grad_norm": 3.582327365875244, + "learning_rate": 0.004999660239839272, + "loss": 9.3217, + "step": 28300 + }, + { + "epoch": 0.1156953586403351, + "grad_norm": 1.5663789510726929, + "learning_rate": 0.004999652003719687, + "loss": 9.2903, + "step": 28400 + }, + { + "epoch": 0.11610273666371657, + "grad_norm": 2.083988904953003, + "learning_rate": 0.004999643668973165, + "loss": 9.3266, + "step": 28500 + }, + { + "epoch": 0.11651011468709803, + "grad_norm": 4.117608547210693, + "learning_rate": 0.0049996352356000365, + "loss": 9.307, + "step": 28600 + }, + { + "epoch": 0.11691749271047949, + "grad_norm": 1.3780686855316162, + "learning_rate": 0.0049996267036006336, + "loss": 9.3073, + "step": 28700 + }, + { + "epoch": 0.11732487073386096, + "grad_norm": 11.629694938659668, + "learning_rate": 0.004999618072975303, + "loss": 9.3441, + "step": 28800 + }, + { + "epoch": 0.11773224875724242, + "grad_norm": 2.022599220275879, + "learning_rate": 0.004999609343724368, + "loss": 9.2835, + "step": 28900 + }, + { + "epoch": 0.11813962678062388, + "grad_norm": 3.4412121772766113, + "learning_rate": 0.004999600515848186, + "loss": 9.2739, + "step": 29000 + }, + { + "epoch": 0.11813962678062388, + "eval_MaskedAccuracy": 0.43849306787282605, + "eval_loss": 1.9530013799667358, + "eval_runtime": 507.2801, + "eval_samples_per_second": 125.13, + "eval_steps_per_second": 0.489, + "step": 29000 + }, + { + "epoch": 0.11854700480400535, + "grad_norm": 3.3755948543548584, + "learning_rate": 0.004999591589347103, + "loss": 9.2399, + "step": 29100 + }, + { + "epoch": 0.11895438282738681, + "grad_norm": 1.9202914237976074, + "learning_rate": 0.004999582564221479, + "loss": 9.2946, + "step": 29200 + }, + { + "epoch": 0.11936176085076826, + "grad_norm": 3.1575686931610107, + "learning_rate": 0.004999573440471659, + "loss": 9.3542, + "step": 29300 + }, + { + "epoch": 0.11976913887414972, + "grad_norm": 5.3306732177734375, + "learning_rate": 0.0049995642180980125, + "loss": 9.3686, + "step": 29400 + }, + { + "epoch": 0.12017651689753119, + "grad_norm": 3.7781152725219727, + "learning_rate": 0.0049995548971008925, + "loss": 9.3227, + "step": 29500 + }, + { + "epoch": 0.12058389492091265, + "grad_norm": 3.726844549179077, + "learning_rate": 0.004999545477480681, + "loss": 9.3034, + "step": 29600 + }, + { + "epoch": 0.12099127294429411, + "grad_norm": 2.921781301498413, + "learning_rate": 0.004999535959237747, + "loss": 9.2338, + "step": 29700 + }, + { + "epoch": 0.12139865096767558, + "grad_norm": 2.6174159049987793, + "learning_rate": 0.004999526342372459, + "loss": 9.2237, + "step": 29800 + }, + { + "epoch": 0.12180602899105704, + "grad_norm": 3.727325201034546, + "learning_rate": 0.004999516626885207, + "loss": 9.2219, + "step": 29900 + }, + { + "epoch": 0.1222134070144385, + "grad_norm": 3.3698318004608154, + "learning_rate": 0.004999506812776362, + "loss": 9.2405, + "step": 30000 + }, + { + "epoch": 0.1222134070144385, + "eval_MaskedAccuracy": 0.4407026905200056, + "eval_loss": 1.9574971199035645, + "eval_runtime": 588.8466, + "eval_samples_per_second": 107.797, + "eval_steps_per_second": 0.421, + "step": 30000 + }, + { + "epoch": 0.12262078503781995, + "grad_norm": 1.5510843992233276, + "learning_rate": 0.004999496900046327, + "loss": 9.2626, + "step": 30100 + }, + { + "epoch": 0.12302816306120141, + "grad_norm": 1.7259398698806763, + "learning_rate": 0.00499948688869549, + "loss": 9.2664, + "step": 30200 + }, + { + "epoch": 0.12343554108458288, + "grad_norm": 3.4563167095184326, + "learning_rate": 0.004999476778724248, + "loss": 9.2955, + "step": 30300 + }, + { + "epoch": 0.12384291910796434, + "grad_norm": 1.9410121440887451, + "learning_rate": 0.004999466570132995, + "loss": 9.2675, + "step": 30400 + }, + { + "epoch": 0.1242502971313458, + "grad_norm": 2.845703601837158, + "learning_rate": 0.004999456262922131, + "loss": 9.2495, + "step": 30500 + }, + { + "epoch": 0.12465767515472727, + "grad_norm": 2.4379312992095947, + "learning_rate": 0.004999445857092077, + "loss": 9.278, + "step": 30600 + }, + { + "epoch": 0.12506505317810873, + "grad_norm": 3.158965587615967, + "learning_rate": 0.004999435352643232, + "loss": 9.189, + "step": 30700 + }, + { + "epoch": 0.12547243120149018, + "grad_norm": 3.224649667739868, + "learning_rate": 0.004999424749576019, + "loss": 9.2487, + "step": 30800 + }, + { + "epoch": 0.12587980922487166, + "grad_norm": 3.579566240310669, + "learning_rate": 0.004999414047890854, + "loss": 9.2058, + "step": 30900 + }, + { + "epoch": 0.1262871872482531, + "grad_norm": 1.4517889022827148, + "learning_rate": 0.004999403247588162, + "loss": 9.1853, + "step": 31000 + }, + { + "epoch": 0.1262871872482531, + "eval_MaskedAccuracy": 0.44081633697535333, + "eval_loss": 1.9580498933792114, + "eval_runtime": 589.0477, + "eval_samples_per_second": 107.76, + "eval_steps_per_second": 0.421, + "step": 31000 + }, + { + "epoch": 0.12669456527163458, + "grad_norm": 3.9054057598114014, + "learning_rate": 0.004999392348668368, + "loss": 9.2766, + "step": 31100 + }, + { + "epoch": 0.12710194329501603, + "grad_norm": 2.204495906829834, + "learning_rate": 0.004999381351131904, + "loss": 9.2299, + "step": 31200 + }, + { + "epoch": 0.12750932131839748, + "grad_norm": 2.9038403034210205, + "learning_rate": 0.004999370254979204, + "loss": 9.1957, + "step": 31300 + }, + { + "epoch": 0.12791669934177896, + "grad_norm": 9.680912017822266, + "learning_rate": 0.004999359060210702, + "loss": 9.2389, + "step": 31400 + }, + { + "epoch": 0.1283240773651604, + "grad_norm": 1.493072748184204, + "learning_rate": 0.004999347766826851, + "loss": 9.2887, + "step": 31500 + }, + { + "epoch": 0.1287314553885419, + "grad_norm": 3.562403678894043, + "learning_rate": 0.004999336374828095, + "loss": 9.2845, + "step": 31600 + }, + { + "epoch": 0.12913883341192334, + "grad_norm": 3.176454544067383, + "learning_rate": 0.004999324884214878, + "loss": 9.2753, + "step": 31700 + }, + { + "epoch": 0.12954621143530481, + "grad_norm": 3.772649049758911, + "learning_rate": 0.004999313294987656, + "loss": 9.2608, + "step": 31800 + }, + { + "epoch": 0.12995358945868626, + "grad_norm": 3.224039316177368, + "learning_rate": 0.00499930160714689, + "loss": 9.182, + "step": 31900 + }, + { + "epoch": 0.13036096748206774, + "grad_norm": 3.68146014213562, + "learning_rate": 0.004999289820693044, + "loss": 9.1812, + "step": 32000 + }, + { + "epoch": 0.13036096748206774, + "eval_MaskedAccuracy": 0.4414966716646933, + "eval_loss": 1.9461902379989624, + "eval_runtime": 575.5966, + "eval_samples_per_second": 110.279, + "eval_steps_per_second": 0.431, + "step": 32000 + }, + { + "epoch": 0.1307683455054492, + "grad_norm": 3.2047019004821777, + "learning_rate": 0.004999277935626578, + "loss": 9.1784, + "step": 32100 + }, + { + "epoch": 0.13117572352883064, + "grad_norm": 3.677278518676758, + "learning_rate": 0.004999265951947974, + "loss": 9.157, + "step": 32200 + }, + { + "epoch": 0.13158310155221212, + "grad_norm": 3.956178665161133, + "learning_rate": 0.004999253869657687, + "loss": 9.1224, + "step": 32300 + }, + { + "epoch": 0.13199047957559357, + "grad_norm": 1.5574619770050049, + "learning_rate": 0.004999241688756209, + "loss": 9.2461, + "step": 32400 + }, + { + "epoch": 0.13239785759897504, + "grad_norm": 2.0814273357391357, + "learning_rate": 0.004999229409244015, + "loss": 9.2823, + "step": 32500 + }, + { + "epoch": 0.1328052356223565, + "grad_norm": 3.6148457527160645, + "learning_rate": 0.004999217031121593, + "loss": 9.1491, + "step": 32600 + }, + { + "epoch": 0.13321261364573797, + "grad_norm": 3.312826633453369, + "learning_rate": 0.00499920455438943, + "loss": 9.212, + "step": 32700 + }, + { + "epoch": 0.13361999166911942, + "grad_norm": 1.8035812377929688, + "learning_rate": 0.004999191979048022, + "loss": 9.1641, + "step": 32800 + }, + { + "epoch": 0.1340273696925009, + "grad_norm": 4.133431434631348, + "learning_rate": 0.004999179305097862, + "loss": 9.2661, + "step": 32900 + }, + { + "epoch": 0.13443474771588235, + "grad_norm": 3.4171595573425293, + "learning_rate": 0.004999166532539454, + "loss": 9.154, + "step": 33000 + }, + { + "epoch": 0.13443474771588235, + "eval_MaskedAccuracy": 0.444123885179477, + "eval_loss": 1.9266562461853027, + "eval_runtime": 444.8407, + "eval_samples_per_second": 142.694, + "eval_steps_per_second": 0.558, + "step": 33000 + }, + { + "epoch": 0.1348421257392638, + "grad_norm": 4.883106231689453, + "learning_rate": 0.004999153661373301, + "loss": 9.1874, + "step": 33100 + }, + { + "epoch": 0.13524950376264527, + "grad_norm": 3.6476919651031494, + "learning_rate": 0.0049991406915999145, + "loss": 9.163, + "step": 33200 + }, + { + "epoch": 0.13565688178602672, + "grad_norm": 2.720723867416382, + "learning_rate": 0.004999127623219813, + "loss": 9.1493, + "step": 33300 + }, + { + "epoch": 0.1360642598094082, + "grad_norm": 3.7319369316101074, + "learning_rate": 0.004999114456233506, + "loss": 9.1741, + "step": 33400 + }, + { + "epoch": 0.13647163783278965, + "grad_norm": 3.6861164569854736, + "learning_rate": 0.004999101190641518, + "loss": 9.1107, + "step": 33500 + }, + { + "epoch": 0.13687901585617113, + "grad_norm": 2.780924081802368, + "learning_rate": 0.004999087826444369, + "loss": 9.1452, + "step": 33600 + }, + { + "epoch": 0.13728639387955258, + "grad_norm": 2.3631842136383057, + "learning_rate": 0.004999074363642589, + "loss": 9.1763, + "step": 33700 + }, + { + "epoch": 0.13769377190293403, + "grad_norm": 3.5727689266204834, + "learning_rate": 0.004999060802236713, + "loss": 9.1375, + "step": 33800 + }, + { + "epoch": 0.1381011499263155, + "grad_norm": 2.269345998764038, + "learning_rate": 0.004999047142227271, + "loss": 9.143, + "step": 33900 + }, + { + "epoch": 0.13850852794969695, + "grad_norm": 7.34450101852417, + "learning_rate": 0.004999033383614803, + "loss": 9.162, + "step": 34000 + }, + { + "epoch": 0.13850852794969695, + "eval_MaskedAccuracy": 0.43828248619245724, + "eval_loss": 1.9571964740753174, + "eval_runtime": 495.8873, + "eval_samples_per_second": 128.005, + "eval_steps_per_second": 0.5, + "step": 34000 + }, + { + "epoch": 0.13891590597307843, + "grad_norm": 4.355815410614014, + "learning_rate": 0.004999019526399854, + "loss": 9.1878, + "step": 34100 + }, + { + "epoch": 0.13932328399645988, + "grad_norm": 4.175578594207764, + "learning_rate": 0.004999005570582981, + "loss": 9.132, + "step": 34200 + }, + { + "epoch": 0.13973066201984136, + "grad_norm": 3.2906596660614014, + "learning_rate": 0.004998991516164727, + "loss": 9.1537, + "step": 34300 + }, + { + "epoch": 0.1401380400432228, + "grad_norm": 3.8736915588378906, + "learning_rate": 0.0049989773631456515, + "loss": 9.1409, + "step": 34400 + }, + { + "epoch": 0.14054541806660428, + "grad_norm": 3.4141225814819336, + "learning_rate": 0.004998963111526308, + "loss": 9.1178, + "step": 34500 + }, + { + "epoch": 0.14095279608998573, + "grad_norm": 3.814049482345581, + "learning_rate": 0.004998948761307268, + "loss": 9.0791, + "step": 34600 + }, + { + "epoch": 0.14136017411336718, + "grad_norm": 3.3079888820648193, + "learning_rate": 0.004998934312489091, + "loss": 9.1364, + "step": 34700 + }, + { + "epoch": 0.14176755213674866, + "grad_norm": 1.326229453086853, + "learning_rate": 0.004998919765072354, + "loss": 9.199, + "step": 34800 + }, + { + "epoch": 0.1421749301601301, + "grad_norm": 2.8008034229278564, + "learning_rate": 0.004998905119057631, + "loss": 9.1457, + "step": 34900 + }, + { + "epoch": 0.14258230818351159, + "grad_norm": 4.717260837554932, + "learning_rate": 0.0049988903744454995, + "loss": 9.0929, + "step": 35000 + }, + { + "epoch": 0.14258230818351159, + "eval_MaskedAccuracy": 0.44587848230103844, + "eval_loss": 1.9244799613952637, + "eval_runtime": 654.16, + "eval_samples_per_second": 97.034, + "eval_steps_per_second": 0.379, + "step": 35000 + }, + { + "epoch": 0.14298968620689304, + "grad_norm": 2.6231470108032227, + "learning_rate": 0.004998875531236537, + "loss": 9.1243, + "step": 35100 + }, + { + "epoch": 0.1433970642302745, + "grad_norm": 3.066685199737549, + "learning_rate": 0.004998860589431345, + "loss": 9.0725, + "step": 35200 + }, + { + "epoch": 0.14380444225365596, + "grad_norm": 3.0163044929504395, + "learning_rate": 0.0049988455490305025, + "loss": 9.0527, + "step": 35300 + }, + { + "epoch": 0.1442118202770374, + "grad_norm": 3.432121515274048, + "learning_rate": 0.00499883041003461, + "loss": 9.0402, + "step": 35400 + }, + { + "epoch": 0.1446191983004189, + "grad_norm": 1.467442274093628, + "learning_rate": 0.0049988151724442605, + "loss": 9.0789, + "step": 35500 + }, + { + "epoch": 0.14502657632380034, + "grad_norm": 4.966704368591309, + "learning_rate": 0.004998799836260063, + "loss": 9.1025, + "step": 35600 + }, + { + "epoch": 0.14543395434718182, + "grad_norm": 4.940292835235596, + "learning_rate": 0.004998784401482619, + "loss": 9.0479, + "step": 35700 + }, + { + "epoch": 0.14584133237056326, + "grad_norm": 5.558600425720215, + "learning_rate": 0.004998768868112538, + "loss": 9.1459, + "step": 35800 + }, + { + "epoch": 0.14624871039394474, + "grad_norm": 1.7292475700378418, + "learning_rate": 0.00499875323615044, + "loss": 9.1045, + "step": 35900 + }, + { + "epoch": 0.1466560884173262, + "grad_norm": 1.3726286888122559, + "learning_rate": 0.00499873750559693, + "loss": 9.1893, + "step": 36000 + }, + { + "epoch": 0.1466560884173262, + "eval_MaskedAccuracy": 0.4421415294091426, + "eval_loss": 1.929854154586792, + "eval_runtime": 512.033, + "eval_samples_per_second": 123.969, + "eval_steps_per_second": 0.484, + "step": 36000 + }, + { + "epoch": 0.14706346644070767, + "grad_norm": 6.443113803863525, + "learning_rate": 0.004998721676452642, + "loss": 9.1884, + "step": 36100 + }, + { + "epoch": 0.14747084446408912, + "grad_norm": 5.186279773712158, + "learning_rate": 0.0049987057487181985, + "loss": 9.0583, + "step": 36200 + }, + { + "epoch": 0.14787822248747057, + "grad_norm": 2.67094349861145, + "learning_rate": 0.0049986897223942344, + "loss": 9.0881, + "step": 36300 + }, + { + "epoch": 0.14828560051085204, + "grad_norm": 3.5158534049987793, + "learning_rate": 0.004998673597481375, + "loss": 9.063, + "step": 36400 + }, + { + "epoch": 0.1486929785342335, + "grad_norm": 4.55438232421875, + "learning_rate": 0.004998657373980268, + "loss": 9.1245, + "step": 36500 + }, + { + "epoch": 0.14910035655761497, + "grad_norm": 3.9908504486083984, + "learning_rate": 0.004998641051891542, + "loss": 9.0492, + "step": 36600 + }, + { + "epoch": 0.14950773458099642, + "grad_norm": 2.482670307159424, + "learning_rate": 0.004998624631215852, + "loss": 9.1073, + "step": 36700 + }, + { + "epoch": 0.1499151126043779, + "grad_norm": 3.83728289604187, + "learning_rate": 0.004998608111953842, + "loss": 9.1892, + "step": 36800 + }, + { + "epoch": 0.15032249062775935, + "grad_norm": 4.212090969085693, + "learning_rate": 0.004998591494106167, + "loss": 9.1035, + "step": 36900 + }, + { + "epoch": 0.15072986865114082, + "grad_norm": 3.91371488571167, + "learning_rate": 0.004998574777673482, + "loss": 9.0584, + "step": 37000 + }, + { + "epoch": 0.15072986865114082, + "eval_MaskedAccuracy": 0.4479879046411305, + "eval_loss": 1.9146876335144043, + "eval_runtime": 589.2359, + "eval_samples_per_second": 107.726, + "eval_steps_per_second": 0.421, + "step": 37000 + }, + { + "epoch": 0.15113724667452227, + "grad_norm": 3.9847943782806396, + "learning_rate": 0.004998557962656451, + "loss": 9.1373, + "step": 37100 + }, + { + "epoch": 0.15154462469790372, + "grad_norm": 2.304649591445923, + "learning_rate": 0.004998541049055731, + "loss": 9.1021, + "step": 37200 + }, + { + "epoch": 0.1519520027212852, + "grad_norm": 2.305466890335083, + "learning_rate": 0.004998524036872005, + "loss": 9.0259, + "step": 37300 + }, + { + "epoch": 0.15235938074466665, + "grad_norm": 5.452589511871338, + "learning_rate": 0.004998506926105931, + "loss": 9.0077, + "step": 37400 + }, + { + "epoch": 0.15276675876804813, + "grad_norm": 3.1669082641601562, + "learning_rate": 0.004998489716758196, + "loss": 9.0085, + "step": 37500 + }, + { + "epoch": 0.15317413679142958, + "grad_norm": 2.483513355255127, + "learning_rate": 0.004998472408829474, + "loss": 9.0579, + "step": 37600 + }, + { + "epoch": 0.15358151481481105, + "grad_norm": 4.417145729064941, + "learning_rate": 0.004998455002320451, + "loss": 9.0336, + "step": 37700 + }, + { + "epoch": 0.1539888928381925, + "grad_norm": 3.7182934284210205, + "learning_rate": 0.004998437497231818, + "loss": 9.0433, + "step": 37800 + }, + { + "epoch": 0.15439627086157395, + "grad_norm": 3.135267734527588, + "learning_rate": 0.00499841989356426, + "loss": 9.0677, + "step": 37900 + }, + { + "epoch": 0.15480364888495543, + "grad_norm": 2.6014578342437744, + "learning_rate": 0.0049984021913184835, + "loss": 9.1458, + "step": 38000 + }, + { + "epoch": 0.15480364888495543, + "eval_MaskedAccuracy": 0.44533889755137634, + "eval_loss": 1.9252251386642456, + "eval_runtime": 479.8735, + "eval_samples_per_second": 132.277, + "eval_steps_per_second": 0.517, + "step": 38000 + }, + { + "epoch": 0.15521102690833688, + "grad_norm": 4.145866870880127, + "learning_rate": 0.0049983843904951855, + "loss": 9.0325, + "step": 38100 + }, + { + "epoch": 0.15561840493171836, + "grad_norm": 2.6572349071502686, + "learning_rate": 0.00499836649109506, + "loss": 8.9937, + "step": 38200 + }, + { + "epoch": 0.1560257829550998, + "grad_norm": 4.326454162597656, + "learning_rate": 0.004998348493118833, + "loss": 9.005, + "step": 38300 + }, + { + "epoch": 0.15643316097848128, + "grad_norm": 2.5977251529693604, + "learning_rate": 0.004998330396567195, + "loss": 9.0657, + "step": 38400 + }, + { + "epoch": 0.15684053900186273, + "grad_norm": 7.225991249084473, + "learning_rate": 0.004998312201440868, + "loss": 9.1711, + "step": 38500 + }, + { + "epoch": 0.1572479170252442, + "grad_norm": 1.0980727672576904, + "learning_rate": 0.0049982939077405815, + "loss": 9.1604, + "step": 38600 + }, + { + "epoch": 0.15765529504862566, + "grad_norm": 4.309670448303223, + "learning_rate": 0.004998275515467049, + "loss": 9.131, + "step": 38700 + }, + { + "epoch": 0.1580626730720071, + "grad_norm": 4.189974308013916, + "learning_rate": 0.004998257024621001, + "loss": 9.0594, + "step": 38800 + }, + { + "epoch": 0.1584700510953886, + "grad_norm": 3.872173309326172, + "learning_rate": 0.004998238435203172, + "loss": 8.999, + "step": 38900 + }, + { + "epoch": 0.15887742911877004, + "grad_norm": 8.60802936553955, + "learning_rate": 0.0049982197472142895, + "loss": 9.0032, + "step": 39000 + }, + { + "epoch": 0.15887742911877004, + "eval_MaskedAccuracy": 0.4432360188291969, + "eval_loss": 1.9270706176757812, + "eval_runtime": 453.8616, + "eval_samples_per_second": 139.858, + "eval_steps_per_second": 0.546, + "step": 39000 + }, + { + "epoch": 0.1592848071421515, + "grad_norm": 4.140707969665527, + "learning_rate": 0.004998200960655094, + "loss": 9.1444, + "step": 39100 + }, + { + "epoch": 0.15969218516553296, + "grad_norm": 3.9461541175842285, + "learning_rate": 0.0049981820755263286, + "loss": 9.0186, + "step": 39200 + }, + { + "epoch": 0.16009956318891444, + "grad_norm": 5.20798397064209, + "learning_rate": 0.004998163091828741, + "loss": 8.9638, + "step": 39300 + }, + { + "epoch": 0.1605069412122959, + "grad_norm": 1.8364992141723633, + "learning_rate": 0.00499814400956308, + "loss": 8.9612, + "step": 39400 + }, + { + "epoch": 0.16091431923567734, + "grad_norm": 4.967864990234375, + "learning_rate": 0.004998124828730113, + "loss": 9.0387, + "step": 39500 + }, + { + "epoch": 0.16132169725905882, + "grad_norm": 2.861976146697998, + "learning_rate": 0.004998105549330576, + "loss": 8.9825, + "step": 39600 + }, + { + "epoch": 0.16172907528244027, + "grad_norm": 25.292007446289062, + "learning_rate": 0.004998086171365244, + "loss": 9.0026, + "step": 39700 + }, + { + "epoch": 0.16213645330582174, + "grad_norm": 2.862931251525879, + "learning_rate": 0.004998066694834885, + "loss": 9.0044, + "step": 39800 + }, + { + "epoch": 0.1625438313292032, + "grad_norm": 4.256959438323975, + "learning_rate": 0.004998047119740263, + "loss": 8.9882, + "step": 39900 + }, + { + "epoch": 0.16295120935258467, + "grad_norm": 4.4340362548828125, + "learning_rate": 0.004998027446082152, + "loss": 9.0686, + "step": 40000 + }, + { + "epoch": 0.16295120935258467, + "eval_MaskedAccuracy": 0.44911529133323386, + "eval_loss": 1.9079149961471558, + "eval_runtime": 533.8636, + "eval_samples_per_second": 118.899, + "eval_steps_per_second": 0.465, + "step": 40000 + }, + { + "epoch": 0.16335858737596612, + "grad_norm": 4.338792324066162, + "learning_rate": 0.004998007673861338, + "loss": 8.9793, + "step": 40100 + }, + { + "epoch": 0.1637659653993476, + "grad_norm": 7.50832462310791, + "learning_rate": 0.004997987803078598, + "loss": 8.9996, + "step": 40200 + }, + { + "epoch": 0.16417334342272905, + "grad_norm": 5.61702823638916, + "learning_rate": 0.004997967833734717, + "loss": 9.0622, + "step": 40300 + }, + { + "epoch": 0.1645807214461105, + "grad_norm": 1.6239995956420898, + "learning_rate": 0.004997947765830481, + "loss": 9.1193, + "step": 40400 + }, + { + "epoch": 0.16498809946949197, + "grad_norm": 3.3413801193237305, + "learning_rate": 0.0049979275993666935, + "loss": 9.0051, + "step": 40500 + }, + { + "epoch": 0.16539547749287342, + "grad_norm": 4.474870681762695, + "learning_rate": 0.004997907334344139, + "loss": 8.9585, + "step": 40600 + }, + { + "epoch": 0.1658028555162549, + "grad_norm": 4.810618877410889, + "learning_rate": 0.004997886970763626, + "loss": 9.0324, + "step": 40700 + }, + { + "epoch": 0.16621023353963635, + "grad_norm": 3.7743003368377686, + "learning_rate": 0.0049978665086259574, + "loss": 9.0539, + "step": 40800 + }, + { + "epoch": 0.16661761156301783, + "grad_norm": 3.4350481033325195, + "learning_rate": 0.004997845947931944, + "loss": 8.9873, + "step": 40900 + }, + { + "epoch": 0.16702498958639928, + "grad_norm": 4.730776786804199, + "learning_rate": 0.004997825288682397, + "loss": 8.9994, + "step": 41000 + }, + { + "epoch": 0.16702498958639928, + "eval_MaskedAccuracy": 0.4477450902474816, + "eval_loss": 1.90780770778656, + "eval_runtime": 582.4452, + "eval_samples_per_second": 108.982, + "eval_steps_per_second": 0.426, + "step": 41000 + }, + { + "epoch": 0.16743236760978075, + "grad_norm": 5.645813941955566, + "learning_rate": 0.004997804530878132, + "loss": 8.9852, + "step": 41100 + }, + { + "epoch": 0.1678397456331622, + "grad_norm": 7.603764533996582, + "learning_rate": 0.004997783674519973, + "loss": 8.9917, + "step": 41200 + }, + { + "epoch": 0.16824712365654365, + "grad_norm": 3.657362461090088, + "learning_rate": 0.004997762719608744, + "loss": 9.0294, + "step": 41300 + }, + { + "epoch": 0.16865450167992513, + "grad_norm": 4.409435749053955, + "learning_rate": 0.00499774166614527, + "loss": 8.9524, + "step": 41400 + }, + { + "epoch": 0.16906187970330658, + "grad_norm": 4.743533611297607, + "learning_rate": 0.00499772051413039, + "loss": 8.9682, + "step": 41500 + }, + { + "epoch": 0.16946925772668806, + "grad_norm": 5.022209644317627, + "learning_rate": 0.004997699263564937, + "loss": 8.9024, + "step": 41600 + }, + { + "epoch": 0.1698766357500695, + "grad_norm": 4.124392032623291, + "learning_rate": 0.004997677914449744, + "loss": 8.9699, + "step": 41700 + }, + { + "epoch": 0.17028401377345098, + "grad_norm": 2.79866361618042, + "learning_rate": 0.004997656466785669, + "loss": 8.9582, + "step": 41800 + }, + { + "epoch": 0.17069139179683243, + "grad_norm": 4.4310221672058105, + "learning_rate": 0.00499763492057355, + "loss": 8.9343, + "step": 41900 + }, + { + "epoch": 0.17109876982021388, + "grad_norm": 1.807766079902649, + "learning_rate": 0.004997613275814238, + "loss": 9.0676, + "step": 42000 + }, + { + "epoch": 0.17109876982021388, + "eval_MaskedAccuracy": 0.44673300940251104, + "eval_loss": 1.9179065227508545, + "eval_runtime": 480.8601, + "eval_samples_per_second": 132.005, + "eval_steps_per_second": 0.516, + "step": 42000 + }, + { + "epoch": 0.17150614784359536, + "grad_norm": 3.888913154602051, + "learning_rate": 0.0049975915325085945, + "loss": 9.0499, + "step": 42100 + }, + { + "epoch": 0.1719135258669768, + "grad_norm": 5.673905849456787, + "learning_rate": 0.0049975696906574725, + "loss": 8.9339, + "step": 42200 + }, + { + "epoch": 0.17232090389035828, + "grad_norm": 8.747493743896484, + "learning_rate": 0.004997547750261739, + "loss": 8.9281, + "step": 42300 + }, + { + "epoch": 0.17272828191373973, + "grad_norm": 7.244274139404297, + "learning_rate": 0.0049975257113222624, + "loss": 9.0767, + "step": 42400 + }, + { + "epoch": 0.1731356599371212, + "grad_norm": 3.129075765609741, + "learning_rate": 0.004997503573839917, + "loss": 9.1039, + "step": 42500 + }, + { + "epoch": 0.17354303796050266, + "grad_norm": 3.9406702518463135, + "learning_rate": 0.004997481337815575, + "loss": 8.9543, + "step": 42600 + }, + { + "epoch": 0.17395041598388414, + "grad_norm": 2.617393732070923, + "learning_rate": 0.004997459003250109, + "loss": 8.9232, + "step": 42700 + }, + { + "epoch": 0.1743577940072656, + "grad_norm": 5.305755138397217, + "learning_rate": 0.004997436570144415, + "loss": 8.9391, + "step": 42800 + }, + { + "epoch": 0.17476517203064704, + "grad_norm": 5.731649398803711, + "learning_rate": 0.004997414038499373, + "loss": 9.0504, + "step": 42900 + }, + { + "epoch": 0.17517255005402851, + "grad_norm": 5.849124431610107, + "learning_rate": 0.004997391408315867, + "loss": 8.9588, + "step": 43000 + }, + { + "epoch": 0.17517255005402851, + "eval_MaskedAccuracy": 0.4417444098255367, + "eval_loss": 1.950426697731018, + "eval_runtime": 468.5442, + "eval_samples_per_second": 135.475, + "eval_steps_per_second": 0.529, + "step": 43000 + }, + { + "epoch": 0.17557992807740996, + "grad_norm": 4.830824851989746, + "learning_rate": 0.0049973686795948, + "loss": 9.0612, + "step": 43100 + }, + { + "epoch": 0.17598730610079144, + "grad_norm": 4.50791597366333, + "learning_rate": 0.004997345852337069, + "loss": 8.9523, + "step": 43200 + }, + { + "epoch": 0.1763946841241729, + "grad_norm": 4.371399402618408, + "learning_rate": 0.004997322926543576, + "loss": 8.8987, + "step": 43300 + }, + { + "epoch": 0.17680206214755437, + "grad_norm": 3.6514639854431152, + "learning_rate": 0.004997299902215233, + "loss": 8.9056, + "step": 43400 + }, + { + "epoch": 0.17720944017093582, + "grad_norm": 5.054758071899414, + "learning_rate": 0.004997276779352933, + "loss": 8.875, + "step": 43500 + }, + { + "epoch": 0.1776168181943173, + "grad_norm": 6.961836814880371, + "learning_rate": 0.004997253557957605, + "loss": 8.9972, + "step": 43600 + }, + { + "epoch": 0.17802419621769874, + "grad_norm": 1.7716807126998901, + "learning_rate": 0.004997230238030173, + "loss": 9.0609, + "step": 43700 + }, + { + "epoch": 0.1784315742410802, + "grad_norm": 4.531676769256592, + "learning_rate": 0.004997206819571546, + "loss": 9.0159, + "step": 43800 + }, + { + "epoch": 0.17883895226446167, + "grad_norm": 6.115238666534424, + "learning_rate": 0.0049971833025826565, + "loss": 8.9111, + "step": 43900 + }, + { + "epoch": 0.17924633028784312, + "grad_norm": 3.474392890930176, + "learning_rate": 0.004997159687064435, + "loss": 8.9168, + "step": 44000 + }, + { + "epoch": 0.17924633028784312, + "eval_MaskedAccuracy": 0.45380778193670457, + "eval_loss": 1.8783373832702637, + "eval_runtime": 510.3313, + "eval_samples_per_second": 124.382, + "eval_steps_per_second": 0.486, + "step": 44000 + }, + { + "epoch": 0.1796537083112246, + "grad_norm": 4.088130474090576, + "learning_rate": 0.004997135973017803, + "loss": 8.8733, + "step": 44100 + }, + { + "epoch": 0.18006108633460605, + "grad_norm": 5.331000328063965, + "learning_rate": 0.0049971121604437145, + "loss": 8.9507, + "step": 44200 + }, + { + "epoch": 0.18046846435798752, + "grad_norm": 3.348855495452881, + "learning_rate": 0.004997088249343102, + "loss": 8.9441, + "step": 44300 + }, + { + "epoch": 0.18087584238136897, + "grad_norm": 5.045528888702393, + "learning_rate": 0.004997064239716911, + "loss": 8.9874, + "step": 44400 + }, + { + "epoch": 0.18128322040475042, + "grad_norm": 2.4286417961120605, + "learning_rate": 0.004997040131566088, + "loss": 8.9286, + "step": 44500 + }, + { + "epoch": 0.1816905984281319, + "grad_norm": 4.78425407409668, + "learning_rate": 0.004997015924891591, + "loss": 8.8713, + "step": 44600 + }, + { + "epoch": 0.18209797645151335, + "grad_norm": 1.8142859935760498, + "learning_rate": 0.004996991619694378, + "loss": 8.9267, + "step": 44700 + }, + { + "epoch": 0.18250535447489483, + "grad_norm": 3.1893045902252197, + "learning_rate": 0.004996967215975412, + "loss": 8.9035, + "step": 44800 + }, + { + "epoch": 0.18291273249827628, + "grad_norm": 6.472596645355225, + "learning_rate": 0.00499694271373565, + "loss": 8.9118, + "step": 44900 + }, + { + "epoch": 0.18332011052165775, + "grad_norm": 4.773984909057617, + "learning_rate": 0.004996918112976068, + "loss": 9.0264, + "step": 45000 + }, + { + "epoch": 0.18332011052165775, + "eval_MaskedAccuracy": 0.4526010996035226, + "eval_loss": 1.8884249925613403, + "eval_runtime": 1591.7003, + "eval_samples_per_second": 39.879, + "eval_steps_per_second": 0.156, + "step": 45000 + }, + { + "epoch": 0.1837274885450392, + "grad_norm": 3.957942485809326, + "learning_rate": 0.004996893413697636, + "loss": 8.9141, + "step": 45100 + }, + { + "epoch": 0.18413486656842068, + "grad_norm": 4.710155010223389, + "learning_rate": 0.004996868615901332, + "loss": 8.8604, + "step": 45200 + }, + { + "epoch": 0.18454224459180213, + "grad_norm": 9.699567794799805, + "learning_rate": 0.004996843719588128, + "loss": 8.8876, + "step": 45300 + }, + { + "epoch": 0.18494962261518358, + "grad_norm": 2.785186529159546, + "learning_rate": 0.0049968187247590124, + "loss": 8.9351, + "step": 45400 + }, + { + "epoch": 0.18535700063856506, + "grad_norm": 5.596234321594238, + "learning_rate": 0.004996793631414983, + "loss": 8.9158, + "step": 45500 + }, + { + "epoch": 0.1857643786619465, + "grad_norm": 4.283908367156982, + "learning_rate": 0.00499676843955702, + "loss": 8.9738, + "step": 45600 + }, + { + "epoch": 0.18617175668532798, + "grad_norm": 6.712310791015625, + "learning_rate": 0.004996743149186122, + "loss": 8.8925, + "step": 45700 + }, + { + "epoch": 0.18657913470870943, + "grad_norm": 4.3184895515441895, + "learning_rate": 0.004996717760303289, + "loss": 8.8861, + "step": 45800 + }, + { + "epoch": 0.1869865127320909, + "grad_norm": 2.1695480346679688, + "learning_rate": 0.004996692272909525, + "loss": 8.8591, + "step": 45900 + }, + { + "epoch": 0.18739389075547236, + "grad_norm": 5.650905609130859, + "learning_rate": 0.004996666687005845, + "loss": 9.07, + "step": 46000 + }, + { + "epoch": 0.18739389075547236, + "eval_MaskedAccuracy": 0.4467761955244632, + "eval_loss": 1.9052798748016357, + "eval_runtime": 1276.7049, + "eval_samples_per_second": 49.719, + "eval_steps_per_second": 0.194, + "step": 46000 + }, + { + "epoch": 0.1878012687788538, + "grad_norm": 4.653480052947998, + "learning_rate": 0.004996641002593249, + "loss": 8.9139, + "step": 46100 + }, + { + "epoch": 0.18820864680223529, + "grad_norm": 3.2729389667510986, + "learning_rate": 0.004996615219672769, + "loss": 8.8715, + "step": 46200 + }, + { + "epoch": 0.18861602482561673, + "grad_norm": 1.0644035339355469, + "learning_rate": 0.004996589338245408, + "loss": 8.8974, + "step": 46300 + }, + { + "epoch": 0.1890234028489982, + "grad_norm": 4.8218770027160645, + "learning_rate": 0.004996563358312191, + "loss": 9.0405, + "step": 46400 + }, + { + "epoch": 0.18943078087237966, + "grad_norm": 3.1921133995056152, + "learning_rate": 0.0049965372798741504, + "loss": 8.8825, + "step": 46500 + }, + { + "epoch": 0.18983815889576114, + "grad_norm": 3.8239128589630127, + "learning_rate": 0.004996511102932316, + "loss": 8.8928, + "step": 46600 + }, + { + "epoch": 0.1902455369191426, + "grad_norm": 1.8124943971633911, + "learning_rate": 0.0049964848274877205, + "loss": 8.8667, + "step": 46700 + }, + { + "epoch": 0.19065291494252407, + "grad_norm": 6.511573314666748, + "learning_rate": 0.004996458453541405, + "loss": 8.9185, + "step": 46800 + }, + { + "epoch": 0.19106029296590551, + "grad_norm": 5.872045516967773, + "learning_rate": 0.004996431981094414, + "loss": 8.8477, + "step": 46900 + }, + { + "epoch": 0.19146767098928696, + "grad_norm": 4.800133228302002, + "learning_rate": 0.004996405410147791, + "loss": 8.849, + "step": 47000 + }, + { + "epoch": 0.19146767098928696, + "eval_MaskedAccuracy": 0.45543538868908684, + "eval_loss": 1.8698941469192505, + "eval_runtime": 1367.0173, + "eval_samples_per_second": 46.434, + "eval_steps_per_second": 0.181, + "step": 47000 + }, + { + "epoch": 0.19187504901266844, + "grad_norm": 3.088144302368164, + "learning_rate": 0.0049963787407025844, + "loss": 8.9498, + "step": 47100 + }, + { + "epoch": 0.1922824270360499, + "grad_norm": 3.6458115577697754, + "learning_rate": 0.004996351972759855, + "loss": 9.0039, + "step": 47200 + }, + { + "epoch": 0.19268980505943137, + "grad_norm": 6.0658369064331055, + "learning_rate": 0.004996325106320655, + "loss": 8.8997, + "step": 47300 + }, + { + "epoch": 0.19309718308281282, + "grad_norm": 5.804576873779297, + "learning_rate": 0.004996298141386048, + "loss": 9.0567, + "step": 47400 + }, + { + "epoch": 0.1935045611061943, + "grad_norm": 4.082156658172607, + "learning_rate": 0.004996271077957102, + "loss": 8.8726, + "step": 47500 + }, + { + "epoch": 0.19391193912957574, + "grad_norm": 3.362668752670288, + "learning_rate": 0.004996243916034889, + "loss": 8.8642, + "step": 47600 + }, + { + "epoch": 0.19431931715295722, + "grad_norm": 1.1706517934799194, + "learning_rate": 0.0049962166556204776, + "loss": 8.8542, + "step": 47700 + }, + { + "epoch": 0.19472669517633867, + "grad_norm": 6.859764099121094, + "learning_rate": 0.0049961892967149485, + "loss": 9.0527, + "step": 47800 + }, + { + "epoch": 0.19513407319972012, + "grad_norm": 4.326186656951904, + "learning_rate": 0.004996161839319382, + "loss": 8.9689, + "step": 47900 + }, + { + "epoch": 0.1955414512231016, + "grad_norm": 6.308228969573975, + "learning_rate": 0.004996134283434865, + "loss": 8.8275, + "step": 48000 + }, + { + "epoch": 0.1955414512231016, + "eval_MaskedAccuracy": 0.4556454833718163, + "eval_loss": 1.8734564781188965, + "eval_runtime": 1654.1594, + "eval_samples_per_second": 38.374, + "eval_steps_per_second": 0.15, + "step": 48000 + }, + { + "epoch": 0.19594882924648305, + "grad_norm": 4.332017421722412, + "learning_rate": 0.004996106629062483, + "loss": 8.8454, + "step": 48100 + }, + { + "epoch": 0.19635620726986452, + "grad_norm": 6.285112380981445, + "learning_rate": 0.004996078876203332, + "loss": 8.8178, + "step": 48200 + }, + { + "epoch": 0.19676358529324597, + "grad_norm": 3.0181238651275635, + "learning_rate": 0.00499605102485851, + "loss": 8.9695, + "step": 48300 + }, + { + "epoch": 0.19717096331662745, + "grad_norm": 5.3587446212768555, + "learning_rate": 0.004996023075029118, + "loss": 8.857, + "step": 48400 + }, + { + "epoch": 0.1975783413400089, + "grad_norm": 4.083210468292236, + "learning_rate": 0.004995995026716264, + "loss": 8.8019, + "step": 48500 + }, + { + "epoch": 0.19798571936339035, + "grad_norm": 1.806215763092041, + "learning_rate": 0.004995966879921056, + "loss": 8.9378, + "step": 48600 + }, + { + "epoch": 0.19839309738677183, + "grad_norm": 4.553736686706543, + "learning_rate": 0.004995938634644601, + "loss": 8.8551, + "step": 48700 + }, + { + "epoch": 0.19880047541015328, + "grad_norm": 6.222628593444824, + "learning_rate": 0.004995910290888019, + "loss": 8.828, + "step": 48800 + }, + { + "epoch": 0.19920785343353475, + "grad_norm": 3.1822943687438965, + "learning_rate": 0.004995881848652431, + "loss": 8.8323, + "step": 48900 + }, + { + "epoch": 0.1996152314569162, + "grad_norm": 3.611772298812866, + "learning_rate": 0.004995853307938964, + "loss": 8.8966, + "step": 49000 + }, + { + "epoch": 0.1996152314569162, + "eval_MaskedAccuracy": 0.45529773088340925, + "eval_loss": 1.8736827373504639, + "eval_runtime": 1125.2104, + "eval_samples_per_second": 56.413, + "eval_steps_per_second": 0.22, + "step": 49000 + }, + { + "epoch": 0.20002260948029768, + "grad_norm": 5.808901786804199, + "learning_rate": 0.004995824668748749, + "loss": 8.7982, + "step": 49100 + }, + { + "epoch": 0.20042998750367913, + "grad_norm": 4.918834209442139, + "learning_rate": 0.004995795931082909, + "loss": 8.806, + "step": 49200 + }, + { + "epoch": 0.2008373655270606, + "grad_norm": 23.904325485229492, + "learning_rate": 0.004995767094942582, + "loss": 8.9147, + "step": 49300 + }, + { + "epoch": 0.20124474355044206, + "grad_norm": 4.423624038696289, + "learning_rate": 0.004995738160328911, + "loss": 9.0289, + "step": 49400 + }, + { + "epoch": 0.2016521215738235, + "grad_norm": 3.2176241874694824, + "learning_rate": 0.004995709127243038, + "loss": 8.9621, + "step": 49500 + }, + { + "epoch": 0.20205949959720498, + "grad_norm": 4.553608417510986, + "learning_rate": 0.004995679995686104, + "loss": 8.8743, + "step": 49600 + }, + { + "epoch": 0.20246687762058643, + "grad_norm": 4.142465114593506, + "learning_rate": 0.004995650765659282, + "loss": 8.816, + "step": 49700 + }, + { + "epoch": 0.2028742556439679, + "grad_norm": 5.3594794273376465, + "learning_rate": 0.00499562143716371, + "loss": 8.7982, + "step": 49800 + }, + { + "epoch": 0.20328163366734936, + "grad_norm": 3.36631441116333, + "learning_rate": 0.004995592010200553, + "loss": 8.8225, + "step": 49900 + }, + { + "epoch": 0.20368901169073084, + "grad_norm": 1.200137972831726, + "learning_rate": 0.004995562484770975, + "loss": 8.833, + "step": 50000 + }, + { + "epoch": 0.20368901169073084, + "eval_MaskedAccuracy": 0.45342948136380307, + "eval_loss": 1.8760101795196533, + "eval_runtime": 1721.9536, + "eval_samples_per_second": 36.863, + "eval_steps_per_second": 0.144, + "step": 50000 + }, + { + "epoch": 0.2040963897141123, + "grad_norm": 2.7836992740631104, + "learning_rate": 0.0049955328608761364, + "loss": 8.9665, + "step": 50100 + }, + { + "epoch": 0.20450376773749374, + "grad_norm": 4.889101982116699, + "learning_rate": 0.004995503138517217, + "loss": 8.9262, + "step": 50200 + }, + { + "epoch": 0.2049111457608752, + "grad_norm": 5.656112194061279, + "learning_rate": 0.004995473317695388, + "loss": 8.826, + "step": 50300 + }, + { + "epoch": 0.20531852378425666, + "grad_norm": 9.625349044799805, + "learning_rate": 0.00499544339841183, + "loss": 8.7877, + "step": 50400 + }, + { + "epoch": 0.20572590180763814, + "grad_norm": 5.409666061401367, + "learning_rate": 0.004995413380667721, + "loss": 8.939, + "step": 50500 + }, + { + "epoch": 0.2061332798310196, + "grad_norm": 6.664834022521973, + "learning_rate": 0.00499538326446425, + "loss": 8.7978, + "step": 50600 + }, + { + "epoch": 0.20654065785440107, + "grad_norm": 6.404662609100342, + "learning_rate": 0.004995353049802609, + "loss": 8.8049, + "step": 50700 + }, + { + "epoch": 0.20694803587778252, + "grad_norm": 7.683322429656982, + "learning_rate": 0.00499532273668399, + "loss": 8.8133, + "step": 50800 + }, + { + "epoch": 0.207355413901164, + "grad_norm": 3.8872005939483643, + "learning_rate": 0.004995292325109595, + "loss": 8.8927, + "step": 50900 + }, + { + "epoch": 0.20776279192454544, + "grad_norm": 4.828040599822998, + "learning_rate": 0.004995261815080626, + "loss": 8.8109, + "step": 51000 + }, + { + "epoch": 0.20776279192454544, + "eval_MaskedAccuracy": 0.4566938346297085, + "eval_loss": 1.8743842840194702, + "eval_runtime": 1101.6221, + "eval_samples_per_second": 57.62, + "eval_steps_per_second": 0.225, + "step": 51000 + }, + { + "epoch": 0.2081701699479269, + "grad_norm": 5.939586162567139, + "learning_rate": 0.00499523120659829, + "loss": 8.8242, + "step": 51100 + }, + { + "epoch": 0.20857754797130837, + "grad_norm": 27.979389190673828, + "learning_rate": 0.004995200499663799, + "loss": 8.7949, + "step": 51200 + }, + { + "epoch": 0.20898492599468982, + "grad_norm": 8.024907112121582, + "learning_rate": 0.004995169694278358, + "loss": 9.0259, + "step": 51300 + }, + { + "epoch": 0.2093923040180713, + "grad_norm": 5.7522807121276855, + "learning_rate": 0.0049951387904431915, + "loss": 8.8851, + "step": 51400 + }, + { + "epoch": 0.20979968204145275, + "grad_norm": 4.299382209777832, + "learning_rate": 0.00499510778815952, + "loss": 8.8128, + "step": 51500 + }, + { + "epoch": 0.21020706006483422, + "grad_norm": 4.014888763427734, + "learning_rate": 0.004995076687428578, + "loss": 8.8239, + "step": 51600 + }, + { + "epoch": 0.21061443808821567, + "grad_norm": 1.072213053703308, + "learning_rate": 0.004995045488251584, + "loss": 8.8771, + "step": 51700 + }, + { + "epoch": 0.21102181611159715, + "grad_norm": 4.117224216461182, + "learning_rate": 0.0049950141906297775, + "loss": 8.9155, + "step": 51800 + }, + { + "epoch": 0.2114291941349786, + "grad_norm": 5.043557643890381, + "learning_rate": 0.004994982794564388, + "loss": 8.7899, + "step": 51900 + }, + { + "epoch": 0.21183657215836005, + "grad_norm": 5.4120683670043945, + "learning_rate": 0.004994951300056659, + "loss": 8.782, + "step": 52000 + }, + { + "epoch": 0.21183657215836005, + "eval_MaskedAccuracy": 0.45812245160372533, + "eval_loss": 1.8528733253479004, + "eval_runtime": 1573.3328, + "eval_samples_per_second": 40.345, + "eval_steps_per_second": 0.158, + "step": 52000 + }, + { + "epoch": 0.21224395018174153, + "grad_norm": 6.679693698883057, + "learning_rate": 0.004994919707107843, + "loss": 8.7559, + "step": 52100 + }, + { + "epoch": 0.21265132820512297, + "grad_norm": 6.865571022033691, + "learning_rate": 0.004994888015719179, + "loss": 8.883, + "step": 52200 + }, + { + "epoch": 0.21305870622850445, + "grad_norm": 6.2120561599731445, + "learning_rate": 0.004994856225891923, + "loss": 8.9445, + "step": 52300 + }, + { + "epoch": 0.2134660842518859, + "grad_norm": 6.802185535430908, + "learning_rate": 0.004994824337627339, + "loss": 8.8191, + "step": 52400 + }, + { + "epoch": 0.21387346227526738, + "grad_norm": 4.90602970123291, + "learning_rate": 0.004994792350926673, + "loss": 8.8454, + "step": 52500 + }, + { + "epoch": 0.21428084029864883, + "grad_norm": 5.3922953605651855, + "learning_rate": 0.004994760265791208, + "loss": 8.9565, + "step": 52600 + }, + { + "epoch": 0.21468821832203028, + "grad_norm": 3.9712696075439453, + "learning_rate": 0.0049947280822222025, + "loss": 8.8104, + "step": 52700 + }, + { + "epoch": 0.21509559634541175, + "grad_norm": 3.618863821029663, + "learning_rate": 0.004994695800220931, + "loss": 8.8186, + "step": 52800 + }, + { + "epoch": 0.2155029743687932, + "grad_norm": 3.201653003692627, + "learning_rate": 0.004994663419788666, + "loss": 8.7529, + "step": 52900 + }, + { + "epoch": 0.21591035239217468, + "grad_norm": 6.746595859527588, + "learning_rate": 0.004994630940926693, + "loss": 8.871, + "step": 53000 + }, + { + "epoch": 0.21591035239217468, + "eval_MaskedAccuracy": 0.45650221757496223, + "eval_loss": 1.8506487607955933, + "eval_runtime": 1421.674, + "eval_samples_per_second": 44.649, + "eval_steps_per_second": 0.174, + "step": 53000 + }, + { + "epoch": 0.21631773041555613, + "grad_norm": 5.726940631866455, + "learning_rate": 0.0049945983636362915, + "loss": 8.7594, + "step": 53100 + }, + { + "epoch": 0.2167251084389376, + "grad_norm": 9.217852592468262, + "learning_rate": 0.00499456568791875, + "loss": 8.8618, + "step": 53200 + }, + { + "epoch": 0.21713248646231906, + "grad_norm": 5.963167667388916, + "learning_rate": 0.004994532913775363, + "loss": 8.9854, + "step": 53300 + }, + { + "epoch": 0.21753986448570053, + "grad_norm": 3.00286865234375, + "learning_rate": 0.004994500041207422, + "loss": 8.8253, + "step": 53400 + }, + { + "epoch": 0.21794724250908198, + "grad_norm": 6.345184326171875, + "learning_rate": 0.004994467070216233, + "loss": 8.7499, + "step": 53500 + }, + { + "epoch": 0.21835462053246343, + "grad_norm": 8.022476196289062, + "learning_rate": 0.004994434000803093, + "loss": 8.762, + "step": 53600 + }, + { + "epoch": 0.2187619985558449, + "grad_norm": 6.483421802520752, + "learning_rate": 0.004994400832969317, + "loss": 8.7591, + "step": 53700 + }, + { + "epoch": 0.21916937657922636, + "grad_norm": 2.510129451751709, + "learning_rate": 0.004994367566716207, + "loss": 8.9242, + "step": 53800 + }, + { + "epoch": 0.21957675460260784, + "grad_norm": 1.064130425453186, + "learning_rate": 0.0049943342020450825, + "loss": 8.979, + "step": 53900 + }, + { + "epoch": 0.2199841326259893, + "grad_norm": 6.173582077026367, + "learning_rate": 0.004994300738957267, + "loss": 8.9965, + "step": 54000 + }, + { + "epoch": 0.2199841326259893, + "eval_MaskedAccuracy": 0.4541979935100313, + "eval_loss": 1.8765358924865723, + "eval_runtime": 1215.8881, + "eval_samples_per_second": 52.205, + "eval_steps_per_second": 0.204, + "step": 54000 + }, + { + "epoch": 0.22039151064937076, + "grad_norm": 5.219405651092529, + "learning_rate": 0.004994267177454082, + "loss": 8.8305, + "step": 54100 + }, + { + "epoch": 0.2207988886727522, + "grad_norm": 6.215470790863037, + "learning_rate": 0.00499423351753685, + "loss": 8.7512, + "step": 54200 + }, + { + "epoch": 0.22120626669613366, + "grad_norm": 11.539061546325684, + "learning_rate": 0.004994199759206904, + "loss": 8.7085, + "step": 54300 + }, + { + "epoch": 0.22161364471951514, + "grad_norm": 2.4697351455688477, + "learning_rate": 0.004994165902465575, + "loss": 8.8035, + "step": 54400 + }, + { + "epoch": 0.2220210227428966, + "grad_norm": 4.639867305755615, + "learning_rate": 0.004994131947314205, + "loss": 8.8734, + "step": 54500 + }, + { + "epoch": 0.22242840076627807, + "grad_norm": 7.649751663208008, + "learning_rate": 0.0049940978937541404, + "loss": 8.8174, + "step": 54600 + }, + { + "epoch": 0.22283577878965952, + "grad_norm": 6.019172191619873, + "learning_rate": 0.004994063741786718, + "loss": 8.7393, + "step": 54700 + }, + { + "epoch": 0.223243156813041, + "grad_norm": 4.9053425788879395, + "learning_rate": 0.0049940294914133, + "loss": 8.7257, + "step": 54800 + }, + { + "epoch": 0.22365053483642244, + "grad_norm": 6.808137893676758, + "learning_rate": 0.004993995142635232, + "loss": 8.7289, + "step": 54900 + }, + { + "epoch": 0.22405791285980392, + "grad_norm": 10.978095054626465, + "learning_rate": 0.00499396069545387, + "loss": 8.755, + "step": 55000 + }, + { + "epoch": 0.22405791285980392, + "eval_MaskedAccuracy": 0.4493424909929303, + "eval_loss": 1.8973885774612427, + "eval_runtime": 1872.0602, + "eval_samples_per_second": 33.907, + "eval_steps_per_second": 0.132, + "step": 55000 + }, + { + "epoch": 0.22446529088318537, + "grad_norm": 4.419609069824219, + "learning_rate": 0.004993926149870585, + "loss": 8.9114, + "step": 55100 + }, + { + "epoch": 0.22487266890656682, + "grad_norm": 5.7764458656311035, + "learning_rate": 0.004993891505886738, + "loss": 8.8835, + "step": 55200 + }, + { + "epoch": 0.2252800469299483, + "grad_norm": 7.038376331329346, + "learning_rate": 0.004993856763503692, + "loss": 8.7756, + "step": 55300 + }, + { + "epoch": 0.22568742495332975, + "grad_norm": 2.246647357940674, + "learning_rate": 0.004993821922722831, + "loss": 8.7382, + "step": 55400 + }, + { + "epoch": 0.22609480297671122, + "grad_norm": 2.824277400970459, + "learning_rate": 0.004993786983545529, + "loss": 8.94, + "step": 55500 + }, + { + "epoch": 0.22650218100009267, + "grad_norm": 2.477095603942871, + "learning_rate": 0.004993751945973163, + "loss": 8.9129, + "step": 55600 + }, + { + "epoch": 0.22690955902347415, + "grad_norm": 8.029402732849121, + "learning_rate": 0.004993716810007125, + "loss": 8.786, + "step": 55700 + }, + { + "epoch": 0.2273169370468556, + "grad_norm": 6.447448253631592, + "learning_rate": 0.004993681575648798, + "loss": 8.7651, + "step": 55800 + }, + { + "epoch": 0.22772431507023708, + "grad_norm": 4.86328125, + "learning_rate": 0.004993646242899573, + "loss": 8.8977, + "step": 55900 + }, + { + "epoch": 0.22813169309361853, + "grad_norm": 5.140337944030762, + "learning_rate": 0.004993610811760857, + "loss": 8.756, + "step": 56000 + }, + { + "epoch": 0.22813169309361853, + "eval_MaskedAccuracy": 0.45951580848885953, + "eval_loss": 1.8519104719161987, + "eval_runtime": 1576.2602, + "eval_samples_per_second": 40.27, + "eval_steps_per_second": 0.157, + "step": 56000 + }, + { + "epoch": 0.22853907111699998, + "grad_norm": 8.257657051086426, + "learning_rate": 0.004993575282234048, + "loss": 8.7562, + "step": 56100 + }, + { + "epoch": 0.22894644914038145, + "grad_norm": 3.961374521255493, + "learning_rate": 0.004993539654320549, + "loss": 8.9159, + "step": 56200 + }, + { + "epoch": 0.2293538271637629, + "grad_norm": 4.514026165008545, + "learning_rate": 0.004993503928021772, + "loss": 8.9226, + "step": 56300 + }, + { + "epoch": 0.22976120518714438, + "grad_norm": 4.246365547180176, + "learning_rate": 0.004993468103339117, + "loss": 8.9462, + "step": 56400 + }, + { + "epoch": 0.23016858321052583, + "grad_norm": 7.508754253387451, + "learning_rate": 0.004993432180274019, + "loss": 8.9351, + "step": 56500 + }, + { + "epoch": 0.2305759612339073, + "grad_norm": 2.7310702800750732, + "learning_rate": 0.004993396158827879, + "loss": 8.8479, + "step": 56600 + }, + { + "epoch": 0.23098333925728876, + "grad_norm": 6.079079627990723, + "learning_rate": 0.004993360039002134, + "loss": 8.8298, + "step": 56700 + }, + { + "epoch": 0.2313907172806702, + "grad_norm": 5.1133809089660645, + "learning_rate": 0.004993323820798205, + "loss": 8.81, + "step": 56800 + }, + { + "epoch": 0.23179809530405168, + "grad_norm": 3.1408135890960693, + "learning_rate": 0.004993287504217525, + "loss": 8.7568, + "step": 56900 + }, + { + "epoch": 0.23220547332743313, + "grad_norm": 5.984044551849365, + "learning_rate": 0.004993251089261535, + "loss": 8.7266, + "step": 57000 + }, + { + "epoch": 0.23220547332743313, + "eval_MaskedAccuracy": 0.4607020510876605, + "eval_loss": 1.8500642776489258, + "eval_runtime": 1521.2445, + "eval_samples_per_second": 41.726, + "eval_steps_per_second": 0.163, + "step": 57000 + }, + { + "epoch": 0.2326128513508146, + "grad_norm": 7.032481670379639, + "learning_rate": 0.004993214575931667, + "loss": 8.698, + "step": 57100 + }, + { + "epoch": 0.23302022937419606, + "grad_norm": 3.9696929454803467, + "learning_rate": 0.0049931779642293705, + "loss": 8.6799, + "step": 57200 + }, + { + "epoch": 0.23342760739757754, + "grad_norm": 5.725532531738281, + "learning_rate": 0.004993141254156091, + "loss": 8.7401, + "step": 57300 + }, + { + "epoch": 0.23383498542095899, + "grad_norm": 3.45983624458313, + "learning_rate": 0.004993104445713283, + "loss": 8.9454, + "step": 57400 + }, + { + "epoch": 0.23424236344434046, + "grad_norm": 3.7529096603393555, + "learning_rate": 0.0049930675389024, + "loss": 8.8428, + "step": 57500 + }, + { + "epoch": 0.2346497414677219, + "grad_norm": 4.558876991271973, + "learning_rate": 0.004993030533724895, + "loss": 8.7754, + "step": 57600 + }, + { + "epoch": 0.23505711949110336, + "grad_norm": 7.477844715118408, + "learning_rate": 0.00499299343018224, + "loss": 8.6963, + "step": 57700 + }, + { + "epoch": 0.23546449751448484, + "grad_norm": 7.134383678436279, + "learning_rate": 0.004992956228275898, + "loss": 8.7056, + "step": 57800 + }, + { + "epoch": 0.2358718755378663, + "grad_norm": 4.128894805908203, + "learning_rate": 0.004992918928007337, + "loss": 8.8289, + "step": 57900 + }, + { + "epoch": 0.23627925356124777, + "grad_norm": 5.839402675628662, + "learning_rate": 0.004992881529378037, + "loss": 8.8334, + "step": 58000 + }, + { + "epoch": 0.23627925356124777, + "eval_MaskedAccuracy": 0.4585491223611061, + "eval_loss": 1.8525534868240356, + "eval_runtime": 1401.9315, + "eval_samples_per_second": 45.278, + "eval_steps_per_second": 0.177, + "step": 58000 + }, + { + "epoch": 0.23668663158462921, + "grad_norm": 3.721687078475952, + "learning_rate": 0.00499284403238947, + "loss": 8.7689, + "step": 58100 + }, + { + "epoch": 0.2370940096080107, + "grad_norm": 4.26967716217041, + "learning_rate": 0.004992806437043121, + "loss": 8.8639, + "step": 58200 + }, + { + "epoch": 0.23750138763139214, + "grad_norm": 3.958195924758911, + "learning_rate": 0.004992768743340482, + "loss": 8.9223, + "step": 58300 + }, + { + "epoch": 0.23790876565477362, + "grad_norm": 2.781923532485962, + "learning_rate": 0.00499273095128304, + "loss": 8.7765, + "step": 58400 + }, + { + "epoch": 0.23831614367815507, + "grad_norm": 8.125414848327637, + "learning_rate": 0.004992693060872287, + "loss": 8.76, + "step": 58500 + }, + { + "epoch": 0.23872352170153652, + "grad_norm": 8.391327857971191, + "learning_rate": 0.00499265507210972, + "loss": 8.7101, + "step": 58600 + }, + { + "epoch": 0.239130899724918, + "grad_norm": 6.260959148406982, + "learning_rate": 0.0049926169849968515, + "loss": 8.7667, + "step": 58700 + }, + { + "epoch": 0.23953827774829944, + "grad_norm": 5.657060146331787, + "learning_rate": 0.004992578799535172, + "loss": 8.8167, + "step": 58800 + }, + { + "epoch": 0.23994565577168092, + "grad_norm": 5.699919700622559, + "learning_rate": 0.004992540515726205, + "loss": 8.6987, + "step": 58900 + }, + { + "epoch": 0.24035303379506237, + "grad_norm": 5.916398525238037, + "learning_rate": 0.004992502133571444, + "loss": 8.6812, + "step": 59000 + }, + { + "epoch": 0.24035303379506237, + "eval_MaskedAccuracy": 0.46217061996832853, + "eval_loss": 1.8404490947723389, + "eval_runtime": 581.3056, + "eval_samples_per_second": 109.196, + "eval_steps_per_second": 0.427, + "step": 59000 + }, + { + "epoch": 0.24076041181844385, + "grad_norm": 8.000349998474121, + "learning_rate": 0.004992463653072431, + "loss": 8.7485, + "step": 59100 + }, + { + "epoch": 0.2411677898418253, + "grad_norm": 6.2110490798950195, + "learning_rate": 0.004992425074230673, + "loss": 8.6799, + "step": 59200 + }, + { + "epoch": 0.24157516786520675, + "grad_norm": 1.3083547353744507, + "learning_rate": 0.0049923863970476965, + "loss": 8.792, + "step": 59300 + }, + { + "epoch": 0.24198254588858822, + "grad_norm": 3.161991834640503, + "learning_rate": 0.0049923476215250305, + "loss": 8.9828, + "step": 59400 + }, + { + "epoch": 0.24238992391196967, + "grad_norm": 4.440457820892334, + "learning_rate": 0.004992308747664208, + "loss": 8.983, + "step": 59500 + }, + { + "epoch": 0.24279730193535115, + "grad_norm": 4.067751407623291, + "learning_rate": 0.004992269775466766, + "loss": 8.8012, + "step": 59600 + }, + { + "epoch": 0.2432046799587326, + "grad_norm": 6.714059352874756, + "learning_rate": 0.004992230704934249, + "loss": 8.7334, + "step": 59700 + }, + { + "epoch": 0.24361205798211408, + "grad_norm": 6.335543155670166, + "learning_rate": 0.004992191536068201, + "loss": 8.7069, + "step": 59800 + }, + { + "epoch": 0.24401943600549553, + "grad_norm": 5.72700309753418, + "learning_rate": 0.004992152268870167, + "loss": 8.6779, + "step": 59900 + }, + { + "epoch": 0.244426814028877, + "grad_norm": 18.419189453125, + "learning_rate": 0.004992112903341708, + "loss": 8.7899, + "step": 60000 + }, + { + "epoch": 0.244426814028877, + "eval_MaskedAccuracy": 0.4408497008426111, + "eval_loss": 1.9382939338684082, + "eval_runtime": 373.1181, + "eval_samples_per_second": 170.123, + "eval_steps_per_second": 0.665, + "step": 60000 + }, + { + "epoch": 0.24483419205225845, + "grad_norm": 5.117299556732178, + "learning_rate": 0.004992073439484375, + "loss": 8.8255, + "step": 60100 + }, + { + "epoch": 0.2452415700756399, + "grad_norm": 3.9339141845703125, + "learning_rate": 0.004992033877299723, + "loss": 8.7172, + "step": 60200 + }, + { + "epoch": 0.24564894809902138, + "grad_norm": 6.391584873199463, + "learning_rate": 0.004991994216789326, + "loss": 8.6556, + "step": 60300 + }, + { + "epoch": 0.24605632612240283, + "grad_norm": 7.434480667114258, + "learning_rate": 0.004991954457954737, + "loss": 8.7071, + "step": 60400 + }, + { + "epoch": 0.2464637041457843, + "grad_norm": 2.5101046562194824, + "learning_rate": 0.004991914600797545, + "loss": 8.6685, + "step": 60500 + }, + { + "epoch": 0.24687108216916576, + "grad_norm": 6.6819939613342285, + "learning_rate": 0.004991874645319314, + "loss": 8.7207, + "step": 60600 + }, + { + "epoch": 0.24727846019254723, + "grad_norm": 6.217319965362549, + "learning_rate": 0.004991834591521631, + "loss": 8.7524, + "step": 60700 + }, + { + "epoch": 0.24768583821592868, + "grad_norm": 3.4094226360321045, + "learning_rate": 0.004991794439406076, + "loss": 8.9701, + "step": 60800 + }, + { + "epoch": 0.24809321623931013, + "grad_norm": 6.1427903175354, + "learning_rate": 0.004991754188974236, + "loss": 8.8112, + "step": 60900 + }, + { + "epoch": 0.2485005942626916, + "grad_norm": 6.913832187652588, + "learning_rate": 0.004991713840227701, + "loss": 8.7124, + "step": 61000 + }, + { + "epoch": 0.2485005942626916, + "eval_MaskedAccuracy": 0.4621929598472666, + "eval_loss": 1.8318604230880737, + "eval_runtime": 527.4568, + "eval_samples_per_second": 120.343, + "eval_steps_per_second": 0.47, + "step": 61000 + }, + { + "epoch": 0.24890797228607306, + "grad_norm": 6.938937187194824, + "learning_rate": 0.004991673393168064, + "loss": 8.7164, + "step": 61100 + }, + { + "epoch": 0.24931535030945454, + "grad_norm": 4.936263084411621, + "learning_rate": 0.004991632847796928, + "loss": 8.6831, + "step": 61200 + }, + { + "epoch": 0.24972272833283599, + "grad_norm": 3.5639712810516357, + "learning_rate": 0.004991592204115897, + "loss": 8.6581, + "step": 61300 + }, + { + "epoch": 0.25013010635621746, + "grad_norm": 4.962512493133545, + "learning_rate": 0.004991551462126576, + "loss": 8.8954, + "step": 61400 + }, + { + "epoch": 0.25053748437959894, + "grad_norm": 5.8719987869262695, + "learning_rate": 0.004991510621830578, + "loss": 8.7383, + "step": 61500 + }, + { + "epoch": 0.25094486240298036, + "grad_norm": 5.943603515625, + "learning_rate": 0.004991469683229517, + "loss": 8.7059, + "step": 61600 + }, + { + "epoch": 0.25135224042636184, + "grad_norm": 7.711507320404053, + "learning_rate": 0.00499142864632501, + "loss": 8.6766, + "step": 61700 + }, + { + "epoch": 0.2517596184497433, + "grad_norm": 6.331603527069092, + "learning_rate": 0.00499138751111868, + "loss": 8.6879, + "step": 61800 + }, + { + "epoch": 0.25216699647312474, + "grad_norm": 2.5456717014312744, + "learning_rate": 0.004991346277612157, + "loss": 8.6847, + "step": 61900 + }, + { + "epoch": 0.2525743744965062, + "grad_norm": 1.222424030303955, + "learning_rate": 0.004991304945807071, + "loss": 8.6913, + "step": 62000 + }, + { + "epoch": 0.2525743744965062, + "eval_MaskedAccuracy": 0.45581933930365287, + "eval_loss": 1.8417388200759888, + "eval_runtime": 538.808, + "eval_samples_per_second": 117.808, + "eval_steps_per_second": 0.46, + "step": 62000 + }, + { + "epoch": 0.2529817525198877, + "grad_norm": 7.086363315582275, + "learning_rate": 0.004991263515705049, + "loss": 8.832, + "step": 62100 + }, + { + "epoch": 0.25338913054326917, + "grad_norm": 6.7353949546813965, + "learning_rate": 0.004991221987307737, + "loss": 8.7419, + "step": 62200 + }, + { + "epoch": 0.2537965085666506, + "grad_norm": 1.0387322902679443, + "learning_rate": 0.004991180360616769, + "loss": 8.7183, + "step": 62300 + }, + { + "epoch": 0.25420388659003207, + "grad_norm": 6.057994842529297, + "learning_rate": 0.00499113863563379, + "loss": 8.8768, + "step": 62400 + }, + { + "epoch": 0.25461126461341355, + "grad_norm": 6.997557163238525, + "learning_rate": 0.00499109681236046, + "loss": 8.737, + "step": 62500 + }, + { + "epoch": 0.25501864263679497, + "grad_norm": 4.8636555671691895, + "learning_rate": 0.004991054890798418, + "loss": 8.7255, + "step": 62600 + }, + { + "epoch": 0.25542602066017644, + "grad_norm": 6.203702926635742, + "learning_rate": 0.004991012870949338, + "loss": 8.6889, + "step": 62700 + }, + { + "epoch": 0.2558333986835579, + "grad_norm": 4.030526638031006, + "learning_rate": 0.004990970752814874, + "loss": 8.6639, + "step": 62800 + }, + { + "epoch": 0.2562407767069394, + "grad_norm": 11.469111442565918, + "learning_rate": 0.004990928536396685, + "loss": 8.91, + "step": 62900 + }, + { + "epoch": 0.2566481547303208, + "grad_norm": 6.844756126403809, + "learning_rate": 0.004990886221696451, + "loss": 8.9484, + "step": 63000 + }, + { + "epoch": 0.2566481547303208, + "eval_MaskedAccuracy": 0.45651751724645373, + "eval_loss": 1.866458773612976, + "eval_runtime": 543.3437, + "eval_samples_per_second": 116.825, + "eval_steps_per_second": 0.456, + "step": 63000 + }, + { + "epoch": 0.2570555327537023, + "grad_norm": 8.265596389770508, + "learning_rate": 0.004990843808715836, + "loss": 8.7548, + "step": 63100 + }, + { + "epoch": 0.2574629107770838, + "grad_norm": 8.50532054901123, + "learning_rate": 0.0049908012974565225, + "loss": 8.6947, + "step": 63200 + }, + { + "epoch": 0.25787028880046525, + "grad_norm": 11.911187171936035, + "learning_rate": 0.004990758687920184, + "loss": 8.7065, + "step": 63300 + }, + { + "epoch": 0.2582776668238467, + "grad_norm": 7.431310653686523, + "learning_rate": 0.004990715980108504, + "loss": 8.6872, + "step": 63400 + }, + { + "epoch": 0.25868504484722815, + "grad_norm": 6.245077610015869, + "learning_rate": 0.004990673174023182, + "loss": 8.6888, + "step": 63500 + }, + { + "epoch": 0.25909242287060963, + "grad_norm": 4.772067070007324, + "learning_rate": 0.004990630269665909, + "loss": 8.6529, + "step": 63600 + }, + { + "epoch": 0.25949980089399105, + "grad_norm": 8.589908599853516, + "learning_rate": 0.004990587267038376, + "loss": 8.7132, + "step": 63700 + }, + { + "epoch": 0.25990717891737253, + "grad_norm": 4.484492301940918, + "learning_rate": 0.004990544166142284, + "loss": 8.8358, + "step": 63800 + }, + { + "epoch": 0.260314556940754, + "grad_norm": 4.792771339416504, + "learning_rate": 0.004990500966979338, + "loss": 8.8237, + "step": 63900 + }, + { + "epoch": 0.2607219349641355, + "grad_norm": 9.162529945373535, + "learning_rate": 0.004990457669551245, + "loss": 8.7504, + "step": 64000 + }, + { + "epoch": 0.2607219349641355, + "eval_MaskedAccuracy": 0.46101908946857945, + "eval_loss": 1.8516390323638916, + "eval_runtime": 524.9316, + "eval_samples_per_second": 120.922, + "eval_steps_per_second": 0.472, + "step": 64000 + }, + { + "epoch": 0.2611293129875169, + "grad_norm": 7.857509613037109, + "learning_rate": 0.004990414273859712, + "loss": 8.687, + "step": 64100 + }, + { + "epoch": 0.2615366910108984, + "grad_norm": 8.928559303283691, + "learning_rate": 0.004990370779906467, + "loss": 8.8045, + "step": 64200 + }, + { + "epoch": 0.26194406903427986, + "grad_norm": 16.3757266998291, + "learning_rate": 0.00499032718769322, + "loss": 8.9208, + "step": 64300 + }, + { + "epoch": 0.2623514470576613, + "grad_norm": 6.15322732925415, + "learning_rate": 0.0049902834972216925, + "loss": 8.908, + "step": 64400 + }, + { + "epoch": 0.26275882508104276, + "grad_norm": 5.626424312591553, + "learning_rate": 0.004990239708493618, + "loss": 8.7201, + "step": 64500 + }, + { + "epoch": 0.26316620310442423, + "grad_norm": 6.347898960113525, + "learning_rate": 0.004990195821510728, + "loss": 8.6724, + "step": 64600 + }, + { + "epoch": 0.2635735811278057, + "grad_norm": 5.051441669464111, + "learning_rate": 0.004990151836274756, + "loss": 8.6605, + "step": 64700 + }, + { + "epoch": 0.26398095915118713, + "grad_norm": 2.98770809173584, + "learning_rate": 0.004990107752787442, + "loss": 8.6744, + "step": 64800 + }, + { + "epoch": 0.2643883371745686, + "grad_norm": 3.0073490142822266, + "learning_rate": 0.004990063571050524, + "loss": 8.8322, + "step": 64900 + }, + { + "epoch": 0.2647957151979501, + "grad_norm": 6.210570335388184, + "learning_rate": 0.004990019291065754, + "loss": 8.7453, + "step": 65000 + }, + { + "epoch": 0.2647957151979501, + "eval_MaskedAccuracy": 0.46210706583991695, + "eval_loss": 1.822614073753357, + "eval_runtime": 547.6703, + "eval_samples_per_second": 115.902, + "eval_steps_per_second": 0.453, + "step": 65000 + }, + { + "epoch": 0.2652030932213315, + "grad_norm": 8.379491806030273, + "learning_rate": 0.00498997491283488, + "loss": 8.6684, + "step": 65100 + }, + { + "epoch": 0.265610471244713, + "grad_norm": 9.686884880065918, + "learning_rate": 0.00498993043635965, + "loss": 8.681, + "step": 65200 + }, + { + "epoch": 0.26601784926809446, + "grad_norm": 7.550168991088867, + "learning_rate": 0.004989885861641834, + "loss": 8.6959, + "step": 65300 + }, + { + "epoch": 0.26642522729147594, + "grad_norm": 3.529690742492676, + "learning_rate": 0.00498984118868319, + "loss": 8.6661, + "step": 65400 + }, + { + "epoch": 0.26683260531485736, + "grad_norm": 3.6193811893463135, + "learning_rate": 0.004989796417485486, + "loss": 8.8082, + "step": 65500 + }, + { + "epoch": 0.26723998333823884, + "grad_norm": 7.745871543884277, + "learning_rate": 0.004989751548050491, + "loss": 8.8334, + "step": 65600 + }, + { + "epoch": 0.2676473613616203, + "grad_norm": 10.003741264343262, + "learning_rate": 0.004989706580379977, + "loss": 8.7687, + "step": 65700 + }, + { + "epoch": 0.2680547393850018, + "grad_norm": 9.515006065368652, + "learning_rate": 0.0049896615144757244, + "loss": 8.7927, + "step": 65800 + }, + { + "epoch": 0.2684621174083832, + "grad_norm": 6.813485622406006, + "learning_rate": 0.004989616350339516, + "loss": 8.7092, + "step": 65900 + }, + { + "epoch": 0.2688694954317647, + "grad_norm": 7.490106582641602, + "learning_rate": 0.004989571087973132, + "loss": 8.6178, + "step": 66000 + }, + { + "epoch": 0.2688694954317647, + "eval_MaskedAccuracy": 0.46375460599596263, + "eval_loss": 1.8165909051895142, + "eval_runtime": 566.3043, + "eval_samples_per_second": 112.088, + "eval_steps_per_second": 0.438, + "step": 66000 + }, + { + "epoch": 0.26927687345514617, + "grad_norm": 9.557563781738281, + "learning_rate": 0.004989525727378363, + "loss": 8.647, + "step": 66100 + }, + { + "epoch": 0.2696842514785276, + "grad_norm": 4.538211345672607, + "learning_rate": 0.004989480268557006, + "loss": 8.663, + "step": 66200 + }, + { + "epoch": 0.27009162950190907, + "grad_norm": 8.971290588378906, + "learning_rate": 0.004989434711510853, + "loss": 8.6516, + "step": 66300 + }, + { + "epoch": 0.27049900752529055, + "grad_norm": 5.639621257781982, + "learning_rate": 0.004989389056241712, + "loss": 8.5922, + "step": 66400 + }, + { + "epoch": 0.270906385548672, + "grad_norm": 7.516700267791748, + "learning_rate": 0.004989343302751382, + "loss": 8.6842, + "step": 66500 + }, + { + "epoch": 0.27131376357205345, + "grad_norm": 18.569461822509766, + "learning_rate": 0.00498929745104168, + "loss": 8.8898, + "step": 66600 + }, + { + "epoch": 0.2717211415954349, + "grad_norm": 7.216564655303955, + "learning_rate": 0.004989251501114402, + "loss": 8.852, + "step": 66700 + }, + { + "epoch": 0.2721285196188164, + "grad_norm": 4.8928632736206055, + "learning_rate": 0.004989205452971374, + "loss": 8.6904, + "step": 66800 + }, + { + "epoch": 0.2725358976421978, + "grad_norm": 9.051252365112305, + "learning_rate": 0.004989159306614425, + "loss": 8.6597, + "step": 66900 + }, + { + "epoch": 0.2729432756655793, + "grad_norm": 8.281542778015137, + "learning_rate": 0.004989113062045374, + "loss": 8.7904, + "step": 67000 + }, + { + "epoch": 0.2729432756655793, + "eval_MaskedAccuracy": 0.45793338315066245, + "eval_loss": 1.8609939813613892, + "eval_runtime": 1394.6305, + "eval_samples_per_second": 45.515, + "eval_steps_per_second": 0.178, + "step": 67000 + }, + { + "epoch": 0.2733506536889608, + "grad_norm": 4.64543342590332, + "learning_rate": 0.00498906671926605, + "loss": 8.7046, + "step": 67100 + }, + { + "epoch": 0.27375803171234225, + "grad_norm": 5.933874607086182, + "learning_rate": 0.004989020278278275, + "loss": 8.6247, + "step": 67200 + }, + { + "epoch": 0.2741654097357237, + "grad_norm": 8.769102096557617, + "learning_rate": 0.004988973739083899, + "loss": 8.6526, + "step": 67300 + }, + { + "epoch": 0.27457278775910515, + "grad_norm": 6.038580894470215, + "learning_rate": 0.004988927101684754, + "loss": 8.6191, + "step": 67400 + }, + { + "epoch": 0.27498016578248663, + "grad_norm": 4.009898662567139, + "learning_rate": 0.004988880366082679, + "loss": 8.6574, + "step": 67500 + }, + { + "epoch": 0.27538754380586805, + "grad_norm": 14.188034057617188, + "learning_rate": 0.004988833532279532, + "loss": 8.8855, + "step": 67600 + }, + { + "epoch": 0.27579492182924953, + "grad_norm": 8.408965110778809, + "learning_rate": 0.004988786600277165, + "loss": 8.8519, + "step": 67700 + }, + { + "epoch": 0.276202299852631, + "grad_norm": 8.768186569213867, + "learning_rate": 0.004988739570077424, + "loss": 8.6812, + "step": 67800 + }, + { + "epoch": 0.2766096778760125, + "grad_norm": 11.160076141357422, + "learning_rate": 0.004988692441682174, + "loss": 8.6409, + "step": 67900 + }, + { + "epoch": 0.2770170558993939, + "grad_norm": 5.295092582702637, + "learning_rate": 0.004988645215093282, + "loss": 8.6146, + "step": 68000 + }, + { + "epoch": 0.2770170558993939, + "eval_MaskedAccuracy": 0.4649399672145674, + "eval_loss": 1.8226548433303833, + "eval_runtime": 1226.5993, + "eval_samples_per_second": 51.75, + "eval_steps_per_second": 0.202, + "step": 68000 + }, + { + "epoch": 0.2774244339227754, + "grad_norm": 6.942083835601807, + "learning_rate": 0.004988597890312612, + "loss": 8.6312, + "step": 68100 + }, + { + "epoch": 0.27783181194615686, + "grad_norm": 6.901147842407227, + "learning_rate": 0.004988550467342032, + "loss": 8.6041, + "step": 68200 + }, + { + "epoch": 0.2782391899695383, + "grad_norm": 2.903517961502075, + "learning_rate": 0.0049885029461834195, + "loss": 8.5955, + "step": 68300 + }, + { + "epoch": 0.27864656799291976, + "grad_norm": 5.638279438018799, + "learning_rate": 0.004988455326838654, + "loss": 8.7353, + "step": 68400 + }, + { + "epoch": 0.27905394601630124, + "grad_norm": 6.898649215698242, + "learning_rate": 0.004988407609309608, + "loss": 8.7646, + "step": 68500 + }, + { + "epoch": 0.2794613240396827, + "grad_norm": 4.677414894104004, + "learning_rate": 0.004988359793598184, + "loss": 8.6375, + "step": 68600 + }, + { + "epoch": 0.27986870206306413, + "grad_norm": 4.530789852142334, + "learning_rate": 0.004988311879706264, + "loss": 8.5985, + "step": 68700 + }, + { + "epoch": 0.2802760800864456, + "grad_norm": 7.5350661277771, + "learning_rate": 0.004988263867635745, + "loss": 8.6005, + "step": 68800 + }, + { + "epoch": 0.2806834581098271, + "grad_norm": 8.78704833984375, + "learning_rate": 0.004988215757388528, + "loss": 8.6037, + "step": 68900 + }, + { + "epoch": 0.28109083613320857, + "grad_norm": 4.651316165924072, + "learning_rate": 0.004988167548966506, + "loss": 8.683, + "step": 69000 + }, + { + "epoch": 0.28109083613320857, + "eval_MaskedAccuracy": 0.4591831419850631, + "eval_loss": 1.8432483673095703, + "eval_runtime": 945.5524, + "eval_samples_per_second": 67.131, + "eval_steps_per_second": 0.262, + "step": 69000 + }, + { + "epoch": 0.28149821415659, + "grad_norm": 4.739138603210449, + "learning_rate": 0.004988119242371587, + "loss": 8.6377, + "step": 69100 + }, + { + "epoch": 0.28190559217997146, + "grad_norm": 5.9082818031311035, + "learning_rate": 0.004988070837605688, + "loss": 8.6987, + "step": 69200 + }, + { + "epoch": 0.28231297020335294, + "grad_norm": 4.415525436401367, + "learning_rate": 0.0049880223346707225, + "loss": 8.8657, + "step": 69300 + }, + { + "epoch": 0.28272034822673436, + "grad_norm": 3.6525769233703613, + "learning_rate": 0.0049879737335686005, + "loss": 8.7889, + "step": 69400 + }, + { + "epoch": 0.28312772625011584, + "grad_norm": 1.016323447227478, + "learning_rate": 0.004987925034301244, + "loss": 8.7423, + "step": 69500 + }, + { + "epoch": 0.2835351042734973, + "grad_norm": 4.312750339508057, + "learning_rate": 0.004987876236870586, + "loss": 8.8335, + "step": 69600 + }, + { + "epoch": 0.2839424822968788, + "grad_norm": 8.026240348815918, + "learning_rate": 0.00498782734127855, + "loss": 8.6856, + "step": 69700 + }, + { + "epoch": 0.2843498603202602, + "grad_norm": 1.7114349603652954, + "learning_rate": 0.004987778347527068, + "loss": 8.7786, + "step": 69800 + }, + { + "epoch": 0.2847572383436417, + "grad_norm": 4.468791961669922, + "learning_rate": 0.004987729255618083, + "loss": 8.7087, + "step": 69900 + }, + { + "epoch": 0.28516461636702317, + "grad_norm": 5.468358516693115, + "learning_rate": 0.004987680065553528, + "loss": 8.7154, + "step": 70000 + }, + { + "epoch": 0.28516461636702317, + "eval_MaskedAccuracy": 0.46311365588665654, + "eval_loss": 1.8235976696014404, + "eval_runtime": 975.8884, + "eval_samples_per_second": 65.044, + "eval_steps_per_second": 0.254, + "step": 70000 + }, + { + "epoch": 0.2855719943904046, + "grad_norm": 4.024952411651611, + "learning_rate": 0.004987630777335353, + "loss": 8.6081, + "step": 70100 + }, + { + "epoch": 0.28597937241378607, + "grad_norm": 10.850265502929688, + "learning_rate": 0.0049875813909655, + "loss": 8.6298, + "step": 70200 + }, + { + "epoch": 0.28638675043716755, + "grad_norm": 6.1410441398620605, + "learning_rate": 0.004987531906445936, + "loss": 8.7084, + "step": 70300 + }, + { + "epoch": 0.286794128460549, + "grad_norm": 4.831986427307129, + "learning_rate": 0.004987482323778607, + "loss": 8.8194, + "step": 70400 + }, + { + "epoch": 0.28720150648393045, + "grad_norm": 3.9731411933898926, + "learning_rate": 0.004987432642965471, + "loss": 8.7009, + "step": 70500 + }, + { + "epoch": 0.2876088845073119, + "grad_norm": 5.253785610198975, + "learning_rate": 0.00498738286400849, + "loss": 8.7323, + "step": 70600 + }, + { + "epoch": 0.2880162625306934, + "grad_norm": 5.914802551269531, + "learning_rate": 0.004987332986909641, + "loss": 8.682, + "step": 70700 + }, + { + "epoch": 0.2884236405540748, + "grad_norm": 9.221420288085938, + "learning_rate": 0.004987283011670893, + "loss": 8.6853, + "step": 70800 + }, + { + "epoch": 0.2888310185774563, + "grad_norm": 6.638362884521484, + "learning_rate": 0.004987232938294225, + "loss": 8.7717, + "step": 70900 + }, + { + "epoch": 0.2892383966008378, + "grad_norm": 8.450394630432129, + "learning_rate": 0.004987182766781605, + "loss": 8.6537, + "step": 71000 + }, + { + "epoch": 0.2892383966008378, + "eval_MaskedAccuracy": 0.46501880714079386, + "eval_loss": 1.813057541847229, + "eval_runtime": 1393.548, + "eval_samples_per_second": 45.55, + "eval_steps_per_second": 0.178, + "step": 71000 + }, + { + "epoch": 0.28964577462421925, + "grad_norm": 6.9521942138671875, + "learning_rate": 0.004987132497135028, + "loss": 8.6107, + "step": 71100 + }, + { + "epoch": 0.2900531526476007, + "grad_norm": 1.0710653066635132, + "learning_rate": 0.004987082129356481, + "loss": 8.6578, + "step": 71200 + }, + { + "epoch": 0.29046053067098215, + "grad_norm": 8.927386283874512, + "learning_rate": 0.004987031663447951, + "loss": 8.7727, + "step": 71300 + }, + { + "epoch": 0.29086790869436363, + "grad_norm": 35.31528854370117, + "learning_rate": 0.00498698109941143, + "loss": 8.6934, + "step": 71400 + }, + { + "epoch": 0.2912752867177451, + "grad_norm": 1.607964038848877, + "learning_rate": 0.004986930437248926, + "loss": 8.7996, + "step": 71500 + }, + { + "epoch": 0.29168266474112653, + "grad_norm": 6.970991611480713, + "learning_rate": 0.0049868796769624415, + "loss": 8.8118, + "step": 71600 + }, + { + "epoch": 0.292090042764508, + "grad_norm": 9.207018852233887, + "learning_rate": 0.004986828818553978, + "loss": 8.8205, + "step": 71700 + }, + { + "epoch": 0.2924974207878895, + "grad_norm": 4.5547332763671875, + "learning_rate": 0.004986777862025549, + "loss": 8.7335, + "step": 71800 + }, + { + "epoch": 0.2929047988112709, + "grad_norm": 3.7256174087524414, + "learning_rate": 0.004986726807379168, + "loss": 8.6877, + "step": 71900 + }, + { + "epoch": 0.2933121768346524, + "grad_norm": 5.260006904602051, + "learning_rate": 0.0049866756546168515, + "loss": 8.7415, + "step": 72000 + }, + { + "epoch": 0.2933121768346524, + "eval_MaskedAccuracy": 0.46264094166115644, + "eval_loss": 1.828948736190796, + "eval_runtime": 598.5044, + "eval_samples_per_second": 106.058, + "eval_steps_per_second": 0.414, + "step": 72000 + }, + { + "epoch": 0.29371955485803386, + "grad_norm": 7.642179012298584, + "learning_rate": 0.004986624403740618, + "loss": 8.6125, + "step": 72100 + }, + { + "epoch": 0.29412693288141534, + "grad_norm": 8.284234046936035, + "learning_rate": 0.004986573054752507, + "loss": 8.6121, + "step": 72200 + }, + { + "epoch": 0.29453431090479676, + "grad_norm": 7.844073295593262, + "learning_rate": 0.0049865216076545385, + "loss": 8.5973, + "step": 72300 + }, + { + "epoch": 0.29494168892817824, + "grad_norm": 8.898551940917969, + "learning_rate": 0.004986470062448749, + "loss": 8.5608, + "step": 72400 + }, + { + "epoch": 0.2953490669515597, + "grad_norm": 11.39210319519043, + "learning_rate": 0.004986418419137175, + "loss": 8.5659, + "step": 72500 + }, + { + "epoch": 0.29575644497494114, + "grad_norm": 8.301942825317383, + "learning_rate": 0.004986366677721854, + "loss": 8.5845, + "step": 72600 + }, + { + "epoch": 0.2961638229983226, + "grad_norm": 6.694902420043945, + "learning_rate": 0.004986314838204841, + "loss": 8.5574, + "step": 72700 + }, + { + "epoch": 0.2965712010217041, + "grad_norm": 9.159902572631836, + "learning_rate": 0.00498626290058818, + "loss": 8.5525, + "step": 72800 + }, + { + "epoch": 0.29697857904508557, + "grad_norm": 6.356729507446289, + "learning_rate": 0.004986210864873929, + "loss": 8.5978, + "step": 72900 + }, + { + "epoch": 0.297385957068467, + "grad_norm": 8.628966331481934, + "learning_rate": 0.004986158731064144, + "loss": 8.559, + "step": 73000 + }, + { + "epoch": 0.297385957068467, + "eval_MaskedAccuracy": 0.46742987719858947, + "eval_loss": 1.810995101928711, + "eval_runtime": 515.208, + "eval_samples_per_second": 123.205, + "eval_steps_per_second": 0.481, + "step": 73000 + }, + { + "epoch": 0.29779333509184847, + "grad_norm": 9.128714561462402, + "learning_rate": 0.004986106499160881, + "loss": 8.62, + "step": 73100 + }, + { + "epoch": 0.29820071311522994, + "grad_norm": 5.086336612701416, + "learning_rate": 0.004986054169166205, + "loss": 8.8034, + "step": 73200 + }, + { + "epoch": 0.29860809113861136, + "grad_norm": 8.508166313171387, + "learning_rate": 0.004986001741082194, + "loss": 8.6297, + "step": 73300 + }, + { + "epoch": 0.29901546916199284, + "grad_norm": 4.634451866149902, + "learning_rate": 0.004985949214910917, + "loss": 8.6497, + "step": 73400 + }, + { + "epoch": 0.2994228471853743, + "grad_norm": 6.528726100921631, + "learning_rate": 0.0049858965906544464, + "loss": 8.5753, + "step": 73500 + }, + { + "epoch": 0.2998302252087558, + "grad_norm": 5.7112555503845215, + "learning_rate": 0.004985843868314864, + "loss": 8.5338, + "step": 73600 + }, + { + "epoch": 0.3002376032321372, + "grad_norm": 2.9792256355285645, + "learning_rate": 0.00498579104789425, + "loss": 8.5698, + "step": 73700 + }, + { + "epoch": 0.3006449812555187, + "grad_norm": 3.0966081619262695, + "learning_rate": 0.0049857381293947015, + "loss": 8.87, + "step": 73800 + }, + { + "epoch": 0.3010523592789002, + "grad_norm": 4.575057029724121, + "learning_rate": 0.004985685112818309, + "loss": 8.6763, + "step": 73900 + }, + { + "epoch": 0.30145973730228165, + "grad_norm": 7.030023097991943, + "learning_rate": 0.004985631998167159, + "loss": 8.601, + "step": 74000 + }, + { + "epoch": 0.30145973730228165, + "eval_MaskedAccuracy": 0.46606080791487003, + "eval_loss": 1.8151373863220215, + "eval_runtime": 682.5644, + "eval_samples_per_second": 92.996, + "eval_steps_per_second": 0.363, + "step": 74000 + }, + { + "epoch": 0.30186711532566307, + "grad_norm": 9.47155475616455, + "learning_rate": 0.004985578785443366, + "loss": 8.582, + "step": 74100 + }, + { + "epoch": 0.30227449334904455, + "grad_norm": 8.0533447265625, + "learning_rate": 0.004985525474649029, + "loss": 8.6827, + "step": 74200 + }, + { + "epoch": 0.302681871372426, + "grad_norm": 5.068324565887451, + "learning_rate": 0.004985472065786245, + "loss": 8.5732, + "step": 74300 + }, + { + "epoch": 0.30308924939580745, + "grad_norm": 6.542651176452637, + "learning_rate": 0.004985418558857134, + "loss": 8.5886, + "step": 74400 + }, + { + "epoch": 0.3034966274191889, + "grad_norm": 5.114233493804932, + "learning_rate": 0.00498536495386381, + "loss": 8.5641, + "step": 74500 + }, + { + "epoch": 0.3039040054425704, + "grad_norm": 1.2797762155532837, + "learning_rate": 0.004985311250808395, + "loss": 8.6368, + "step": 74600 + }, + { + "epoch": 0.3043113834659519, + "grad_norm": 9.820923805236816, + "learning_rate": 0.0049852574496930145, + "loss": 8.903, + "step": 74700 + }, + { + "epoch": 0.3047187614893333, + "grad_norm": 2.4579174518585205, + "learning_rate": 0.004985203550519789, + "loss": 8.8724, + "step": 74800 + }, + { + "epoch": 0.3051261395127148, + "grad_norm": 6.749464511871338, + "learning_rate": 0.004985149553290851, + "loss": 8.7658, + "step": 74900 + }, + { + "epoch": 0.30553351753609626, + "grad_norm": 2.1830623149871826, + "learning_rate": 0.0049850954580083415, + "loss": 8.6716, + "step": 75000 + }, + { + "epoch": 0.30553351753609626, + "eval_MaskedAccuracy": 0.45829476062798746, + "eval_loss": 1.8497532606124878, + "eval_runtime": 556.5347, + "eval_samples_per_second": 114.056, + "eval_steps_per_second": 0.446, + "step": 75000 + }, + { + "epoch": 0.3059408955594777, + "grad_norm": 8.475727081298828, + "learning_rate": 0.004985041264674393, + "loss": 8.8036, + "step": 75100 + }, + { + "epoch": 0.30634827358285915, + "grad_norm": 7.727541923522949, + "learning_rate": 0.0049849869732911485, + "loss": 8.7284, + "step": 75200 + }, + { + "epoch": 0.30675565160624063, + "grad_norm": 6.423786163330078, + "learning_rate": 0.004984932583860751, + "loss": 8.6141, + "step": 75300 + }, + { + "epoch": 0.3071630296296221, + "grad_norm": 7.17932653427124, + "learning_rate": 0.004984878096385355, + "loss": 8.5934, + "step": 75400 + }, + { + "epoch": 0.30757040765300353, + "grad_norm": 5.807397365570068, + "learning_rate": 0.0049848235108671205, + "loss": 8.5996, + "step": 75500 + }, + { + "epoch": 0.307977785676385, + "grad_norm": 9.838018417358398, + "learning_rate": 0.004984768827308196, + "loss": 8.5559, + "step": 75600 + }, + { + "epoch": 0.3083851636997665, + "grad_norm": 14.574153900146484, + "learning_rate": 0.004984714045710747, + "loss": 8.6299, + "step": 75700 + }, + { + "epoch": 0.3087925417231479, + "grad_norm": 5.1707658767700195, + "learning_rate": 0.004984659166076941, + "loss": 8.7743, + "step": 75800 + }, + { + "epoch": 0.3091999197465294, + "grad_norm": 5.780153751373291, + "learning_rate": 0.004984604188408943, + "loss": 8.7648, + "step": 75900 + }, + { + "epoch": 0.30960729776991086, + "grad_norm": 7.285679817199707, + "learning_rate": 0.004984549112708933, + "loss": 8.6375, + "step": 76000 + }, + { + "epoch": 0.30960729776991086, + "eval_MaskedAccuracy": 0.4661069620890094, + "eval_loss": 1.8157120943069458, + "eval_runtime": 640.5674, + "eval_samples_per_second": 99.093, + "eval_steps_per_second": 0.387, + "step": 76000 + }, + { + "epoch": 0.31001467579329234, + "grad_norm": 6.567680835723877, + "learning_rate": 0.0049844939389790816, + "loss": 8.5938, + "step": 76100 + }, + { + "epoch": 0.31042205381667376, + "grad_norm": 4.858399868011475, + "learning_rate": 0.00498443866722158, + "loss": 8.5711, + "step": 76200 + }, + { + "epoch": 0.31082943184005524, + "grad_norm": 5.496742248535156, + "learning_rate": 0.004984383297438603, + "loss": 8.578, + "step": 76300 + }, + { + "epoch": 0.3112368098634367, + "grad_norm": 5.734500885009766, + "learning_rate": 0.004984327829632341, + "loss": 8.5659, + "step": 76400 + }, + { + "epoch": 0.3116441878868182, + "grad_norm": 4.251106262207031, + "learning_rate": 0.004984272263804993, + "loss": 8.5555, + "step": 76500 + }, + { + "epoch": 0.3120515659101996, + "grad_norm": 4.126178741455078, + "learning_rate": 0.0049842165999587525, + "loss": 8.5562, + "step": 76600 + }, + { + "epoch": 0.3124589439335811, + "grad_norm": 8.147299766540527, + "learning_rate": 0.004984160838095821, + "loss": 8.5382, + "step": 76700 + }, + { + "epoch": 0.31286632195696257, + "grad_norm": 4.034787654876709, + "learning_rate": 0.004984104978218403, + "loss": 8.69, + "step": 76800 + }, + { + "epoch": 0.313273699980344, + "grad_norm": 1.8307220935821533, + "learning_rate": 0.004984049020328701, + "loss": 8.7546, + "step": 76900 + }, + { + "epoch": 0.31368107800372547, + "grad_norm": 6.058050632476807, + "learning_rate": 0.004983992964428932, + "loss": 8.6887, + "step": 77000 + }, + { + "epoch": 0.31368107800372547, + "eval_MaskedAccuracy": 0.45887611000533407, + "eval_loss": 1.8508230447769165, + "eval_runtime": 513.6801, + "eval_samples_per_second": 123.571, + "eval_steps_per_second": 0.483, + "step": 77000 + }, + { + "epoch": 0.31408845602710694, + "grad_norm": 8.168664932250977, + "learning_rate": 0.0049839368105213154, + "loss": 8.7375, + "step": 77100 + }, + { + "epoch": 0.3144958340504884, + "grad_norm": 4.794949054718018, + "learning_rate": 0.0049838805586080665, + "loss": 8.6035, + "step": 77200 + }, + { + "epoch": 0.31490321207386984, + "grad_norm": 3.7873361110687256, + "learning_rate": 0.004983824208691406, + "loss": 8.5769, + "step": 77300 + }, + { + "epoch": 0.3153105900972513, + "grad_norm": 9.073785781860352, + "learning_rate": 0.004983767760773574, + "loss": 8.568, + "step": 77400 + }, + { + "epoch": 0.3157179681206328, + "grad_norm": 8.142977714538574, + "learning_rate": 0.004983711214856787, + "loss": 8.5618, + "step": 77500 + }, + { + "epoch": 0.3161253461440142, + "grad_norm": 6.451813220977783, + "learning_rate": 0.0049836545709432975, + "loss": 8.5512, + "step": 77600 + }, + { + "epoch": 0.3165327241673957, + "grad_norm": 10.494230270385742, + "learning_rate": 0.004983597829035338, + "loss": 8.5219, + "step": 77700 + }, + { + "epoch": 0.3169401021907772, + "grad_norm": 3.895531177520752, + "learning_rate": 0.004983540989135144, + "loss": 8.6625, + "step": 77800 + }, + { + "epoch": 0.31734748021415865, + "grad_norm": 8.601829528808594, + "learning_rate": 0.004983484051244962, + "loss": 8.59, + "step": 77900 + }, + { + "epoch": 0.31775485823754007, + "grad_norm": 6.697279453277588, + "learning_rate": 0.004983427015367055, + "loss": 8.5592, + "step": 78000 + }, + { + "epoch": 0.31775485823754007, + "eval_MaskedAccuracy": 0.46543396075038707, + "eval_loss": 1.8149820566177368, + "eval_runtime": 577.5418, + "eval_samples_per_second": 109.907, + "eval_steps_per_second": 0.429, + "step": 78000 + }, + { + "epoch": 0.31816223626092155, + "grad_norm": 8.476865768432617, + "learning_rate": 0.004983369881503679, + "loss": 8.7323, + "step": 78100 + }, + { + "epoch": 0.318569614284303, + "grad_norm": 5.520491123199463, + "learning_rate": 0.004983312649657079, + "loss": 8.5741, + "step": 78200 + }, + { + "epoch": 0.31897699230768445, + "grad_norm": 8.3047513961792, + "learning_rate": 0.004983255319829533, + "loss": 8.5857, + "step": 78300 + }, + { + "epoch": 0.3193843703310659, + "grad_norm": 10.212699890136719, + "learning_rate": 0.004983197892023291, + "loss": 8.5407, + "step": 78400 + }, + { + "epoch": 0.3197917483544474, + "grad_norm": 6.702394962310791, + "learning_rate": 0.004983140366240634, + "loss": 8.5472, + "step": 78500 + }, + { + "epoch": 0.3201991263778289, + "grad_norm": 3.3476109504699707, + "learning_rate": 0.0049830827424838416, + "loss": 8.6871, + "step": 78600 + }, + { + "epoch": 0.3206065044012103, + "grad_norm": 4.181885242462158, + "learning_rate": 0.0049830250207551765, + "loss": 8.8476, + "step": 78700 + }, + { + "epoch": 0.3210138824245918, + "grad_norm": 6.616336345672607, + "learning_rate": 0.004982967201056927, + "loss": 8.8723, + "step": 78800 + }, + { + "epoch": 0.32142126044797326, + "grad_norm": 5.890897750854492, + "learning_rate": 0.0049829092833913865, + "loss": 8.6229, + "step": 78900 + }, + { + "epoch": 0.3218286384713547, + "grad_norm": 8.629895210266113, + "learning_rate": 0.004982851267760835, + "loss": 8.5627, + "step": 79000 + }, + { + "epoch": 0.3218286384713547, + "eval_MaskedAccuracy": 0.4663628527517574, + "eval_loss": 1.8107352256774902, + "eval_runtime": 485.2463, + "eval_samples_per_second": 130.812, + "eval_steps_per_second": 0.511, + "step": 79000 + }, + { + "epoch": 0.32223601649473615, + "grad_norm": 4.689969539642334, + "learning_rate": 0.0049827931541675695, + "loss": 8.5525, + "step": 79100 + }, + { + "epoch": 0.32264339451811763, + "grad_norm": 7.801525592803955, + "learning_rate": 0.004982734942613896, + "loss": 8.5295, + "step": 79200 + }, + { + "epoch": 0.3230507725414991, + "grad_norm": 5.901822566986084, + "learning_rate": 0.00498267663310211, + "loss": 8.5588, + "step": 79300 + }, + { + "epoch": 0.32345815056488053, + "grad_norm": 4.35959529876709, + "learning_rate": 0.004982618225634512, + "loss": 8.6651, + "step": 79400 + }, + { + "epoch": 0.323865528588262, + "grad_norm": 6.286835670471191, + "learning_rate": 0.004982559720213415, + "loss": 8.5631, + "step": 79500 + }, + { + "epoch": 0.3242729066116435, + "grad_norm": 5.67689847946167, + "learning_rate": 0.004982501116841123, + "loss": 8.5648, + "step": 79600 + }, + { + "epoch": 0.32468028463502496, + "grad_norm": 7.631079196929932, + "learning_rate": 0.0049824424155199614, + "loss": 8.5563, + "step": 79700 + }, + { + "epoch": 0.3250876626584064, + "grad_norm": 9.412803649902344, + "learning_rate": 0.00498238361625226, + "loss": 8.5585, + "step": 79800 + }, + { + "epoch": 0.32549504068178786, + "grad_norm": 8.735962867736816, + "learning_rate": 0.004982324719040331, + "loss": 8.5384, + "step": 79900 + }, + { + "epoch": 0.32590241870516934, + "grad_norm": 14.457242965698242, + "learning_rate": 0.004982265723886508, + "loss": 8.5336, + "step": 80000 + }, + { + "epoch": 0.32590241870516934, + "eval_MaskedAccuracy": 0.4618274738562086, + "eval_loss": 1.8267515897750854, + "eval_runtime": 545.2404, + "eval_samples_per_second": 116.418, + "eval_steps_per_second": 0.455, + "step": 80000 + }, + { + "epoch": 0.32630979672855076, + "grad_norm": 5.96212100982666, + "learning_rate": 0.004982206630793117, + "loss": 8.8503, + "step": 80100 + }, + { + "epoch": 0.32671717475193224, + "grad_norm": 18.13055992126465, + "learning_rate": 0.004982147439762501, + "loss": 8.842, + "step": 80200 + }, + { + "epoch": 0.3271245527753137, + "grad_norm": 20.262971878051758, + "learning_rate": 0.004982088150796999, + "loss": 8.8396, + "step": 80300 + }, + { + "epoch": 0.3275319307986952, + "grad_norm": 9.294783592224121, + "learning_rate": 0.004982028763898953, + "loss": 8.7368, + "step": 80400 + }, + { + "epoch": 0.3279393088220766, + "grad_norm": 5.545958518981934, + "learning_rate": 0.00498196927907071, + "loss": 8.5956, + "step": 80500 + }, + { + "epoch": 0.3283466868454581, + "grad_norm": 5.721169948577881, + "learning_rate": 0.004981909696314625, + "loss": 8.5734, + "step": 80600 + }, + { + "epoch": 0.32875406486883957, + "grad_norm": 5.6723761558532715, + "learning_rate": 0.004981850015633058, + "loss": 8.5697, + "step": 80700 + }, + { + "epoch": 0.329161442892221, + "grad_norm": 6.1140031814575195, + "learning_rate": 0.0049817902370283594, + "loss": 8.5231, + "step": 80800 + }, + { + "epoch": 0.32956882091560247, + "grad_norm": 5.004663944244385, + "learning_rate": 0.004981730360502891, + "loss": 8.6437, + "step": 80900 + }, + { + "epoch": 0.32997619893898394, + "grad_norm": 9.134260177612305, + "learning_rate": 0.00498167038605903, + "loss": 8.7893, + "step": 81000 + }, + { + "epoch": 0.32997619893898394, + "eval_MaskedAccuracy": 0.4559564923524422, + "eval_loss": 1.8662664890289307, + "eval_runtime": 580.9179, + "eval_samples_per_second": 109.268, + "eval_steps_per_second": 0.427, + "step": 81000 + }, + { + "epoch": 0.3303835769623654, + "grad_norm": 7.398022174835205, + "learning_rate": 0.004981610313699138, + "loss": 8.7318, + "step": 81100 + }, + { + "epoch": 0.33079095498574684, + "grad_norm": 10.874109268188477, + "learning_rate": 0.0049815501434255964, + "loss": 8.6043, + "step": 81200 + }, + { + "epoch": 0.3311983330091283, + "grad_norm": 7.2064690589904785, + "learning_rate": 0.0049814898752407796, + "loss": 8.5529, + "step": 81300 + }, + { + "epoch": 0.3316057110325098, + "grad_norm": 13.496102333068848, + "learning_rate": 0.004981429509147076, + "loss": 8.5061, + "step": 81400 + }, + { + "epoch": 0.3320130890558912, + "grad_norm": 7.787205696105957, + "learning_rate": 0.004981369045146868, + "loss": 8.5397, + "step": 81500 + }, + { + "epoch": 0.3324204670792727, + "grad_norm": 5.602849006652832, + "learning_rate": 0.004981308483242548, + "loss": 8.4998, + "step": 81600 + }, + { + "epoch": 0.3328278451026542, + "grad_norm": 8.219766616821289, + "learning_rate": 0.0049812478234365085, + "loss": 8.5309, + "step": 81700 + }, + { + "epoch": 0.33323522312603565, + "grad_norm": 6.64454984664917, + "learning_rate": 0.004981187065731154, + "loss": 8.6075, + "step": 81800 + }, + { + "epoch": 0.3336426011494171, + "grad_norm": 7.264410018920898, + "learning_rate": 0.004981126210128876, + "loss": 8.5623, + "step": 81900 + }, + { + "epoch": 0.33404997917279855, + "grad_norm": 9.636812210083008, + "learning_rate": 0.004981065256632084, + "loss": 8.5134, + "step": 82000 + }, + { + "epoch": 0.33404997917279855, + "eval_MaskedAccuracy": 0.4689584496984937, + "eval_loss": 1.8067864179611206, + "eval_runtime": 597.0935, + "eval_samples_per_second": 106.308, + "eval_steps_per_second": 0.415, + "step": 82000 + }, + { + "epoch": 0.33445735719618, + "grad_norm": 8.728602409362793, + "learning_rate": 0.0049810042052431955, + "loss": 8.5174, + "step": 82100 + }, + { + "epoch": 0.3348647352195615, + "grad_norm": 7.863314628601074, + "learning_rate": 0.004980943055964616, + "loss": 8.7261, + "step": 82200 + }, + { + "epoch": 0.3352721132429429, + "grad_norm": 5.752391338348389, + "learning_rate": 0.004980881808798764, + "loss": 8.7496, + "step": 82300 + }, + { + "epoch": 0.3356794912663244, + "grad_norm": 6.728067874908447, + "learning_rate": 0.004980820463748064, + "loss": 8.5905, + "step": 82400 + }, + { + "epoch": 0.3360868692897059, + "grad_norm": 4.3324995040893555, + "learning_rate": 0.004980759020814935, + "loss": 8.5585, + "step": 82500 + }, + { + "epoch": 0.3364942473130873, + "grad_norm": 3.047788619995117, + "learning_rate": 0.004980697480001814, + "loss": 8.5441, + "step": 82600 + }, + { + "epoch": 0.3369016253364688, + "grad_norm": 7.090338706970215, + "learning_rate": 0.004980635841311131, + "loss": 8.6671, + "step": 82700 + }, + { + "epoch": 0.33730900335985026, + "grad_norm": 5.892022132873535, + "learning_rate": 0.004980574104745324, + "loss": 8.5548, + "step": 82800 + }, + { + "epoch": 0.33771638138323173, + "grad_norm": 8.809314727783203, + "learning_rate": 0.004980512270306831, + "loss": 8.5253, + "step": 82900 + }, + { + "epoch": 0.33812375940661316, + "grad_norm": 8.679837226867676, + "learning_rate": 0.004980450337998095, + "loss": 8.5425, + "step": 83000 + }, + { + "epoch": 0.33812375940661316, + "eval_MaskedAccuracy": 0.46861824620965875, + "eval_loss": 1.7987737655639648, + "eval_runtime": 643.4937, + "eval_samples_per_second": 98.643, + "eval_steps_per_second": 0.385, + "step": 83000 + }, + { + "epoch": 0.33853113742999463, + "grad_norm": 24.381132125854492, + "learning_rate": 0.0049803883078215734, + "loss": 8.6264, + "step": 83100 + }, + { + "epoch": 0.3389385154533761, + "grad_norm": 5.536754131317139, + "learning_rate": 0.0049803261797797095, + "loss": 8.7931, + "step": 83200 + }, + { + "epoch": 0.33934589347675753, + "grad_norm": 8.775206565856934, + "learning_rate": 0.004980263953874957, + "loss": 8.7717, + "step": 83300 + }, + { + "epoch": 0.339753271500139, + "grad_norm": 3.58613920211792, + "learning_rate": 0.004980201630109785, + "loss": 8.725, + "step": 83400 + }, + { + "epoch": 0.3401606495235205, + "grad_norm": 5.676400184631348, + "learning_rate": 0.004980139208486654, + "loss": 8.6465, + "step": 83500 + }, + { + "epoch": 0.34056802754690196, + "grad_norm": 8.117194175720215, + "learning_rate": 0.004980076689008034, + "loss": 8.5568, + "step": 83600 + }, + { + "epoch": 0.3409754055702834, + "grad_norm": 8.87285327911377, + "learning_rate": 0.004980014071676394, + "loss": 8.5427, + "step": 83700 + }, + { + "epoch": 0.34138278359366486, + "grad_norm": 6.456417560577393, + "learning_rate": 0.004979951356494205, + "loss": 8.5264, + "step": 83800 + }, + { + "epoch": 0.34179016161704634, + "grad_norm": 5.127048969268799, + "learning_rate": 0.004979888543463963, + "loss": 8.6865, + "step": 83900 + }, + { + "epoch": 0.34219753964042776, + "grad_norm": 6.285412788391113, + "learning_rate": 0.004979825632588136, + "loss": 8.6639, + "step": 84000 + }, + { + "epoch": 0.34219753964042776, + "eval_MaskedAccuracy": 0.46654122265143805, + "eval_loss": 1.8142414093017578, + "eval_runtime": 591.5507, + "eval_samples_per_second": 107.304, + "eval_steps_per_second": 0.419, + "step": 84000 + }, + { + "epoch": 0.34260491766380924, + "grad_norm": 6.347151279449463, + "learning_rate": 0.004979762623869219, + "loss": 8.567, + "step": 84100 + }, + { + "epoch": 0.3430122956871907, + "grad_norm": 6.459830284118652, + "learning_rate": 0.0049796995173096974, + "loss": 8.5622, + "step": 84200 + }, + { + "epoch": 0.3434196737105722, + "grad_norm": 7.853311061859131, + "learning_rate": 0.004979636312912063, + "loss": 8.5209, + "step": 84300 + }, + { + "epoch": 0.3438270517339536, + "grad_norm": 7.669728755950928, + "learning_rate": 0.0049795730106788234, + "loss": 8.4813, + "step": 84400 + }, + { + "epoch": 0.3442344297573351, + "grad_norm": 10.450735092163086, + "learning_rate": 0.004979509610612482, + "loss": 8.6282, + "step": 84500 + }, + { + "epoch": 0.34464180778071657, + "grad_norm": 7.543275356292725, + "learning_rate": 0.004979446112715538, + "loss": 8.6334, + "step": 84600 + }, + { + "epoch": 0.34504918580409805, + "grad_norm": 4.0467023849487305, + "learning_rate": 0.004979382516990501, + "loss": 8.7694, + "step": 84700 + }, + { + "epoch": 0.34545656382747947, + "grad_norm": 8.46611213684082, + "learning_rate": 0.0049793188234398865, + "loss": 8.6023, + "step": 84800 + }, + { + "epoch": 0.34586394185086095, + "grad_norm": 6.743054389953613, + "learning_rate": 0.004979255032066221, + "loss": 8.5541, + "step": 84900 + }, + { + "epoch": 0.3462713198742424, + "grad_norm": 7.6256585121154785, + "learning_rate": 0.004979191142872017, + "loss": 8.5375, + "step": 85000 + }, + { + "epoch": 0.3462713198742424, + "eval_MaskedAccuracy": 0.4691629318915052, + "eval_loss": 1.7983323335647583, + "eval_runtime": 544.0842, + "eval_samples_per_second": 116.666, + "eval_steps_per_second": 0.456, + "step": 85000 + }, + { + "epoch": 0.34667869789762384, + "grad_norm": 1.0273075103759766, + "learning_rate": 0.004979127155859808, + "loss": 8.5629, + "step": 85100 + }, + { + "epoch": 0.3470860759210053, + "grad_norm": 9.683170318603516, + "learning_rate": 0.004979063071032122, + "loss": 8.7399, + "step": 85200 + }, + { + "epoch": 0.3474934539443868, + "grad_norm": 7.662843704223633, + "learning_rate": 0.004978998888391494, + "loss": 8.5619, + "step": 85300 + }, + { + "epoch": 0.3479008319677683, + "grad_norm": 12.199478149414062, + "learning_rate": 0.004978934607940453, + "loss": 8.5566, + "step": 85400 + }, + { + "epoch": 0.3483082099911497, + "grad_norm": 7.151951789855957, + "learning_rate": 0.0049788702296815444, + "loss": 8.5126, + "step": 85500 + }, + { + "epoch": 0.3487155880145312, + "grad_norm": 8.834843635559082, + "learning_rate": 0.004978805753617314, + "loss": 8.5281, + "step": 85600 + }, + { + "epoch": 0.34912296603791265, + "grad_norm": 7.378878593444824, + "learning_rate": 0.004978741179750311, + "loss": 8.5081, + "step": 85700 + }, + { + "epoch": 0.3495303440612941, + "grad_norm": 5.579213619232178, + "learning_rate": 0.004978676508083091, + "loss": 8.48, + "step": 85800 + }, + { + "epoch": 0.34993772208467555, + "grad_norm": 5.404116153717041, + "learning_rate": 0.004978611738618207, + "loss": 8.7612, + "step": 85900 + }, + { + "epoch": 0.35034510010805703, + "grad_norm": 7.545348167419434, + "learning_rate": 0.0049785468713582255, + "loss": 8.5953, + "step": 86000 + }, + { + "epoch": 0.35034510010805703, + "eval_MaskedAccuracy": 0.4679827090386452, + "eval_loss": 1.8021892309188843, + "eval_runtime": 607.8016, + "eval_samples_per_second": 104.435, + "eval_steps_per_second": 0.408, + "step": 86000 + }, + { + "epoch": 0.3507524781314385, + "grad_norm": 7.222796440124512, + "learning_rate": 0.004978481906305703, + "loss": 8.5486, + "step": 86100 + }, + { + "epoch": 0.3511598561548199, + "grad_norm": 4.052384853363037, + "learning_rate": 0.00497841684346321, + "loss": 8.511, + "step": 86200 + }, + { + "epoch": 0.3515672341782014, + "grad_norm": 13.966856002807617, + "learning_rate": 0.004978351682833326, + "loss": 8.5318, + "step": 86300 + }, + { + "epoch": 0.3519746122015829, + "grad_norm": 1.1165910959243774, + "learning_rate": 0.004978286424418617, + "loss": 8.7596, + "step": 86400 + }, + { + "epoch": 0.3523819902249643, + "grad_norm": 1.760972023010254, + "learning_rate": 0.004978221068221663, + "loss": 8.7927, + "step": 86500 + }, + { + "epoch": 0.3527893682483458, + "grad_norm": 4.525177955627441, + "learning_rate": 0.004978155614245055, + "loss": 8.8414, + "step": 86600 + }, + { + "epoch": 0.35319674627172726, + "grad_norm": 5.550790786743164, + "learning_rate": 0.00497809006249138, + "loss": 8.5977, + "step": 86700 + }, + { + "epoch": 0.35360412429510873, + "grad_norm": 7.4307451248168945, + "learning_rate": 0.00497802441296323, + "loss": 8.631, + "step": 86800 + }, + { + "epoch": 0.35401150231849016, + "grad_norm": 1.908689260482788, + "learning_rate": 0.004977958665663198, + "loss": 8.5632, + "step": 86900 + }, + { + "epoch": 0.35441888034187163, + "grad_norm": 7.362282752990723, + "learning_rate": 0.004977892820593886, + "loss": 8.6296, + "step": 87000 + }, + { + "epoch": 0.35441888034187163, + "eval_MaskedAccuracy": 0.46437557347380914, + "eval_loss": 1.8164002895355225, + "eval_runtime": 503.8685, + "eval_samples_per_second": 125.977, + "eval_steps_per_second": 0.492, + "step": 87000 + }, + { + "epoch": 0.3548262583652531, + "grad_norm": 3.9842371940612793, + "learning_rate": 0.004977826877757894, + "loss": 8.5993, + "step": 87100 + }, + { + "epoch": 0.3552336363886346, + "grad_norm": 6.476138591766357, + "learning_rate": 0.004977760837157826, + "loss": 8.6622, + "step": 87200 + }, + { + "epoch": 0.355641014412016, + "grad_norm": 9.106497764587402, + "learning_rate": 0.0049776946987962955, + "loss": 8.5143, + "step": 87300 + }, + { + "epoch": 0.3560483924353975, + "grad_norm": 5.475508689880371, + "learning_rate": 0.004977628462675919, + "loss": 8.5169, + "step": 87400 + }, + { + "epoch": 0.35645577045877896, + "grad_norm": 9.564685821533203, + "learning_rate": 0.004977562128799318, + "loss": 8.5448, + "step": 87500 + }, + { + "epoch": 0.3568631484821604, + "grad_norm": 3.261197328567505, + "learning_rate": 0.004977495697169109, + "loss": 8.6543, + "step": 87600 + }, + { + "epoch": 0.35727052650554186, + "grad_norm": 11.773866653442383, + "learning_rate": 0.004977429167787918, + "loss": 8.6109, + "step": 87700 + }, + { + "epoch": 0.35767790452892334, + "grad_norm": 5.849395275115967, + "learning_rate": 0.004977362540658388, + "loss": 8.7551, + "step": 87800 + }, + { + "epoch": 0.3580852825523048, + "grad_norm": 6.478888511657715, + "learning_rate": 0.00497729581578314, + "loss": 8.6222, + "step": 87900 + }, + { + "epoch": 0.35849266057568624, + "grad_norm": 10.449518203735352, + "learning_rate": 0.004977228993164813, + "loss": 8.5137, + "step": 88000 + }, + { + "epoch": 0.35849266057568624, + "eval_MaskedAccuracy": 0.46903022098579256, + "eval_loss": 1.7992993593215942, + "eval_runtime": 596.9731, + "eval_samples_per_second": 106.33, + "eval_steps_per_second": 0.415, + "step": 88000 + }, + { + "epoch": 0.3589000385990677, + "grad_norm": 4.7867302894592285, + "learning_rate": 0.004977162072806052, + "loss": 8.5145, + "step": 88100 + }, + { + "epoch": 0.3593074166224492, + "grad_norm": 9.004382133483887, + "learning_rate": 0.004977095054709505, + "loss": 8.4894, + "step": 88200 + }, + { + "epoch": 0.3597147946458306, + "grad_norm": 9.401776313781738, + "learning_rate": 0.0049770279388778186, + "loss": 8.4734, + "step": 88300 + }, + { + "epoch": 0.3601221726692121, + "grad_norm": 9.095968246459961, + "learning_rate": 0.004976960725313647, + "loss": 8.4613, + "step": 88400 + }, + { + "epoch": 0.36052955069259357, + "grad_norm": 9.433191299438477, + "learning_rate": 0.004976893414019646, + "loss": 8.4729, + "step": 88500 + }, + { + "epoch": 0.36093692871597505, + "grad_norm": 7.030093193054199, + "learning_rate": 0.004976826004998478, + "loss": 8.4404, + "step": 88600 + }, + { + "epoch": 0.36134430673935647, + "grad_norm": 4.911047458648682, + "learning_rate": 0.004976758498252802, + "loss": 8.5508, + "step": 88700 + }, + { + "epoch": 0.36175168476273795, + "grad_norm": 12.30935287475586, + "learning_rate": 0.004976690893785297, + "loss": 8.5833, + "step": 88800 + }, + { + "epoch": 0.3621590627861194, + "grad_norm": 12.394389152526855, + "learning_rate": 0.004976623191598629, + "loss": 8.5081, + "step": 88900 + }, + { + "epoch": 0.36256644080950085, + "grad_norm": 11.784280776977539, + "learning_rate": 0.004976555391695481, + "loss": 8.4756, + "step": 89000 + }, + { + "epoch": 0.36256644080950085, + "eval_MaskedAccuracy": 0.47060521979681946, + "eval_loss": 1.7932844161987305, + "eval_runtime": 467.7312, + "eval_samples_per_second": 135.71, + "eval_steps_per_second": 0.53, + "step": 89000 + }, + { + "epoch": 0.3629738188328823, + "grad_norm": 11.131914138793945, + "learning_rate": 0.004976487494078528, + "loss": 8.4545, + "step": 89100 + }, + { + "epoch": 0.3633811968562638, + "grad_norm": 8.43490219116211, + "learning_rate": 0.0049764194987504526, + "loss": 8.4633, + "step": 89200 + }, + { + "epoch": 0.3637885748796453, + "grad_norm": 3.71230149269104, + "learning_rate": 0.004976351405713957, + "loss": 8.4831, + "step": 89300 + }, + { + "epoch": 0.3641959529030267, + "grad_norm": 11.018815994262695, + "learning_rate": 0.004976283214971711, + "loss": 8.6283, + "step": 89400 + }, + { + "epoch": 0.3646033309264082, + "grad_norm": 7.818960666656494, + "learning_rate": 0.00497621492652643, + "loss": 8.5877, + "step": 89500 + }, + { + "epoch": 0.36501070894978965, + "grad_norm": 7.818572044372559, + "learning_rate": 0.004976146540380799, + "loss": 8.7524, + "step": 89600 + }, + { + "epoch": 0.3654180869731711, + "grad_norm": 5.199432373046875, + "learning_rate": 0.004976078056537536, + "loss": 8.7144, + "step": 89700 + }, + { + "epoch": 0.36582546499655255, + "grad_norm": 8.510984420776367, + "learning_rate": 0.004976009474999339, + "loss": 8.5381, + "step": 89800 + }, + { + "epoch": 0.36623284301993403, + "grad_norm": 8.88054370880127, + "learning_rate": 0.004975940795768925, + "loss": 8.5168, + "step": 89900 + }, + { + "epoch": 0.3666402210433155, + "grad_norm": 8.716156005859375, + "learning_rate": 0.004975872018849006, + "loss": 8.522, + "step": 90000 + }, + { + "epoch": 0.3666402210433155, + "eval_MaskedAccuracy": 0.4706358157887873, + "eval_loss": 1.7848535776138306, + "eval_runtime": 594.3953, + "eval_samples_per_second": 106.791, + "eval_steps_per_second": 0.417, + "step": 90000 + }, + { + "epoch": 0.36704759906669693, + "grad_norm": 10.945036888122559, + "learning_rate": 0.0049758031442422935, + "loss": 8.4675, + "step": 90100 + }, + { + "epoch": 0.3674549770900784, + "grad_norm": 4.423859119415283, + "learning_rate": 0.004975734171951515, + "loss": 8.583, + "step": 90200 + }, + { + "epoch": 0.3678623551134599, + "grad_norm": 10.427994728088379, + "learning_rate": 0.004975665101979403, + "loss": 8.7836, + "step": 90300 + }, + { + "epoch": 0.36826973313684136, + "grad_norm": 9.24545669555664, + "learning_rate": 0.004975595934328689, + "loss": 8.759, + "step": 90400 + }, + { + "epoch": 0.3686771111602228, + "grad_norm": 8.96117877960205, + "learning_rate": 0.004975526669002103, + "loss": 8.7391, + "step": 90500 + }, + { + "epoch": 0.36908448918360426, + "grad_norm": 3.695530891418457, + "learning_rate": 0.004975457306002383, + "loss": 8.5696, + "step": 90600 + }, + { + "epoch": 0.36949186720698574, + "grad_norm": 8.851283073425293, + "learning_rate": 0.004975387845332268, + "loss": 8.6098, + "step": 90700 + }, + { + "epoch": 0.36989924523036716, + "grad_norm": 8.128582000732422, + "learning_rate": 0.004975318286994518, + "loss": 8.5255, + "step": 90800 + }, + { + "epoch": 0.37030662325374863, + "grad_norm": 7.524702072143555, + "learning_rate": 0.004975248630991871, + "loss": 8.4705, + "step": 90900 + }, + { + "epoch": 0.3707140012771301, + "grad_norm": 10.401105880737305, + "learning_rate": 0.004975178877327081, + "loss": 8.4252, + "step": 91000 + }, + { + "epoch": 0.3707140012771301, + "eval_MaskedAccuracy": 0.47044726839432044, + "eval_loss": 1.7901692390441895, + "eval_runtime": 552.0048, + "eval_samples_per_second": 114.992, + "eval_steps_per_second": 0.449, + "step": 91000 + }, + { + "epoch": 0.3711213793005116, + "grad_norm": 7.144312381744385, + "learning_rate": 0.004975109026002911, + "loss": 8.46, + "step": 91100 + }, + { + "epoch": 0.371528757323893, + "grad_norm": 7.993409156799316, + "learning_rate": 0.00497503907702212, + "loss": 8.5332, + "step": 91200 + }, + { + "epoch": 0.3719361353472745, + "grad_norm": 3.0928328037261963, + "learning_rate": 0.004974969030387474, + "loss": 8.7197, + "step": 91300 + }, + { + "epoch": 0.37234351337065597, + "grad_norm": 11.095446586608887, + "learning_rate": 0.004974898886101745, + "loss": 8.5891, + "step": 91400 + }, + { + "epoch": 0.3727508913940374, + "grad_norm": 10.600836753845215, + "learning_rate": 0.004974828644167699, + "loss": 8.6775, + "step": 91500 + }, + { + "epoch": 0.37315826941741886, + "grad_norm": 9.994548797607422, + "learning_rate": 0.0049747583045881205, + "loss": 8.5214, + "step": 91600 + }, + { + "epoch": 0.37356564744080034, + "grad_norm": 9.244461059570312, + "learning_rate": 0.004974687867365784, + "loss": 8.4631, + "step": 91700 + }, + { + "epoch": 0.3739730254641818, + "grad_norm": 8.088507652282715, + "learning_rate": 0.004974617332503477, + "loss": 8.4735, + "step": 91800 + }, + { + "epoch": 0.37438040348756324, + "grad_norm": 7.182349681854248, + "learning_rate": 0.0049745467000039895, + "loss": 8.4874, + "step": 91900 + }, + { + "epoch": 0.3747877815109447, + "grad_norm": 3.551943302154541, + "learning_rate": 0.004974475969870113, + "loss": 8.4846, + "step": 92000 + }, + { + "epoch": 0.3747877815109447, + "eval_MaskedAccuracy": 0.469990103072361, + "eval_loss": 1.7921059131622314, + "eval_runtime": 584.8413, + "eval_samples_per_second": 108.535, + "eval_steps_per_second": 0.424, + "step": 92000 + }, + { + "epoch": 0.3751951595343262, + "grad_norm": 4.345943927764893, + "learning_rate": 0.004974405142104643, + "loss": 8.7141, + "step": 92100 + }, + { + "epoch": 0.3756025375577076, + "grad_norm": 2.48052978515625, + "learning_rate": 0.004974334216710381, + "loss": 8.6144, + "step": 92200 + }, + { + "epoch": 0.3760099155810891, + "grad_norm": 9.305281639099121, + "learning_rate": 0.00497426319369013, + "loss": 8.703, + "step": 92300 + }, + { + "epoch": 0.37641729360447057, + "grad_norm": 8.072779655456543, + "learning_rate": 0.0049741920730466994, + "loss": 8.815, + "step": 92400 + }, + { + "epoch": 0.37682467162785205, + "grad_norm": 7.126327991485596, + "learning_rate": 0.004974120854782898, + "loss": 8.5336, + "step": 92500 + }, + { + "epoch": 0.37723204965123347, + "grad_norm": 10.760848999023438, + "learning_rate": 0.004974049538901545, + "loss": 8.5313, + "step": 92600 + }, + { + "epoch": 0.37763942767461495, + "grad_norm": 8.534592628479004, + "learning_rate": 0.004973978125405459, + "loss": 8.4804, + "step": 92700 + }, + { + "epoch": 0.3780468056979964, + "grad_norm": 6.255151748657227, + "learning_rate": 0.004973906614297455, + "loss": 8.4687, + "step": 92800 + }, + { + "epoch": 0.3784541837213779, + "grad_norm": 10.237187385559082, + "learning_rate": 0.004973835005580372, + "loss": 8.4838, + "step": 92900 + }, + { + "epoch": 0.3788615617447593, + "grad_norm": 3.644022226333618, + "learning_rate": 0.004973763299257041, + "loss": 8.5314, + "step": 93000 + }, + { + "epoch": 0.3788615617447593, + "eval_MaskedAccuracy": 0.4678807769183222, + "eval_loss": 1.7888926267623901, + "eval_runtime": 535.7103, + "eval_samples_per_second": 118.489, + "eval_steps_per_second": 0.463, + "step": 93000 + }, + { + "epoch": 0.3792689397681408, + "grad_norm": 8.058805465698242, + "learning_rate": 0.0049736914953302895, + "loss": 8.814, + "step": 93100 + }, + { + "epoch": 0.3796763177915223, + "grad_norm": 4.705103874206543, + "learning_rate": 0.0049736195938029585, + "loss": 8.5721, + "step": 93200 + }, + { + "epoch": 0.3800836958149037, + "grad_norm": 10.882928848266602, + "learning_rate": 0.004973547594677897, + "loss": 8.6773, + "step": 93300 + }, + { + "epoch": 0.3804910738382852, + "grad_norm": 5.981499195098877, + "learning_rate": 0.0049734754979579455, + "loss": 8.551, + "step": 93400 + }, + { + "epoch": 0.38089845186166665, + "grad_norm": 3.3413658142089844, + "learning_rate": 0.004973403303645947, + "loss": 8.6139, + "step": 93500 + }, + { + "epoch": 0.38130582988504813, + "grad_norm": 3.6258671283721924, + "learning_rate": 0.004973331011744769, + "loss": 8.6114, + "step": 93600 + }, + { + "epoch": 0.38171320790842955, + "grad_norm": 7.946436882019043, + "learning_rate": 0.004973258622257264, + "loss": 8.6811, + "step": 93700 + }, + { + "epoch": 0.38212058593181103, + "grad_norm": 6.5823974609375, + "learning_rate": 0.004973186135186295, + "loss": 8.5271, + "step": 93800 + }, + { + "epoch": 0.3825279639551925, + "grad_norm": 10.669353485107422, + "learning_rate": 0.004973113550534733, + "loss": 8.5442, + "step": 93900 + }, + { + "epoch": 0.38293534197857393, + "grad_norm": 10.489116668701172, + "learning_rate": 0.00497304086830544, + "loss": 8.7497, + "step": 94000 + }, + { + "epoch": 0.38293534197857393, + "eval_MaskedAccuracy": 0.45569368019319123, + "eval_loss": 1.8612487316131592, + "eval_runtime": 590.2393, + "eval_samples_per_second": 107.543, + "eval_steps_per_second": 0.42, + "step": 94000 + }, + { + "epoch": 0.3833427200019554, + "grad_norm": 3.391709566116333, + "learning_rate": 0.004972968088501293, + "loss": 8.7032, + "step": 94100 + }, + { + "epoch": 0.3837500980253369, + "grad_norm": 7.481471061706543, + "learning_rate": 0.004972895211125167, + "loss": 8.5595, + "step": 94200 + }, + { + "epoch": 0.38415747604871836, + "grad_norm": 9.863663673400879, + "learning_rate": 0.004972822236179941, + "loss": 8.5347, + "step": 94300 + }, + { + "epoch": 0.3845648540720998, + "grad_norm": 8.590025901794434, + "learning_rate": 0.004972749163668501, + "loss": 8.6051, + "step": 94400 + }, + { + "epoch": 0.38497223209548126, + "grad_norm": 5.421779632568359, + "learning_rate": 0.00497267599359374, + "loss": 8.6653, + "step": 94500 + }, + { + "epoch": 0.38537961011886274, + "grad_norm": 5.929932594299316, + "learning_rate": 0.004972602725958551, + "loss": 8.514, + "step": 94600 + }, + { + "epoch": 0.38578698814224416, + "grad_norm": 5.623597145080566, + "learning_rate": 0.0049725293607658224, + "loss": 8.5508, + "step": 94700 + }, + { + "epoch": 0.38619436616562564, + "grad_norm": 6.509292125701904, + "learning_rate": 0.004972455898018459, + "loss": 8.5169, + "step": 94800 + }, + { + "epoch": 0.3866017441890071, + "grad_norm": 7.418100357055664, + "learning_rate": 0.004972382337719371, + "loss": 8.4773, + "step": 94900 + }, + { + "epoch": 0.3870091222123886, + "grad_norm": 9.678910255432129, + "learning_rate": 0.004972308679871465, + "loss": 8.4783, + "step": 95000 + }, + { + "epoch": 0.3870091222123886, + "eval_MaskedAccuracy": 0.47081274182536137, + "eval_loss": 1.7964165210723877, + "eval_runtime": 606.0193, + "eval_samples_per_second": 104.743, + "eval_steps_per_second": 0.409, + "step": 95000 + }, + { + "epoch": 0.38741650023577, + "grad_norm": 5.7913994789123535, + "learning_rate": 0.0049722349244776505, + "loss": 8.4547, + "step": 95100 + }, + { + "epoch": 0.3878238782591515, + "grad_norm": 8.161794662475586, + "learning_rate": 0.0049721610715408445, + "loss": 8.4734, + "step": 95200 + }, + { + "epoch": 0.38823125628253297, + "grad_norm": 6.818782806396484, + "learning_rate": 0.004972087121063967, + "loss": 8.471, + "step": 95300 + }, + { + "epoch": 0.38863863430591444, + "grad_norm": 7.340078830718994, + "learning_rate": 0.004972013073049942, + "loss": 8.4317, + "step": 95400 + }, + { + "epoch": 0.38904601232929586, + "grad_norm": 6.700837135314941, + "learning_rate": 0.004971938927501688, + "loss": 8.4477, + "step": 95500 + }, + { + "epoch": 0.38945339035267734, + "grad_norm": 13.21121883392334, + "learning_rate": 0.004971864684422152, + "loss": 8.4427, + "step": 95600 + }, + { + "epoch": 0.3898607683760588, + "grad_norm": 11.779099464416504, + "learning_rate": 0.00497179034381426, + "loss": 8.664, + "step": 95700 + }, + { + "epoch": 0.39026814639944024, + "grad_norm": 3.3817567825317383, + "learning_rate": 0.0049717159056809545, + "loss": 8.7076, + "step": 95800 + }, + { + "epoch": 0.3906755244228217, + "grad_norm": 8.147451400756836, + "learning_rate": 0.004971641370025169, + "loss": 8.5892, + "step": 95900 + }, + { + "epoch": 0.3910829024462032, + "grad_norm": 12.229639053344727, + "learning_rate": 0.00497156673684986, + "loss": 8.4967, + "step": 96000 + }, + { + "epoch": 0.3910829024462032, + "eval_MaskedAccuracy": 0.47064134862438395, + "eval_loss": 1.8006526231765747, + "eval_runtime": 587.9392, + "eval_samples_per_second": 107.964, + "eval_steps_per_second": 0.422, + "step": 96000 + }, + { + "epoch": 0.3914902804695847, + "grad_norm": 8.995532989501953, + "learning_rate": 0.004971492006157979, + "loss": 8.463, + "step": 96100 + }, + { + "epoch": 0.3918976584929661, + "grad_norm": 7.136062145233154, + "learning_rate": 0.0049714171779524745, + "loss": 8.4514, + "step": 96200 + }, + { + "epoch": 0.39230503651634757, + "grad_norm": 5.498950481414795, + "learning_rate": 0.004971342252236309, + "loss": 8.4614, + "step": 96300 + }, + { + "epoch": 0.39271241453972905, + "grad_norm": 7.159139633178711, + "learning_rate": 0.004971267229012441, + "loss": 8.4478, + "step": 96400 + }, + { + "epoch": 0.39311979256311047, + "grad_norm": 10.60985279083252, + "learning_rate": 0.0049711921082838395, + "loss": 8.4169, + "step": 96500 + }, + { + "epoch": 0.39352717058649195, + "grad_norm": 7.3362812995910645, + "learning_rate": 0.004971116890053479, + "loss": 8.4131, + "step": 96600 + }, + { + "epoch": 0.3939345486098734, + "grad_norm": 7.946305274963379, + "learning_rate": 0.0049710415743243214, + "loss": 8.46, + "step": 96700 + }, + { + "epoch": 0.3943419266332549, + "grad_norm": 2.3922622203826904, + "learning_rate": 0.0049709661610993485, + "loss": 8.659, + "step": 96800 + }, + { + "epoch": 0.3947493046566363, + "grad_norm": 10.708338737487793, + "learning_rate": 0.0049708906503815545, + "loss": 8.8707, + "step": 96900 + }, + { + "epoch": 0.3951566826800178, + "grad_norm": 1.006100058555603, + "learning_rate": 0.004970815042173913, + "loss": 8.6811, + "step": 97000 + }, + { + "epoch": 0.3951566826800178, + "eval_MaskedAccuracy": 0.4597566056094459, + "eval_loss": 1.8534563779830933, + "eval_runtime": 628.0543, + "eval_samples_per_second": 101.068, + "eval_steps_per_second": 0.395, + "step": 97000 + }, + { + "epoch": 0.3955640607033993, + "grad_norm": 5.042269706726074, + "learning_rate": 0.004970739336479416, + "loss": 8.6412, + "step": 97100 + }, + { + "epoch": 0.3959714387267807, + "grad_norm": 6.670810699462891, + "learning_rate": 0.004970663533301056, + "loss": 8.5012, + "step": 97200 + }, + { + "epoch": 0.3963788167501622, + "grad_norm": 7.702014923095703, + "learning_rate": 0.004970587632641828, + "loss": 8.4667, + "step": 97300 + }, + { + "epoch": 0.39678619477354365, + "grad_norm": 6.220357418060303, + "learning_rate": 0.004970511634504726, + "loss": 8.4563, + "step": 97400 + }, + { + "epoch": 0.39719357279692513, + "grad_norm": 2.774712085723877, + "learning_rate": 0.004970435538892763, + "loss": 8.5194, + "step": 97500 + }, + { + "epoch": 0.39760095082030655, + "grad_norm": 2.297208786010742, + "learning_rate": 0.004970359345808957, + "loss": 8.8205, + "step": 97600 + }, + { + "epoch": 0.39800832884368803, + "grad_norm": 7.010386943817139, + "learning_rate": 0.004970283055256305, + "loss": 8.6058, + "step": 97700 + }, + { + "epoch": 0.3984157068670695, + "grad_norm": 5.929687023162842, + "learning_rate": 0.004970206667237826, + "loss": 8.5004, + "step": 97800 + }, + { + "epoch": 0.39882308489045093, + "grad_norm": 9.975981712341309, + "learning_rate": 0.004970130181756547, + "loss": 8.4581, + "step": 97900 + }, + { + "epoch": 0.3992304629138324, + "grad_norm": 11.034873008728027, + "learning_rate": 0.0049700535988154845, + "loss": 8.4762, + "step": 98000 + }, + { + "epoch": 0.3992304629138324, + "eval_MaskedAccuracy": 0.4724678743496642, + "eval_loss": 1.784359335899353, + "eval_runtime": 562.0174, + "eval_samples_per_second": 112.943, + "eval_steps_per_second": 0.441, + "step": 98000 + }, + { + "epoch": 0.3996378409372139, + "grad_norm": 9.151312828063965, + "learning_rate": 0.004969976918417675, + "loss": 8.455, + "step": 98100 + }, + { + "epoch": 0.40004521896059536, + "grad_norm": 7.673705577850342, + "learning_rate": 0.0049699001405661405, + "loss": 8.4271, + "step": 98200 + }, + { + "epoch": 0.4004525969839768, + "grad_norm": 7.930306911468506, + "learning_rate": 0.004969823265263923, + "loss": 8.4594, + "step": 98300 + }, + { + "epoch": 0.40085997500735826, + "grad_norm": 5.796153545379639, + "learning_rate": 0.00496974629251405, + "loss": 8.4188, + "step": 98400 + }, + { + "epoch": 0.40126735303073974, + "grad_norm": 6.536154270172119, + "learning_rate": 0.004969669222319581, + "loss": 8.4624, + "step": 98500 + }, + { + "epoch": 0.4016747310541212, + "grad_norm": 3.913081645965576, + "learning_rate": 0.004969592054683553, + "loss": 8.4665, + "step": 98600 + }, + { + "epoch": 0.40208210907750264, + "grad_norm": 7.269486427307129, + "learning_rate": 0.004969514789609019, + "loss": 8.4192, + "step": 98700 + }, + { + "epoch": 0.4024894871008841, + "grad_norm": 10.048105239868164, + "learning_rate": 0.004969437427099035, + "loss": 8.3996, + "step": 98800 + }, + { + "epoch": 0.4028968651242656, + "grad_norm": 11.666584968566895, + "learning_rate": 0.004969359967156659, + "loss": 8.4127, + "step": 98900 + }, + { + "epoch": 0.403304243147647, + "grad_norm": 3.249028205871582, + "learning_rate": 0.004969282409784957, + "loss": 8.4762, + "step": 99000 + }, + { + "epoch": 0.403304243147647, + "eval_MaskedAccuracy": 0.46012254295805927, + "eval_loss": 1.8292229175567627, + "eval_runtime": 602.0341, + "eval_samples_per_second": 105.436, + "eval_steps_per_second": 0.412, + "step": 99000 + }, + { + "epoch": 0.4037116211710285, + "grad_norm": 1.0890861749649048, + "learning_rate": 0.004969204754986983, + "loss": 8.8252, + "step": 99100 + }, + { + "epoch": 0.40411899919440997, + "grad_norm": 10.644725799560547, + "learning_rate": 0.004969127002765818, + "loss": 8.7852, + "step": 99200 + }, + { + "epoch": 0.40452637721779144, + "grad_norm": 7.145195007324219, + "learning_rate": 0.004969049153124536, + "loss": 8.6053, + "step": 99300 + }, + { + "epoch": 0.40493375524117287, + "grad_norm": 10.985939979553223, + "learning_rate": 0.004968971206066211, + "loss": 8.6446, + "step": 99400 + }, + { + "epoch": 0.40534113326455434, + "grad_norm": 3.1606171131134033, + "learning_rate": 0.004968893161593926, + "loss": 8.5559, + "step": 99500 + }, + { + "epoch": 0.4057485112879358, + "grad_norm": 4.031942844390869, + "learning_rate": 0.004968815019710773, + "loss": 8.4871, + "step": 99600 + }, + { + "epoch": 0.40615588931131724, + "grad_norm": 6.07538366317749, + "learning_rate": 0.004968736780419826, + "loss": 8.5672, + "step": 99700 + }, + { + "epoch": 0.4065632673346987, + "grad_norm": 12.900278091430664, + "learning_rate": 0.004968658443724192, + "loss": 8.5097, + "step": 99800 + }, + { + "epoch": 0.4069706453580802, + "grad_norm": 6.876996994018555, + "learning_rate": 0.004968580009626964, + "loss": 8.4321, + "step": 99900 + }, + { + "epoch": 0.4073780233814617, + "grad_norm": 8.868197441101074, + "learning_rate": 0.004968501478131237, + "loss": 8.4285, + "step": 100000 + }, + { + "epoch": 0.4073780233814617, + "eval_MaskedAccuracy": 0.4721042950935722, + "eval_loss": 1.777195692062378, + "eval_runtime": 495.6412, + "eval_samples_per_second": 128.068, + "eval_steps_per_second": 0.5, + "step": 100000 + }, + { + "epoch": 0.4077854014048431, + "grad_norm": 5.752632141113281, + "learning_rate": 0.004968422849240128, + "loss": 8.3971, + "step": 100100 + }, + { + "epoch": 0.4081927794282246, + "grad_norm": 6.129387378692627, + "learning_rate": 0.004968344122956736, + "loss": 8.4651, + "step": 100200 + }, + { + "epoch": 0.40860015745160605, + "grad_norm": 4.563933849334717, + "learning_rate": 0.004968265299284173, + "loss": 8.7484, + "step": 100300 + }, + { + "epoch": 0.40900753547498747, + "grad_norm": 4.392648220062256, + "learning_rate": 0.004968186378225553, + "loss": 8.697, + "step": 100400 + }, + { + "epoch": 0.40941491349836895, + "grad_norm": 4.389554977416992, + "learning_rate": 0.004968107359784003, + "loss": 8.4855, + "step": 100500 + }, + { + "epoch": 0.4098222915217504, + "grad_norm": 14.288431167602539, + "learning_rate": 0.004968028243962654, + "loss": 8.5336, + "step": 100600 + }, + { + "epoch": 0.4102296695451319, + "grad_norm": 0.7195647358894348, + "learning_rate": 0.004967949030764624, + "loss": 8.705, + "step": 100700 + }, + { + "epoch": 0.4106370475685133, + "grad_norm": 9.668696403503418, + "learning_rate": 0.0049678697201930464, + "loss": 8.5831, + "step": 100800 + }, + { + "epoch": 0.4110444255918948, + "grad_norm": 10.904997825622559, + "learning_rate": 0.004967790312251054, + "loss": 8.4528, + "step": 100900 + }, + { + "epoch": 0.4114518036152763, + "grad_norm": 9.388179779052734, + "learning_rate": 0.00496771080694179, + "loss": 8.4365, + "step": 101000 + }, + { + "epoch": 0.4114518036152763, + "eval_MaskedAccuracy": 0.47232041490235255, + "eval_loss": 1.7801474332809448, + "eval_runtime": 579.6663, + "eval_samples_per_second": 109.504, + "eval_steps_per_second": 0.428, + "step": 101000 + }, + { + "epoch": 0.41185918163865776, + "grad_norm": 10.938289642333984, + "learning_rate": 0.004967631204268392, + "loss": 8.4674, + "step": 101100 + }, + { + "epoch": 0.4122665596620392, + "grad_norm": 8.339547157287598, + "learning_rate": 0.004967551504234018, + "loss": 8.426, + "step": 101200 + }, + { + "epoch": 0.41267393768542066, + "grad_norm": 9.121721267700195, + "learning_rate": 0.00496747170684181, + "loss": 8.4499, + "step": 101300 + }, + { + "epoch": 0.41308131570880213, + "grad_norm": 11.89256477355957, + "learning_rate": 0.004967391812094921, + "loss": 8.4271, + "step": 101400 + }, + { + "epoch": 0.41348869373218355, + "grad_norm": 15.700182914733887, + "learning_rate": 0.004967311819996515, + "loss": 8.426, + "step": 101500 + }, + { + "epoch": 0.41389607175556503, + "grad_norm": 11.21250057220459, + "learning_rate": 0.004967231730549757, + "loss": 8.4552, + "step": 101600 + }, + { + "epoch": 0.4143034497789465, + "grad_norm": 15.058488845825195, + "learning_rate": 0.00496715154375781, + "loss": 8.4029, + "step": 101700 + }, + { + "epoch": 0.414710827802328, + "grad_norm": 9.321598052978516, + "learning_rate": 0.0049670712596238415, + "loss": 8.4129, + "step": 101800 + }, + { + "epoch": 0.4151182058257094, + "grad_norm": 12.426863670349121, + "learning_rate": 0.004966990878151028, + "loss": 8.4271, + "step": 101900 + }, + { + "epoch": 0.4155255838490909, + "grad_norm": 2.9854533672332764, + "learning_rate": 0.004966910399342545, + "loss": 8.5209, + "step": 102000 + }, + { + "epoch": 0.4155255838490909, + "eval_MaskedAccuracy": 0.46132029272790565, + "eval_loss": 1.8232041597366333, + "eval_runtime": 639.7025, + "eval_samples_per_second": 99.227, + "eval_steps_per_second": 0.388, + "step": 102000 + }, + { + "epoch": 0.41593296187247236, + "grad_norm": 6.4361395835876465, + "learning_rate": 0.004966829823201583, + "loss": 8.7334, + "step": 102100 + }, + { + "epoch": 0.4163403398958538, + "grad_norm": 7.562118053436279, + "learning_rate": 0.004966749149731316, + "loss": 8.6135, + "step": 102200 + }, + { + "epoch": 0.41674771791923526, + "grad_norm": 6.0594964027404785, + "learning_rate": 0.004966668378934941, + "loss": 8.6054, + "step": 102300 + }, + { + "epoch": 0.41715509594261674, + "grad_norm": 3.716524600982666, + "learning_rate": 0.004966587510815647, + "loss": 8.779, + "step": 102400 + }, + { + "epoch": 0.4175624739659982, + "grad_norm": 5.204922676086426, + "learning_rate": 0.0049665065453766335, + "loss": 8.7758, + "step": 102500 + }, + { + "epoch": 0.41796985198937964, + "grad_norm": 6.669337749481201, + "learning_rate": 0.004966425482621107, + "loss": 8.541, + "step": 102600 + }, + { + "epoch": 0.4183772300127611, + "grad_norm": 4.090687274932861, + "learning_rate": 0.00496634432255226, + "loss": 8.5421, + "step": 102700 + }, + { + "epoch": 0.4187846080361426, + "grad_norm": 10.983367919921875, + "learning_rate": 0.00496626306517331, + "loss": 8.6436, + "step": 102800 + }, + { + "epoch": 0.419191986059524, + "grad_norm": 9.434021949768066, + "learning_rate": 0.004966181710487462, + "loss": 8.6067, + "step": 102900 + }, + { + "epoch": 0.4195993640829055, + "grad_norm": 8.15808391571045, + "learning_rate": 0.004966100258497939, + "loss": 8.512, + "step": 103000 + }, + { + "epoch": 0.4195993640829055, + "eval_MaskedAccuracy": 0.47153795500267925, + "eval_loss": 1.798658847808838, + "eval_runtime": 577.6914, + "eval_samples_per_second": 109.879, + "eval_steps_per_second": 0.429, + "step": 103000 + }, + { + "epoch": 0.42000674210628697, + "grad_norm": 9.656765937805176, + "learning_rate": 0.004966018709207965, + "loss": 8.4555, + "step": 103100 + }, + { + "epoch": 0.42041412012966844, + "grad_norm": 11.301863670349121, + "learning_rate": 0.0049659370626207585, + "loss": 8.4537, + "step": 103200 + }, + { + "epoch": 0.42082149815304987, + "grad_norm": 2.013126850128174, + "learning_rate": 0.004965855318739549, + "loss": 8.4458, + "step": 103300 + }, + { + "epoch": 0.42122887617643134, + "grad_norm": 6.856706142425537, + "learning_rate": 0.004965773477567567, + "loss": 8.5382, + "step": 103400 + }, + { + "epoch": 0.4216362541998128, + "grad_norm": 7.913936614990234, + "learning_rate": 0.004965691539108045, + "loss": 8.4526, + "step": 103500 + }, + { + "epoch": 0.4220436322231943, + "grad_norm": 6.275994300842285, + "learning_rate": 0.004965609503364228, + "loss": 8.4138, + "step": 103600 + }, + { + "epoch": 0.4224510102465757, + "grad_norm": 9.040349006652832, + "learning_rate": 0.004965527370339356, + "loss": 8.4321, + "step": 103700 + }, + { + "epoch": 0.4228583882699572, + "grad_norm": 6.3520307540893555, + "learning_rate": 0.0049654451400366825, + "loss": 8.4265, + "step": 103800 + }, + { + "epoch": 0.4232657662933387, + "grad_norm": 5.726953029632568, + "learning_rate": 0.0049653628124594435, + "loss": 8.5499, + "step": 103900 + }, + { + "epoch": 0.4236731443167201, + "grad_norm": 3.7572128772735596, + "learning_rate": 0.004965280387610907, + "loss": 8.6229, + "step": 104000 + }, + { + "epoch": 0.4236731443167201, + "eval_MaskedAccuracy": 0.46590958911815616, + "eval_loss": 1.8005009889602661, + "eval_runtime": 605.7562, + "eval_samples_per_second": 104.788, + "eval_steps_per_second": 0.409, + "step": 104000 + }, + { + "epoch": 0.4240805223401016, + "grad_norm": 7.807491779327393, + "learning_rate": 0.004965197865494326, + "loss": 8.5348, + "step": 104100 + }, + { + "epoch": 0.42448790036348305, + "grad_norm": 5.640869617462158, + "learning_rate": 0.004965115246112967, + "loss": 8.5162, + "step": 104200 + }, + { + "epoch": 0.42489527838686453, + "grad_norm": 3.8039796352386475, + "learning_rate": 0.0049650325294700965, + "loss": 8.6223, + "step": 104300 + }, + { + "epoch": 0.42530265641024595, + "grad_norm": 6.969115257263184, + "learning_rate": 0.004964949715568982, + "loss": 8.5224, + "step": 104400 + }, + { + "epoch": 0.4257100344336274, + "grad_norm": 6.870429515838623, + "learning_rate": 0.004964866804412895, + "loss": 8.5626, + "step": 104500 + }, + { + "epoch": 0.4261174124570089, + "grad_norm": 1.0034383535385132, + "learning_rate": 0.004964783796005118, + "loss": 8.7603, + "step": 104600 + }, + { + "epoch": 0.4265247904803903, + "grad_norm": 4.922119140625, + "learning_rate": 0.0049647006903489315, + "loss": 8.6798, + "step": 104700 + }, + { + "epoch": 0.4269321685037718, + "grad_norm": 5.869476795196533, + "learning_rate": 0.004964617487447619, + "loss": 8.4977, + "step": 104800 + }, + { + "epoch": 0.4273395465271533, + "grad_norm": 6.778334140777588, + "learning_rate": 0.004964534187304481, + "loss": 8.4221, + "step": 104900 + }, + { + "epoch": 0.42774692455053476, + "grad_norm": 5.279184341430664, + "learning_rate": 0.004964450789922798, + "loss": 8.4347, + "step": 105000 + }, + { + "epoch": 0.42774692455053476, + "eval_MaskedAccuracy": 0.4724774508604719, + "eval_loss": 1.787004828453064, + "eval_runtime": 599.3399, + "eval_samples_per_second": 105.91, + "eval_steps_per_second": 0.414, + "step": 105000 + }, + { + "epoch": 0.4281543025739162, + "grad_norm": 6.847875595092773, + "learning_rate": 0.004964367295305874, + "loss": 8.4475, + "step": 105100 + }, + { + "epoch": 0.42856168059729766, + "grad_norm": 7.180721759796143, + "learning_rate": 0.004964283703457004, + "loss": 8.4224, + "step": 105200 + }, + { + "epoch": 0.42896905862067913, + "grad_norm": 8.042956352233887, + "learning_rate": 0.004964200014379491, + "loss": 8.3657, + "step": 105300 + }, + { + "epoch": 0.42937643664406056, + "grad_norm": 6.460809230804443, + "learning_rate": 0.004964116228076652, + "loss": 8.4149, + "step": 105400 + }, + { + "epoch": 0.42978381466744203, + "grad_norm": 2.5339605808258057, + "learning_rate": 0.004964032344551802, + "loss": 8.3877, + "step": 105500 + }, + { + "epoch": 0.4301911926908235, + "grad_norm": 8.963078498840332, + "learning_rate": 0.004963948363808248, + "loss": 8.4022, + "step": 105600 + }, + { + "epoch": 0.430598570714205, + "grad_norm": 5.23846960067749, + "learning_rate": 0.004963864285849314, + "loss": 8.3939, + "step": 105700 + }, + { + "epoch": 0.4310059487375864, + "grad_norm": 3.0979650020599365, + "learning_rate": 0.00496378011067832, + "loss": 8.3992, + "step": 105800 + }, + { + "epoch": 0.4314133267609679, + "grad_norm": 4.946394443511963, + "learning_rate": 0.004963695838298602, + "loss": 8.4295, + "step": 105900 + }, + { + "epoch": 0.43182070478434936, + "grad_norm": 5.411014080047607, + "learning_rate": 0.004963611468713486, + "loss": 8.372, + "step": 106000 + }, + { + "epoch": 0.43182070478434936, + "eval_MaskedAccuracy": 0.4736338868371074, + "eval_loss": 1.7651697397232056, + "eval_runtime": 607.1531, + "eval_samples_per_second": 104.547, + "eval_steps_per_second": 0.408, + "step": 106000 + }, + { + "epoch": 0.43222808280773084, + "grad_norm": 4.002007484436035, + "learning_rate": 0.004963527001926312, + "loss": 8.4388, + "step": 106100 + }, + { + "epoch": 0.43263546083111226, + "grad_norm": 5.016679763793945, + "learning_rate": 0.004963442437940414, + "loss": 8.513, + "step": 106200 + }, + { + "epoch": 0.43304283885449374, + "grad_norm": 4.849783420562744, + "learning_rate": 0.004963357776759134, + "loss": 8.4713, + "step": 106300 + }, + { + "epoch": 0.4334502168778752, + "grad_norm": 5.830073356628418, + "learning_rate": 0.004963273018385826, + "loss": 8.6484, + "step": 106400 + }, + { + "epoch": 0.43385759490125664, + "grad_norm": 9.937734603881836, + "learning_rate": 0.004963188162823837, + "loss": 8.4738, + "step": 106500 + }, + { + "epoch": 0.4342649729246381, + "grad_norm": 6.670325756072998, + "learning_rate": 0.004963103210076525, + "loss": 8.4172, + "step": 106600 + }, + { + "epoch": 0.4346723509480196, + "grad_norm": 10.503084182739258, + "learning_rate": 0.004963018160147245, + "loss": 8.4091, + "step": 106700 + }, + { + "epoch": 0.43507972897140107, + "grad_norm": 9.465533256530762, + "learning_rate": 0.004962933013039362, + "loss": 8.4091, + "step": 106800 + }, + { + "epoch": 0.4354871069947825, + "grad_norm": 10.29710578918457, + "learning_rate": 0.004962847768756245, + "loss": 8.4316, + "step": 106900 + }, + { + "epoch": 0.43589448501816397, + "grad_norm": 12.24212646484375, + "learning_rate": 0.004962762427301256, + "loss": 8.405, + "step": 107000 + }, + { + "epoch": 0.43589448501816397, + "eval_MaskedAccuracy": 0.4743431114096333, + "eval_loss": 1.7652925252914429, + "eval_runtime": 570.9129, + "eval_samples_per_second": 111.183, + "eval_steps_per_second": 0.434, + "step": 107000 + }, + { + "epoch": 0.43630186304154545, + "grad_norm": 9.348734855651855, + "learning_rate": 0.0049626769886777725, + "loss": 8.6629, + "step": 107100 + }, + { + "epoch": 0.43670924106492687, + "grad_norm": 4.134520053863525, + "learning_rate": 0.0049625914528891755, + "loss": 8.5898, + "step": 107200 + }, + { + "epoch": 0.43711661908830834, + "grad_norm": 9.459831237792969, + "learning_rate": 0.004962505819938845, + "loss": 8.4488, + "step": 107300 + }, + { + "epoch": 0.4375239971116898, + "grad_norm": 8.720632553100586, + "learning_rate": 0.004962420089830158, + "loss": 8.5787, + "step": 107400 + }, + { + "epoch": 0.4379313751350713, + "grad_norm": 9.35346794128418, + "learning_rate": 0.004962334262566517, + "loss": 8.5486, + "step": 107500 + }, + { + "epoch": 0.4383387531584527, + "grad_norm": 13.627753257751465, + "learning_rate": 0.00496224833815131, + "loss": 8.614, + "step": 107600 + }, + { + "epoch": 0.4387461311818342, + "grad_norm": 5.3978166580200195, + "learning_rate": 0.0049621623165879295, + "loss": 8.506, + "step": 107700 + }, + { + "epoch": 0.4391535092052157, + "grad_norm": 7.623312950134277, + "learning_rate": 0.00496207619787979, + "loss": 8.7496, + "step": 107800 + }, + { + "epoch": 0.4395608872285971, + "grad_norm": 2.9135639667510986, + "learning_rate": 0.00496198998203028, + "loss": 8.7128, + "step": 107900 + }, + { + "epoch": 0.4399682652519786, + "grad_norm": 6.541151523590088, + "learning_rate": 0.004961903669042818, + "loss": 8.5844, + "step": 108000 + }, + { + "epoch": 0.4399682652519786, + "eval_MaskedAccuracy": 0.47024074789879794, + "eval_loss": 1.7921466827392578, + "eval_runtime": 654.1479, + "eval_samples_per_second": 97.036, + "eval_steps_per_second": 0.379, + "step": 108000 + }, + { + "epoch": 0.44037564327536005, + "grad_norm": 7.330739498138428, + "learning_rate": 0.004961817258920815, + "loss": 8.4422, + "step": 108100 + }, + { + "epoch": 0.44078302129874153, + "grad_norm": 9.475118637084961, + "learning_rate": 0.0049617307516676866, + "loss": 8.4373, + "step": 108200 + }, + { + "epoch": 0.44119039932212295, + "grad_norm": 8.402446746826172, + "learning_rate": 0.004961644147286852, + "loss": 8.4133, + "step": 108300 + }, + { + "epoch": 0.4415977773455044, + "grad_norm": 6.884156227111816, + "learning_rate": 0.00496155744578173, + "loss": 8.4033, + "step": 108400 + }, + { + "epoch": 0.4420051553688859, + "grad_norm": 9.831668853759766, + "learning_rate": 0.004961470647155754, + "loss": 8.4261, + "step": 108500 + }, + { + "epoch": 0.4424125333922673, + "grad_norm": 7.965442657470703, + "learning_rate": 0.0049613837514123574, + "loss": 8.3904, + "step": 108600 + }, + { + "epoch": 0.4428199114156488, + "grad_norm": 1.3925095796585083, + "learning_rate": 0.004961296758554973, + "loss": 8.4134, + "step": 108700 + }, + { + "epoch": 0.4432272894390303, + "grad_norm": 8.52515983581543, + "learning_rate": 0.004961209668587038, + "loss": 8.6539, + "step": 108800 + }, + { + "epoch": 0.44363466746241176, + "grad_norm": 2.4925291538238525, + "learning_rate": 0.004961122481512003, + "loss": 8.4962, + "step": 108900 + }, + { + "epoch": 0.4440420454857932, + "grad_norm": 1.4443053007125854, + "learning_rate": 0.004961035197333306, + "loss": 8.6677, + "step": 109000 + }, + { + "epoch": 0.4440420454857932, + "eval_MaskedAccuracy": 0.4628872529788418, + "eval_loss": 1.829740047454834, + "eval_runtime": 602.3877, + "eval_samples_per_second": 105.374, + "eval_steps_per_second": 0.412, + "step": 109000 + }, + { + "epoch": 0.44444942350917466, + "grad_norm": 7.29392671585083, + "learning_rate": 0.004960947816054404, + "loss": 8.5859, + "step": 109100 + }, + { + "epoch": 0.44485680153255613, + "grad_norm": 7.652862071990967, + "learning_rate": 0.004960860337678751, + "loss": 8.458, + "step": 109200 + }, + { + "epoch": 0.4452641795559376, + "grad_norm": 8.172975540161133, + "learning_rate": 0.004960772762209805, + "loss": 8.426, + "step": 109300 + }, + { + "epoch": 0.44567155757931903, + "grad_norm": 5.396576404571533, + "learning_rate": 0.004960685089651025, + "loss": 8.4052, + "step": 109400 + }, + { + "epoch": 0.4460789356027005, + "grad_norm": 6.155834674835205, + "learning_rate": 0.0049605973200058774, + "loss": 8.4008, + "step": 109500 + }, + { + "epoch": 0.446486313626082, + "grad_norm": 6.791605472564697, + "learning_rate": 0.004960509453277837, + "loss": 8.4069, + "step": 109600 + }, + { + "epoch": 0.4468936916494634, + "grad_norm": 8.964988708496094, + "learning_rate": 0.00496042148947037, + "loss": 8.366, + "step": 109700 + }, + { + "epoch": 0.4473010696728449, + "grad_norm": 5.947779178619385, + "learning_rate": 0.0049603334285869645, + "loss": 8.4173, + "step": 109800 + }, + { + "epoch": 0.44770844769622636, + "grad_norm": 5.3750176429748535, + "learning_rate": 0.004960245270631098, + "loss": 8.4632, + "step": 109900 + }, + { + "epoch": 0.44811582571960784, + "grad_norm": 8.55760669708252, + "learning_rate": 0.004960157015606248, + "loss": 8.4325, + "step": 110000 + }, + { + "epoch": 0.44811582571960784, + "eval_MaskedAccuracy": 0.47331954585294894, + "eval_loss": 1.7828290462493896, + "eval_runtime": 598.6767, + "eval_samples_per_second": 106.027, + "eval_steps_per_second": 0.414, + "step": 110000 + }, + { + "epoch": 0.44852320374298926, + "grad_norm": 7.122706413269043, + "learning_rate": 0.00496006866351591, + "loss": 8.374, + "step": 110100 + }, + { + "epoch": 0.44893058176637074, + "grad_norm": 8.731146812438965, + "learning_rate": 0.004959980214363583, + "loss": 8.4295, + "step": 110200 + }, + { + "epoch": 0.4493379597897522, + "grad_norm": 7.568539619445801, + "learning_rate": 0.004959891668152759, + "loss": 8.3725, + "step": 110300 + }, + { + "epoch": 0.44974533781313364, + "grad_norm": 7.53127384185791, + "learning_rate": 0.004959803024886933, + "loss": 8.4216, + "step": 110400 + }, + { + "epoch": 0.4501527158365151, + "grad_norm": 2.6870458126068115, + "learning_rate": 0.00495971428456961, + "loss": 8.4128, + "step": 110500 + }, + { + "epoch": 0.4505600938598966, + "grad_norm": 15.371285438537598, + "learning_rate": 0.004959625447204312, + "loss": 8.6718, + "step": 110600 + }, + { + "epoch": 0.45096747188327807, + "grad_norm": 2.650428533554077, + "learning_rate": 0.004959536512794538, + "loss": 8.77, + "step": 110700 + }, + { + "epoch": 0.4513748499066595, + "grad_norm": 1.186808466911316, + "learning_rate": 0.004959447481343806, + "loss": 8.7843, + "step": 110800 + }, + { + "epoch": 0.45178222793004097, + "grad_norm": 2.998349905014038, + "learning_rate": 0.004959358352855635, + "loss": 8.7435, + "step": 110900 + }, + { + "epoch": 0.45218960595342245, + "grad_norm": 4.527252674102783, + "learning_rate": 0.004959269127333554, + "loss": 8.6732, + "step": 111000 + }, + { + "epoch": 0.45218960595342245, + "eval_MaskedAccuracy": 0.46516753932012583, + "eval_loss": 1.809565782546997, + "eval_runtime": 579.2909, + "eval_samples_per_second": 109.575, + "eval_steps_per_second": 0.428, + "step": 111000 + }, + { + "epoch": 0.45259698397680387, + "grad_norm": 4.419383525848389, + "learning_rate": 0.00495917980478109, + "loss": 8.5257, + "step": 111100 + }, + { + "epoch": 0.45300436200018535, + "grad_norm": 4.69866943359375, + "learning_rate": 0.004959090385201771, + "loss": 8.4745, + "step": 111200 + }, + { + "epoch": 0.4534117400235668, + "grad_norm": 4.82834005355835, + "learning_rate": 0.004959000868599137, + "loss": 8.5087, + "step": 111300 + }, + { + "epoch": 0.4538191180469483, + "grad_norm": 9.863975524902344, + "learning_rate": 0.004958911254976722, + "loss": 8.5017, + "step": 111400 + }, + { + "epoch": 0.4542264960703297, + "grad_norm": 5.013560771942139, + "learning_rate": 0.004958821544338068, + "loss": 8.4531, + "step": 111500 + }, + { + "epoch": 0.4546338740937112, + "grad_norm": 1.191077709197998, + "learning_rate": 0.004958731736686727, + "loss": 8.5021, + "step": 111600 + }, + { + "epoch": 0.4550412521170927, + "grad_norm": 5.876702785491943, + "learning_rate": 0.004958641832026253, + "loss": 8.618, + "step": 111700 + }, + { + "epoch": 0.45544863014047415, + "grad_norm": 5.5669779777526855, + "learning_rate": 0.0049585518303601845, + "loss": 8.5249, + "step": 111800 + }, + { + "epoch": 0.4558560081638556, + "grad_norm": 6.915269374847412, + "learning_rate": 0.004958461731692095, + "loss": 8.4397, + "step": 111900 + }, + { + "epoch": 0.45626338618723705, + "grad_norm": 8.481354713439941, + "learning_rate": 0.004958371536025539, + "loss": 8.6175, + "step": 112000 + }, + { + "epoch": 0.45626338618723705, + "eval_MaskedAccuracy": 0.46605112787424247, + "eval_loss": 1.8209140300750732, + "eval_runtime": 605.9854, + "eval_samples_per_second": 104.748, + "eval_steps_per_second": 0.409, + "step": 112000 + }, + { + "epoch": 0.45667076421061853, + "grad_norm": 7.431014537811279, + "learning_rate": 0.004958281243364088, + "loss": 8.5776, + "step": 112100 + }, + { + "epoch": 0.45707814223399995, + "grad_norm": 6.935708522796631, + "learning_rate": 0.004958190853711309, + "loss": 8.4614, + "step": 112200 + }, + { + "epoch": 0.45748552025738143, + "grad_norm": 4.441507816314697, + "learning_rate": 0.004958100367070768, + "loss": 8.4313, + "step": 112300 + }, + { + "epoch": 0.4578928982807629, + "grad_norm": 1.8756330013275146, + "learning_rate": 0.004958009783446055, + "loss": 8.5578, + "step": 112400 + }, + { + "epoch": 0.4583002763041444, + "grad_norm": 3.120640993118286, + "learning_rate": 0.004957919102840741, + "loss": 8.6338, + "step": 112500 + }, + { + "epoch": 0.4587076543275258, + "grad_norm": 1.1729477643966675, + "learning_rate": 0.004957828325258414, + "loss": 8.5977, + "step": 112600 + }, + { + "epoch": 0.4591150323509073, + "grad_norm": 0.5741099715232849, + "learning_rate": 0.004957737450702669, + "loss": 8.7102, + "step": 112700 + }, + { + "epoch": 0.45952241037428876, + "grad_norm": 8.047289848327637, + "learning_rate": 0.004957646479177092, + "loss": 8.5728, + "step": 112800 + }, + { + "epoch": 0.4599297883976702, + "grad_norm": 9.095126152038574, + "learning_rate": 0.0049575554106852816, + "loss": 8.4705, + "step": 112900 + }, + { + "epoch": 0.46033716642105166, + "grad_norm": 9.453558921813965, + "learning_rate": 0.004957464245230838, + "loss": 8.4057, + "step": 113000 + }, + { + "epoch": 0.46033716642105166, + "eval_MaskedAccuracy": 0.4738057493999801, + "eval_loss": 1.781148076057434, + "eval_runtime": 540.2794, + "eval_samples_per_second": 117.487, + "eval_steps_per_second": 0.459, + "step": 113000 + }, + { + "epoch": 0.46074454444443314, + "grad_norm": 8.071161270141602, + "learning_rate": 0.004957372982817367, + "loss": 8.4066, + "step": 113100 + }, + { + "epoch": 0.4611519224678146, + "grad_norm": 10.719406127929688, + "learning_rate": 0.004957281623448478, + "loss": 8.4156, + "step": 113200 + }, + { + "epoch": 0.46155930049119603, + "grad_norm": 7.539982795715332, + "learning_rate": 0.004957190167127779, + "loss": 8.3838, + "step": 113300 + }, + { + "epoch": 0.4619666785145775, + "grad_norm": 10.179490089416504, + "learning_rate": 0.004957098613858882, + "loss": 8.3715, + "step": 113400 + }, + { + "epoch": 0.462374056537959, + "grad_norm": 9.369685173034668, + "learning_rate": 0.0049570069636454194, + "loss": 8.342, + "step": 113500 + }, + { + "epoch": 0.4627814345613404, + "grad_norm": 9.911388397216797, + "learning_rate": 0.004956915216491005, + "loss": 8.5081, + "step": 113600 + }, + { + "epoch": 0.4631888125847219, + "grad_norm": 11.996129035949707, + "learning_rate": 0.004956823372399269, + "loss": 8.4799, + "step": 113700 + }, + { + "epoch": 0.46359619060810336, + "grad_norm": 4.705869197845459, + "learning_rate": 0.004956731431373847, + "loss": 8.3913, + "step": 113800 + }, + { + "epoch": 0.46400356863148484, + "grad_norm": 2.45719575881958, + "learning_rate": 0.004956639393418368, + "loss": 8.5583, + "step": 113900 + }, + { + "epoch": 0.46441094665486626, + "grad_norm": 1.0450843572616577, + "learning_rate": 0.004956547258536471, + "loss": 8.619, + "step": 114000 + }, + { + "epoch": 0.46441094665486626, + "eval_MaskedAccuracy": 0.46584400777593554, + "eval_loss": 1.818083643913269, + "eval_runtime": 601.3887, + "eval_samples_per_second": 105.549, + "eval_steps_per_second": 0.412, + "step": 114000 + }, + { + "epoch": 0.46481832467824774, + "grad_norm": 2.7218010425567627, + "learning_rate": 0.004956455026731795, + "loss": 8.6125, + "step": 114100 + }, + { + "epoch": 0.4652257027016292, + "grad_norm": 3.7615129947662354, + "learning_rate": 0.004956362698007991, + "loss": 8.6228, + "step": 114200 + }, + { + "epoch": 0.4656330807250107, + "grad_norm": 4.030871391296387, + "learning_rate": 0.004956270272368709, + "loss": 8.546, + "step": 114300 + }, + { + "epoch": 0.4660404587483921, + "grad_norm": 5.961259365081787, + "learning_rate": 0.004956177749817598, + "loss": 8.4686, + "step": 114400 + }, + { + "epoch": 0.4664478367717736, + "grad_norm": 9.751999855041504, + "learning_rate": 0.004956085130358327, + "loss": 8.4324, + "step": 114500 + }, + { + "epoch": 0.46685521479515507, + "grad_norm": 9.5776948928833, + "learning_rate": 0.004955992413994554, + "loss": 8.5815, + "step": 114600 + }, + { + "epoch": 0.4672625928185365, + "grad_norm": 5.737105369567871, + "learning_rate": 0.0049558996007299395, + "loss": 8.5721, + "step": 114700 + }, + { + "epoch": 0.46766997084191797, + "grad_norm": 8.12684440612793, + "learning_rate": 0.004955806690568156, + "loss": 8.451, + "step": 114800 + }, + { + "epoch": 0.46807734886529945, + "grad_norm": 3.4034082889556885, + "learning_rate": 0.004955713683512877, + "loss": 8.4247, + "step": 114900 + }, + { + "epoch": 0.4684847268886809, + "grad_norm": 4.736820220947266, + "learning_rate": 0.004955620579567771, + "loss": 8.4251, + "step": 115000 + }, + { + "epoch": 0.4684847268886809, + "eval_MaskedAccuracy": 0.4688468475901898, + "eval_loss": 1.8014590740203857, + "eval_runtime": 562.4603, + "eval_samples_per_second": 112.854, + "eval_steps_per_second": 0.441, + "step": 115000 + }, + { + "epoch": 0.46889210491206235, + "grad_norm": 10.829277992248535, + "learning_rate": 0.004955527378736533, + "loss": 8.5794, + "step": 115100 + }, + { + "epoch": 0.4692994829354438, + "grad_norm": 1.7777608633041382, + "learning_rate": 0.004955434081022844, + "loss": 8.635, + "step": 115200 + }, + { + "epoch": 0.4697068609588253, + "grad_norm": 17.929372787475586, + "learning_rate": 0.0049553406864303855, + "loss": 8.5905, + "step": 115300 + }, + { + "epoch": 0.4701142389822067, + "grad_norm": 3.8750357627868652, + "learning_rate": 0.004955247194962858, + "loss": 8.5732, + "step": 115400 + }, + { + "epoch": 0.4705216170055882, + "grad_norm": 3.1751935482025146, + "learning_rate": 0.004955153606623948, + "loss": 8.4471, + "step": 115500 + }, + { + "epoch": 0.4709289950289697, + "grad_norm": 11.84594440460205, + "learning_rate": 0.0049550599214173595, + "loss": 8.4343, + "step": 115600 + }, + { + "epoch": 0.47133637305235115, + "grad_norm": 6.070246696472168, + "learning_rate": 0.0049549661393468, + "loss": 8.4336, + "step": 115700 + }, + { + "epoch": 0.4717437510757326, + "grad_norm": 3.038576602935791, + "learning_rate": 0.004954872260415978, + "loss": 8.5264, + "step": 115800 + }, + { + "epoch": 0.47215112909911405, + "grad_norm": 1.0032246112823486, + "learning_rate": 0.0049547782846286045, + "loss": 8.5696, + "step": 115900 + }, + { + "epoch": 0.47255850712249553, + "grad_norm": 5.927453994750977, + "learning_rate": 0.004954684211988393, + "loss": 8.6381, + "step": 116000 + }, + { + "epoch": 0.47255850712249553, + "eval_MaskedAccuracy": 0.4656466126507873, + "eval_loss": 1.8105868101119995, + "eval_runtime": 617.0844, + "eval_samples_per_second": 102.864, + "eval_steps_per_second": 0.402, + "step": 116000 + }, + { + "epoch": 0.47296588514587695, + "grad_norm": 4.60734748840332, + "learning_rate": 0.004954590042499063, + "loss": 8.6123, + "step": 116100 + }, + { + "epoch": 0.47337326316925843, + "grad_norm": 7.652487754821777, + "learning_rate": 0.004954495776164337, + "loss": 8.5055, + "step": 116200 + }, + { + "epoch": 0.4737806411926399, + "grad_norm": 5.244068145751953, + "learning_rate": 0.004954401412987941, + "loss": 8.464, + "step": 116300 + }, + { + "epoch": 0.4741880192160214, + "grad_norm": 12.130002975463867, + "learning_rate": 0.004954306952973607, + "loss": 8.4014, + "step": 116400 + }, + { + "epoch": 0.4745953972394028, + "grad_norm": 7.4617919921875, + "learning_rate": 0.004954212396125075, + "loss": 8.5934, + "step": 116500 + }, + { + "epoch": 0.4750027752627843, + "grad_norm": 3.1740314960479736, + "learning_rate": 0.004954117742446068, + "loss": 8.5771, + "step": 116600 + }, + { + "epoch": 0.47541015328616576, + "grad_norm": 4.737682819366455, + "learning_rate": 0.004954022991940346, + "loss": 8.4457, + "step": 116700 + }, + { + "epoch": 0.47581753130954724, + "grad_norm": 11.205411911010742, + "learning_rate": 0.004953928144611644, + "loss": 8.5085, + "step": 116800 + }, + { + "epoch": 0.47622490933292866, + "grad_norm": 3.0664117336273193, + "learning_rate": 0.004953833200463713, + "loss": 8.5006, + "step": 116900 + }, + { + "epoch": 0.47663228735631014, + "grad_norm": 11.607182502746582, + "learning_rate": 0.004953738159500306, + "loss": 8.4609, + "step": 117000 + }, + { + "epoch": 0.47663228735631014, + "eval_MaskedAccuracy": 0.4652727326260346, + "eval_loss": 1.8133021593093872, + "eval_runtime": 639.7971, + "eval_samples_per_second": 99.213, + "eval_steps_per_second": 0.388, + "step": 117000 + }, + { + "epoch": 0.4770396653796916, + "grad_norm": 4.19486665725708, + "learning_rate": 0.00495364302172519, + "loss": 8.5505, + "step": 117100 + }, + { + "epoch": 0.47744704340307303, + "grad_norm": 1.922994613647461, + "learning_rate": 0.004953547787142112, + "loss": 8.5968, + "step": 117200 + }, + { + "epoch": 0.4778544214264545, + "grad_norm": 5.716039657592773, + "learning_rate": 0.004953452455754846, + "loss": 8.525, + "step": 117300 + }, + { + "epoch": 0.478261799449836, + "grad_norm": 8.691774368286133, + "learning_rate": 0.004953357027567162, + "loss": 8.6127, + "step": 117400 + }, + { + "epoch": 0.47866917747321747, + "grad_norm": 5.32814884185791, + "learning_rate": 0.004953261502582827, + "loss": 8.5103, + "step": 117500 + }, + { + "epoch": 0.4790765554965989, + "grad_norm": 7.465363025665283, + "learning_rate": 0.00495316588080562, + "loss": 8.4228, + "step": 117600 + }, + { + "epoch": 0.47948393351998037, + "grad_norm": 10.480701446533203, + "learning_rate": 0.00495307016223933, + "loss": 8.392, + "step": 117700 + }, + { + "epoch": 0.47989131154336184, + "grad_norm": 6.020634174346924, + "learning_rate": 0.004952974346887732, + "loss": 8.4383, + "step": 117800 + }, + { + "epoch": 0.48029868956674326, + "grad_norm": 9.17965030670166, + "learning_rate": 0.00495287843475461, + "loss": 8.4746, + "step": 117900 + }, + { + "epoch": 0.48070606759012474, + "grad_norm": 6.490881443023682, + "learning_rate": 0.004952782425843763, + "loss": 8.4617, + "step": 118000 + }, + { + "epoch": 0.48070606759012474, + "eval_MaskedAccuracy": 0.46867127279022996, + "eval_loss": 1.8012827634811401, + "eval_runtime": 540.5029, + "eval_samples_per_second": 117.439, + "eval_steps_per_second": 0.459, + "step": 118000 + }, + { + "epoch": 0.4811134456135062, + "grad_norm": 8.385427474975586, + "learning_rate": 0.004952686320158984, + "loss": 8.4601, + "step": 118100 + }, + { + "epoch": 0.4815208236368877, + "grad_norm": 7.8449506759643555, + "learning_rate": 0.0049525901177040685, + "loss": 8.4005, + "step": 118200 + }, + { + "epoch": 0.4819282016602691, + "grad_norm": 5.87675666809082, + "learning_rate": 0.004952493818482826, + "loss": 8.3504, + "step": 118300 + }, + { + "epoch": 0.4823355796836506, + "grad_norm": 8.414234161376953, + "learning_rate": 0.00495239742249907, + "loss": 8.3437, + "step": 118400 + }, + { + "epoch": 0.48274295770703207, + "grad_norm": 8.45103931427002, + "learning_rate": 0.004952300929756606, + "loss": 8.3787, + "step": 118500 + }, + { + "epoch": 0.4831503357304135, + "grad_norm": 4.254857063293457, + "learning_rate": 0.004952204340259244, + "loss": 8.3363, + "step": 118600 + }, + { + "epoch": 0.48355771375379497, + "grad_norm": 8.870880126953125, + "learning_rate": 0.0049521076540108054, + "loss": 8.3965, + "step": 118700 + }, + { + "epoch": 0.48396509177717645, + "grad_norm": 7.823821067810059, + "learning_rate": 0.004952010871015116, + "loss": 8.3472, + "step": 118800 + }, + { + "epoch": 0.4843724698005579, + "grad_norm": 8.193089485168457, + "learning_rate": 0.004951913991275995, + "loss": 8.3609, + "step": 118900 + }, + { + "epoch": 0.48477984782393935, + "grad_norm": 8.305012702941895, + "learning_rate": 0.004951817014797274, + "loss": 8.3527, + "step": 119000 + }, + { + "epoch": 0.48477984782393935, + "eval_MaskedAccuracy": 0.4767895611606481, + "eval_loss": 1.7638946771621704, + "eval_runtime": 648.6472, + "eval_samples_per_second": 97.859, + "eval_steps_per_second": 0.382, + "step": 119000 + }, + { + "epoch": 0.4851872258473208, + "grad_norm": 8.314849853515625, + "learning_rate": 0.0049517199415827955, + "loss": 8.3497, + "step": 119100 + }, + { + "epoch": 0.4855946038707023, + "grad_norm": 3.263856887817383, + "learning_rate": 0.004951622771636393, + "loss": 8.4411, + "step": 119200 + }, + { + "epoch": 0.4860019818940837, + "grad_norm": 2.1813316345214844, + "learning_rate": 0.004951525504961904, + "loss": 8.5163, + "step": 119300 + }, + { + "epoch": 0.4864093599174652, + "grad_norm": 9.317235946655273, + "learning_rate": 0.004951428141563178, + "loss": 8.5087, + "step": 119400 + }, + { + "epoch": 0.4868167379408467, + "grad_norm": 4.322655200958252, + "learning_rate": 0.004951330681444059, + "loss": 8.604, + "step": 119500 + }, + { + "epoch": 0.48722411596422815, + "grad_norm": 7.114287376403809, + "learning_rate": 0.0049512331246084, + "loss": 8.6363, + "step": 119600 + }, + { + "epoch": 0.4876314939876096, + "grad_norm": 6.147729396820068, + "learning_rate": 0.00495113547106007, + "loss": 8.4465, + "step": 119700 + }, + { + "epoch": 0.48803887201099105, + "grad_norm": 6.949401378631592, + "learning_rate": 0.004951037720802924, + "loss": 8.3767, + "step": 119800 + }, + { + "epoch": 0.48844625003437253, + "grad_norm": 7.660586833953857, + "learning_rate": 0.004950939873840821, + "loss": 8.408, + "step": 119900 + }, + { + "epoch": 0.488853628057754, + "grad_norm": 8.358684539794922, + "learning_rate": 0.004950841930177634, + "loss": 8.3833, + "step": 120000 + }, + { + "epoch": 0.488853628057754, + "eval_MaskedAccuracy": 0.47606467814721937, + "eval_loss": 1.7586658000946045, + "eval_runtime": 593.8693, + "eval_samples_per_second": 106.885, + "eval_steps_per_second": 0.418, + "step": 120000 + }, + { + "epoch": 0.48926100608113543, + "grad_norm": 9.096843719482422, + "learning_rate": 0.004950743889817234, + "loss": 8.3418, + "step": 120100 + }, + { + "epoch": 0.4896683841045169, + "grad_norm": 8.729216575622559, + "learning_rate": 0.004950645752763492, + "loss": 8.381, + "step": 120200 + }, + { + "epoch": 0.4900757621278984, + "grad_norm": 7.4013447761535645, + "learning_rate": 0.004950547519020298, + "loss": 8.3483, + "step": 120300 + }, + { + "epoch": 0.4904831401512798, + "grad_norm": 7.02334451675415, + "learning_rate": 0.004950449188591533, + "loss": 8.3802, + "step": 120400 + }, + { + "epoch": 0.4908905181746613, + "grad_norm": 3.4499237537384033, + "learning_rate": 0.004950350761481079, + "loss": 8.3577, + "step": 120500 + }, + { + "epoch": 0.49129789619804276, + "grad_norm": 4.924149036407471, + "learning_rate": 0.004950252237692828, + "loss": 8.3735, + "step": 120600 + }, + { + "epoch": 0.49170527422142424, + "grad_norm": 6.146267414093018, + "learning_rate": 0.0049501536172306786, + "loss": 8.363, + "step": 120700 + }, + { + "epoch": 0.49211265224480566, + "grad_norm": 7.329911231994629, + "learning_rate": 0.004950054900098534, + "loss": 8.3391, + "step": 120800 + }, + { + "epoch": 0.49252003026818714, + "grad_norm": 2.1442506313323975, + "learning_rate": 0.004949956086300295, + "loss": 8.5251, + "step": 120900 + }, + { + "epoch": 0.4929274082915686, + "grad_norm": 9.27274227142334, + "learning_rate": 0.004949857175839865, + "loss": 8.5657, + "step": 121000 + }, + { + "epoch": 0.4929274082915686, + "eval_MaskedAccuracy": 0.47242173123299347, + "eval_loss": 1.7904921770095825, + "eval_runtime": 611.4048, + "eval_samples_per_second": 103.82, + "eval_steps_per_second": 0.406, + "step": 121000 + }, + { + "epoch": 0.49333478631495004, + "grad_norm": 7.481986045837402, + "learning_rate": 0.0049497581687211484, + "loss": 8.3722, + "step": 121100 + }, + { + "epoch": 0.4937421643383315, + "grad_norm": 7.524771213531494, + "learning_rate": 0.004949659064948069, + "loss": 8.378, + "step": 121200 + }, + { + "epoch": 0.494149542361713, + "grad_norm": 9.83425235748291, + "learning_rate": 0.004949559864524543, + "loss": 8.3516, + "step": 121300 + }, + { + "epoch": 0.49455692038509447, + "grad_norm": 8.452327728271484, + "learning_rate": 0.004949460567454488, + "loss": 8.3802, + "step": 121400 + }, + { + "epoch": 0.4949642984084759, + "grad_norm": 4.382816791534424, + "learning_rate": 0.004949361173741836, + "loss": 8.4591, + "step": 121500 + }, + { + "epoch": 0.49537167643185737, + "grad_norm": 6.4604172706604, + "learning_rate": 0.004949261683390515, + "loss": 8.5378, + "step": 121600 + }, + { + "epoch": 0.49577905445523884, + "grad_norm": 6.597474575042725, + "learning_rate": 0.00494916209640446, + "loss": 8.5965, + "step": 121700 + }, + { + "epoch": 0.49618643247862027, + "grad_norm": 8.31852912902832, + "learning_rate": 0.0049490624127876, + "loss": 8.4832, + "step": 121800 + }, + { + "epoch": 0.49659381050200174, + "grad_norm": 5.9900641441345215, + "learning_rate": 0.004948962632543879, + "loss": 8.6186, + "step": 121900 + }, + { + "epoch": 0.4970011885253832, + "grad_norm": 6.915224552154541, + "learning_rate": 0.004948862755677247, + "loss": 8.454, + "step": 122000 + }, + { + "epoch": 0.4970011885253832, + "eval_MaskedAccuracy": 0.472195087355659, + "eval_loss": 1.7736320495605469, + "eval_runtime": 404.842, + "eval_samples_per_second": 156.792, + "eval_steps_per_second": 0.613, + "step": 122000 + }, + { + "epoch": 0.4974085665487647, + "grad_norm": 8.118802070617676, + "learning_rate": 0.0049487627821916454, + "loss": 8.3735, + "step": 122100 + }, + { + "epoch": 0.4978159445721461, + "grad_norm": 9.109051704406738, + "learning_rate": 0.004948662712091032, + "loss": 8.358, + "step": 122200 + }, + { + "epoch": 0.4982233225955276, + "grad_norm": 7.688298225402832, + "learning_rate": 0.004948562545379363, + "loss": 8.357, + "step": 122300 + }, + { + "epoch": 0.4986307006189091, + "grad_norm": 5.265169620513916, + "learning_rate": 0.004948462282060591, + "loss": 8.3677, + "step": 122400 + }, + { + "epoch": 0.49903807864229055, + "grad_norm": 5.895035743713379, + "learning_rate": 0.004948361922138693, + "loss": 8.3613, + "step": 122500 + }, + { + "epoch": 0.49944545666567197, + "grad_norm": 3.273195743560791, + "learning_rate": 0.004948261465617631, + "loss": 8.3804, + "step": 122600 + }, + { + "epoch": 0.49985283468905345, + "grad_norm": 5.652338981628418, + "learning_rate": 0.004948160912501374, + "loss": 8.5127, + "step": 122700 + }, + { + "epoch": 0.5002602127124349, + "grad_norm": 10.528982162475586, + "learning_rate": 0.0049480602627939, + "loss": 8.3952, + "step": 122800 + }, + { + "epoch": 0.5006675907358163, + "grad_norm": 7.678232192993164, + "learning_rate": 0.00494795951649919, + "loss": 8.3522, + "step": 122900 + }, + { + "epoch": 0.5010749687591979, + "grad_norm": 6.645458698272705, + "learning_rate": 0.004947858673621225, + "loss": 8.3805, + "step": 123000 + }, + { + "epoch": 0.5010749687591979, + "eval_MaskedAccuracy": 0.4759519419330508, + "eval_loss": 1.7681282758712769, + "eval_runtime": 571.6199, + "eval_samples_per_second": 111.046, + "eval_steps_per_second": 0.434, + "step": 123000 + }, + { + "epoch": 0.5014823467825793, + "grad_norm": 6.954684734344482, + "learning_rate": 0.004947757734163989, + "loss": 8.3092, + "step": 123100 + }, + { + "epoch": 0.5018897248059607, + "grad_norm": 8.839340209960938, + "learning_rate": 0.004947656698131469, + "loss": 8.3486, + "step": 123200 + }, + { + "epoch": 0.5022971028293423, + "grad_norm": 6.953580856323242, + "learning_rate": 0.004947555565527669, + "loss": 8.3612, + "step": 123300 + }, + { + "epoch": 0.5027044808527237, + "grad_norm": 9.655802726745605, + "learning_rate": 0.004947454336356585, + "loss": 8.3551, + "step": 123400 + }, + { + "epoch": 0.5031118588761051, + "grad_norm": 0.8042750954627991, + "learning_rate": 0.00494735301062222, + "loss": 8.7023, + "step": 123500 + }, + { + "epoch": 0.5035192368994866, + "grad_norm": 10.146775245666504, + "learning_rate": 0.004947251588328577, + "loss": 8.6243, + "step": 123600 + }, + { + "epoch": 0.503926614922868, + "grad_norm": 6.121560096740723, + "learning_rate": 0.004947150069479665, + "loss": 8.4713, + "step": 123700 + }, + { + "epoch": 0.5043339929462495, + "grad_norm": 6.796733379364014, + "learning_rate": 0.004947048454079505, + "loss": 8.4259, + "step": 123800 + }, + { + "epoch": 0.504741370969631, + "grad_norm": 11.119961738586426, + "learning_rate": 0.004946946742132101, + "loss": 8.5861, + "step": 123900 + }, + { + "epoch": 0.5051487489930124, + "grad_norm": 4.724853038787842, + "learning_rate": 0.004946844933641488, + "loss": 8.5123, + "step": 124000 + }, + { + "epoch": 0.5051487489930124, + "eval_MaskedAccuracy": 0.4723671499722319, + "eval_loss": 1.7843869924545288, + "eval_runtime": 619.0846, + "eval_samples_per_second": 102.532, + "eval_steps_per_second": 0.401, + "step": 124000 + }, + { + "epoch": 0.505556127016394, + "grad_norm": 4.339019298553467, + "learning_rate": 0.004946743028611682, + "loss": 8.5424, + "step": 124100 + }, + { + "epoch": 0.5059635050397754, + "grad_norm": 0.9369896054267883, + "learning_rate": 0.004946641027046716, + "loss": 8.597, + "step": 124200 + }, + { + "epoch": 0.5063708830631568, + "grad_norm": 9.629310607910156, + "learning_rate": 0.004946538928950624, + "loss": 8.515, + "step": 124300 + }, + { + "epoch": 0.5067782610865383, + "grad_norm": 7.729372501373291, + "learning_rate": 0.004946436734327439, + "loss": 8.4293, + "step": 124400 + }, + { + "epoch": 0.5071856391099198, + "grad_norm": 2.7494254112243652, + "learning_rate": 0.004946334443181198, + "loss": 8.3897, + "step": 124500 + }, + { + "epoch": 0.5075930171333012, + "grad_norm": 1.270338773727417, + "learning_rate": 0.004946232055515947, + "loss": 8.6036, + "step": 124600 + }, + { + "epoch": 0.5080003951566827, + "grad_norm": 10.06462574005127, + "learning_rate": 0.004946129571335741, + "loss": 8.5373, + "step": 124700 + }, + { + "epoch": 0.5084077731800641, + "grad_norm": 2.283895969390869, + "learning_rate": 0.004946026990644626, + "loss": 8.4413, + "step": 124800 + }, + { + "epoch": 0.5088151512034456, + "grad_norm": 6.6044111251831055, + "learning_rate": 0.004945924313446655, + "loss": 8.4213, + "step": 124900 + }, + { + "epoch": 0.5092225292268271, + "grad_norm": 2.018730640411377, + "learning_rate": 0.004945821539745899, + "loss": 8.4662, + "step": 125000 + }, + { + "epoch": 0.5092225292268271, + "eval_MaskedAccuracy": 0.46727423518335215, + "eval_loss": 1.8027355670928955, + "eval_runtime": 586.8485, + "eval_samples_per_second": 108.164, + "eval_steps_per_second": 0.423, + "step": 125000 + }, + { + "epoch": 0.5096299072502085, + "grad_norm": 8.053539276123047, + "learning_rate": 0.00494571866954641, + "loss": 8.5339, + "step": 125100 + }, + { + "epoch": 0.5100372852735899, + "grad_norm": 5.74146842956543, + "learning_rate": 0.00494561570285225, + "loss": 8.4135, + "step": 125200 + }, + { + "epoch": 0.5104446632969715, + "grad_norm": 7.048740386962891, + "learning_rate": 0.004945512639667504, + "loss": 8.4548, + "step": 125300 + }, + { + "epoch": 0.5108520413203529, + "grad_norm": 7.538445472717285, + "learning_rate": 0.004945409479996239, + "loss": 8.4078, + "step": 125400 + }, + { + "epoch": 0.5112594193437344, + "grad_norm": 10.846882820129395, + "learning_rate": 0.004945306223842535, + "loss": 8.3321, + "step": 125500 + }, + { + "epoch": 0.5116667973671158, + "grad_norm": 6.417740821838379, + "learning_rate": 0.004945202871210477, + "loss": 8.3654, + "step": 125600 + }, + { + "epoch": 0.5120741753904973, + "grad_norm": 9.360198974609375, + "learning_rate": 0.004945099422104144, + "loss": 8.3416, + "step": 125700 + }, + { + "epoch": 0.5124815534138788, + "grad_norm": 4.266257286071777, + "learning_rate": 0.004944995876527626, + "loss": 8.3595, + "step": 125800 + }, + { + "epoch": 0.5128889314372602, + "grad_norm": 4.841434955596924, + "learning_rate": 0.004944892234485027, + "loss": 8.5323, + "step": 125900 + }, + { + "epoch": 0.5132963094606416, + "grad_norm": 11.480585098266602, + "learning_rate": 0.004944788495980438, + "loss": 8.3728, + "step": 126000 + }, + { + "epoch": 0.5132963094606416, + "eval_MaskedAccuracy": 0.4755163862779088, + "eval_loss": 1.7594823837280273, + "eval_runtime": 648.4406, + "eval_samples_per_second": 97.89, + "eval_steps_per_second": 0.382, + "step": 126000 + }, + { + "epoch": 0.5137036874840232, + "grad_norm": 5.7990403175354, + "learning_rate": 0.004944684661017963, + "loss": 8.3259, + "step": 126100 + }, + { + "epoch": 0.5141110655074046, + "grad_norm": 1.583735704421997, + "learning_rate": 0.004944580729601707, + "loss": 8.3592, + "step": 126200 + }, + { + "epoch": 0.514518443530786, + "grad_norm": 12.780102729797363, + "learning_rate": 0.0049444767017357725, + "loss": 8.43, + "step": 126300 + }, + { + "epoch": 0.5149258215541676, + "grad_norm": 5.658169269561768, + "learning_rate": 0.0049443725774242775, + "loss": 8.6245, + "step": 126400 + }, + { + "epoch": 0.515333199577549, + "grad_norm": 7.621128082275391, + "learning_rate": 0.004944268356671341, + "loss": 8.4524, + "step": 126500 + }, + { + "epoch": 0.5157405776009305, + "grad_norm": 9.25407600402832, + "learning_rate": 0.004944164039481081, + "loss": 8.4178, + "step": 126600 + }, + { + "epoch": 0.5161479556243119, + "grad_norm": 5.097949028015137, + "learning_rate": 0.004944059625857619, + "loss": 8.6409, + "step": 126700 + }, + { + "epoch": 0.5165553336476933, + "grad_norm": 9.274227142333984, + "learning_rate": 0.004943955115805082, + "loss": 8.6079, + "step": 126800 + }, + { + "epoch": 0.5169627116710749, + "grad_norm": 5.212336540222168, + "learning_rate": 0.004943850509327609, + "loss": 8.4844, + "step": 126900 + }, + { + "epoch": 0.5173700896944563, + "grad_norm": 10.940645217895508, + "learning_rate": 0.004943745806429329, + "loss": 8.4035, + "step": 127000 + }, + { + "epoch": 0.5173700896944563, + "eval_MaskedAccuracy": 0.4748734666305473, + "eval_loss": 1.7772326469421387, + "eval_runtime": 616.2331, + "eval_samples_per_second": 103.006, + "eval_steps_per_second": 0.402, + "step": 127000 + }, + { + "epoch": 0.5177774677178377, + "grad_norm": 5.6482696533203125, + "learning_rate": 0.004943641007114383, + "loss": 8.3536, + "step": 127100 + }, + { + "epoch": 0.5181848457412193, + "grad_norm": 8.932602882385254, + "learning_rate": 0.004943536111386913, + "loss": 8.3414, + "step": 127200 + }, + { + "epoch": 0.5185922237646007, + "grad_norm": 13.874133110046387, + "learning_rate": 0.004943431119251073, + "loss": 8.3514, + "step": 127300 + }, + { + "epoch": 0.5189996017879821, + "grad_norm": 8.456324577331543, + "learning_rate": 0.0049433260307110115, + "loss": 8.3177, + "step": 127400 + }, + { + "epoch": 0.5194069798113636, + "grad_norm": 8.561285972595215, + "learning_rate": 0.004943220845770884, + "loss": 8.3386, + "step": 127500 + }, + { + "epoch": 0.5198143578347451, + "grad_norm": 8.178915023803711, + "learning_rate": 0.004943115564434837, + "loss": 8.3262, + "step": 127600 + }, + { + "epoch": 0.5202217358581265, + "grad_norm": 4.443258285522461, + "learning_rate": 0.004943010186707044, + "loss": 8.3302, + "step": 127700 + }, + { + "epoch": 0.520629113881508, + "grad_norm": 6.866542816162109, + "learning_rate": 0.00494290471259167, + "loss": 8.4339, + "step": 127800 + }, + { + "epoch": 0.5210364919048894, + "grad_norm": 11.089356422424316, + "learning_rate": 0.004942799142092884, + "loss": 8.6393, + "step": 127900 + }, + { + "epoch": 0.521443869928271, + "grad_norm": 1.7169142961502075, + "learning_rate": 0.004942693475214859, + "loss": 8.6402, + "step": 128000 + }, + { + "epoch": 0.521443869928271, + "eval_MaskedAccuracy": 0.4648368387713934, + "eval_loss": 1.8243728876113892, + "eval_runtime": 559.7984, + "eval_samples_per_second": 113.391, + "eval_steps_per_second": 0.443, + "step": 128000 + }, + { + "epoch": 0.5218512479516524, + "grad_norm": 7.574516773223877, + "learning_rate": 0.0049425877119617725, + "loss": 8.5531, + "step": 128100 + }, + { + "epoch": 0.5222586259750338, + "grad_norm": 4.418700695037842, + "learning_rate": 0.004942481852337807, + "loss": 8.4875, + "step": 128200 + }, + { + "epoch": 0.5226660039984153, + "grad_norm": 8.742793083190918, + "learning_rate": 0.004942375896347151, + "loss": 8.5511, + "step": 128300 + }, + { + "epoch": 0.5230733820217968, + "grad_norm": 11.146100044250488, + "learning_rate": 0.004942269843993984, + "loss": 8.4107, + "step": 128400 + }, + { + "epoch": 0.5234807600451782, + "grad_norm": 9.998716354370117, + "learning_rate": 0.004942163695282504, + "loss": 8.4421, + "step": 128500 + }, + { + "epoch": 0.5238881380685597, + "grad_norm": 2.967787027359009, + "learning_rate": 0.004942057450216913, + "loss": 8.4074, + "step": 128600 + }, + { + "epoch": 0.5242955160919411, + "grad_norm": 6.486566543579102, + "learning_rate": 0.004941951108801405, + "loss": 8.4151, + "step": 128700 + }, + { + "epoch": 0.5247028941153226, + "grad_norm": 2.534282922744751, + "learning_rate": 0.004941844671040186, + "loss": 8.3743, + "step": 128800 + }, + { + "epoch": 0.5251102721387041, + "grad_norm": 5.772778511047363, + "learning_rate": 0.0049417381369374624, + "loss": 8.5069, + "step": 128900 + }, + { + "epoch": 0.5255176501620855, + "grad_norm": 7.692689895629883, + "learning_rate": 0.00494163150649745, + "loss": 8.3617, + "step": 129000 + }, + { + "epoch": 0.5255176501620855, + "eval_MaskedAccuracy": 0.47622878724681494, + "eval_loss": 1.7545971870422363, + "eval_runtime": 686.8219, + "eval_samples_per_second": 92.42, + "eval_steps_per_second": 0.361, + "step": 129000 + }, + { + "epoch": 0.525925028185467, + "grad_norm": 7.9084577560424805, + "learning_rate": 0.004941524779724357, + "loss": 8.2893, + "step": 129100 + }, + { + "epoch": 0.5263324062088485, + "grad_norm": 5.903682708740234, + "learning_rate": 0.004941417956622409, + "loss": 8.3253, + "step": 129200 + }, + { + "epoch": 0.5267397842322299, + "grad_norm": 10.002944946289062, + "learning_rate": 0.004941311037195834, + "loss": 8.3331, + "step": 129300 + }, + { + "epoch": 0.5271471622556114, + "grad_norm": 5.873917102813721, + "learning_rate": 0.004941204021448854, + "loss": 8.3173, + "step": 129400 + }, + { + "epoch": 0.5275545402789928, + "grad_norm": 7.145143508911133, + "learning_rate": 0.0049410969093856985, + "loss": 8.3968, + "step": 129500 + }, + { + "epoch": 0.5279619183023743, + "grad_norm": 4.4543328285217285, + "learning_rate": 0.004940989701010604, + "loss": 8.449, + "step": 129600 + }, + { + "epoch": 0.5283692963257558, + "grad_norm": 7.055647850036621, + "learning_rate": 0.004940882396327806, + "loss": 8.3819, + "step": 129700 + }, + { + "epoch": 0.5287766743491372, + "grad_norm": 8.206925392150879, + "learning_rate": 0.004940774995341549, + "loss": 8.3501, + "step": 129800 + }, + { + "epoch": 0.5291840523725186, + "grad_norm": 10.196109771728516, + "learning_rate": 0.004940667498056074, + "loss": 8.2884, + "step": 129900 + }, + { + "epoch": 0.5295914303959002, + "grad_norm": 10.326079368591309, + "learning_rate": 0.004940559904475633, + "loss": 8.3304, + "step": 130000 + }, + { + "epoch": 0.5295914303959002, + "eval_MaskedAccuracy": 0.4778809703901036, + "eval_loss": 1.7554049491882324, + "eval_runtime": 432.0501, + "eval_samples_per_second": 146.918, + "eval_steps_per_second": 0.574, + "step": 130000 + }, + { + "epoch": 0.5299988084192816, + "grad_norm": 7.801156997680664, + "learning_rate": 0.004940452214604491, + "loss": 8.3286, + "step": 130100 + }, + { + "epoch": 0.530406186442663, + "grad_norm": 6.964962959289551, + "learning_rate": 0.004940344428446893, + "loss": 8.4938, + "step": 130200 + }, + { + "epoch": 0.5308135644660446, + "grad_norm": 3.655668258666992, + "learning_rate": 0.004940236546007105, + "loss": 8.5877, + "step": 130300 + }, + { + "epoch": 0.531220942489426, + "grad_norm": 8.000005722045898, + "learning_rate": 0.004940128567289394, + "loss": 8.5344, + "step": 130400 + }, + { + "epoch": 0.5316283205128075, + "grad_norm": 8.468135833740234, + "learning_rate": 0.004940020492298025, + "loss": 8.3759, + "step": 130500 + }, + { + "epoch": 0.5320356985361889, + "grad_norm": 6.7972917556762695, + "learning_rate": 0.004939912321037274, + "loss": 8.3218, + "step": 130600 + }, + { + "epoch": 0.5324430765595703, + "grad_norm": 10.092455863952637, + "learning_rate": 0.004939804053511412, + "loss": 8.342, + "step": 130700 + }, + { + "epoch": 0.5328504545829519, + "grad_norm": 4.264617919921875, + "learning_rate": 0.004939695689724729, + "loss": 8.6042, + "step": 130800 + }, + { + "epoch": 0.5332578326063333, + "grad_norm": 7.822512626647949, + "learning_rate": 0.0049395872296815, + "loss": 8.5632, + "step": 130900 + }, + { + "epoch": 0.5336652106297147, + "grad_norm": 3.6305220127105713, + "learning_rate": 0.004939478673386021, + "loss": 8.5285, + "step": 131000 + }, + { + "epoch": 0.5336652106297147, + "eval_MaskedAccuracy": 0.47348632156424786, + "eval_loss": 1.7749598026275635, + "eval_runtime": 551.4523, + "eval_samples_per_second": 115.107, + "eval_steps_per_second": 0.45, + "step": 131000 + }, + { + "epoch": 0.5340725886530963, + "grad_norm": 12.928987503051758, + "learning_rate": 0.004939370020842574, + "loss": 8.4338, + "step": 131100 + }, + { + "epoch": 0.5344799666764777, + "grad_norm": 4.38896369934082, + "learning_rate": 0.004939261272055469, + "loss": 8.5271, + "step": 131200 + }, + { + "epoch": 0.5348873446998591, + "grad_norm": 4.418875694274902, + "learning_rate": 0.004939152427028993, + "loss": 8.3756, + "step": 131300 + }, + { + "epoch": 0.5352947227232406, + "grad_norm": 4.9500651359558105, + "learning_rate": 0.0049390434857674494, + "loss": 8.3532, + "step": 131400 + }, + { + "epoch": 0.5357021007466221, + "grad_norm": 7.733002185821533, + "learning_rate": 0.004938934448275152, + "loss": 8.3283, + "step": 131500 + }, + { + "epoch": 0.5361094787700036, + "grad_norm": 6.218177318572998, + "learning_rate": 0.0049388253145564096, + "loss": 8.3114, + "step": 131600 + }, + { + "epoch": 0.536516856793385, + "grad_norm": 5.060550212860107, + "learning_rate": 0.0049387160846155334, + "loss": 8.3091, + "step": 131700 + }, + { + "epoch": 0.5369242348167664, + "grad_norm": 9.628071784973145, + "learning_rate": 0.004938606758456842, + "loss": 8.3289, + "step": 131800 + }, + { + "epoch": 0.537331612840148, + "grad_norm": 4.559714317321777, + "learning_rate": 0.004938497336084662, + "loss": 8.3044, + "step": 131900 + }, + { + "epoch": 0.5377389908635294, + "grad_norm": 6.963791847229004, + "learning_rate": 0.004938387817503315, + "loss": 8.2859, + "step": 132000 + }, + { + "epoch": 0.5377389908635294, + "eval_MaskedAccuracy": 0.47790219219810026, + "eval_loss": 1.7602800130844116, + "eval_runtime": 606.5816, + "eval_samples_per_second": 104.645, + "eval_steps_per_second": 0.409, + "step": 132000 + }, + { + "epoch": 0.5381463688869108, + "grad_norm": 8.68433952331543, + "learning_rate": 0.004938278202717131, + "loss": 8.316, + "step": 132100 + }, + { + "epoch": 0.5385537469102923, + "grad_norm": 6.305760383605957, + "learning_rate": 0.004938168491730447, + "loss": 8.3034, + "step": 132200 + }, + { + "epoch": 0.5389611249336738, + "grad_norm": 8.102996826171875, + "learning_rate": 0.004938058684547598, + "loss": 8.3054, + "step": 132300 + }, + { + "epoch": 0.5393685029570552, + "grad_norm": 4.82603120803833, + "learning_rate": 0.004937948781172932, + "loss": 8.317, + "step": 132400 + }, + { + "epoch": 0.5397758809804367, + "grad_norm": 9.478666305541992, + "learning_rate": 0.004937838781610782, + "loss": 8.3199, + "step": 132500 + }, + { + "epoch": 0.5401832590038181, + "grad_norm": 6.930224418640137, + "learning_rate": 0.004937728685865504, + "loss": 8.3277, + "step": 132600 + }, + { + "epoch": 0.5405906370271996, + "grad_norm": 6.685028076171875, + "learning_rate": 0.00493761849394145, + "loss": 8.3096, + "step": 132700 + }, + { + "epoch": 0.5409980150505811, + "grad_norm": 11.711406707763672, + "learning_rate": 0.004937508205842981, + "loss": 8.3485, + "step": 132800 + }, + { + "epoch": 0.5414053930739625, + "grad_norm": 4.872280120849609, + "learning_rate": 0.004937397821574446, + "loss": 8.3167, + "step": 132900 + }, + { + "epoch": 0.541812771097344, + "grad_norm": 5.365285873413086, + "learning_rate": 0.004937287341140217, + "loss": 8.3758, + "step": 133000 + }, + { + "epoch": 0.541812771097344, + "eval_MaskedAccuracy": 0.47756522020477266, + "eval_loss": 1.7520103454589844, + "eval_runtime": 573.9139, + "eval_samples_per_second": 110.602, + "eval_steps_per_second": 0.432, + "step": 133000 + }, + { + "epoch": 0.5422201491207255, + "grad_norm": 5.993309020996094, + "learning_rate": 0.004937176764544669, + "loss": 8.325, + "step": 133100 + }, + { + "epoch": 0.5426275271441069, + "grad_norm": 6.58394193649292, + "learning_rate": 0.004937066091792161, + "loss": 8.3942, + "step": 133200 + }, + { + "epoch": 0.5430349051674884, + "grad_norm": 2.4660582542419434, + "learning_rate": 0.004936955322887076, + "loss": 8.607, + "step": 133300 + }, + { + "epoch": 0.5434422831908698, + "grad_norm": 1.6601381301879883, + "learning_rate": 0.004936844457833782, + "loss": 8.6692, + "step": 133400 + }, + { + "epoch": 0.5438496612142513, + "grad_norm": 8.589520454406738, + "learning_rate": 0.004936733496636674, + "loss": 8.6054, + "step": 133500 + }, + { + "epoch": 0.5442570392376328, + "grad_norm": 10.06407642364502, + "learning_rate": 0.004936622439300137, + "loss": 8.4439, + "step": 133600 + }, + { + "epoch": 0.5446644172610142, + "grad_norm": 2.2744853496551514, + "learning_rate": 0.0049365112858285555, + "loss": 8.559, + "step": 133700 + }, + { + "epoch": 0.5450717952843956, + "grad_norm": 2.906050205230713, + "learning_rate": 0.004936400036226332, + "loss": 8.5895, + "step": 133800 + }, + { + "epoch": 0.5454791733077772, + "grad_norm": 4.517184257507324, + "learning_rate": 0.004936288690497856, + "loss": 8.6234, + "step": 133900 + }, + { + "epoch": 0.5458865513311586, + "grad_norm": 4.899981498718262, + "learning_rate": 0.004936177248647536, + "loss": 8.4567, + "step": 134000 + }, + { + "epoch": 0.5458865513311586, + "eval_MaskedAccuracy": 0.4734875833116333, + "eval_loss": 1.77745521068573, + "eval_runtime": 588.8793, + "eval_samples_per_second": 107.791, + "eval_steps_per_second": 0.421, + "step": 134000 + }, + { + "epoch": 0.54629392935454, + "grad_norm": 5.668209075927734, + "learning_rate": 0.004936065710679779, + "loss": 8.4023, + "step": 134100 + }, + { + "epoch": 0.5467013073779216, + "grad_norm": 7.033673286437988, + "learning_rate": 0.0049359540765989965, + "loss": 8.449, + "step": 134200 + }, + { + "epoch": 0.547108685401303, + "grad_norm": 4.8322601318359375, + "learning_rate": 0.00493584234640959, + "loss": 8.3554, + "step": 134300 + }, + { + "epoch": 0.5475160634246845, + "grad_norm": 8.219858169555664, + "learning_rate": 0.004935730520115993, + "loss": 8.3085, + "step": 134400 + }, + { + "epoch": 0.5479234414480659, + "grad_norm": 8.772672653198242, + "learning_rate": 0.004935618597722611, + "loss": 8.3246, + "step": 134500 + }, + { + "epoch": 0.5483308194714474, + "grad_norm": 4.388508319854736, + "learning_rate": 0.0049355065792338745, + "loss": 8.3255, + "step": 134600 + }, + { + "epoch": 0.5487381974948289, + "grad_norm": 7.620246410369873, + "learning_rate": 0.004935394464654222, + "loss": 8.3211, + "step": 134700 + }, + { + "epoch": 0.5491455755182103, + "grad_norm": 8.451358795166016, + "learning_rate": 0.004935282253988074, + "loss": 8.2897, + "step": 134800 + }, + { + "epoch": 0.5495529535415917, + "grad_norm": 8.059579849243164, + "learning_rate": 0.004935169947239869, + "loss": 8.2934, + "step": 134900 + }, + { + "epoch": 0.5499603315649733, + "grad_norm": 5.406369209289551, + "learning_rate": 0.004935057544414045, + "loss": 8.3126, + "step": 135000 + }, + { + "epoch": 0.5499603315649733, + "eval_MaskedAccuracy": 0.4790078726465579, + "eval_loss": 1.7325547933578491, + "eval_runtime": 561.955, + "eval_samples_per_second": 112.956, + "eval_steps_per_second": 0.441, + "step": 135000 + }, + { + "epoch": 0.5503677095883547, + "grad_norm": 6.986901760101318, + "learning_rate": 0.004934945045515049, + "loss": 8.3266, + "step": 135100 + }, + { + "epoch": 0.5507750876117361, + "grad_norm": 6.454778671264648, + "learning_rate": 0.00493483245054733, + "loss": 8.2776, + "step": 135200 + }, + { + "epoch": 0.5511824656351176, + "grad_norm": 5.6888813972473145, + "learning_rate": 0.004934719759515342, + "loss": 8.2973, + "step": 135300 + }, + { + "epoch": 0.5515898436584991, + "grad_norm": 10.446785926818848, + "learning_rate": 0.004934606972423534, + "loss": 8.3179, + "step": 135400 + }, + { + "epoch": 0.5519972216818806, + "grad_norm": 4.9595112800598145, + "learning_rate": 0.004934494089276374, + "loss": 8.2849, + "step": 135500 + }, + { + "epoch": 0.552404599705262, + "grad_norm": 7.039740562438965, + "learning_rate": 0.004934381110078313, + "loss": 8.2879, + "step": 135600 + }, + { + "epoch": 0.5528119777286434, + "grad_norm": 5.197790622711182, + "learning_rate": 0.004934268034833823, + "loss": 8.3102, + "step": 135700 + }, + { + "epoch": 0.553219355752025, + "grad_norm": 9.838872909545898, + "learning_rate": 0.004934154863547372, + "loss": 8.3484, + "step": 135800 + }, + { + "epoch": 0.5536267337754064, + "grad_norm": 8.352523803710938, + "learning_rate": 0.004934041596223441, + "loss": 8.3332, + "step": 135900 + }, + { + "epoch": 0.5540341117987878, + "grad_norm": 6.191901206970215, + "learning_rate": 0.004933928232866506, + "loss": 8.5556, + "step": 136000 + }, + { + "epoch": 0.5540341117987878, + "eval_MaskedAccuracy": 0.45859378365217834, + "eval_loss": 1.8376259803771973, + "eval_runtime": 558.7141, + "eval_samples_per_second": 113.611, + "eval_steps_per_second": 0.444, + "step": 136000 + }, + { + "epoch": 0.5544414898221693, + "grad_norm": 6.703093528747559, + "learning_rate": 0.004933814773481047, + "loss": 8.6737, + "step": 136100 + }, + { + "epoch": 0.5548488678455508, + "grad_norm": 8.970337867736816, + "learning_rate": 0.004933701218071543, + "loss": 8.5675, + "step": 136200 + }, + { + "epoch": 0.5552562458689322, + "grad_norm": 5.416504383087158, + "learning_rate": 0.004933587566642498, + "loss": 8.5321, + "step": 136300 + }, + { + "epoch": 0.5556636238923137, + "grad_norm": 2.75545072555542, + "learning_rate": 0.004933473819198396, + "loss": 8.4297, + "step": 136400 + }, + { + "epoch": 0.5560710019156951, + "grad_norm": 4.553280830383301, + "learning_rate": 0.004933359975743742, + "loss": 8.5563, + "step": 136500 + }, + { + "epoch": 0.5564783799390766, + "grad_norm": 6.438226222991943, + "learning_rate": 0.004933246036283021, + "loss": 8.423, + "step": 136600 + }, + { + "epoch": 0.5568857579624581, + "grad_norm": 9.083197593688965, + "learning_rate": 0.004933132000820752, + "loss": 8.389, + "step": 136700 + }, + { + "epoch": 0.5572931359858395, + "grad_norm": 8.170239448547363, + "learning_rate": 0.00493301786936144, + "loss": 8.3369, + "step": 136800 + }, + { + "epoch": 0.557700514009221, + "grad_norm": 10.418622970581055, + "learning_rate": 0.0049329036419095895, + "loss": 8.3401, + "step": 136900 + }, + { + "epoch": 0.5581078920326025, + "grad_norm": 8.396859169006348, + "learning_rate": 0.004932789318469727, + "loss": 8.3074, + "step": 137000 + }, + { + "epoch": 0.5581078920326025, + "eval_MaskedAccuracy": 0.478139934353604, + "eval_loss": 1.757191777229309, + "eval_runtime": 604.6911, + "eval_samples_per_second": 104.973, + "eval_steps_per_second": 0.41, + "step": 137000 + }, + { + "epoch": 0.5585152700559839, + "grad_norm": 10.67445182800293, + "learning_rate": 0.004932674899046365, + "loss": 8.3036, + "step": 137100 + }, + { + "epoch": 0.5589226480793654, + "grad_norm": 8.79953384399414, + "learning_rate": 0.004932560383644033, + "loss": 8.2774, + "step": 137200 + }, + { + "epoch": 0.5593300261027468, + "grad_norm": 8.39030933380127, + "learning_rate": 0.0049324457722672525, + "loss": 8.274, + "step": 137300 + }, + { + "epoch": 0.5597374041261283, + "grad_norm": 9.26461124420166, + "learning_rate": 0.004932331064920559, + "loss": 8.3156, + "step": 137400 + }, + { + "epoch": 0.5601447821495098, + "grad_norm": 4.736127853393555, + "learning_rate": 0.004932216261608489, + "loss": 8.3264, + "step": 137500 + }, + { + "epoch": 0.5605521601728912, + "grad_norm": 9.321123123168945, + "learning_rate": 0.0049321013623355794, + "loss": 8.285, + "step": 137600 + }, + { + "epoch": 0.5609595381962726, + "grad_norm": 10.020621299743652, + "learning_rate": 0.0049319863671063655, + "loss": 8.2578, + "step": 137700 + }, + { + "epoch": 0.5613669162196542, + "grad_norm": 6.300183296203613, + "learning_rate": 0.0049318712759254014, + "loss": 8.2911, + "step": 137800 + }, + { + "epoch": 0.5617742942430356, + "grad_norm": 8.268228530883789, + "learning_rate": 0.004931756088797239, + "loss": 8.4312, + "step": 137900 + }, + { + "epoch": 0.5621816722664171, + "grad_norm": 9.11463737487793, + "learning_rate": 0.004931640805726424, + "loss": 8.344, + "step": 138000 + }, + { + "epoch": 0.5621816722664171, + "eval_MaskedAccuracy": 0.47664963225182544, + "eval_loss": 1.7589812278747559, + "eval_runtime": 603.7869, + "eval_samples_per_second": 105.13, + "eval_steps_per_second": 0.411, + "step": 138000 + }, + { + "epoch": 0.5625890502897986, + "grad_norm": 8.544374465942383, + "learning_rate": 0.004931525426717527, + "loss": 8.3289, + "step": 138100 + }, + { + "epoch": 0.56299642831318, + "grad_norm": 5.79107141494751, + "learning_rate": 0.004931409951775097, + "loss": 8.3046, + "step": 138200 + }, + { + "epoch": 0.5634038063365615, + "grad_norm": 8.56974983215332, + "learning_rate": 0.004931294380903704, + "loss": 8.3242, + "step": 138300 + }, + { + "epoch": 0.5638111843599429, + "grad_norm": 12.155808448791504, + "learning_rate": 0.004931178714107916, + "loss": 8.3155, + "step": 138400 + }, + { + "epoch": 0.5642185623833244, + "grad_norm": 4.285138130187988, + "learning_rate": 0.004931062951392311, + "loss": 8.2667, + "step": 138500 + }, + { + "epoch": 0.5646259404067059, + "grad_norm": 8.989679336547852, + "learning_rate": 0.004930947092761459, + "loss": 8.5222, + "step": 138600 + }, + { + "epoch": 0.5650333184300873, + "grad_norm": 4.6177978515625, + "learning_rate": 0.004930831138219945, + "loss": 8.6581, + "step": 138700 + }, + { + "epoch": 0.5654406964534687, + "grad_norm": 2.7358109951019287, + "learning_rate": 0.0049307150877723515, + "loss": 8.5795, + "step": 138800 + }, + { + "epoch": 0.5658480744768503, + "grad_norm": 3.21806263923645, + "learning_rate": 0.004930598941423266, + "loss": 8.542, + "step": 138900 + }, + { + "epoch": 0.5662554525002317, + "grad_norm": 5.059041500091553, + "learning_rate": 0.004930482699177287, + "loss": 8.3765, + "step": 139000 + }, + { + "epoch": 0.5662554525002317, + "eval_MaskedAccuracy": 0.47533519953307585, + "eval_loss": 1.7726454734802246, + "eval_runtime": 609.0225, + "eval_samples_per_second": 104.226, + "eval_steps_per_second": 0.407, + "step": 139000 + }, + { + "epoch": 0.5666628305236131, + "grad_norm": 5.088695526123047, + "learning_rate": 0.004930366361038996, + "loss": 8.3651, + "step": 139100 + }, + { + "epoch": 0.5670702085469946, + "grad_norm": 11.858138084411621, + "learning_rate": 0.004930249927013009, + "loss": 8.404, + "step": 139200 + }, + { + "epoch": 0.5674775865703761, + "grad_norm": 5.233626842498779, + "learning_rate": 0.004930133397103916, + "loss": 8.4429, + "step": 139300 + }, + { + "epoch": 0.5678849645937576, + "grad_norm": 9.309755325317383, + "learning_rate": 0.004930016771316329, + "loss": 8.3795, + "step": 139400 + }, + { + "epoch": 0.568292342617139, + "grad_norm": 3.2571256160736084, + "learning_rate": 0.004929900049654861, + "loss": 8.3915, + "step": 139500 + }, + { + "epoch": 0.5686997206405204, + "grad_norm": 23.565610885620117, + "learning_rate": 0.00492978323212412, + "loss": 8.4156, + "step": 139600 + }, + { + "epoch": 0.569107098663902, + "grad_norm": 3.5002174377441406, + "learning_rate": 0.00492966631872873, + "loss": 8.5306, + "step": 139700 + }, + { + "epoch": 0.5695144766872834, + "grad_norm": 0.7858268022537231, + "learning_rate": 0.004929549309473314, + "loss": 8.3792, + "step": 139800 + }, + { + "epoch": 0.5699218547106648, + "grad_norm": 15.595468521118164, + "learning_rate": 0.0049294322043625044, + "loss": 8.5528, + "step": 139900 + }, + { + "epoch": 0.5703292327340463, + "grad_norm": 3.5991625785827637, + "learning_rate": 0.00492931500340092, + "loss": 8.4807, + "step": 140000 + }, + { + "epoch": 0.5703292327340463, + "eval_MaskedAccuracy": 0.47439803391024454, + "eval_loss": 1.7724093198776245, + "eval_runtime": 499.4875, + "eval_samples_per_second": 127.082, + "eval_steps_per_second": 0.497, + "step": 140000 + }, + { + "epoch": 0.5707366107574278, + "grad_norm": 4.950541019439697, + "learning_rate": 0.004929197706593197, + "loss": 8.3725, + "step": 140100 + }, + { + "epoch": 0.5711439887808092, + "grad_norm": 1.7901657819747925, + "learning_rate": 0.004929080313943971, + "loss": 8.3586, + "step": 140200 + }, + { + "epoch": 0.5715513668041907, + "grad_norm": 7.542468547821045, + "learning_rate": 0.004928962825457888, + "loss": 8.5297, + "step": 140300 + }, + { + "epoch": 0.5719587448275721, + "grad_norm": 4.861608028411865, + "learning_rate": 0.004928845241139586, + "loss": 8.5119, + "step": 140400 + }, + { + "epoch": 0.5723661228509537, + "grad_norm": 5.734555244445801, + "learning_rate": 0.004928727560993721, + "loss": 8.3784, + "step": 140500 + }, + { + "epoch": 0.5727735008743351, + "grad_norm": 9.214383125305176, + "learning_rate": 0.00492860978502494, + "loss": 8.3441, + "step": 140600 + }, + { + "epoch": 0.5731808788977165, + "grad_norm": 4.167412757873535, + "learning_rate": 0.004928491913237904, + "loss": 8.4657, + "step": 140700 + }, + { + "epoch": 0.573588256921098, + "grad_norm": 6.125998497009277, + "learning_rate": 0.004928373945637273, + "loss": 8.4581, + "step": 140800 + }, + { + "epoch": 0.5739956349444795, + "grad_norm": 3.892979621887207, + "learning_rate": 0.004928255882227712, + "loss": 8.3622, + "step": 140900 + }, + { + "epoch": 0.5744030129678609, + "grad_norm": 6.536438941955566, + "learning_rate": 0.004928137723013875, + "loss": 8.4651, + "step": 141000 + }, + { + "epoch": 0.5744030129678609, + "eval_MaskedAccuracy": 0.46521993284921864, + "eval_loss": 1.8148672580718994, + "eval_runtime": 612.1042, + "eval_samples_per_second": 103.701, + "eval_steps_per_second": 0.405, + "step": 141000 + }, + { + "epoch": 0.5748103909912424, + "grad_norm": 7.157055854797363, + "learning_rate": 0.004928019468000448, + "loss": 8.4738, + "step": 141100 + }, + { + "epoch": 0.5752177690146238, + "grad_norm": 5.56843900680542, + "learning_rate": 0.004927901117192104, + "loss": 8.3646, + "step": 141200 + }, + { + "epoch": 0.5756251470380053, + "grad_norm": 0.8016031384468079, + "learning_rate": 0.004927782670593521, + "loss": 8.387, + "step": 141300 + }, + { + "epoch": 0.5760325250613868, + "grad_norm": 4.794798374176025, + "learning_rate": 0.004927664128209382, + "loss": 8.4828, + "step": 141400 + }, + { + "epoch": 0.5764399030847682, + "grad_norm": 5.881836891174316, + "learning_rate": 0.004927545490044371, + "loss": 8.3491, + "step": 141500 + }, + { + "epoch": 0.5768472811081496, + "grad_norm": 9.797046661376953, + "learning_rate": 0.004927426756103177, + "loss": 8.385, + "step": 141600 + }, + { + "epoch": 0.5772546591315312, + "grad_norm": 0.9898270964622498, + "learning_rate": 0.004927307926390499, + "loss": 8.3714, + "step": 141700 + }, + { + "epoch": 0.5776620371549126, + "grad_norm": 6.364189147949219, + "learning_rate": 0.004927189000911034, + "loss": 8.5285, + "step": 141800 + }, + { + "epoch": 0.5780694151782941, + "grad_norm": 2.7466721534729004, + "learning_rate": 0.004927069979669483, + "loss": 8.3986, + "step": 141900 + }, + { + "epoch": 0.5784767932016756, + "grad_norm": 7.290796279907227, + "learning_rate": 0.004926950862670552, + "loss": 8.3574, + "step": 142000 + }, + { + "epoch": 0.5784767932016756, + "eval_MaskedAccuracy": 0.4768711731034431, + "eval_loss": 1.7562979459762573, + "eval_runtime": 633.151, + "eval_samples_per_second": 100.254, + "eval_steps_per_second": 0.392, + "step": 142000 + }, + { + "epoch": 0.578884171225057, + "grad_norm": 8.044575691223145, + "learning_rate": 0.0049268316499189455, + "loss": 8.3856, + "step": 142100 + }, + { + "epoch": 0.5792915492484385, + "grad_norm": 9.492392539978027, + "learning_rate": 0.004926712341419386, + "loss": 8.4513, + "step": 142200 + }, + { + "epoch": 0.5796989272718199, + "grad_norm": 3.2362284660339355, + "learning_rate": 0.00492659293717658, + "loss": 8.347, + "step": 142300 + }, + { + "epoch": 0.5801063052952014, + "grad_norm": 7.138864040374756, + "learning_rate": 0.004926473437195258, + "loss": 8.3386, + "step": 142400 + }, + { + "epoch": 0.5805136833185829, + "grad_norm": 7.8301100730896, + "learning_rate": 0.00492635384148014, + "loss": 8.4787, + "step": 142500 + }, + { + "epoch": 0.5809210613419643, + "grad_norm": 7.749133586883545, + "learning_rate": 0.004926234150035957, + "loss": 8.5, + "step": 142600 + }, + { + "epoch": 0.5813284393653457, + "grad_norm": 17.91340446472168, + "learning_rate": 0.004926114362867435, + "loss": 8.4313, + "step": 142700 + }, + { + "epoch": 0.5817358173887273, + "grad_norm": 7.80926513671875, + "learning_rate": 0.004925994479979315, + "loss": 8.4978, + "step": 142800 + }, + { + "epoch": 0.5821431954121087, + "grad_norm": 1.9094536304473877, + "learning_rate": 0.004925874501376334, + "loss": 8.4321, + "step": 142900 + }, + { + "epoch": 0.5825505734354902, + "grad_norm": 7.612975120544434, + "learning_rate": 0.0049257544270632355, + "loss": 8.3529, + "step": 143000 + }, + { + "epoch": 0.5825505734354902, + "eval_MaskedAccuracy": 0.4764777458492212, + "eval_loss": 1.761231780052185, + "eval_runtime": 633.3846, + "eval_samples_per_second": 100.217, + "eval_steps_per_second": 0.392, + "step": 143000 + }, + { + "epoch": 0.5829579514588716, + "grad_norm": 7.09923791885376, + "learning_rate": 0.004925634257044773, + "loss": 8.3188, + "step": 143100 + }, + { + "epoch": 0.5833653294822531, + "grad_norm": 7.374269485473633, + "learning_rate": 0.004925513991325686, + "loss": 8.351, + "step": 143200 + }, + { + "epoch": 0.5837727075056346, + "grad_norm": 3.873291492462158, + "learning_rate": 0.004925393629910736, + "loss": 8.3384, + "step": 143300 + }, + { + "epoch": 0.584180085529016, + "grad_norm": 5.784383296966553, + "learning_rate": 0.004925273172804682, + "loss": 8.4272, + "step": 143400 + }, + { + "epoch": 0.5845874635523974, + "grad_norm": 2.9218976497650146, + "learning_rate": 0.004925152620012286, + "loss": 8.4527, + "step": 143500 + }, + { + "epoch": 0.584994841575779, + "grad_norm": 3.2860617637634277, + "learning_rate": 0.004925031971538311, + "loss": 8.4885, + "step": 143600 + }, + { + "epoch": 0.5854022195991604, + "grad_norm": 10.385235786437988, + "learning_rate": 0.004924911227387532, + "loss": 8.5233, + "step": 143700 + }, + { + "epoch": 0.5858095976225418, + "grad_norm": 7.59553861618042, + "learning_rate": 0.004924790387564717, + "loss": 8.5009, + "step": 143800 + }, + { + "epoch": 0.5862169756459233, + "grad_norm": 3.1202356815338135, + "learning_rate": 0.0049246694520746455, + "loss": 8.4833, + "step": 143900 + }, + { + "epoch": 0.5866243536693048, + "grad_norm": 5.625021457672119, + "learning_rate": 0.0049245484209221035, + "loss": 8.4456, + "step": 144000 + }, + { + "epoch": 0.5866243536693048, + "eval_MaskedAccuracy": 0.4735239587801145, + "eval_loss": 1.7806174755096436, + "eval_runtime": 633.5303, + "eval_samples_per_second": 100.194, + "eval_steps_per_second": 0.391, + "step": 144000 + }, + { + "epoch": 0.5870317316926862, + "grad_norm": 8.998865127563477, + "learning_rate": 0.004924427294111858, + "loss": 8.3641, + "step": 144100 + }, + { + "epoch": 0.5874391097160677, + "grad_norm": 7.449718952178955, + "learning_rate": 0.00492430607164872, + "loss": 8.3482, + "step": 144200 + }, + { + "epoch": 0.5878464877394491, + "grad_norm": 6.765208721160889, + "learning_rate": 0.004924184753537476, + "loss": 8.315, + "step": 144300 + }, + { + "epoch": 0.5882538657628307, + "grad_norm": 6.553773880004883, + "learning_rate": 0.004924063339782917, + "loss": 8.291, + "step": 144400 + }, + { + "epoch": 0.5886612437862121, + "grad_norm": 5.706290245056152, + "learning_rate": 0.0049239418303898465, + "loss": 8.3187, + "step": 144500 + }, + { + "epoch": 0.5890686218095935, + "grad_norm": 8.585490226745605, + "learning_rate": 0.00492382022536307, + "loss": 8.4939, + "step": 144600 + }, + { + "epoch": 0.589475999832975, + "grad_norm": 6.214458465576172, + "learning_rate": 0.00492369852470739, + "loss": 8.481, + "step": 144700 + }, + { + "epoch": 0.5898833778563565, + "grad_norm": 10.59347915649414, + "learning_rate": 0.004923576728427617, + "loss": 8.3291, + "step": 144800 + }, + { + "epoch": 0.5902907558797379, + "grad_norm": 12.514204025268555, + "learning_rate": 0.004923454836528572, + "loss": 8.3762, + "step": 144900 + }, + { + "epoch": 0.5906981339031194, + "grad_norm": 4.104231834411621, + "learning_rate": 0.004923332849015071, + "loss": 8.5356, + "step": 145000 + }, + { + "epoch": 0.5906981339031194, + "eval_MaskedAccuracy": 0.4719728494655418, + "eval_loss": 1.7796320915222168, + "eval_runtime": 601.6769, + "eval_samples_per_second": 105.498, + "eval_steps_per_second": 0.412, + "step": 145000 + }, + { + "epoch": 0.5911055119265008, + "grad_norm": 6.915755748748779, + "learning_rate": 0.004923210765891941, + "loss": 8.3611, + "step": 145100 + }, + { + "epoch": 0.5915128899498823, + "grad_norm": 4.886312484741211, + "learning_rate": 0.004923088587164003, + "loss": 8.3103, + "step": 145200 + }, + { + "epoch": 0.5919202679732638, + "grad_norm": 7.537599086761475, + "learning_rate": 0.004922966312836096, + "loss": 8.3021, + "step": 145300 + }, + { + "epoch": 0.5923276459966452, + "grad_norm": 6.0286054611206055, + "learning_rate": 0.0049228439429130485, + "loss": 8.3138, + "step": 145400 + }, + { + "epoch": 0.5927350240200268, + "grad_norm": 5.109126567840576, + "learning_rate": 0.0049227214773997, + "loss": 8.2938, + "step": 145500 + }, + { + "epoch": 0.5931424020434082, + "grad_norm": 5.64884090423584, + "learning_rate": 0.004922598916300887, + "loss": 8.3078, + "step": 145600 + }, + { + "epoch": 0.5935497800667896, + "grad_norm": 8.799834251403809, + "learning_rate": 0.004922476259621459, + "loss": 8.344, + "step": 145700 + }, + { + "epoch": 0.5939571580901711, + "grad_norm": 5.0454792976379395, + "learning_rate": 0.004922353507366266, + "loss": 8.4802, + "step": 145800 + }, + { + "epoch": 0.5943645361135526, + "grad_norm": 7.879946708679199, + "learning_rate": 0.0049222306595401565, + "loss": 8.3843, + "step": 145900 + }, + { + "epoch": 0.594771914136934, + "grad_norm": 5.273271560668945, + "learning_rate": 0.004922107716147994, + "loss": 8.3082, + "step": 146000 + }, + { + "epoch": 0.594771914136934, + "eval_MaskedAccuracy": 0.4789688832855816, + "eval_loss": 1.7472257614135742, + "eval_runtime": 532.2718, + "eval_samples_per_second": 119.255, + "eval_steps_per_second": 0.466, + "step": 146000 + }, + { + "epoch": 0.5951792921603155, + "grad_norm": 5.88913106918335, + "learning_rate": 0.004921984677194639, + "loss": 8.2821, + "step": 146100 + }, + { + "epoch": 0.5955866701836969, + "grad_norm": 6.070159912109375, + "learning_rate": 0.00492186154268495, + "loss": 8.279, + "step": 146200 + }, + { + "epoch": 0.5959940482070784, + "grad_norm": 3.6523008346557617, + "learning_rate": 0.004921738312623806, + "loss": 8.2728, + "step": 146300 + }, + { + "epoch": 0.5964014262304599, + "grad_norm": 8.161672592163086, + "learning_rate": 0.004921614987016069, + "loss": 8.2584, + "step": 146400 + }, + { + "epoch": 0.5968088042538413, + "grad_norm": 4.909224987030029, + "learning_rate": 0.004921491565866608, + "loss": 8.3252, + "step": 146500 + }, + { + "epoch": 0.5972161822772227, + "grad_norm": 8.278355598449707, + "learning_rate": 0.0049213680491803115, + "loss": 8.4315, + "step": 146600 + }, + { + "epoch": 0.5976235603006043, + "grad_norm": 4.000131607055664, + "learning_rate": 0.004921244436962065, + "loss": 8.3199, + "step": 146700 + }, + { + "epoch": 0.5980309383239857, + "grad_norm": 7.900701522827148, + "learning_rate": 0.004921120729216753, + "loss": 8.2887, + "step": 146800 + }, + { + "epoch": 0.5984383163473672, + "grad_norm": 4.253056049346924, + "learning_rate": 0.0049209969259492664, + "loss": 8.2676, + "step": 146900 + }, + { + "epoch": 0.5988456943707486, + "grad_norm": 3.7337450981140137, + "learning_rate": 0.0049208730271644964, + "loss": 8.4013, + "step": 147000 + }, + { + "epoch": 0.5988456943707486, + "eval_MaskedAccuracy": 0.45803042398340266, + "eval_loss": 1.8447405099868774, + "eval_runtime": 566.7738, + "eval_samples_per_second": 111.995, + "eval_steps_per_second": 0.438, + "step": 147000 + }, + { + "epoch": 0.5992530723941301, + "grad_norm": 8.883218765258789, + "learning_rate": 0.0049207490328673446, + "loss": 8.4825, + "step": 147100 + }, + { + "epoch": 0.5996604504175116, + "grad_norm": 2.6650524139404297, + "learning_rate": 0.0049206249430627175, + "loss": 8.3655, + "step": 147200 + }, + { + "epoch": 0.600067828440893, + "grad_norm": 3.044719696044922, + "learning_rate": 0.004920500757755516, + "loss": 8.5077, + "step": 147300 + }, + { + "epoch": 0.6004752064642744, + "grad_norm": 3.7706477642059326, + "learning_rate": 0.004920376476950655, + "loss": 8.5518, + "step": 147400 + }, + { + "epoch": 0.600882584487656, + "grad_norm": 2.005147933959961, + "learning_rate": 0.004920252100653035, + "loss": 8.5234, + "step": 147500 + }, + { + "epoch": 0.6012899625110374, + "grad_norm": 5.612418174743652, + "learning_rate": 0.004920127628867589, + "loss": 8.5105, + "step": 147600 + }, + { + "epoch": 0.6016973405344188, + "grad_norm": 6.48015832901001, + "learning_rate": 0.004920003061599228, + "loss": 8.4509, + "step": 147700 + }, + { + "epoch": 0.6021047185578003, + "grad_norm": 6.271529197692871, + "learning_rate": 0.00491987839885288, + "loss": 8.3557, + "step": 147800 + }, + { + "epoch": 0.6025120965811818, + "grad_norm": 9.45332145690918, + "learning_rate": 0.004919753640633472, + "loss": 8.3089, + "step": 147900 + }, + { + "epoch": 0.6029194746045633, + "grad_norm": 6.876777648925781, + "learning_rate": 0.004919628786945938, + "loss": 8.3361, + "step": 148000 + }, + { + "epoch": 0.6029194746045633, + "eval_MaskedAccuracy": 0.4786036508942568, + "eval_loss": 1.7497118711471558, + "eval_runtime": 528.3629, + "eval_samples_per_second": 120.137, + "eval_steps_per_second": 0.469, + "step": 148000 + }, + { + "epoch": 0.6033268526279447, + "grad_norm": 7.976334571838379, + "learning_rate": 0.004919503837795211, + "loss": 8.2906, + "step": 148100 + }, + { + "epoch": 0.6037342306513261, + "grad_norm": 7.192055702209473, + "learning_rate": 0.004919378793186239, + "loss": 8.2782, + "step": 148200 + }, + { + "epoch": 0.6041416086747077, + "grad_norm": 9.158371925354004, + "learning_rate": 0.004919253653123958, + "loss": 8.2861, + "step": 148300 + }, + { + "epoch": 0.6045489866980891, + "grad_norm": 6.050761699676514, + "learning_rate": 0.004919128417613319, + "loss": 8.2923, + "step": 148400 + }, + { + "epoch": 0.6049563647214705, + "grad_norm": 6.686251640319824, + "learning_rate": 0.0049190030866592665, + "loss": 8.3018, + "step": 148500 + }, + { + "epoch": 0.605363742744852, + "grad_norm": 8.751076698303223, + "learning_rate": 0.0049188776602667635, + "loss": 8.3218, + "step": 148600 + }, + { + "epoch": 0.6057711207682335, + "grad_norm": 9.721341133117676, + "learning_rate": 0.004918752138440773, + "loss": 8.2988, + "step": 148700 + }, + { + "epoch": 0.6061784987916149, + "grad_norm": 9.185450553894043, + "learning_rate": 0.004918626521186247, + "loss": 8.2807, + "step": 148800 + }, + { + "epoch": 0.6065858768149964, + "grad_norm": 6.511389255523682, + "learning_rate": 0.004918500808508156, + "loss": 8.2823, + "step": 148900 + }, + { + "epoch": 0.6069932548383778, + "grad_norm": 4.399498462677002, + "learning_rate": 0.004918375000411468, + "loss": 8.4705, + "step": 149000 + }, + { + "epoch": 0.6069932548383778, + "eval_MaskedAccuracy": 0.46920769914216875, + "eval_loss": 1.803868055343628, + "eval_runtime": 559.0038, + "eval_samples_per_second": 113.552, + "eval_steps_per_second": 0.444, + "step": 149000 + }, + { + "epoch": 0.6074006328617593, + "grad_norm": 5.837223529815674, + "learning_rate": 0.004918249096901156, + "loss": 8.5529, + "step": 149100 + }, + { + "epoch": 0.6078080108851408, + "grad_norm": 6.941884994506836, + "learning_rate": 0.004918123097982208, + "loss": 8.4394, + "step": 149200 + }, + { + "epoch": 0.6082153889085222, + "grad_norm": 5.623409271240234, + "learning_rate": 0.004917997003659595, + "loss": 8.4554, + "step": 149300 + }, + { + "epoch": 0.6086227669319038, + "grad_norm": 7.008805751800537, + "learning_rate": 0.004917870813938307, + "loss": 8.3244, + "step": 149400 + }, + { + "epoch": 0.6090301449552852, + "grad_norm": 6.360350131988525, + "learning_rate": 0.004917744528823322, + "loss": 8.2924, + "step": 149500 + }, + { + "epoch": 0.6094375229786666, + "grad_norm": 6.548702239990234, + "learning_rate": 0.0049176181483196425, + "loss": 8.3391, + "step": 149600 + }, + { + "epoch": 0.6098449010020481, + "grad_norm": 6.634683609008789, + "learning_rate": 0.004917491672432266, + "loss": 8.2663, + "step": 149700 + }, + { + "epoch": 0.6102522790254296, + "grad_norm": 5.925806522369385, + "learning_rate": 0.004917365101166191, + "loss": 8.2479, + "step": 149800 + }, + { + "epoch": 0.610659657048811, + "grad_norm": 7.2930521965026855, + "learning_rate": 0.004917238434526421, + "loss": 8.2717, + "step": 149900 + }, + { + "epoch": 0.6110670350721925, + "grad_norm": 6.2033796310424805, + "learning_rate": 0.004917111672517968, + "loss": 8.2765, + "step": 150000 + }, + { + "epoch": 0.6110670350721925, + "eval_MaskedAccuracy": 0.48004479177415543, + "eval_loss": 1.741084098815918, + "eval_runtime": 552.5727, + "eval_samples_per_second": 114.874, + "eval_steps_per_second": 0.449, + "step": 150000 + }, + { + "epoch": 0.6114744130955739, + "grad_norm": 2.6907870769500732, + "learning_rate": 0.004916984815145834, + "loss": 8.4017, + "step": 150100 + }, + { + "epoch": 0.6118817911189554, + "grad_norm": 0.7989844679832458, + "learning_rate": 0.004916857862415046, + "loss": 8.4558, + "step": 150200 + }, + { + "epoch": 0.6122891691423369, + "grad_norm": 9.022034645080566, + "learning_rate": 0.004916730814330616, + "loss": 8.6093, + "step": 150300 + }, + { + "epoch": 0.6126965471657183, + "grad_norm": 8.180450439453125, + "learning_rate": 0.004916603670897565, + "loss": 8.3932, + "step": 150400 + }, + { + "epoch": 0.6131039251890998, + "grad_norm": 1.3381363153457642, + "learning_rate": 0.004916476432120923, + "loss": 8.4623, + "step": 150500 + }, + { + "epoch": 0.6135113032124813, + "grad_norm": 6.8818511962890625, + "learning_rate": 0.0049163490980057185, + "loss": 8.5591, + "step": 150600 + }, + { + "epoch": 0.6139186812358627, + "grad_norm": 6.092988967895508, + "learning_rate": 0.004916221668556986, + "loss": 8.4932, + "step": 150700 + }, + { + "epoch": 0.6143260592592442, + "grad_norm": 7.2855448722839355, + "learning_rate": 0.004916094143779766, + "loss": 8.4204, + "step": 150800 + }, + { + "epoch": 0.6147334372826256, + "grad_norm": 4.892360210418701, + "learning_rate": 0.004915966523679098, + "loss": 8.3325, + "step": 150900 + }, + { + "epoch": 0.6151408153060071, + "grad_norm": 1.7383997440338135, + "learning_rate": 0.0049158388082600375, + "loss": 8.3469, + "step": 151000 + }, + { + "epoch": 0.6151408153060071, + "eval_MaskedAccuracy": 0.4731508727785146, + "eval_loss": 1.7681819200515747, + "eval_runtime": 575.5654, + "eval_samples_per_second": 110.285, + "eval_steps_per_second": 0.431, + "step": 151000 + }, + { + "epoch": 0.6155481933293886, + "grad_norm": 36.812713623046875, + "learning_rate": 0.004915710997527619, + "loss": 8.4677, + "step": 151100 + }, + { + "epoch": 0.61595557135277, + "grad_norm": 2.0609169006347656, + "learning_rate": 0.0049155830914869, + "loss": 8.5544, + "step": 151200 + }, + { + "epoch": 0.6163629493761514, + "grad_norm": 3.7911245822906494, + "learning_rate": 0.004915455090142937, + "loss": 8.4017, + "step": 151300 + }, + { + "epoch": 0.616770327399533, + "grad_norm": 6.143694877624512, + "learning_rate": 0.004915326993500799, + "loss": 8.3425, + "step": 151400 + }, + { + "epoch": 0.6171777054229144, + "grad_norm": 2.891474485397339, + "learning_rate": 0.004915198801565535, + "loss": 8.3782, + "step": 151500 + }, + { + "epoch": 0.6175850834462958, + "grad_norm": 3.7956511974334717, + "learning_rate": 0.0049150705143422325, + "loss": 8.4126, + "step": 151600 + }, + { + "epoch": 0.6179924614696773, + "grad_norm": 6.636448383331299, + "learning_rate": 0.004914942131835944, + "loss": 8.4108, + "step": 151700 + }, + { + "epoch": 0.6183998394930588, + "grad_norm": 7.5709052085876465, + "learning_rate": 0.004914813654051754, + "loss": 8.3272, + "step": 151800 + }, + { + "epoch": 0.6188072175164403, + "grad_norm": 6.166447639465332, + "learning_rate": 0.004914685080994747, + "loss": 8.2731, + "step": 151900 + }, + { + "epoch": 0.6192145955398217, + "grad_norm": 8.206730842590332, + "learning_rate": 0.00491455641267, + "loss": 8.2621, + "step": 152000 + }, + { + "epoch": 0.6192145955398217, + "eval_MaskedAccuracy": 0.48002182329563664, + "eval_loss": 1.7566319704055786, + "eval_runtime": 512.7412, + "eval_samples_per_second": 123.797, + "eval_steps_per_second": 0.484, + "step": 152000 + }, + { + "epoch": 0.6196219735632031, + "grad_norm": 9.208566665649414, + "learning_rate": 0.004914427649082595, + "loss": 8.2899, + "step": 152100 + }, + { + "epoch": 0.6200293515865847, + "grad_norm": 6.308719635009766, + "learning_rate": 0.0049142987902376315, + "loss": 8.2675, + "step": 152200 + }, + { + "epoch": 0.6204367296099661, + "grad_norm": 3.0895836353302, + "learning_rate": 0.004914169836140206, + "loss": 8.3639, + "step": 152300 + }, + { + "epoch": 0.6208441076333475, + "grad_norm": 3.028787136077881, + "learning_rate": 0.004914040786795409, + "loss": 8.5091, + "step": 152400 + }, + { + "epoch": 0.621251485656729, + "grad_norm": 7.628536701202393, + "learning_rate": 0.004913911642208341, + "loss": 8.3584, + "step": 152500 + }, + { + "epoch": 0.6216588636801105, + "grad_norm": 14.235428810119629, + "learning_rate": 0.004913782402384117, + "loss": 8.3257, + "step": 152600 + }, + { + "epoch": 0.6220662417034919, + "grad_norm": 12.227068901062012, + "learning_rate": 0.00491365306732784, + "loss": 8.4322, + "step": 152700 + }, + { + "epoch": 0.6224736197268734, + "grad_norm": 1.0032848119735718, + "learning_rate": 0.004913523637044627, + "loss": 8.4773, + "step": 152800 + }, + { + "epoch": 0.6228809977502549, + "grad_norm": 0.7832779884338379, + "learning_rate": 0.004913394111539591, + "loss": 8.4854, + "step": 152900 + }, + { + "epoch": 0.6232883757736364, + "grad_norm": 34.24857711791992, + "learning_rate": 0.004913264490817857, + "loss": 8.5275, + "step": 153000 + }, + { + "epoch": 0.6232883757736364, + "eval_MaskedAccuracy": 0.46698241044431904, + "eval_loss": 1.8068398237228394, + "eval_runtime": 561.1496, + "eval_samples_per_second": 113.118, + "eval_steps_per_second": 0.442, + "step": 153000 + }, + { + "epoch": 0.6236957537970178, + "grad_norm": 5.277442455291748, + "learning_rate": 0.004913134774884545, + "loss": 8.4271, + "step": 153100 + }, + { + "epoch": 0.6241031318203992, + "grad_norm": 9.542320251464844, + "learning_rate": 0.0049130049637447805, + "loss": 8.3496, + "step": 153200 + }, + { + "epoch": 0.6245105098437808, + "grad_norm": 2.137045383453369, + "learning_rate": 0.004912875057403706, + "loss": 8.3199, + "step": 153300 + }, + { + "epoch": 0.6249178878671622, + "grad_norm": 4.267445087432861, + "learning_rate": 0.0049127450558664534, + "loss": 8.4279, + "step": 153400 + }, + { + "epoch": 0.6253252658905436, + "grad_norm": 4.157717227935791, + "learning_rate": 0.0049126149591381596, + "loss": 8.453, + "step": 153500 + }, + { + "epoch": 0.6257326439139251, + "grad_norm": 7.502955913543701, + "learning_rate": 0.00491248476722397, + "loss": 8.4648, + "step": 153600 + }, + { + "epoch": 0.6261400219373066, + "grad_norm": 8.893965721130371, + "learning_rate": 0.004912354480129033, + "loss": 8.4351, + "step": 153700 + }, + { + "epoch": 0.626547399960688, + "grad_norm": 4.194019317626953, + "learning_rate": 0.004912224097858498, + "loss": 8.3335, + "step": 153800 + }, + { + "epoch": 0.6269547779840695, + "grad_norm": 5.781638145446777, + "learning_rate": 0.0049120936204175136, + "loss": 8.3219, + "step": 153900 + }, + { + "epoch": 0.6273621560074509, + "grad_norm": 4.300761699676514, + "learning_rate": 0.004911963047811252, + "loss": 8.3777, + "step": 154000 + }, + { + "epoch": 0.6273621560074509, + "eval_MaskedAccuracy": 0.4780379626562326, + "eval_loss": 1.7531262636184692, + "eval_runtime": 613.0712, + "eval_samples_per_second": 103.538, + "eval_steps_per_second": 0.405, + "step": 154000 + }, + { + "epoch": 0.6277695340308324, + "grad_norm": 5.300182342529297, + "learning_rate": 0.004911832380044862, + "loss": 8.2957, + "step": 154100 + }, + { + "epoch": 0.6281769120542139, + "grad_norm": 7.4471025466918945, + "learning_rate": 0.004911701617123519, + "loss": 8.2896, + "step": 154200 + }, + { + "epoch": 0.6285842900775953, + "grad_norm": 5.349423408508301, + "learning_rate": 0.0049115707590523874, + "loss": 8.3444, + "step": 154300 + }, + { + "epoch": 0.6289916681009768, + "grad_norm": 6.0373053550720215, + "learning_rate": 0.004911439805836638, + "loss": 8.2733, + "step": 154400 + }, + { + "epoch": 0.6293990461243583, + "grad_norm": 8.793858528137207, + "learning_rate": 0.004911308757481456, + "loss": 8.2432, + "step": 154500 + }, + { + "epoch": 0.6298064241477397, + "grad_norm": 1.461338996887207, + "learning_rate": 0.004911177613992021, + "loss": 8.3495, + "step": 154600 + }, + { + "epoch": 0.6302138021711212, + "grad_norm": 4.147045135498047, + "learning_rate": 0.004911046375373516, + "loss": 8.5307, + "step": 154700 + }, + { + "epoch": 0.6306211801945026, + "grad_norm": 6.397643089294434, + "learning_rate": 0.004910915041631126, + "loss": 8.4319, + "step": 154800 + }, + { + "epoch": 0.6310285582178841, + "grad_norm": 6.000017166137695, + "learning_rate": 0.004910783612770045, + "loss": 8.3459, + "step": 154900 + }, + { + "epoch": 0.6314359362412656, + "grad_norm": 4.597378730773926, + "learning_rate": 0.0049106520887954724, + "loss": 8.3301, + "step": 155000 + }, + { + "epoch": 0.6314359362412656, + "eval_MaskedAccuracy": 0.4786940926800944, + "eval_loss": 1.7530438899993896, + "eval_runtime": 550.1194, + "eval_samples_per_second": 115.386, + "eval_steps_per_second": 0.451, + "step": 155000 + }, + { + "epoch": 0.631843314264647, + "grad_norm": 5.755496978759766, + "learning_rate": 0.004910520469712606, + "loss": 8.3068, + "step": 155100 + }, + { + "epoch": 0.6322506922880284, + "grad_norm": 8.617079734802246, + "learning_rate": 0.004910388755526653, + "loss": 8.2794, + "step": 155200 + }, + { + "epoch": 0.63265807031141, + "grad_norm": 2.8633389472961426, + "learning_rate": 0.004910256946242821, + "loss": 8.2812, + "step": 155300 + }, + { + "epoch": 0.6330654483347914, + "grad_norm": 4.23527193069458, + "learning_rate": 0.004910125041866313, + "loss": 8.4366, + "step": 155400 + }, + { + "epoch": 0.6334728263581728, + "grad_norm": 9.153361320495605, + "learning_rate": 0.00490999304240235, + "loss": 8.4435, + "step": 155500 + }, + { + "epoch": 0.6338802043815543, + "grad_norm": 1.2283235788345337, + "learning_rate": 0.004909860947856153, + "loss": 8.5084, + "step": 155600 + }, + { + "epoch": 0.6342875824049358, + "grad_norm": 6.179910182952881, + "learning_rate": 0.004909728758232938, + "loss": 8.4251, + "step": 155700 + }, + { + "epoch": 0.6346949604283173, + "grad_norm": 9.873910903930664, + "learning_rate": 0.004909596473537926, + "loss": 8.3496, + "step": 155800 + }, + { + "epoch": 0.6351023384516987, + "grad_norm": 0.6780279874801636, + "learning_rate": 0.004909464093776359, + "loss": 8.4366, + "step": 155900 + }, + { + "epoch": 0.6355097164750801, + "grad_norm": 6.819117546081543, + "learning_rate": 0.004909331618953468, + "loss": 8.4343, + "step": 156000 + }, + { + "epoch": 0.6355097164750801, + "eval_MaskedAccuracy": 0.4746228182690614, + "eval_loss": 1.7684835195541382, + "eval_runtime": 609.0402, + "eval_samples_per_second": 104.223, + "eval_steps_per_second": 0.407, + "step": 156000 + }, + { + "epoch": 0.6359170944984617, + "grad_norm": 7.058218955993652, + "learning_rate": 0.0049091990490744905, + "loss": 8.3529, + "step": 156100 + }, + { + "epoch": 0.6363244725218431, + "grad_norm": 2.344837188720703, + "learning_rate": 0.004909066384144668, + "loss": 8.4224, + "step": 156200 + }, + { + "epoch": 0.6367318505452245, + "grad_norm": 7.422060489654541, + "learning_rate": 0.00490893362416924, + "loss": 8.4797, + "step": 156300 + }, + { + "epoch": 0.637139228568606, + "grad_norm": 6.679649829864502, + "learning_rate": 0.004908800769153458, + "loss": 8.4663, + "step": 156400 + }, + { + "epoch": 0.6375466065919875, + "grad_norm": 1.7687937021255493, + "learning_rate": 0.004908667819102575, + "loss": 8.3526, + "step": 156500 + }, + { + "epoch": 0.6379539846153689, + "grad_norm": 4.981142997741699, + "learning_rate": 0.004908534774021848, + "loss": 8.3066, + "step": 156600 + }, + { + "epoch": 0.6383613626387504, + "grad_norm": 5.302996635437012, + "learning_rate": 0.004908401633916531, + "loss": 8.3902, + "step": 156700 + }, + { + "epoch": 0.6387687406621319, + "grad_norm": 9.538987159729004, + "learning_rate": 0.0049082683987919, + "loss": 8.3133, + "step": 156800 + }, + { + "epoch": 0.6391761186855134, + "grad_norm": 1.8474633693695068, + "learning_rate": 0.004908135068653213, + "loss": 8.3133, + "step": 156900 + }, + { + "epoch": 0.6395834967088948, + "grad_norm": 8.414902687072754, + "learning_rate": 0.004908001643505744, + "loss": 8.4922, + "step": 157000 + }, + { + "epoch": 0.6395834967088948, + "eval_MaskedAccuracy": 0.47553048548253046, + "eval_loss": 1.7700469493865967, + "eval_runtime": 577.0729, + "eval_samples_per_second": 109.996, + "eval_steps_per_second": 0.43, + "step": 157000 + }, + { + "epoch": 0.6399908747322762, + "grad_norm": 3.761073350906372, + "learning_rate": 0.004907868123354759, + "loss": 8.3444, + "step": 157100 + }, + { + "epoch": 0.6403982527556578, + "grad_norm": 1.2089842557907104, + "learning_rate": 0.004907734508205559, + "loss": 8.4, + "step": 157200 + }, + { + "epoch": 0.6408056307790392, + "grad_norm": 8.433003425598145, + "learning_rate": 0.004907600798063412, + "loss": 8.4174, + "step": 157300 + }, + { + "epoch": 0.6412130088024206, + "grad_norm": 3.711627721786499, + "learning_rate": 0.004907466992933599, + "loss": 8.3402, + "step": 157400 + }, + { + "epoch": 0.6416203868258021, + "grad_norm": 8.343791007995605, + "learning_rate": 0.004907333092821423, + "loss": 8.2904, + "step": 157500 + }, + { + "epoch": 0.6420277648491836, + "grad_norm": 4.790099620819092, + "learning_rate": 0.004907199097732165, + "loss": 8.2752, + "step": 157600 + }, + { + "epoch": 0.642435142872565, + "grad_norm": 6.770668983459473, + "learning_rate": 0.004907065007671135, + "loss": 8.2258, + "step": 157700 + }, + { + "epoch": 0.6428425208959465, + "grad_norm": 6.104489803314209, + "learning_rate": 0.0049069308226436245, + "loss": 8.2567, + "step": 157800 + }, + { + "epoch": 0.6432498989193279, + "grad_norm": 4.77765417098999, + "learning_rate": 0.004906796542654943, + "loss": 8.2373, + "step": 157900 + }, + { + "epoch": 0.6436572769427094, + "grad_norm": 30.578868865966797, + "learning_rate": 0.004906662167710398, + "loss": 8.2344, + "step": 158000 + }, + { + "epoch": 0.6436572769427094, + "eval_MaskedAccuracy": 0.4684047134907011, + "eval_loss": 1.8017596006393433, + "eval_runtime": 636.8838, + "eval_samples_per_second": 99.667, + "eval_steps_per_second": 0.389, + "step": 158000 + }, + { + "epoch": 0.6440646549660909, + "grad_norm": 4.308048248291016, + "learning_rate": 0.004906527697815302, + "loss": 8.5158, + "step": 158100 + }, + { + "epoch": 0.6444720329894723, + "grad_norm": 9.857576370239258, + "learning_rate": 0.004906393132974976, + "loss": 8.498, + "step": 158200 + }, + { + "epoch": 0.6448794110128538, + "grad_norm": 5.082419395446777, + "learning_rate": 0.004906258473194737, + "loss": 8.4119, + "step": 158300 + }, + { + "epoch": 0.6452867890362353, + "grad_norm": 1.5055186748504639, + "learning_rate": 0.004906123718479909, + "loss": 8.4332, + "step": 158400 + }, + { + "epoch": 0.6456941670596167, + "grad_norm": 5.3352580070495605, + "learning_rate": 0.004905988868835811, + "loss": 8.4906, + "step": 158500 + }, + { + "epoch": 0.6461015450829982, + "grad_norm": 3.541630268096924, + "learning_rate": 0.004905853924267782, + "loss": 8.3953, + "step": 158600 + }, + { + "epoch": 0.6465089231063796, + "grad_norm": 4.715964317321777, + "learning_rate": 0.00490571888478116, + "loss": 8.2978, + "step": 158700 + }, + { + "epoch": 0.6469163011297611, + "grad_norm": 4.749425888061523, + "learning_rate": 0.004905583750381281, + "loss": 8.4208, + "step": 158800 + }, + { + "epoch": 0.6473236791531426, + "grad_norm": 5.415860652923584, + "learning_rate": 0.004905448521073488, + "loss": 8.4136, + "step": 158900 + }, + { + "epoch": 0.647731057176524, + "grad_norm": 5.361938953399658, + "learning_rate": 0.004905313196863122, + "loss": 8.2605, + "step": 159000 + }, + { + "epoch": 0.647731057176524, + "eval_MaskedAccuracy": 0.47887928738637614, + "eval_loss": 1.7427128553390503, + "eval_runtime": 652.528, + "eval_samples_per_second": 97.277, + "eval_steps_per_second": 0.38, + "step": 159000 + }, + { + "epoch": 0.6481384351999054, + "grad_norm": 4.954558372497559, + "learning_rate": 0.004905177777755537, + "loss": 8.2781, + "step": 159100 + }, + { + "epoch": 0.648545813223287, + "grad_norm": 5.150397300720215, + "learning_rate": 0.004905042263756091, + "loss": 8.2828, + "step": 159200 + }, + { + "epoch": 0.6489531912466684, + "grad_norm": 7.26869535446167, + "learning_rate": 0.004904906654870136, + "loss": 8.254, + "step": 159300 + }, + { + "epoch": 0.6493605692700499, + "grad_norm": 2.4618828296661377, + "learning_rate": 0.004904770951103038, + "loss": 8.2973, + "step": 159400 + }, + { + "epoch": 0.6497679472934313, + "grad_norm": 9.42536449432373, + "learning_rate": 0.0049046351524601565, + "loss": 8.3643, + "step": 159500 + }, + { + "epoch": 0.6501753253168128, + "grad_norm": 8.196399688720703, + "learning_rate": 0.004904499258946864, + "loss": 8.4622, + "step": 159600 + }, + { + "epoch": 0.6505827033401943, + "grad_norm": 5.312618255615234, + "learning_rate": 0.004904363270568535, + "loss": 8.2905, + "step": 159700 + }, + { + "epoch": 0.6509900813635757, + "grad_norm": 4.90754508972168, + "learning_rate": 0.004904227187330539, + "loss": 8.2546, + "step": 159800 + }, + { + "epoch": 0.6513974593869571, + "grad_norm": 6.547611236572266, + "learning_rate": 0.004904091009238261, + "loss": 8.2654, + "step": 159900 + }, + { + "epoch": 0.6518048374103387, + "grad_norm": 4.732059478759766, + "learning_rate": 0.004903954736297084, + "loss": 8.2189, + "step": 160000 + }, + { + "epoch": 0.6518048374103387, + "eval_MaskedAccuracy": 0.48032427367158237, + "eval_loss": 1.7542961835861206, + "eval_runtime": 711.8517, + "eval_samples_per_second": 89.17, + "eval_steps_per_second": 0.348, + "step": 160000 + }, + { + "epoch": 0.6522122154337201, + "grad_norm": 4.2720184326171875, + "learning_rate": 0.004903818368512396, + "loss": 8.2616, + "step": 160100 + }, + { + "epoch": 0.6526195934571015, + "grad_norm": 5.928277492523193, + "learning_rate": 0.0049036819058895925, + "loss": 8.2555, + "step": 160200 + }, + { + "epoch": 0.653026971480483, + "grad_norm": 7.389522552490234, + "learning_rate": 0.004903545348434064, + "loss": 8.2081, + "step": 160300 + }, + { + "epoch": 0.6534343495038645, + "grad_norm": 8.315999984741211, + "learning_rate": 0.004903408696151203, + "loss": 8.2393, + "step": 160400 + }, + { + "epoch": 0.6538417275272459, + "grad_norm": 5.6302571296691895, + "learning_rate": 0.0049032719490464255, + "loss": 8.2477, + "step": 160500 + }, + { + "epoch": 0.6542491055506274, + "grad_norm": 6.31380558013916, + "learning_rate": 0.004903135107125126, + "loss": 8.2348, + "step": 160600 + }, + { + "epoch": 0.6546564835740089, + "grad_norm": 1.6448034048080444, + "learning_rate": 0.0049029981703927125, + "loss": 8.2952, + "step": 160700 + }, + { + "epoch": 0.6550638615973904, + "grad_norm": 2.5281286239624023, + "learning_rate": 0.004902861138854616, + "loss": 8.5663, + "step": 160800 + }, + { + "epoch": 0.6554712396207718, + "grad_norm": 5.702998161315918, + "learning_rate": 0.00490272401251624, + "loss": 8.5445, + "step": 160900 + }, + { + "epoch": 0.6558786176441532, + "grad_norm": 7.67074728012085, + "learning_rate": 0.004902586791383006, + "loss": 8.5403, + "step": 161000 + }, + { + "epoch": 0.6558786176441532, + "eval_MaskedAccuracy": 0.4705086464907593, + "eval_loss": 1.8039331436157227, + "eval_runtime": 632.0011, + "eval_samples_per_second": 100.437, + "eval_steps_per_second": 0.392, + "step": 161000 + }, + { + "epoch": 0.6562859956675348, + "grad_norm": 4.913644790649414, + "learning_rate": 0.004902449475460347, + "loss": 8.397, + "step": 161100 + }, + { + "epoch": 0.6566933736909162, + "grad_norm": 6.445557117462158, + "learning_rate": 0.004902312064753687, + "loss": 8.3314, + "step": 161200 + }, + { + "epoch": 0.6571007517142976, + "grad_norm": 10.202012062072754, + "learning_rate": 0.0049021745592684645, + "loss": 8.2995, + "step": 161300 + }, + { + "epoch": 0.6575081297376791, + "grad_norm": 7.1944403648376465, + "learning_rate": 0.004902036959010109, + "loss": 8.2713, + "step": 161400 + }, + { + "epoch": 0.6579155077610606, + "grad_norm": 6.703820705413818, + "learning_rate": 0.004901899263984063, + "loss": 8.265, + "step": 161500 + }, + { + "epoch": 0.658322885784442, + "grad_norm": 1.373208999633789, + "learning_rate": 0.004901761474195769, + "loss": 8.2205, + "step": 161600 + }, + { + "epoch": 0.6587302638078235, + "grad_norm": 9.944040298461914, + "learning_rate": 0.004901623589650673, + "loss": 8.4279, + "step": 161700 + }, + { + "epoch": 0.6591376418312049, + "grad_norm": 1.1036696434020996, + "learning_rate": 0.004901485610354228, + "loss": 8.4604, + "step": 161800 + }, + { + "epoch": 0.6595450198545865, + "grad_norm": 1.1476789712905884, + "learning_rate": 0.00490134753631189, + "loss": 8.5233, + "step": 161900 + }, + { + "epoch": 0.6599523978779679, + "grad_norm": 2.216991662979126, + "learning_rate": 0.004901209367529117, + "loss": 8.5094, + "step": 162000 + }, + { + "epoch": 0.6599523978779679, + "eval_MaskedAccuracy": 0.4710504473440415, + "eval_loss": 1.7816741466522217, + "eval_runtime": 691.9791, + "eval_samples_per_second": 91.731, + "eval_steps_per_second": 0.358, + "step": 162000 + }, + { + "epoch": 0.6603597759013493, + "grad_norm": 8.646736145019531, + "learning_rate": 0.004901071104011381, + "loss": 8.4393, + "step": 162100 + }, + { + "epoch": 0.6607671539247308, + "grad_norm": 4.416395664215088, + "learning_rate": 0.00490093274576413, + "loss": 8.3638, + "step": 162200 + }, + { + "epoch": 0.6611745319481123, + "grad_norm": 5.148403167724609, + "learning_rate": 0.004900794292792844, + "loss": 8.3285, + "step": 162300 + }, + { + "epoch": 0.6615819099714937, + "grad_norm": 1.9926255941390991, + "learning_rate": 0.004900655745102995, + "loss": 8.3124, + "step": 162400 + }, + { + "epoch": 0.6619892879948752, + "grad_norm": 6.878363132476807, + "learning_rate": 0.004900517102700059, + "loss": 8.3455, + "step": 162500 + }, + { + "epoch": 0.6623966660182566, + "grad_norm": 8.724032402038574, + "learning_rate": 0.004900378365589525, + "loss": 8.331, + "step": 162600 + }, + { + "epoch": 0.6628040440416381, + "grad_norm": 8.976378440856934, + "learning_rate": 0.004900239533776872, + "loss": 8.2772, + "step": 162700 + }, + { + "epoch": 0.6632114220650196, + "grad_norm": 4.16154146194458, + "learning_rate": 0.004900100607267592, + "loss": 8.2905, + "step": 162800 + }, + { + "epoch": 0.663618800088401, + "grad_norm": 10.567460060119629, + "learning_rate": 0.004899961586067173, + "loss": 8.4158, + "step": 162900 + }, + { + "epoch": 0.6640261781117824, + "grad_norm": 3.53060245513916, + "learning_rate": 0.004899822470181118, + "loss": 8.3608, + "step": 163000 + }, + { + "epoch": 0.6640261781117824, + "eval_MaskedAccuracy": 0.47414340365640284, + "eval_loss": 1.773229956626892, + "eval_runtime": 641.1106, + "eval_samples_per_second": 99.009, + "eval_steps_per_second": 0.387, + "step": 163000 + }, + { + "epoch": 0.664433556135164, + "grad_norm": 4.364983558654785, + "learning_rate": 0.0048996832596149235, + "loss": 8.3465, + "step": 163100 + }, + { + "epoch": 0.6648409341585454, + "grad_norm": 3.7606041431427, + "learning_rate": 0.004899543954374083, + "loss": 8.3469, + "step": 163200 + }, + { + "epoch": 0.6652483121819269, + "grad_norm": 5.685455799102783, + "learning_rate": 0.004899404554464118, + "loss": 8.2964, + "step": 163300 + }, + { + "epoch": 0.6656556902053083, + "grad_norm": 10.511970520019531, + "learning_rate": 0.004899265059890539, + "loss": 8.4242, + "step": 163400 + }, + { + "epoch": 0.6660630682286898, + "grad_norm": 5.845661640167236, + "learning_rate": 0.004899125470658849, + "loss": 8.4663, + "step": 163500 + }, + { + "epoch": 0.6664704462520713, + "grad_norm": 6.697454929351807, + "learning_rate": 0.004898985786774578, + "loss": 8.3532, + "step": 163600 + }, + { + "epoch": 0.6668778242754527, + "grad_norm": 6.9508376121521, + "learning_rate": 0.004898846008243234, + "loss": 8.3082, + "step": 163700 + }, + { + "epoch": 0.6672852022988341, + "grad_norm": 7.337489604949951, + "learning_rate": 0.00489870613507036, + "loss": 8.293, + "step": 163800 + }, + { + "epoch": 0.6676925803222157, + "grad_norm": 6.600030422210693, + "learning_rate": 0.00489856616726148, + "loss": 8.2511, + "step": 163900 + }, + { + "epoch": 0.6680999583455971, + "grad_norm": 8.571969032287598, + "learning_rate": 0.0048984261048221275, + "loss": 8.2392, + "step": 164000 + }, + { + "epoch": 0.6680999583455971, + "eval_MaskedAccuracy": 0.48152138193449695, + "eval_loss": 1.7331830263137817, + "eval_runtime": 525.0777, + "eval_samples_per_second": 120.889, + "eval_steps_per_second": 0.472, + "step": 164000 + }, + { + "epoch": 0.6685073363689785, + "grad_norm": 5.374701499938965, + "learning_rate": 0.004898285947757848, + "loss": 8.2564, + "step": 164100 + }, + { + "epoch": 0.66891471439236, + "grad_norm": 0.8485134840011597, + "learning_rate": 0.004898145696074172, + "loss": 8.3965, + "step": 164200 + }, + { + "epoch": 0.6693220924157415, + "grad_norm": 7.08267879486084, + "learning_rate": 0.004898005349776646, + "loss": 8.4201, + "step": 164300 + }, + { + "epoch": 0.669729470439123, + "grad_norm": 9.659616470336914, + "learning_rate": 0.004897864908870817, + "loss": 8.2908, + "step": 164400 + }, + { + "epoch": 0.6701368484625044, + "grad_norm": 2.042405366897583, + "learning_rate": 0.004897724373362244, + "loss": 8.3799, + "step": 164500 + }, + { + "epoch": 0.6705442264858859, + "grad_norm": 3.2800698280334473, + "learning_rate": 0.004897583743256474, + "loss": 8.5298, + "step": 164600 + }, + { + "epoch": 0.6709516045092674, + "grad_norm": 2.759813070297241, + "learning_rate": 0.004897443018559074, + "loss": 8.4652, + "step": 164700 + }, + { + "epoch": 0.6713589825326488, + "grad_norm": 5.804178237915039, + "learning_rate": 0.00489730219927561, + "loss": 8.3006, + "step": 164800 + }, + { + "epoch": 0.6717663605560302, + "grad_norm": 3.275555372238159, + "learning_rate": 0.004897161285411646, + "loss": 8.3407, + "step": 164900 + }, + { + "epoch": 0.6721737385794118, + "grad_norm": 6.999525547027588, + "learning_rate": 0.004897020276972746, + "loss": 8.3238, + "step": 165000 + }, + { + "epoch": 0.6721737385794118, + "eval_MaskedAccuracy": 0.47875500870127324, + "eval_loss": 1.7594870328903198, + "eval_runtime": 658.3203, + "eval_samples_per_second": 96.421, + "eval_steps_per_second": 0.377, + "step": 165000 + }, + { + "epoch": 0.6725811166027932, + "grad_norm": 5.938575267791748, + "learning_rate": 0.0048968791739644945, + "loss": 8.2815, + "step": 165100 + }, + { + "epoch": 0.6729884946261746, + "grad_norm": 9.04610824584961, + "learning_rate": 0.004896737976392463, + "loss": 8.2412, + "step": 165200 + }, + { + "epoch": 0.6733958726495561, + "grad_norm": 5.345634460449219, + "learning_rate": 0.0048965966842622445, + "loss": 8.2667, + "step": 165300 + }, + { + "epoch": 0.6738032506729376, + "grad_norm": 8.443628311157227, + "learning_rate": 0.0048964552975794134, + "loss": 8.2333, + "step": 165400 + }, + { + "epoch": 0.674210628696319, + "grad_norm": 4.402365684509277, + "learning_rate": 0.004896313816349567, + "loss": 8.3441, + "step": 165500 + }, + { + "epoch": 0.6746180067197005, + "grad_norm": 10.775688171386719, + "learning_rate": 0.0048961722405782985, + "loss": 8.3199, + "step": 165600 + }, + { + "epoch": 0.6750253847430819, + "grad_norm": 3.270447254180908, + "learning_rate": 0.004896030570271197, + "loss": 8.528, + "step": 165700 + }, + { + "epoch": 0.6754327627664635, + "grad_norm": 4.356602191925049, + "learning_rate": 0.004895888805433864, + "loss": 8.3563, + "step": 165800 + }, + { + "epoch": 0.6758401407898449, + "grad_norm": 7.3610100746154785, + "learning_rate": 0.0048957469460719095, + "loss": 8.3676, + "step": 165900 + }, + { + "epoch": 0.6762475188132263, + "grad_norm": 9.1506986618042, + "learning_rate": 0.004895604992190948, + "loss": 8.3984, + "step": 166000 + }, + { + "epoch": 0.6762475188132263, + "eval_MaskedAccuracy": 0.4770375833347552, + "eval_loss": 1.7629963159561157, + "eval_runtime": 551.11, + "eval_samples_per_second": 115.178, + "eval_steps_per_second": 0.45, + "step": 166000 + }, + { + "epoch": 0.6766548968366078, + "grad_norm": 10.568071365356445, + "learning_rate": 0.004895462943796584, + "loss": 8.2751, + "step": 166100 + }, + { + "epoch": 0.6770622748599893, + "grad_norm": 4.640679836273193, + "learning_rate": 0.004895320800894436, + "loss": 8.2899, + "step": 166200 + }, + { + "epoch": 0.6774696528833707, + "grad_norm": 7.352025032043457, + "learning_rate": 0.004895178563490116, + "loss": 8.363, + "step": 166300 + }, + { + "epoch": 0.6778770309067522, + "grad_norm": 9.487617492675781, + "learning_rate": 0.004895036231589264, + "loss": 8.3213, + "step": 166400 + }, + { + "epoch": 0.6782844089301336, + "grad_norm": 5.678429126739502, + "learning_rate": 0.004894893805197495, + "loss": 8.2882, + "step": 166500 + }, + { + "epoch": 0.6786917869535151, + "grad_norm": 6.872136116027832, + "learning_rate": 0.004894751284320436, + "loss": 8.3267, + "step": 166600 + }, + { + "epoch": 0.6790991649768966, + "grad_norm": 9.50109577178955, + "learning_rate": 0.004894608668963724, + "loss": 8.4098, + "step": 166700 + }, + { + "epoch": 0.679506543000278, + "grad_norm": 17.34059715270996, + "learning_rate": 0.004894465959133007, + "loss": 8.4633, + "step": 166800 + }, + { + "epoch": 0.6799139210236596, + "grad_norm": 1.1411833763122559, + "learning_rate": 0.004894323154833926, + "loss": 8.4391, + "step": 166900 + }, + { + "epoch": 0.680321299047041, + "grad_norm": 5.388540744781494, + "learning_rate": 0.004894180256072116, + "loss": 8.3438, + "step": 167000 + }, + { + "epoch": 0.680321299047041, + "eval_MaskedAccuracy": 0.47816457264364465, + "eval_loss": 1.7510942220687866, + "eval_runtime": 621.2353, + "eval_samples_per_second": 102.177, + "eval_steps_per_second": 0.399, + "step": 167000 + }, + { + "epoch": 0.6807286770704224, + "grad_norm": 6.737392902374268, + "learning_rate": 0.004894037262853236, + "loss": 8.2886, + "step": 167100 + }, + { + "epoch": 0.6811360550938039, + "grad_norm": 6.978421211242676, + "learning_rate": 0.00489389417518293, + "loss": 8.3475, + "step": 167200 + }, + { + "epoch": 0.6815434331171853, + "grad_norm": 0.6066324710845947, + "learning_rate": 0.0048937509930668584, + "loss": 8.3985, + "step": 167300 + }, + { + "epoch": 0.6819508111405668, + "grad_norm": 10.237112998962402, + "learning_rate": 0.0048936077165106845, + "loss": 8.4404, + "step": 167400 + }, + { + "epoch": 0.6823581891639483, + "grad_norm": 6.3599748611450195, + "learning_rate": 0.004893464345520071, + "loss": 8.3485, + "step": 167500 + }, + { + "epoch": 0.6827655671873297, + "grad_norm": 5.881896495819092, + "learning_rate": 0.004893320880100694, + "loss": 8.2703, + "step": 167600 + }, + { + "epoch": 0.6831729452107111, + "grad_norm": 18.456289291381836, + "learning_rate": 0.0048931773202582195, + "loss": 8.2824, + "step": 167700 + }, + { + "epoch": 0.6835803232340927, + "grad_norm": 6.398762226104736, + "learning_rate": 0.00489303366599832, + "loss": 8.4018, + "step": 167800 + }, + { + "epoch": 0.6839877012574741, + "grad_norm": 3.5661845207214355, + "learning_rate": 0.004892889917326678, + "loss": 8.4498, + "step": 167900 + }, + { + "epoch": 0.6843950792808555, + "grad_norm": 2.394167184829712, + "learning_rate": 0.004892746074248975, + "loss": 8.4847, + "step": 168000 + }, + { + "epoch": 0.6843950792808555, + "eval_MaskedAccuracy": 0.47158817847142775, + "eval_loss": 1.7774649858474731, + "eval_runtime": 569.0822, + "eval_samples_per_second": 111.541, + "eval_steps_per_second": 0.436, + "step": 168000 + }, + { + "epoch": 0.6848024573042371, + "grad_norm": 7.174911975860596, + "learning_rate": 0.004892602136770898, + "loss": 8.4165, + "step": 168100 + }, + { + "epoch": 0.6852098353276185, + "grad_norm": 3.2920947074890137, + "learning_rate": 0.0048924581048981446, + "loss": 8.3214, + "step": 168200 + }, + { + "epoch": 0.685617213351, + "grad_norm": 12.373799324035645, + "learning_rate": 0.004892313978636404, + "loss": 8.3641, + "step": 168300 + }, + { + "epoch": 0.6860245913743814, + "grad_norm": 3.5428054332733154, + "learning_rate": 0.004892169757991367, + "loss": 8.3479, + "step": 168400 + }, + { + "epoch": 0.6864319693977629, + "grad_norm": 5.213382244110107, + "learning_rate": 0.004892025442968745, + "loss": 8.2797, + "step": 168500 + }, + { + "epoch": 0.6868393474211444, + "grad_norm": 9.658862113952637, + "learning_rate": 0.004891881033574241, + "loss": 8.2737, + "step": 168600 + }, + { + "epoch": 0.6872467254445258, + "grad_norm": 4.990167140960693, + "learning_rate": 0.0048917365298135635, + "loss": 8.333, + "step": 168700 + }, + { + "epoch": 0.6876541034679072, + "grad_norm": 7.842627048492432, + "learning_rate": 0.0048915919316924236, + "loss": 8.3933, + "step": 168800 + }, + { + "epoch": 0.6880614814912888, + "grad_norm": 2.300215721130371, + "learning_rate": 0.004891447239216542, + "loss": 8.2848, + "step": 168900 + }, + { + "epoch": 0.6884688595146702, + "grad_norm": 4.792848110198975, + "learning_rate": 0.004891302452391639, + "loss": 8.261, + "step": 169000 + }, + { + "epoch": 0.6884688595146702, + "eval_MaskedAccuracy": 0.480171492947199, + "eval_loss": 1.7475031614303589, + "eval_runtime": 656.8193, + "eval_samples_per_second": 96.642, + "eval_steps_per_second": 0.378, + "step": 169000 + }, + { + "epoch": 0.6888762375380516, + "grad_norm": 8.848061561584473, + "learning_rate": 0.0048911575712234315, + "loss": 8.2706, + "step": 169100 + }, + { + "epoch": 0.6892836155614331, + "grad_norm": 3.295029640197754, + "learning_rate": 0.004891012595717655, + "loss": 8.3237, + "step": 169200 + }, + { + "epoch": 0.6896909935848146, + "grad_norm": 5.978100776672363, + "learning_rate": 0.004890867525880042, + "loss": 8.4107, + "step": 169300 + }, + { + "epoch": 0.6900983716081961, + "grad_norm": 0.6782967448234558, + "learning_rate": 0.0048907223617163246, + "loss": 8.4126, + "step": 169400 + }, + { + "epoch": 0.6905057496315775, + "grad_norm": 6.610276222229004, + "learning_rate": 0.004890577103232248, + "loss": 8.4434, + "step": 169500 + }, + { + "epoch": 0.6909131276549589, + "grad_norm": 0.7157424688339233, + "learning_rate": 0.004890431750433545, + "loss": 8.4295, + "step": 169600 + }, + { + "epoch": 0.6913205056783405, + "grad_norm": 10.258817672729492, + "learning_rate": 0.004890286303325968, + "loss": 8.3865, + "step": 169700 + }, + { + "epoch": 0.6917278837017219, + "grad_norm": 2.953791856765747, + "learning_rate": 0.004890140761915269, + "loss": 8.2532, + "step": 169800 + }, + { + "epoch": 0.6921352617251033, + "grad_norm": 5.6295270919799805, + "learning_rate": 0.004889995126207192, + "loss": 8.289, + "step": 169900 + }, + { + "epoch": 0.6925426397484848, + "grad_norm": 0.6412755250930786, + "learning_rate": 0.0048898493962075, + "loss": 8.2589, + "step": 170000 + }, + { + "epoch": 0.6925426397484848, + "eval_MaskedAccuracy": 0.4773358240766947, + "eval_loss": 1.7543995380401611, + "eval_runtime": 609.8017, + "eval_samples_per_second": 104.093, + "eval_steps_per_second": 0.407, + "step": 170000 + }, + { + "epoch": 0.6929500177718663, + "grad_norm": 0.6819936037063599, + "learning_rate": 0.004889703571921961, + "loss": 8.36, + "step": 170100 + }, + { + "epoch": 0.6933573957952477, + "grad_norm": 4.7626214027404785, + "learning_rate": 0.004889557653356331, + "loss": 8.4279, + "step": 170200 + }, + { + "epoch": 0.6937647738186292, + "grad_norm": 10.89247989654541, + "learning_rate": 0.004889411640516388, + "loss": 8.3306, + "step": 170300 + }, + { + "epoch": 0.6941721518420106, + "grad_norm": 0.7468447089195251, + "learning_rate": 0.004889265533407896, + "loss": 8.3002, + "step": 170400 + }, + { + "epoch": 0.6945795298653921, + "grad_norm": 3.9062106609344482, + "learning_rate": 0.004889119332036641, + "loss": 8.408, + "step": 170500 + }, + { + "epoch": 0.6949869078887736, + "grad_norm": 8.741107940673828, + "learning_rate": 0.004888973036408398, + "loss": 8.3202, + "step": 170600 + }, + { + "epoch": 0.695394285912155, + "grad_norm": 3.7794833183288574, + "learning_rate": 0.004888826646528943, + "loss": 8.3113, + "step": 170700 + }, + { + "epoch": 0.6958016639355366, + "grad_norm": 1.2744481563568115, + "learning_rate": 0.004888680162404073, + "loss": 8.4118, + "step": 170800 + }, + { + "epoch": 0.696209041958918, + "grad_norm": 9.467698097229004, + "learning_rate": 0.00488853358403958, + "loss": 8.3829, + "step": 170900 + }, + { + "epoch": 0.6966164199822994, + "grad_norm": 1.5702000856399536, + "learning_rate": 0.0048883869114412564, + "loss": 8.4425, + "step": 171000 + }, + { + "epoch": 0.6966164199822994, + "eval_MaskedAccuracy": 0.47248101723780406, + "eval_loss": 1.7865709066390991, + "eval_runtime": 538.5232, + "eval_samples_per_second": 117.87, + "eval_steps_per_second": 0.461, + "step": 171000 + }, + { + "epoch": 0.6970237980056809, + "grad_norm": 9.437499046325684, + "learning_rate": 0.004888240144614903, + "loss": 8.4318, + "step": 171100 + }, + { + "epoch": 0.6974311760290623, + "grad_norm": 3.6925880908966064, + "learning_rate": 0.004888093283566316, + "loss": 8.4064, + "step": 171200 + }, + { + "epoch": 0.6978385540524438, + "grad_norm": 5.525794506072998, + "learning_rate": 0.004887946328301304, + "loss": 8.3119, + "step": 171300 + }, + { + "epoch": 0.6982459320758253, + "grad_norm": 5.053781032562256, + "learning_rate": 0.004887799278825681, + "loss": 8.2764, + "step": 171400 + }, + { + "epoch": 0.6986533100992067, + "grad_norm": 5.076804161071777, + "learning_rate": 0.004887652135145267, + "loss": 8.2487, + "step": 171500 + }, + { + "epoch": 0.6990606881225881, + "grad_norm": 2.078188896179199, + "learning_rate": 0.004887504897265856, + "loss": 8.283, + "step": 171600 + }, + { + "epoch": 0.6994680661459697, + "grad_norm": 3.048565149307251, + "learning_rate": 0.0048873575651932925, + "loss": 8.3919, + "step": 171700 + }, + { + "epoch": 0.6998754441693511, + "grad_norm": 1.426169753074646, + "learning_rate": 0.004887210138933391, + "loss": 8.4645, + "step": 171800 + }, + { + "epoch": 0.7002828221927326, + "grad_norm": 2.3991544246673584, + "learning_rate": 0.004887062618491986, + "loss": 8.4391, + "step": 171900 + }, + { + "epoch": 0.7006902002161141, + "grad_norm": 2.98624587059021, + "learning_rate": 0.004886915003874903, + "loss": 8.4758, + "step": 172000 + }, + { + "epoch": 0.7006902002161141, + "eval_MaskedAccuracy": 0.4691883130532846, + "eval_loss": 1.790522813796997, + "eval_runtime": 515.1359, + "eval_samples_per_second": 123.222, + "eval_steps_per_second": 0.481, + "step": 172000 + }, + { + "epoch": 0.7010975782394955, + "grad_norm": 4.742863178253174, + "learning_rate": 0.004886767295087984, + "loss": 8.3807, + "step": 172100 + }, + { + "epoch": 0.701504956262877, + "grad_norm": 3.5053226947784424, + "learning_rate": 0.004886619492137062, + "loss": 8.3139, + "step": 172200 + }, + { + "epoch": 0.7019123342862584, + "grad_norm": 8.727147102355957, + "learning_rate": 0.004886471595027978, + "loss": 8.2916, + "step": 172300 + }, + { + "epoch": 0.7023197123096399, + "grad_norm": 4.358945369720459, + "learning_rate": 0.004886323603766593, + "loss": 8.287, + "step": 172400 + }, + { + "epoch": 0.7027270903330214, + "grad_norm": 4.907960891723633, + "learning_rate": 0.00488617551835875, + "loss": 8.2544, + "step": 172500 + }, + { + "epoch": 0.7031344683564028, + "grad_norm": 2.0375194549560547, + "learning_rate": 0.004886027338810299, + "loss": 8.2582, + "step": 172600 + }, + { + "epoch": 0.7035418463797842, + "grad_norm": 5.98710298538208, + "learning_rate": 0.004885879065127108, + "loss": 8.3614, + "step": 172700 + }, + { + "epoch": 0.7039492244031658, + "grad_norm": 3.117462635040283, + "learning_rate": 0.0048857306973150265, + "loss": 8.2707, + "step": 172800 + }, + { + "epoch": 0.7043566024265472, + "grad_norm": 2.485978603363037, + "learning_rate": 0.004885582235379937, + "loss": 8.3653, + "step": 172900 + }, + { + "epoch": 0.7047639804499286, + "grad_norm": 4.734591960906982, + "learning_rate": 0.004885433679327683, + "loss": 8.407, + "step": 173000 + }, + { + "epoch": 0.7047639804499286, + "eval_MaskedAccuracy": 0.47496040524555855, + "eval_loss": 1.7643200159072876, + "eval_runtime": 545.751, + "eval_samples_per_second": 116.309, + "eval_steps_per_second": 0.454, + "step": 173000 + }, + { + "epoch": 0.7051713584733101, + "grad_norm": 2.7611308097839355, + "learning_rate": 0.004885285029164163, + "loss": 8.4275, + "step": 173100 + }, + { + "epoch": 0.7055787364966916, + "grad_norm": 6.302596569061279, + "learning_rate": 0.004885136284895244, + "loss": 8.3923, + "step": 173200 + }, + { + "epoch": 0.7059861145200731, + "grad_norm": 3.493154287338257, + "learning_rate": 0.00488498744652681, + "loss": 8.3187, + "step": 173300 + }, + { + "epoch": 0.7063934925434545, + "grad_norm": 3.9514620304107666, + "learning_rate": 0.004884838514064751, + "loss": 8.2603, + "step": 173400 + }, + { + "epoch": 0.7068008705668359, + "grad_norm": 7.091209411621094, + "learning_rate": 0.004884689487514938, + "loss": 8.3939, + "step": 173500 + }, + { + "epoch": 0.7072082485902175, + "grad_norm": 8.952984809875488, + "learning_rate": 0.004884540366883272, + "loss": 8.3596, + "step": 173600 + }, + { + "epoch": 0.7076156266135989, + "grad_norm": 4.8596649169921875, + "learning_rate": 0.00488439115217565, + "loss": 8.3088, + "step": 173700 + }, + { + "epoch": 0.7080230046369803, + "grad_norm": 1.3575553894042969, + "learning_rate": 0.004884241843397972, + "loss": 8.2992, + "step": 173800 + }, + { + "epoch": 0.7084303826603618, + "grad_norm": 4.548696517944336, + "learning_rate": 0.0048840924405561354, + "loss": 8.2768, + "step": 173900 + }, + { + "epoch": 0.7088377606837433, + "grad_norm": 4.349992275238037, + "learning_rate": 0.004883942943656046, + "loss": 8.319, + "step": 174000 + }, + { + "epoch": 0.7088377606837433, + "eval_MaskedAccuracy": 0.4732281269916888, + "eval_loss": 1.7842973470687866, + "eval_runtime": 651.7468, + "eval_samples_per_second": 97.394, + "eval_steps_per_second": 0.381, + "step": 174000 + }, + { + "epoch": 0.7092451387071247, + "grad_norm": 6.522211074829102, + "learning_rate": 0.004883793352703624, + "loss": 8.3246, + "step": 174100 + }, + { + "epoch": 0.7096525167305062, + "grad_norm": 6.698734283447266, + "learning_rate": 0.00488364366770477, + "loss": 8.2695, + "step": 174200 + }, + { + "epoch": 0.7100598947538876, + "grad_norm": 3.7172343730926514, + "learning_rate": 0.004883493888665418, + "loss": 8.4204, + "step": 174300 + }, + { + "epoch": 0.7104672727772692, + "grad_norm": 3.83941388130188, + "learning_rate": 0.004883344015591483, + "loss": 8.3345, + "step": 174400 + }, + { + "epoch": 0.7108746508006506, + "grad_norm": 7.8376970291137695, + "learning_rate": 0.004883194048488885, + "loss": 8.3458, + "step": 174500 + }, + { + "epoch": 0.711282028824032, + "grad_norm": 4.667509078979492, + "learning_rate": 0.004883043987363555, + "loss": 8.2559, + "step": 174600 + }, + { + "epoch": 0.7116894068474136, + "grad_norm": 5.577969551086426, + "learning_rate": 0.004882893832221431, + "loss": 8.2355, + "step": 174700 + }, + { + "epoch": 0.712096784870795, + "grad_norm": 7.191335201263428, + "learning_rate": 0.004882743583068447, + "loss": 8.3568, + "step": 174800 + }, + { + "epoch": 0.7125041628941764, + "grad_norm": 2.6492528915405273, + "learning_rate": 0.004882593239910542, + "loss": 8.2721, + "step": 174900 + }, + { + "epoch": 0.7129115409175579, + "grad_norm": 1.0866893529891968, + "learning_rate": 0.004882442802753663, + "loss": 8.3197, + "step": 175000 + }, + { + "epoch": 0.7129115409175579, + "eval_MaskedAccuracy": 0.4768590181371025, + "eval_loss": 1.7546759843826294, + "eval_runtime": 575.3248, + "eval_samples_per_second": 110.331, + "eval_steps_per_second": 0.431, + "step": 175000 + }, + { + "epoch": 0.7133189189409394, + "grad_norm": 7.7724456787109375, + "learning_rate": 0.004882292271603746, + "loss": 8.3555, + "step": 175100 + }, + { + "epoch": 0.7137262969643208, + "grad_norm": 3.023310422897339, + "learning_rate": 0.004882141646466756, + "loss": 8.2732, + "step": 175200 + }, + { + "epoch": 0.7141336749877023, + "grad_norm": 1.2857904434204102, + "learning_rate": 0.004881990927348646, + "loss": 8.2973, + "step": 175300 + }, + { + "epoch": 0.7145410530110837, + "grad_norm": 9.354995727539062, + "learning_rate": 0.004881840114255371, + "loss": 8.437, + "step": 175400 + }, + { + "epoch": 0.7149484310344651, + "grad_norm": 7.249983310699463, + "learning_rate": 0.004881689207192886, + "loss": 8.3225, + "step": 175500 + }, + { + "epoch": 0.7153558090578467, + "grad_norm": 3.115715980529785, + "learning_rate": 0.0048815382061671705, + "loss": 8.2588, + "step": 175600 + }, + { + "epoch": 0.7157631870812281, + "grad_norm": 6.985172271728516, + "learning_rate": 0.00488138711118419, + "loss": 8.2741, + "step": 175700 + }, + { + "epoch": 0.7161705651046096, + "grad_norm": 3.081441879272461, + "learning_rate": 0.004881235922249919, + "loss": 8.326, + "step": 175800 + }, + { + "epoch": 0.7165779431279911, + "grad_norm": 4.380251407623291, + "learning_rate": 0.004881084639370334, + "loss": 8.2507, + "step": 175900 + }, + { + "epoch": 0.7169853211513725, + "grad_norm": 1.9868642091751099, + "learning_rate": 0.004880933262551412, + "loss": 8.3041, + "step": 176000 + }, + { + "epoch": 0.7169853211513725, + "eval_MaskedAccuracy": 0.47505992785756423, + "eval_loss": 1.7654045820236206, + "eval_runtime": 650.8605, + "eval_samples_per_second": 97.526, + "eval_steps_per_second": 0.381, + "step": 176000 + }, + { + "epoch": 0.717392699174754, + "grad_norm": 4.893762111663818, + "learning_rate": 0.004880781791799138, + "loss": 8.4209, + "step": 176100 + }, + { + "epoch": 0.7178000771981354, + "grad_norm": 4.261211395263672, + "learning_rate": 0.004880630227119507, + "loss": 8.3148, + "step": 176200 + }, + { + "epoch": 0.7182074552215169, + "grad_norm": 1.6230965852737427, + "learning_rate": 0.004880478568518509, + "loss": 8.4022, + "step": 176300 + }, + { + "epoch": 0.7186148332448984, + "grad_norm": 1.696700930595398, + "learning_rate": 0.004880326816002138, + "loss": 8.3866, + "step": 176400 + }, + { + "epoch": 0.7190222112682798, + "grad_norm": 5.27692985534668, + "learning_rate": 0.0048801749695763915, + "loss": 8.2963, + "step": 176500 + }, + { + "epoch": 0.7194295892916612, + "grad_norm": 5.148913383483887, + "learning_rate": 0.004880023029247276, + "loss": 8.2342, + "step": 176600 + }, + { + "epoch": 0.7198369673150428, + "grad_norm": 9.100824356079102, + "learning_rate": 0.004879870995020799, + "loss": 8.2097, + "step": 176700 + }, + { + "epoch": 0.7202443453384242, + "grad_norm": 1.109858512878418, + "learning_rate": 0.0048797188669029625, + "loss": 8.3588, + "step": 176800 + }, + { + "epoch": 0.7206517233618056, + "grad_norm": 6.3910322189331055, + "learning_rate": 0.004879566644899795, + "loss": 8.4023, + "step": 176900 + }, + { + "epoch": 0.7210591013851871, + "grad_norm": 2.662142515182495, + "learning_rate": 0.004879414329017307, + "loss": 8.2879, + "step": 177000 + }, + { + "epoch": 0.7210591013851871, + "eval_MaskedAccuracy": 0.4803536924996919, + "eval_loss": 1.7457139492034912, + "eval_runtime": 563.9514, + "eval_samples_per_second": 112.556, + "eval_steps_per_second": 0.44, + "step": 177000 + }, + { + "epoch": 0.7214664794085686, + "grad_norm": 0.49664661288261414, + "learning_rate": 0.004879261919261518, + "loss": 8.2449, + "step": 177100 + }, + { + "epoch": 0.7218738574319501, + "grad_norm": 2.5227560997009277, + "learning_rate": 0.004879109415638463, + "loss": 8.3815, + "step": 177200 + }, + { + "epoch": 0.7222812354553315, + "grad_norm": 7.061991214752197, + "learning_rate": 0.0048789568181541575, + "loss": 8.3137, + "step": 177300 + }, + { + "epoch": 0.7226886134787129, + "grad_norm": 6.182156562805176, + "learning_rate": 0.004878804126814647, + "loss": 8.2393, + "step": 177400 + }, + { + "epoch": 0.7230959915020945, + "grad_norm": 6.707774639129639, + "learning_rate": 0.004878651341625962, + "loss": 8.2374, + "step": 177500 + }, + { + "epoch": 0.7235033695254759, + "grad_norm": 8.791891098022461, + "learning_rate": 0.0048784984625941455, + "loss": 8.2205, + "step": 177600 + }, + { + "epoch": 0.7239107475488573, + "grad_norm": 3.0393781661987305, + "learning_rate": 0.004878345489725241, + "loss": 8.1966, + "step": 177700 + }, + { + "epoch": 0.7243181255722388, + "grad_norm": 7.30899715423584, + "learning_rate": 0.004878192423025295, + "loss": 8.356, + "step": 177800 + }, + { + "epoch": 0.7247255035956203, + "grad_norm": 3.005924940109253, + "learning_rate": 0.00487803926250036, + "loss": 8.3474, + "step": 177900 + }, + { + "epoch": 0.7251328816190017, + "grad_norm": 5.2232666015625, + "learning_rate": 0.004877886008156489, + "loss": 8.2996, + "step": 178000 + }, + { + "epoch": 0.7251328816190017, + "eval_MaskedAccuracy": 0.48003651713678797, + "eval_loss": 1.7420556545257568, + "eval_runtime": 587.5573, + "eval_samples_per_second": 108.034, + "eval_steps_per_second": 0.422, + "step": 178000 + }, + { + "epoch": 0.7255402596423832, + "grad_norm": 2.2677695751190186, + "learning_rate": 0.004877732659999748, + "loss": 8.357, + "step": 178100 + }, + { + "epoch": 0.7259476376657646, + "grad_norm": 8.819666862487793, + "learning_rate": 0.004877579218036192, + "loss": 8.2959, + "step": 178200 + }, + { + "epoch": 0.7263550156891462, + "grad_norm": 3.094377279281616, + "learning_rate": 0.00487742568227189, + "loss": 8.2471, + "step": 178300 + }, + { + "epoch": 0.7267623937125276, + "grad_norm": 4.4123125076293945, + "learning_rate": 0.004877272052712911, + "loss": 8.2269, + "step": 178400 + }, + { + "epoch": 0.727169771735909, + "grad_norm": 3.1336193084716797, + "learning_rate": 0.00487711832936533, + "loss": 8.3825, + "step": 178500 + }, + { + "epoch": 0.7275771497592906, + "grad_norm": 2.860325336456299, + "learning_rate": 0.004876964512235228, + "loss": 8.4471, + "step": 178600 + }, + { + "epoch": 0.727984527782672, + "grad_norm": 4.890446186065674, + "learning_rate": 0.004876810601328681, + "loss": 8.4431, + "step": 178700 + }, + { + "epoch": 0.7283919058060534, + "grad_norm": 4.390522003173828, + "learning_rate": 0.004876656596651773, + "loss": 8.305, + "step": 178800 + }, + { + "epoch": 0.7287992838294349, + "grad_norm": 6.204839706420898, + "learning_rate": 0.004876502498210598, + "loss": 8.2462, + "step": 178900 + }, + { + "epoch": 0.7292066618528164, + "grad_norm": 4.126978874206543, + "learning_rate": 0.004876348306011245, + "loss": 8.2346, + "step": 179000 + }, + { + "epoch": 0.7292066618528164, + "eval_MaskedAccuracy": 0.4809214081013844, + "eval_loss": 1.7444720268249512, + "eval_runtime": 597.017, + "eval_samples_per_second": 106.322, + "eval_steps_per_second": 0.415, + "step": 179000 + }, + { + "epoch": 0.7296140398761978, + "grad_norm": 1.363865852355957, + "learning_rate": 0.004876194020059811, + "loss": 8.3255, + "step": 179100 + }, + { + "epoch": 0.7300214178995793, + "grad_norm": 4.949935436248779, + "learning_rate": 0.004876039640362395, + "loss": 8.2939, + "step": 179200 + }, + { + "epoch": 0.7304287959229607, + "grad_norm": 6.188343048095703, + "learning_rate": 0.004875885166925098, + "loss": 8.3008, + "step": 179300 + }, + { + "epoch": 0.7308361739463421, + "grad_norm": 6.2942070960998535, + "learning_rate": 0.004875730599754032, + "loss": 8.3243, + "step": 179400 + }, + { + "epoch": 0.7312435519697237, + "grad_norm": 2.720975160598755, + "learning_rate": 0.00487557593885531, + "loss": 8.2844, + "step": 179500 + }, + { + "epoch": 0.7316509299931051, + "grad_norm": 3.3526389598846436, + "learning_rate": 0.004875421184235038, + "loss": 8.2559, + "step": 179600 + }, + { + "epoch": 0.7320583080164866, + "grad_norm": 8.793871879577637, + "learning_rate": 0.0048752663358993375, + "loss": 8.2198, + "step": 179700 + }, + { + "epoch": 0.7324656860398681, + "grad_norm": 6.481076717376709, + "learning_rate": 0.004875111393854334, + "loss": 8.186, + "step": 179800 + }, + { + "epoch": 0.7328730640632495, + "grad_norm": 6.413854122161865, + "learning_rate": 0.004874956358106152, + "loss": 8.208, + "step": 179900 + }, + { + "epoch": 0.733280442086631, + "grad_norm": 6.735567092895508, + "learning_rate": 0.004874801228660921, + "loss": 8.1682, + "step": 180000 + }, + { + "epoch": 0.733280442086631, + "eval_MaskedAccuracy": 0.482909605258875, + "eval_loss": 1.729762077331543, + "eval_runtime": 590.1032, + "eval_samples_per_second": 107.568, + "eval_steps_per_second": 0.42, + "step": 180000 + }, + { + "epoch": 0.7336878201100124, + "grad_norm": 2.869108200073242, + "learning_rate": 0.004874646005524773, + "loss": 8.3157, + "step": 180100 + }, + { + "epoch": 0.7340951981333939, + "grad_norm": 5.248936176300049, + "learning_rate": 0.0048744906887038374, + "loss": 8.3361, + "step": 180200 + }, + { + "epoch": 0.7345025761567754, + "grad_norm": 2.505849838256836, + "learning_rate": 0.0048743352782042665, + "loss": 8.4022, + "step": 180300 + }, + { + "epoch": 0.7349099541801568, + "grad_norm": 6.214993000030518, + "learning_rate": 0.0048741797740322005, + "loss": 8.3097, + "step": 180400 + }, + { + "epoch": 0.7353173322035382, + "grad_norm": 5.4167890548706055, + "learning_rate": 0.004874024176193786, + "loss": 8.3306, + "step": 180500 + }, + { + "epoch": 0.7357247102269198, + "grad_norm": 1.3875280618667603, + "learning_rate": 0.004873868484695169, + "loss": 8.3948, + "step": 180600 + }, + { + "epoch": 0.7361320882503012, + "grad_norm": 6.5398125648498535, + "learning_rate": 0.004873712699542514, + "loss": 8.3518, + "step": 180700 + }, + { + "epoch": 0.7365394662736827, + "grad_norm": 6.465213298797607, + "learning_rate": 0.004873556820741981, + "loss": 8.2596, + "step": 180800 + }, + { + "epoch": 0.7369468442970641, + "grad_norm": 7.302810192108154, + "learning_rate": 0.004873400848299721, + "loss": 8.2491, + "step": 180900 + }, + { + "epoch": 0.7373542223204456, + "grad_norm": 7.1983466148376465, + "learning_rate": 0.00487324478222191, + "loss": 8.2249, + "step": 181000 + }, + { + "epoch": 0.7373542223204456, + "eval_MaskedAccuracy": 0.4827771830212063, + "eval_loss": 1.7254915237426758, + "eval_runtime": 599.5115, + "eval_samples_per_second": 105.88, + "eval_steps_per_second": 0.414, + "step": 181000 + }, + { + "epoch": 0.7377616003438271, + "grad_norm": 7.062850475311279, + "learning_rate": 0.004873088622514718, + "loss": 8.195, + "step": 181100 + }, + { + "epoch": 0.7381689783672085, + "grad_norm": 7.937536716461182, + "learning_rate": 0.004872932369184326, + "loss": 8.1692, + "step": 181200 + }, + { + "epoch": 0.7385763563905899, + "grad_norm": 5.309225559234619, + "learning_rate": 0.004872776022236897, + "loss": 8.1978, + "step": 181300 + }, + { + "epoch": 0.7389837344139715, + "grad_norm": 3.140054225921631, + "learning_rate": 0.004872619581678617, + "loss": 8.235, + "step": 181400 + }, + { + "epoch": 0.7393911124373529, + "grad_norm": 9.557893753051758, + "learning_rate": 0.004872463047515668, + "loss": 8.3554, + "step": 181500 + }, + { + "epoch": 0.7397984904607343, + "grad_norm": 3.6274309158325195, + "learning_rate": 0.004872306419754254, + "loss": 8.2993, + "step": 181600 + }, + { + "epoch": 0.7402058684841158, + "grad_norm": 4.35516357421875, + "learning_rate": 0.0048721496984005545, + "loss": 8.2472, + "step": 181700 + }, + { + "epoch": 0.7406132465074973, + "grad_norm": 0.8326847553253174, + "learning_rate": 0.004871992883460763, + "loss": 8.3899, + "step": 181800 + }, + { + "epoch": 0.7410206245308787, + "grad_norm": 8.322855949401855, + "learning_rate": 0.0048718359749410824, + "loss": 8.3415, + "step": 181900 + }, + { + "epoch": 0.7414280025542602, + "grad_norm": 2.791518449783325, + "learning_rate": 0.004871678972847715, + "loss": 8.2363, + "step": 182000 + }, + { + "epoch": 0.7414280025542602, + "eval_MaskedAccuracy": 0.481054271312736, + "eval_loss": 1.738249659538269, + "eval_runtime": 728.8516, + "eval_samples_per_second": 87.09, + "eval_steps_per_second": 0.34, + "step": 182000 + }, + { + "epoch": 0.7418353805776416, + "grad_norm": 2.209365129470825, + "learning_rate": 0.004871521877186866, + "loss": 8.2858, + "step": 182100 + }, + { + "epoch": 0.7422427586010232, + "grad_norm": 3.326539993286133, + "learning_rate": 0.004871364687964754, + "loss": 8.317, + "step": 182200 + }, + { + "epoch": 0.7426501366244046, + "grad_norm": 6.098221302032471, + "learning_rate": 0.004871207405187595, + "loss": 8.2708, + "step": 182300 + }, + { + "epoch": 0.743057514647786, + "grad_norm": 3.575340509414673, + "learning_rate": 0.004871050028861597, + "loss": 8.2523, + "step": 182400 + }, + { + "epoch": 0.7434648926711676, + "grad_norm": 4.331416606903076, + "learning_rate": 0.004870892558992985, + "loss": 8.4662, + "step": 182500 + }, + { + "epoch": 0.743872270694549, + "grad_norm": 1.635528564453125, + "learning_rate": 0.004870734995587994, + "loss": 8.3721, + "step": 182600 + }, + { + "epoch": 0.7442796487179304, + "grad_norm": 5.0897393226623535, + "learning_rate": 0.004870577338652834, + "loss": 8.4446, + "step": 182700 + }, + { + "epoch": 0.7446870267413119, + "grad_norm": 0.853996217250824, + "learning_rate": 0.004870419588193759, + "loss": 8.4412, + "step": 182800 + }, + { + "epoch": 0.7450944047646934, + "grad_norm": 2.650754451751709, + "learning_rate": 0.004870261744216997, + "loss": 8.4779, + "step": 182900 + }, + { + "epoch": 0.7455017827880748, + "grad_norm": 3.4979281425476074, + "learning_rate": 0.004870103806728794, + "loss": 8.4302, + "step": 183000 + }, + { + "epoch": 0.7455017827880748, + "eval_MaskedAccuracy": 0.47285557428123637, + "eval_loss": 1.7717115879058838, + "eval_runtime": 338.4866, + "eval_samples_per_second": 187.529, + "eval_steps_per_second": 0.733, + "step": 183000 + }, + { + "epoch": 0.7459091608114563, + "grad_norm": 3.0517659187316895, + "learning_rate": 0.004869945775735384, + "loss": 8.4316, + "step": 183100 + }, + { + "epoch": 0.7463165388348377, + "grad_norm": 5.8500518798828125, + "learning_rate": 0.004869787651243016, + "loss": 8.3039, + "step": 183200 + }, + { + "epoch": 0.7467239168582193, + "grad_norm": 8.321171760559082, + "learning_rate": 0.004869629433257944, + "loss": 8.2616, + "step": 183300 + }, + { + "epoch": 0.7471312948816007, + "grad_norm": 3.770230293273926, + "learning_rate": 0.004869471121786423, + "loss": 8.223, + "step": 183400 + }, + { + "epoch": 0.7475386729049821, + "grad_norm": 5.35031795501709, + "learning_rate": 0.00486931271683471, + "loss": 8.2396, + "step": 183500 + }, + { + "epoch": 0.7479460509283636, + "grad_norm": 1.7099127769470215, + "learning_rate": 0.004869154218409075, + "loss": 8.2524, + "step": 183600 + }, + { + "epoch": 0.7483534289517451, + "grad_norm": 6.3081560134887695, + "learning_rate": 0.004868995626515777, + "loss": 8.3537, + "step": 183700 + }, + { + "epoch": 0.7487608069751265, + "grad_norm": 4.459376811981201, + "learning_rate": 0.004868836941161092, + "loss": 8.2728, + "step": 183800 + }, + { + "epoch": 0.749168184998508, + "grad_norm": 0.8340696096420288, + "learning_rate": 0.0048686781623512795, + "loss": 8.2427, + "step": 183900 + }, + { + "epoch": 0.7495755630218894, + "grad_norm": 5.280702590942383, + "learning_rate": 0.004868519290092641, + "loss": 8.3643, + "step": 184000 + }, + { + "epoch": 0.7495755630218894, + "eval_MaskedAccuracy": 0.4768380283348209, + "eval_loss": 1.7494298219680786, + "eval_runtime": 579.1395, + "eval_samples_per_second": 109.604, + "eval_steps_per_second": 0.428, + "step": 184000 + }, + { + "epoch": 0.7499829410452709, + "grad_norm": 4.741583824157715, + "learning_rate": 0.004868360324391437, + "loss": 8.2572, + "step": 184100 + }, + { + "epoch": 0.7503903190686524, + "grad_norm": 7.211680889129639, + "learning_rate": 0.004868201265253965, + "loss": 8.2716, + "step": 184200 + }, + { + "epoch": 0.7507976970920338, + "grad_norm": 6.9085588455200195, + "learning_rate": 0.004868042112686504, + "loss": 8.2017, + "step": 184300 + }, + { + "epoch": 0.7512050751154152, + "grad_norm": 6.018723964691162, + "learning_rate": 0.004867882866695344, + "loss": 8.3096, + "step": 184400 + }, + { + "epoch": 0.7516124531387968, + "grad_norm": 3.685755491256714, + "learning_rate": 0.004867723527286793, + "loss": 8.3765, + "step": 184500 + }, + { + "epoch": 0.7520198311621782, + "grad_norm": 8.629942893981934, + "learning_rate": 0.004867564094467144, + "loss": 8.3778, + "step": 184600 + }, + { + "epoch": 0.7524272091855597, + "grad_norm": 6.810964584350586, + "learning_rate": 0.0048674045682427, + "loss": 8.348, + "step": 184700 + }, + { + "epoch": 0.7528345872089411, + "grad_norm": 3.482246160507202, + "learning_rate": 0.004867244948619771, + "loss": 8.2458, + "step": 184800 + }, + { + "epoch": 0.7532419652323226, + "grad_norm": 6.576715469360352, + "learning_rate": 0.0048670852356046635, + "loss": 8.2585, + "step": 184900 + }, + { + "epoch": 0.7536493432557041, + "grad_norm": 5.824215888977051, + "learning_rate": 0.004866925429203695, + "loss": 8.2768, + "step": 185000 + }, + { + "epoch": 0.7536493432557041, + "eval_MaskedAccuracy": 0.4760389425166015, + "eval_loss": 1.7571865320205688, + "eval_runtime": 711.5395, + "eval_samples_per_second": 89.209, + "eval_steps_per_second": 0.349, + "step": 185000 + }, + { + "epoch": 0.7540567212790855, + "grad_norm": 6.5599164962768555, + "learning_rate": 0.004866765529423172, + "loss": 8.434, + "step": 185100 + }, + { + "epoch": 0.7544640993024669, + "grad_norm": 4.8812479972839355, + "learning_rate": 0.004866605536269431, + "loss": 8.3051, + "step": 185200 + }, + { + "epoch": 0.7548714773258485, + "grad_norm": 2.002317428588867, + "learning_rate": 0.00486644544974879, + "loss": 8.3022, + "step": 185300 + }, + { + "epoch": 0.7552788553492299, + "grad_norm": 12.033347129821777, + "learning_rate": 0.004866285269867578, + "loss": 8.3938, + "step": 185400 + }, + { + "epoch": 0.7556862333726113, + "grad_norm": 5.843840599060059, + "learning_rate": 0.004866124996632135, + "loss": 8.3901, + "step": 185500 + }, + { + "epoch": 0.7560936113959928, + "grad_norm": 11.444961547851562, + "learning_rate": 0.004865964630048786, + "loss": 8.3889, + "step": 185600 + }, + { + "epoch": 0.7565009894193743, + "grad_norm": 2.2533037662506104, + "learning_rate": 0.004865804170123884, + "loss": 8.3197, + "step": 185700 + }, + { + "epoch": 0.7569083674427558, + "grad_norm": 5.55755615234375, + "learning_rate": 0.0048656436168637756, + "loss": 8.2175, + "step": 185800 + }, + { + "epoch": 0.7573157454661372, + "grad_norm": 7.763909339904785, + "learning_rate": 0.0048654829702747876, + "loss": 8.2303, + "step": 185900 + }, + { + "epoch": 0.7577231234895186, + "grad_norm": 6.464959144592285, + "learning_rate": 0.0048653222303632815, + "loss": 8.2303, + "step": 186000 + }, + { + "epoch": 0.7577231234895186, + "eval_MaskedAccuracy": 0.47948798329779957, + "eval_loss": 1.7422912120819092, + "eval_runtime": 562.167, + "eval_samples_per_second": 112.913, + "eval_steps_per_second": 0.441, + "step": 186000 + }, + { + "epoch": 0.7581305015129002, + "grad_norm": 4.902656078338623, + "learning_rate": 0.004865161397135624, + "loss": 8.3223, + "step": 186100 + }, + { + "epoch": 0.7585378795362816, + "grad_norm": 8.939846992492676, + "learning_rate": 0.004865000470598161, + "loss": 8.2164, + "step": 186200 + }, + { + "epoch": 0.758945257559663, + "grad_norm": 7.8905792236328125, + "learning_rate": 0.004864839450757255, + "loss": 8.2015, + "step": 186300 + }, + { + "epoch": 0.7593526355830446, + "grad_norm": 3.3256075382232666, + "learning_rate": 0.004864678337619275, + "loss": 8.3425, + "step": 186400 + }, + { + "epoch": 0.759760013606426, + "grad_norm": 1.2854762077331543, + "learning_rate": 0.004864517131190597, + "loss": 8.3916, + "step": 186500 + }, + { + "epoch": 0.7601673916298074, + "grad_norm": 7.77571439743042, + "learning_rate": 0.004864355831477576, + "loss": 8.3372, + "step": 186600 + }, + { + "epoch": 0.7605747696531889, + "grad_norm": 2.4621174335479736, + "learning_rate": 0.004864194438486606, + "loss": 8.2787, + "step": 186700 + }, + { + "epoch": 0.7609821476765704, + "grad_norm": 9.451438903808594, + "learning_rate": 0.004864032952224062, + "loss": 8.2355, + "step": 186800 + }, + { + "epoch": 0.7613895256999518, + "grad_norm": 5.953160285949707, + "learning_rate": 0.004863871372696324, + "loss": 8.2821, + "step": 186900 + }, + { + "epoch": 0.7617969037233333, + "grad_norm": 2.5793473720550537, + "learning_rate": 0.004863709699909789, + "loss": 8.2502, + "step": 187000 + }, + { + "epoch": 0.7617969037233333, + "eval_MaskedAccuracy": 0.48226549821371534, + "eval_loss": 1.7357350587844849, + "eval_runtime": 549.9313, + "eval_samples_per_second": 115.425, + "eval_steps_per_second": 0.451, + "step": 187000 + }, + { + "epoch": 0.7622042817467147, + "grad_norm": 8.321205139160156, + "learning_rate": 0.0048635479338708446, + "loss": 8.2542, + "step": 187100 + }, + { + "epoch": 0.7626116597700963, + "grad_norm": 10.006879806518555, + "learning_rate": 0.004863386074585885, + "loss": 8.4086, + "step": 187200 + }, + { + "epoch": 0.7630190377934777, + "grad_norm": 4.29319953918457, + "learning_rate": 0.0048632241220613135, + "loss": 8.4075, + "step": 187300 + }, + { + "epoch": 0.7634264158168591, + "grad_norm": 3.3111531734466553, + "learning_rate": 0.004863062076303527, + "loss": 8.3403, + "step": 187400 + }, + { + "epoch": 0.7638337938402406, + "grad_norm": 4.626220703125, + "learning_rate": 0.004862899937318938, + "loss": 8.3157, + "step": 187500 + }, + { + "epoch": 0.7642411718636221, + "grad_norm": 10.10457992553711, + "learning_rate": 0.004862737705113952, + "loss": 8.3852, + "step": 187600 + }, + { + "epoch": 0.7646485498870035, + "grad_norm": 6.324151992797852, + "learning_rate": 0.004862575379694986, + "loss": 8.2689, + "step": 187700 + }, + { + "epoch": 0.765055927910385, + "grad_norm": 2.697753667831421, + "learning_rate": 0.0048624129610684505, + "loss": 8.2501, + "step": 187800 + }, + { + "epoch": 0.7654633059337664, + "grad_norm": 3.552042245864868, + "learning_rate": 0.00486225044924077, + "loss": 8.3004, + "step": 187900 + }, + { + "epoch": 0.7658706839571479, + "grad_norm": 6.898065567016602, + "learning_rate": 0.004862087844218373, + "loss": 8.3036, + "step": 188000 + }, + { + "epoch": 0.7658706839571479, + "eval_MaskedAccuracy": 0.48030488243074565, + "eval_loss": 1.7357308864593506, + "eval_runtime": 686.6961, + "eval_samples_per_second": 92.437, + "eval_steps_per_second": 0.361, + "step": 188000 + }, + { + "epoch": 0.7662780619805294, + "grad_norm": 2.1473889350891113, + "learning_rate": 0.004861925146007684, + "loss": 8.2513, + "step": 188100 + }, + { + "epoch": 0.7666854400039108, + "grad_norm": 3.4288330078125, + "learning_rate": 0.004861762354615138, + "loss": 8.403, + "step": 188200 + }, + { + "epoch": 0.7670928180272923, + "grad_norm": 2.9138879776000977, + "learning_rate": 0.004861599470047177, + "loss": 8.3305, + "step": 188300 + }, + { + "epoch": 0.7675001960506738, + "grad_norm": 6.231320381164551, + "learning_rate": 0.0048614364923102345, + "loss": 8.318, + "step": 188400 + }, + { + "epoch": 0.7679075740740552, + "grad_norm": 6.695070266723633, + "learning_rate": 0.004861273421410755, + "loss": 8.2621, + "step": 188500 + }, + { + "epoch": 0.7683149520974367, + "grad_norm": 0.6956781148910522, + "learning_rate": 0.004861110257355179, + "loss": 8.2153, + "step": 188600 + }, + { + "epoch": 0.7687223301208181, + "grad_norm": 1.3013149499893188, + "learning_rate": 0.004860947000149968, + "loss": 8.343, + "step": 188700 + }, + { + "epoch": 0.7691297081441996, + "grad_norm": 1.5729635953903198, + "learning_rate": 0.004860783649801571, + "loss": 8.3601, + "step": 188800 + }, + { + "epoch": 0.7695370861675811, + "grad_norm": 5.179935932159424, + "learning_rate": 0.004860620206316447, + "loss": 8.2648, + "step": 188900 + }, + { + "epoch": 0.7699444641909625, + "grad_norm": 4.961496829986572, + "learning_rate": 0.004860456669701059, + "loss": 8.2418, + "step": 189000 + }, + { + "epoch": 0.7699444641909625, + "eval_MaskedAccuracy": 0.4811944379313617, + "eval_loss": 1.7441548109054565, + "eval_runtime": 494.085, + "eval_samples_per_second": 128.472, + "eval_steps_per_second": 0.502, + "step": 189000 + }, + { + "epoch": 0.7703518422143439, + "grad_norm": 6.041499137878418, + "learning_rate": 0.00486029303996187, + "loss": 8.2266, + "step": 189100 + }, + { + "epoch": 0.7707592202377255, + "grad_norm": 7.916848182678223, + "learning_rate": 0.004860129317105353, + "loss": 8.3198, + "step": 189200 + }, + { + "epoch": 0.7711665982611069, + "grad_norm": 3.322267770767212, + "learning_rate": 0.00485996550113798, + "loss": 8.2506, + "step": 189300 + }, + { + "epoch": 0.7715739762844883, + "grad_norm": 4.494008541107178, + "learning_rate": 0.004859801592066219, + "loss": 8.2102, + "step": 189400 + }, + { + "epoch": 0.7719813543078698, + "grad_norm": 8.828492164611816, + "learning_rate": 0.004859637589896558, + "loss": 8.2277, + "step": 189500 + }, + { + "epoch": 0.7723887323312513, + "grad_norm": 6.599806308746338, + "learning_rate": 0.00485947349463548, + "loss": 8.353, + "step": 189600 + }, + { + "epoch": 0.7727961103546328, + "grad_norm": 0.9402397274971008, + "learning_rate": 0.004859309306289469, + "loss": 8.3772, + "step": 189700 + }, + { + "epoch": 0.7732034883780142, + "grad_norm": 21.609149932861328, + "learning_rate": 0.0048591450248650185, + "loss": 8.373, + "step": 189800 + }, + { + "epoch": 0.7736108664013956, + "grad_norm": 5.11705207824707, + "learning_rate": 0.004858980650368628, + "loss": 8.3223, + "step": 189900 + }, + { + "epoch": 0.7740182444247772, + "grad_norm": 3.844451427459717, + "learning_rate": 0.004858816182806792, + "loss": 8.2618, + "step": 190000 + }, + { + "epoch": 0.7740182444247772, + "eval_MaskedAccuracy": 0.4816098762850331, + "eval_loss": 1.7375717163085938, + "eval_runtime": 406.9675, + "eval_samples_per_second": 155.973, + "eval_steps_per_second": 0.609, + "step": 190000 + }, + { + "epoch": 0.7744256224481586, + "grad_norm": 4.231984615325928, + "learning_rate": 0.004858651622186017, + "loss": 8.2077, + "step": 190100 + }, + { + "epoch": 0.77483300047154, + "grad_norm": 4.590526103973389, + "learning_rate": 0.0048584869685128025, + "loss": 8.1978, + "step": 190200 + }, + { + "epoch": 0.7752403784949216, + "grad_norm": 4.9509429931640625, + "learning_rate": 0.004858322221793658, + "loss": 8.1997, + "step": 190300 + }, + { + "epoch": 0.775647756518303, + "grad_norm": 5.255951404571533, + "learning_rate": 0.004858157382035101, + "loss": 8.1573, + "step": 190400 + }, + { + "epoch": 0.7760551345416844, + "grad_norm": 6.821035385131836, + "learning_rate": 0.004857992449243652, + "loss": 8.1405, + "step": 190500 + }, + { + "epoch": 0.7764625125650659, + "grad_norm": 4.925536155700684, + "learning_rate": 0.004857827423425829, + "loss": 8.1715, + "step": 190600 + }, + { + "epoch": 0.7768698905884474, + "grad_norm": 7.465526580810547, + "learning_rate": 0.004857662304588147, + "loss": 8.1682, + "step": 190700 + }, + { + "epoch": 0.7772772686118289, + "grad_norm": 5.358131408691406, + "learning_rate": 0.004857497092737141, + "loss": 8.3492, + "step": 190800 + }, + { + "epoch": 0.7776846466352103, + "grad_norm": 7.660170078277588, + "learning_rate": 0.0048573317878793446, + "loss": 8.2337, + "step": 190900 + }, + { + "epoch": 0.7780920246585917, + "grad_norm": 1.671234369277954, + "learning_rate": 0.004857166390021285, + "loss": 8.3512, + "step": 191000 + }, + { + "epoch": 0.7780920246585917, + "eval_MaskedAccuracy": 0.47423356197852706, + "eval_loss": 1.7600901126861572, + "eval_runtime": 713.4928, + "eval_samples_per_second": 88.965, + "eval_steps_per_second": 0.348, + "step": 191000 + }, + { + "epoch": 0.7784994026819733, + "grad_norm": 6.394827842712402, + "learning_rate": 0.004857000899169509, + "loss": 8.4037, + "step": 191100 + }, + { + "epoch": 0.7789067807053547, + "grad_norm": 25.751123428344727, + "learning_rate": 0.004856835315330559, + "loss": 8.4075, + "step": 191200 + }, + { + "epoch": 0.7793141587287361, + "grad_norm": 2.641561269760132, + "learning_rate": 0.004856669638510987, + "loss": 8.335, + "step": 191300 + }, + { + "epoch": 0.7797215367521176, + "grad_norm": 3.0915346145629883, + "learning_rate": 0.004856503868717332, + "loss": 8.2223, + "step": 191400 + }, + { + "epoch": 0.7801289147754991, + "grad_norm": 8.381845474243164, + "learning_rate": 0.00485633800595615, + "loss": 8.2886, + "step": 191500 + }, + { + "epoch": 0.7805362927988805, + "grad_norm": 3.6453874111175537, + "learning_rate": 0.004856172050233998, + "loss": 8.3798, + "step": 191600 + }, + { + "epoch": 0.780943670822262, + "grad_norm": 7.310152530670166, + "learning_rate": 0.004856006001557444, + "loss": 8.3099, + "step": 191700 + }, + { + "epoch": 0.7813510488456434, + "grad_norm": 2.473210096359253, + "learning_rate": 0.004855839859933053, + "loss": 8.2403, + "step": 191800 + }, + { + "epoch": 0.7817584268690249, + "grad_norm": 3.5466408729553223, + "learning_rate": 0.004855673625367388, + "loss": 8.1866, + "step": 191900 + }, + { + "epoch": 0.7821658048924064, + "grad_norm": 4.585660457611084, + "learning_rate": 0.0048555072978670195, + "loss": 8.3252, + "step": 192000 + }, + { + "epoch": 0.7821658048924064, + "eval_MaskedAccuracy": 0.4768169207854122, + "eval_loss": 1.760154366493225, + "eval_runtime": 611.2653, + "eval_samples_per_second": 103.844, + "eval_steps_per_second": 0.406, + "step": 192000 + }, + { + "epoch": 0.7825731829157878, + "grad_norm": 4.345408916473389, + "learning_rate": 0.004855340877438528, + "loss": 8.3354, + "step": 192100 + }, + { + "epoch": 0.7829805609391693, + "grad_norm": 6.655874729156494, + "learning_rate": 0.004855174364088492, + "loss": 8.2959, + "step": 192200 + }, + { + "epoch": 0.7833879389625508, + "grad_norm": 3.4941017627716064, + "learning_rate": 0.004855007757823497, + "loss": 8.1996, + "step": 192300 + }, + { + "epoch": 0.7837953169859322, + "grad_norm": 4.970099925994873, + "learning_rate": 0.004854841058650124, + "loss": 8.3653, + "step": 192400 + }, + { + "epoch": 0.7842026950093137, + "grad_norm": 0.5349174737930298, + "learning_rate": 0.00485467426657497, + "loss": 8.3485, + "step": 192500 + }, + { + "epoch": 0.7846100730326951, + "grad_norm": 0.6330676078796387, + "learning_rate": 0.004854507381604621, + "loss": 8.3705, + "step": 192600 + }, + { + "epoch": 0.7850174510560766, + "grad_norm": 3.8508827686309814, + "learning_rate": 0.004854340403745676, + "loss": 8.3403, + "step": 192700 + }, + { + "epoch": 0.7854248290794581, + "grad_norm": 6.208312511444092, + "learning_rate": 0.004854173333004748, + "loss": 8.3188, + "step": 192800 + }, + { + "epoch": 0.7858322071028395, + "grad_norm": 6.375416278839111, + "learning_rate": 0.004854006169388426, + "loss": 8.2416, + "step": 192900 + }, + { + "epoch": 0.7862395851262209, + "grad_norm": 5.320347309112549, + "learning_rate": 0.004853838912903331, + "loss": 8.2092, + "step": 193000 + }, + { + "epoch": 0.7862395851262209, + "eval_MaskedAccuracy": 0.4831812609564819, + "eval_loss": 1.7338577508926392, + "eval_runtime": 646.1593, + "eval_samples_per_second": 98.236, + "eval_steps_per_second": 0.384, + "step": 193000 + }, + { + "epoch": 0.7866469631496025, + "grad_norm": 6.850183486938477, + "learning_rate": 0.0048536715635560754, + "loss": 8.2364, + "step": 193100 + }, + { + "epoch": 0.7870543411729839, + "grad_norm": 8.502579689025879, + "learning_rate": 0.0048535041213532614, + "loss": 8.2027, + "step": 193200 + }, + { + "epoch": 0.7874617191963654, + "grad_norm": 4.212939739227295, + "learning_rate": 0.0048533365863015245, + "loss": 8.3491, + "step": 193300 + }, + { + "epoch": 0.7878690972197469, + "grad_norm": 5.045016765594482, + "learning_rate": 0.004853168958407482, + "loss": 8.3094, + "step": 193400 + }, + { + "epoch": 0.7882764752431283, + "grad_norm": 2.5898854732513428, + "learning_rate": 0.0048530012376777594, + "loss": 8.3289, + "step": 193500 + }, + { + "epoch": 0.7886838532665098, + "grad_norm": 5.046809673309326, + "learning_rate": 0.0048528334241189925, + "loss": 8.2256, + "step": 193600 + }, + { + "epoch": 0.7890912312898912, + "grad_norm": 8.55445384979248, + "learning_rate": 0.0048526655177378135, + "loss": 8.2187, + "step": 193700 + }, + { + "epoch": 0.7894986093132726, + "grad_norm": 2.4749691486358643, + "learning_rate": 0.004852497518540859, + "loss": 8.1947, + "step": 193800 + }, + { + "epoch": 0.7899059873366542, + "grad_norm": 3.637096643447876, + "learning_rate": 0.004852329426534777, + "loss": 8.2319, + "step": 193900 + }, + { + "epoch": 0.7903133653600356, + "grad_norm": 4.979690074920654, + "learning_rate": 0.004852161241726202, + "loss": 8.1991, + "step": 194000 + }, + { + "epoch": 0.7903133653600356, + "eval_MaskedAccuracy": 0.4823404586882282, + "eval_loss": 1.726420283317566, + "eval_runtime": 590.1867, + "eval_samples_per_second": 107.552, + "eval_steps_per_second": 0.42, + "step": 194000 + }, + { + "epoch": 0.790720743383417, + "grad_norm": 4.382778167724609, + "learning_rate": 0.004851992964121791, + "loss": 8.2844, + "step": 194100 + }, + { + "epoch": 0.7911281214067986, + "grad_norm": 10.199458122253418, + "learning_rate": 0.004851824593728198, + "loss": 8.3121, + "step": 194200 + }, + { + "epoch": 0.79153549943018, + "grad_norm": 2.9020090103149414, + "learning_rate": 0.004851656130552078, + "loss": 8.3118, + "step": 194300 + }, + { + "epoch": 0.7919428774535614, + "grad_norm": 0.5850937366485596, + "learning_rate": 0.0048514875746000895, + "loss": 8.2712, + "step": 194400 + }, + { + "epoch": 0.7923502554769429, + "grad_norm": 2.5323173999786377, + "learning_rate": 0.004851318925878893, + "loss": 8.3253, + "step": 194500 + }, + { + "epoch": 0.7927576335003244, + "grad_norm": 1.724382996559143, + "learning_rate": 0.004851150184395162, + "loss": 8.3354, + "step": 194600 + }, + { + "epoch": 0.7931650115237059, + "grad_norm": 6.714611530303955, + "learning_rate": 0.004850981350155566, + "loss": 8.3365, + "step": 194700 + }, + { + "epoch": 0.7935723895470873, + "grad_norm": 2.9206438064575195, + "learning_rate": 0.0048508124231667745, + "loss": 8.2588, + "step": 194800 + }, + { + "epoch": 0.7939797675704687, + "grad_norm": 6.360642433166504, + "learning_rate": 0.004850643403435478, + "loss": 8.243, + "step": 194900 + }, + { + "epoch": 0.7943871455938503, + "grad_norm": 1.3880046606063843, + "learning_rate": 0.004850474290968347, + "loss": 8.304, + "step": 195000 + }, + { + "epoch": 0.7943871455938503, + "eval_MaskedAccuracy": 0.4745305035933312, + "eval_loss": 1.7650314569473267, + "eval_runtime": 612.4558, + "eval_samples_per_second": 103.642, + "eval_steps_per_second": 0.405, + "step": 195000 + }, + { + "epoch": 0.7947945236172317, + "grad_norm": 7.486367225646973, + "learning_rate": 0.004850305085772075, + "loss": 8.3045, + "step": 195100 + }, + { + "epoch": 0.7952019016406131, + "grad_norm": 6.169907569885254, + "learning_rate": 0.004850135787853349, + "loss": 8.1965, + "step": 195200 + }, + { + "epoch": 0.7956092796639946, + "grad_norm": 0.9016996026039124, + "learning_rate": 0.004849966397218864, + "loss": 8.3548, + "step": 195300 + }, + { + "epoch": 0.7960166576873761, + "grad_norm": 1.0470725297927856, + "learning_rate": 0.004849796913875317, + "loss": 8.353, + "step": 195400 + }, + { + "epoch": 0.7964240357107575, + "grad_norm": 6.29168176651001, + "learning_rate": 0.004849627337829407, + "loss": 8.289, + "step": 195500 + }, + { + "epoch": 0.796831413734139, + "grad_norm": 8.154321670532227, + "learning_rate": 0.004849457669087843, + "loss": 8.2741, + "step": 195600 + }, + { + "epoch": 0.7972387917575204, + "grad_norm": 6.071690559387207, + "learning_rate": 0.004849287907657329, + "loss": 8.2308, + "step": 195700 + }, + { + "epoch": 0.7976461697809019, + "grad_norm": 6.007944107055664, + "learning_rate": 0.004849118053544568, + "loss": 8.2006, + "step": 195800 + }, + { + "epoch": 0.7980535478042834, + "grad_norm": 3.953540563583374, + "learning_rate": 0.004848948106756286, + "loss": 8.2979, + "step": 195900 + }, + { + "epoch": 0.7984609258276648, + "grad_norm": 7.509738922119141, + "learning_rate": 0.004848778067299203, + "loss": 8.2262, + "step": 196000 + }, + { + "epoch": 0.7984609258276648, + "eval_MaskedAccuracy": 0.479848862356713, + "eval_loss": 1.7400267124176025, + "eval_runtime": 513.6673, + "eval_samples_per_second": 123.574, + "eval_steps_per_second": 0.483, + "step": 196000 + }, + { + "epoch": 0.7988683038510463, + "grad_norm": 6.1329545974731445, + "learning_rate": 0.004848607935180032, + "loss": 8.2389, + "step": 196100 + }, + { + "epoch": 0.7992756818744278, + "grad_norm": 11.7472505569458, + "learning_rate": 0.004848437710405505, + "loss": 8.2541, + "step": 196200 + }, + { + "epoch": 0.7996830598978092, + "grad_norm": 0.4944990575313568, + "learning_rate": 0.00484826739298235, + "loss": 8.3522, + "step": 196300 + }, + { + "epoch": 0.8000904379211907, + "grad_norm": 5.864392280578613, + "learning_rate": 0.004848096982917298, + "loss": 8.2947, + "step": 196400 + }, + { + "epoch": 0.8004978159445721, + "grad_norm": 5.442176342010498, + "learning_rate": 0.004847926480217097, + "loss": 8.277, + "step": 196500 + }, + { + "epoch": 0.8009051939679536, + "grad_norm": 2.9440054893493652, + "learning_rate": 0.004847755884888478, + "loss": 8.4247, + "step": 196600 + }, + { + "epoch": 0.8013125719913351, + "grad_norm": 17.107017517089844, + "learning_rate": 0.004847585196938187, + "loss": 8.3849, + "step": 196700 + }, + { + "epoch": 0.8017199500147165, + "grad_norm": 0.6312718391418457, + "learning_rate": 0.004847414416372972, + "loss": 8.3402, + "step": 196800 + }, + { + "epoch": 0.8021273280380979, + "grad_norm": 4.328444480895996, + "learning_rate": 0.004847243543199599, + "loss": 8.3553, + "step": 196900 + }, + { + "epoch": 0.8025347060614795, + "grad_norm": 4.721122741699219, + "learning_rate": 0.004847072577424805, + "loss": 8.2438, + "step": 197000 + }, + { + "epoch": 0.8025347060614795, + "eval_MaskedAccuracy": 0.4820260232919574, + "eval_loss": 1.7297272682189941, + "eval_runtime": 687.611, + "eval_samples_per_second": 92.314, + "eval_steps_per_second": 0.361, + "step": 197000 + }, + { + "epoch": 0.8029420840848609, + "grad_norm": 6.645321369171143, + "learning_rate": 0.004846901519055362, + "loss": 8.2717, + "step": 197100 + }, + { + "epoch": 0.8033494621082424, + "grad_norm": 6.457027912139893, + "learning_rate": 0.00484673036809802, + "loss": 8.2698, + "step": 197200 + }, + { + "epoch": 0.8037568401316239, + "grad_norm": 2.528052806854248, + "learning_rate": 0.004846559124559557, + "loss": 8.213, + "step": 197300 + }, + { + "epoch": 0.8041642181550053, + "grad_norm": 2.305532217025757, + "learning_rate": 0.004846387788446739, + "loss": 8.2166, + "step": 197400 + }, + { + "epoch": 0.8045715961783868, + "grad_norm": 10.579450607299805, + "learning_rate": 0.004846216359766331, + "loss": 8.3314, + "step": 197500 + }, + { + "epoch": 0.8049789742017682, + "grad_norm": 1.0553219318389893, + "learning_rate": 0.004846044838525123, + "loss": 8.3549, + "step": 197600 + }, + { + "epoch": 0.8053863522251496, + "grad_norm": 6.5963335037231445, + "learning_rate": 0.004845873224729893, + "loss": 8.3154, + "step": 197700 + }, + { + "epoch": 0.8057937302485312, + "grad_norm": 3.267850160598755, + "learning_rate": 0.004845701518387424, + "loss": 8.3146, + "step": 197800 + }, + { + "epoch": 0.8062011082719126, + "grad_norm": 3.7062668800354004, + "learning_rate": 0.004845529719504509, + "loss": 8.3205, + "step": 197900 + }, + { + "epoch": 0.806608486295294, + "grad_norm": 7.010178565979004, + "learning_rate": 0.004845357828087938, + "loss": 8.3182, + "step": 198000 + }, + { + "epoch": 0.806608486295294, + "eval_MaskedAccuracy": 0.47622369932805414, + "eval_loss": 1.7665005922317505, + "eval_runtime": 476.6637, + "eval_samples_per_second": 133.167, + "eval_steps_per_second": 0.52, + "step": 198000 + }, + { + "epoch": 0.8070158643186756, + "grad_norm": 5.485898017883301, + "learning_rate": 0.004845185844144501, + "loss": 8.2616, + "step": 198100 + }, + { + "epoch": 0.807423242342057, + "grad_norm": 4.966923713684082, + "learning_rate": 0.0048450137676810025, + "loss": 8.3116, + "step": 198200 + }, + { + "epoch": 0.8078306203654384, + "grad_norm": 6.153297424316406, + "learning_rate": 0.00484484159870424, + "loss": 8.2513, + "step": 198300 + }, + { + "epoch": 0.8082379983888199, + "grad_norm": 2.917039155960083, + "learning_rate": 0.004844669337221025, + "loss": 8.2135, + "step": 198400 + }, + { + "epoch": 0.8086453764122014, + "grad_norm": 3.5864193439483643, + "learning_rate": 0.004844496983238169, + "loss": 8.2675, + "step": 198500 + }, + { + "epoch": 0.8090527544355829, + "grad_norm": 5.927716255187988, + "learning_rate": 0.004844324536762485, + "loss": 8.2741, + "step": 198600 + }, + { + "epoch": 0.8094601324589643, + "grad_norm": 5.406866073608398, + "learning_rate": 0.0048441519978007904, + "loss": 8.1962, + "step": 198700 + }, + { + "epoch": 0.8098675104823457, + "grad_norm": 4.743386745452881, + "learning_rate": 0.004843979366359911, + "loss": 8.1696, + "step": 198800 + }, + { + "epoch": 0.8102748885057273, + "grad_norm": 3.488309621810913, + "learning_rate": 0.004843806642446664, + "loss": 8.1757, + "step": 198900 + }, + { + "epoch": 0.8106822665291087, + "grad_norm": 3.7075552940368652, + "learning_rate": 0.004843633826067882, + "loss": 8.1915, + "step": 199000 + }, + { + "epoch": 0.8106822665291087, + "eval_MaskedAccuracy": 0.479225615367175, + "eval_loss": 1.742332935333252, + "eval_runtime": 730.2844, + "eval_samples_per_second": 86.92, + "eval_steps_per_second": 0.34, + "step": 199000 + }, + { + "epoch": 0.8110896445524901, + "grad_norm": 1.1896377801895142, + "learning_rate": 0.004843460917230393, + "loss": 8.2984, + "step": 199100 + }, + { + "epoch": 0.8114970225758716, + "grad_norm": 0.9556285738945007, + "learning_rate": 0.004843287915941042, + "loss": 8.3639, + "step": 199200 + }, + { + "epoch": 0.8119044005992531, + "grad_norm": 7.886438846588135, + "learning_rate": 0.0048431148222066595, + "loss": 8.3558, + "step": 199300 + }, + { + "epoch": 0.8123117786226345, + "grad_norm": 6.792365074157715, + "learning_rate": 0.004842941636034093, + "loss": 8.3307, + "step": 199400 + }, + { + "epoch": 0.812719156646016, + "grad_norm": 3.0975301265716553, + "learning_rate": 0.0048427683574301895, + "loss": 8.2943, + "step": 199500 + }, + { + "epoch": 0.8131265346693974, + "grad_norm": 4.0706281661987305, + "learning_rate": 0.0048425949864017975, + "loss": 8.2511, + "step": 199600 + }, + { + "epoch": 0.813533912692779, + "grad_norm": 1.3213456869125366, + "learning_rate": 0.0048424215229557754, + "loss": 8.2188, + "step": 199700 + }, + { + "epoch": 0.8139412907161604, + "grad_norm": 3.0209662914276123, + "learning_rate": 0.004842247967098978, + "loss": 8.2851, + "step": 199800 + }, + { + "epoch": 0.8143486687395418, + "grad_norm": 4.856011390686035, + "learning_rate": 0.00484207431883827, + "loss": 8.203, + "step": 199900 + }, + { + "epoch": 0.8147560467629233, + "grad_norm": 4.1549296379089355, + "learning_rate": 0.004841900578180518, + "loss": 8.316, + "step": 200000 + }, + { + "epoch": 0.8147560467629233, + "eval_MaskedAccuracy": 0.47899325286543376, + "eval_loss": 1.7504117488861084, + "eval_runtime": 626.9726, + "eval_samples_per_second": 101.242, + "eval_steps_per_second": 0.396, + "step": 200000 + }, + { + "epoch": 0.8151634247863048, + "grad_norm": 6.633772373199463, + "learning_rate": 0.0048417267451325895, + "loss": 8.2596, + "step": 200100 + }, + { + "epoch": 0.8155708028096862, + "grad_norm": 2.556081533432007, + "learning_rate": 0.004841552819701353, + "loss": 8.2201, + "step": 200200 + }, + { + "epoch": 0.8159781808330677, + "grad_norm": 5.128891468048096, + "learning_rate": 0.004841378801893684, + "loss": 8.2111, + "step": 200300 + }, + { + "epoch": 0.8163855588564491, + "grad_norm": 7.334069728851318, + "learning_rate": 0.004841204691716469, + "loss": 8.201, + "step": 200400 + }, + { + "epoch": 0.8167929368798306, + "grad_norm": 2.5291171073913574, + "learning_rate": 0.004841030489176587, + "loss": 8.2851, + "step": 200500 + }, + { + "epoch": 0.8172003149032121, + "grad_norm": 3.6551196575164795, + "learning_rate": 0.004840856194280921, + "loss": 8.2998, + "step": 200600 + }, + { + "epoch": 0.8176076929265935, + "grad_norm": 5.844996452331543, + "learning_rate": 0.004840681807036361, + "loss": 8.2128, + "step": 200700 + }, + { + "epoch": 0.8180150709499749, + "grad_norm": 0.4380483627319336, + "learning_rate": 0.004840507327449816, + "loss": 8.2821, + "step": 200800 + }, + { + "epoch": 0.8184224489733565, + "grad_norm": 5.027729511260986, + "learning_rate": 0.004840332755528171, + "loss": 8.2823, + "step": 200900 + }, + { + "epoch": 0.8188298269967379, + "grad_norm": 4.696913719177246, + "learning_rate": 0.004840158091278329, + "loss": 8.2708, + "step": 201000 + }, + { + "epoch": 0.8188298269967379, + "eval_MaskedAccuracy": 0.4771777011503206, + "eval_loss": 1.7562700510025024, + "eval_runtime": 481.2755, + "eval_samples_per_second": 131.891, + "eval_steps_per_second": 0.515, + "step": 201000 + }, + { + "epoch": 0.8192372050201194, + "grad_norm": 4.170097827911377, + "learning_rate": 0.004839983334707201, + "loss": 8.2597, + "step": 201100 + }, + { + "epoch": 0.8196445830435009, + "grad_norm": 3.8776440620422363, + "learning_rate": 0.004839808485821691, + "loss": 8.2401, + "step": 201200 + }, + { + "epoch": 0.8200519610668823, + "grad_norm": 5.236380577087402, + "learning_rate": 0.004839633544628719, + "loss": 8.1898, + "step": 201300 + }, + { + "epoch": 0.8204593390902638, + "grad_norm": 0.6355003118515015, + "learning_rate": 0.004839458511135197, + "loss": 8.309, + "step": 201400 + }, + { + "epoch": 0.8208667171136452, + "grad_norm": 4.979259967803955, + "learning_rate": 0.004839283385348045, + "loss": 8.2883, + "step": 201500 + }, + { + "epoch": 0.8212740951370266, + "grad_norm": 6.542636871337891, + "learning_rate": 0.00483910816727419, + "loss": 8.1954, + "step": 201600 + }, + { + "epoch": 0.8216814731604082, + "grad_norm": 2.4118804931640625, + "learning_rate": 0.004838932856920554, + "loss": 8.2065, + "step": 201700 + }, + { + "epoch": 0.8220888511837896, + "grad_norm": 6.951291561126709, + "learning_rate": 0.004838757454294068, + "loss": 8.2174, + "step": 201800 + }, + { + "epoch": 0.822496229207171, + "grad_norm": 6.4014177322387695, + "learning_rate": 0.004838581959401661, + "loss": 8.2311, + "step": 201900 + }, + { + "epoch": 0.8229036072305526, + "grad_norm": 7.916312217712402, + "learning_rate": 0.0048384063722502835, + "loss": 8.1887, + "step": 202000 + }, + { + "epoch": 0.8229036072305526, + "eval_MaskedAccuracy": 0.47698157197199315, + "eval_loss": 1.75382399559021, + "eval_runtime": 508.5186, + "eval_samples_per_second": 124.825, + "eval_steps_per_second": 0.488, + "step": 202000 + }, + { + "epoch": 0.823310985253934, + "grad_norm": 4.69990348815918, + "learning_rate": 0.004838230692846875, + "loss": 8.2324, + "step": 202100 + }, + { + "epoch": 0.8237183632773155, + "grad_norm": 1.1547861099243164, + "learning_rate": 0.004838054921198377, + "loss": 8.3198, + "step": 202200 + }, + { + "epoch": 0.8241257413006969, + "grad_norm": 2.918012857437134, + "learning_rate": 0.004837879057311743, + "loss": 8.3413, + "step": 202300 + }, + { + "epoch": 0.8245331193240784, + "grad_norm": 7.248411655426025, + "learning_rate": 0.004837703101193917, + "loss": 8.3124, + "step": 202400 + }, + { + "epoch": 0.8249404973474599, + "grad_norm": 2.161184549331665, + "learning_rate": 0.004837527052851858, + "loss": 8.2408, + "step": 202500 + }, + { + "epoch": 0.8253478753708413, + "grad_norm": 5.13443660736084, + "learning_rate": 0.004837350912292535, + "loss": 8.2681, + "step": 202600 + }, + { + "epoch": 0.8257552533942227, + "grad_norm": 5.112154960632324, + "learning_rate": 0.0048371746795229, + "loss": 8.2744, + "step": 202700 + }, + { + "epoch": 0.8261626314176043, + "grad_norm": 5.996163368225098, + "learning_rate": 0.0048369983545499306, + "loss": 8.2252, + "step": 202800 + }, + { + "epoch": 0.8265700094409857, + "grad_norm": 4.185304164886475, + "learning_rate": 0.0048368219373806, + "loss": 8.1794, + "step": 202900 + }, + { + "epoch": 0.8269773874643671, + "grad_norm": 5.415948867797852, + "learning_rate": 0.004836645428021872, + "loss": 8.2494, + "step": 203000 + }, + { + "epoch": 0.8269773874643671, + "eval_MaskedAccuracy": 0.4802600949969667, + "eval_loss": 1.7474703788757324, + "eval_runtime": 557.2325, + "eval_samples_per_second": 113.913, + "eval_steps_per_second": 0.445, + "step": 203000 + }, + { + "epoch": 0.8273847654877486, + "grad_norm": 2.4445393085479736, + "learning_rate": 0.004836468826480733, + "loss": 8.1957, + "step": 203100 + }, + { + "epoch": 0.8277921435111301, + "grad_norm": 3.8953542709350586, + "learning_rate": 0.004836292132764158, + "loss": 8.2104, + "step": 203200 + }, + { + "epoch": 0.8281995215345115, + "grad_norm": 3.1333765983581543, + "learning_rate": 0.004836115346879137, + "loss": 8.2912, + "step": 203300 + }, + { + "epoch": 0.828606899557893, + "grad_norm": 3.902015209197998, + "learning_rate": 0.004835938468832664, + "loss": 8.2786, + "step": 203400 + }, + { + "epoch": 0.8290142775812744, + "grad_norm": 6.234856605529785, + "learning_rate": 0.004835761498631721, + "loss": 8.2223, + "step": 203500 + }, + { + "epoch": 0.829421655604656, + "grad_norm": 1.9330283403396606, + "learning_rate": 0.004835584436283312, + "loss": 8.1993, + "step": 203600 + }, + { + "epoch": 0.8298290336280374, + "grad_norm": 1.8898357152938843, + "learning_rate": 0.004835407281794435, + "loss": 8.2911, + "step": 203700 + }, + { + "epoch": 0.8302364116514188, + "grad_norm": 7.752363681793213, + "learning_rate": 0.00483523003517209, + "loss": 8.305, + "step": 203800 + }, + { + "epoch": 0.8306437896748003, + "grad_norm": 1.9813470840454102, + "learning_rate": 0.004835052696423295, + "loss": 8.3072, + "step": 203900 + }, + { + "epoch": 0.8310511676981818, + "grad_norm": 4.670867443084717, + "learning_rate": 0.004834875265555053, + "loss": 8.2232, + "step": 204000 + }, + { + "epoch": 0.8310511676981818, + "eval_MaskedAccuracy": 0.4832815956989023, + "eval_loss": 1.7318273782730103, + "eval_runtime": 478.7258, + "eval_samples_per_second": 132.594, + "eval_steps_per_second": 0.518, + "step": 204000 + }, + { + "epoch": 0.8314585457215632, + "grad_norm": 6.116964340209961, + "learning_rate": 0.004834697742574377, + "loss": 8.2046, + "step": 204100 + }, + { + "epoch": 0.8318659237449447, + "grad_norm": 2.5044636726379395, + "learning_rate": 0.004834520127488294, + "loss": 8.2273, + "step": 204200 + }, + { + "epoch": 0.8322733017683261, + "grad_norm": 4.650371551513672, + "learning_rate": 0.0048343424203038205, + "loss": 8.3231, + "step": 204300 + }, + { + "epoch": 0.8326806797917076, + "grad_norm": 2.9488110542297363, + "learning_rate": 0.00483416462102798, + "loss": 8.3034, + "step": 204400 + }, + { + "epoch": 0.8330880578150891, + "grad_norm": 4.800295352935791, + "learning_rate": 0.004833986729667807, + "loss": 8.2262, + "step": 204500 + }, + { + "epoch": 0.8334954358384705, + "grad_norm": 5.492660045623779, + "learning_rate": 0.004833808746230335, + "loss": 8.2045, + "step": 204600 + }, + { + "epoch": 0.833902813861852, + "grad_norm": 3.42221999168396, + "learning_rate": 0.004833630670722601, + "loss": 8.3011, + "step": 204700 + }, + { + "epoch": 0.8343101918852335, + "grad_norm": 5.113192558288574, + "learning_rate": 0.004833452503151636, + "loss": 8.3247, + "step": 204800 + }, + { + "epoch": 0.8347175699086149, + "grad_norm": 3.8817861080169678, + "learning_rate": 0.004833274243524488, + "loss": 8.3233, + "step": 204900 + }, + { + "epoch": 0.8351249479319964, + "grad_norm": 4.52166748046875, + "learning_rate": 0.004833095891848208, + "loss": 8.2378, + "step": 205000 + }, + { + "epoch": 0.8351249479319964, + "eval_MaskedAccuracy": 0.48239745624371705, + "eval_loss": 1.7319921255111694, + "eval_runtime": 463.2795, + "eval_samples_per_second": 137.014, + "eval_steps_per_second": 0.535, + "step": 205000 + }, + { + "epoch": 0.8355323259553779, + "grad_norm": 1.309309720993042, + "learning_rate": 0.004832917448129844, + "loss": 8.2326, + "step": 205100 + }, + { + "epoch": 0.8359397039787593, + "grad_norm": 5.92283296585083, + "learning_rate": 0.004832738912376453, + "loss": 8.32, + "step": 205200 + }, + { + "epoch": 0.8363470820021408, + "grad_norm": 3.4327287673950195, + "learning_rate": 0.004832560284595092, + "loss": 8.2146, + "step": 205300 + }, + { + "epoch": 0.8367544600255222, + "grad_norm": 3.296018600463867, + "learning_rate": 0.004832381564792831, + "loss": 8.2043, + "step": 205400 + }, + { + "epoch": 0.8371618380489037, + "grad_norm": 12.764690399169922, + "learning_rate": 0.004832202752976721, + "loss": 8.2826, + "step": 205500 + }, + { + "epoch": 0.8375692160722852, + "grad_norm": 10.4335355758667, + "learning_rate": 0.004832023849153844, + "loss": 8.3267, + "step": 205600 + }, + { + "epoch": 0.8379765940956666, + "grad_norm": 2.16225528717041, + "learning_rate": 0.00483184485333127, + "loss": 8.2872, + "step": 205700 + }, + { + "epoch": 0.838383972119048, + "grad_norm": 5.040310859680176, + "learning_rate": 0.004831665765516073, + "loss": 8.1863, + "step": 205800 + }, + { + "epoch": 0.8387913501424296, + "grad_norm": 6.956847190856934, + "learning_rate": 0.00483148658571533, + "loss": 8.1587, + "step": 205900 + }, + { + "epoch": 0.839198728165811, + "grad_norm": 3.0041253566741943, + "learning_rate": 0.004831307313936133, + "loss": 8.1822, + "step": 206000 + }, + { + "epoch": 0.839198728165811, + "eval_MaskedAccuracy": 0.48427526915216307, + "eval_loss": 1.7122535705566406, + "eval_runtime": 597.3019, + "eval_samples_per_second": 106.271, + "eval_steps_per_second": 0.415, + "step": 206000 + }, + { + "epoch": 0.8396061061891925, + "grad_norm": 12.86263370513916, + "learning_rate": 0.004831127950185568, + "loss": 8.1797, + "step": 206100 + }, + { + "epoch": 0.8400134842125739, + "grad_norm": 6.6114702224731445, + "learning_rate": 0.004830948494470716, + "loss": 8.2865, + "step": 206200 + }, + { + "epoch": 0.8404208622359554, + "grad_norm": 1.7653467655181885, + "learning_rate": 0.0048307689467986855, + "loss": 8.3733, + "step": 206300 + }, + { + "epoch": 0.8408282402593369, + "grad_norm": 4.993772983551025, + "learning_rate": 0.004830589307176563, + "loss": 8.3702, + "step": 206400 + }, + { + "epoch": 0.8412356182827183, + "grad_norm": 4.925052165985107, + "learning_rate": 0.004830409575611455, + "loss": 8.3024, + "step": 206500 + }, + { + "epoch": 0.8416429963060997, + "grad_norm": 4.0374064445495605, + "learning_rate": 0.004830229752110483, + "loss": 8.266, + "step": 206600 + }, + { + "epoch": 0.8420503743294813, + "grad_norm": 5.8857502937316895, + "learning_rate": 0.004830049836680735, + "loss": 8.2176, + "step": 206700 + }, + { + "epoch": 0.8424577523528627, + "grad_norm": 2.368356943130493, + "learning_rate": 0.004829869829329334, + "loss": 8.1805, + "step": 206800 + }, + { + "epoch": 0.8428651303762441, + "grad_norm": 1.3813912868499756, + "learning_rate": 0.0048296897300634, + "loss": 8.2906, + "step": 206900 + }, + { + "epoch": 0.8432725083996256, + "grad_norm": 2.6662254333496094, + "learning_rate": 0.004829509538890036, + "loss": 8.3135, + "step": 207000 + }, + { + "epoch": 0.8432725083996256, + "eval_MaskedAccuracy": 0.4793077358420627, + "eval_loss": 1.7458363771438599, + "eval_runtime": 594.7579, + "eval_samples_per_second": 106.726, + "eval_steps_per_second": 0.417, + "step": 207000 + }, + { + "epoch": 0.8436798864230071, + "grad_norm": 4.135429382324219, + "learning_rate": 0.004829329255816385, + "loss": 8.3306, + "step": 207100 + }, + { + "epoch": 0.8440872644463886, + "grad_norm": 4.559167861938477, + "learning_rate": 0.004829148880849567, + "loss": 8.2143, + "step": 207200 + }, + { + "epoch": 0.84449464246977, + "grad_norm": 5.941912651062012, + "learning_rate": 0.004828968413996712, + "loss": 8.2289, + "step": 207300 + }, + { + "epoch": 0.8449020204931514, + "grad_norm": 5.322763919830322, + "learning_rate": 0.00482878785526496, + "loss": 8.2137, + "step": 207400 + }, + { + "epoch": 0.845309398516533, + "grad_norm": 3.8385138511657715, + "learning_rate": 0.004828607204661446, + "loss": 8.2313, + "step": 207500 + }, + { + "epoch": 0.8457167765399144, + "grad_norm": 5.177248001098633, + "learning_rate": 0.00482842646219331, + "loss": 8.2714, + "step": 207600 + }, + { + "epoch": 0.8461241545632958, + "grad_norm": 5.964561939239502, + "learning_rate": 0.004828245627867698, + "loss": 8.2937, + "step": 207700 + }, + { + "epoch": 0.8465315325866773, + "grad_norm": 2.427088499069214, + "learning_rate": 0.004828064701691758, + "loss": 8.1818, + "step": 207800 + }, + { + "epoch": 0.8469389106100588, + "grad_norm": 8.923009872436523, + "learning_rate": 0.0048278836836726435, + "loss": 8.2604, + "step": 207900 + }, + { + "epoch": 0.8473462886334402, + "grad_norm": 2.955294370651245, + "learning_rate": 0.004827702573817521, + "loss": 8.2247, + "step": 208000 + }, + { + "epoch": 0.8473462886334402, + "eval_MaskedAccuracy": 0.4828405931102095, + "eval_loss": 1.7338587045669556, + "eval_runtime": 401.7982, + "eval_samples_per_second": 157.98, + "eval_steps_per_second": 0.617, + "step": 208000 + }, + { + "epoch": 0.8477536666568217, + "grad_norm": 4.922137260437012, + "learning_rate": 0.0048275213721335405, + "loss": 8.1972, + "step": 208100 + }, + { + "epoch": 0.8481610446802031, + "grad_norm": 7.44797945022583, + "learning_rate": 0.004827340078627866, + "loss": 8.1997, + "step": 208200 + }, + { + "epoch": 0.8485684227035846, + "grad_norm": 3.5857973098754883, + "learning_rate": 0.004827158693307668, + "loss": 8.1282, + "step": 208300 + }, + { + "epoch": 0.8489758007269661, + "grad_norm": 5.662656784057617, + "learning_rate": 0.004826977216180114, + "loss": 8.1625, + "step": 208400 + }, + { + "epoch": 0.8493831787503475, + "grad_norm": 5.123605728149414, + "learning_rate": 0.004826795647252382, + "loss": 8.1483, + "step": 208500 + }, + { + "epoch": 0.8497905567737291, + "grad_norm": 6.954997539520264, + "learning_rate": 0.0048266139865316545, + "loss": 8.1298, + "step": 208600 + }, + { + "epoch": 0.8501979347971105, + "grad_norm": 5.438270092010498, + "learning_rate": 0.004826432234025106, + "loss": 8.1321, + "step": 208700 + }, + { + "epoch": 0.8506053128204919, + "grad_norm": 6.686572551727295, + "learning_rate": 0.004826250389739933, + "loss": 8.0882, + "step": 208800 + }, + { + "epoch": 0.8510126908438734, + "grad_norm": 5.251838207244873, + "learning_rate": 0.004826068453683314, + "loss": 8.1225, + "step": 208900 + }, + { + "epoch": 0.8514200688672549, + "grad_norm": 4.220588684082031, + "learning_rate": 0.004825886425862444, + "loss": 8.1675, + "step": 209000 + }, + { + "epoch": 0.8514200688672549, + "eval_MaskedAccuracy": 0.48592952135121487, + "eval_loss": 1.7134249210357666, + "eval_runtime": 416.3753, + "eval_samples_per_second": 152.449, + "eval_steps_per_second": 0.596, + "step": 209000 + }, + { + "epoch": 0.8518274468906363, + "grad_norm": 8.761860847473145, + "learning_rate": 0.004825704306284519, + "loss": 8.0808, + "step": 209100 + }, + { + "epoch": 0.8522348249140178, + "grad_norm": 4.385628700256348, + "learning_rate": 0.004825522094956739, + "loss": 8.1519, + "step": 209200 + }, + { + "epoch": 0.8526422029373992, + "grad_norm": 6.701074600219727, + "learning_rate": 0.004825339791886317, + "loss": 8.1493, + "step": 209300 + }, + { + "epoch": 0.8530495809607807, + "grad_norm": 4.271905422210693, + "learning_rate": 0.004825157397080448, + "loss": 8.131, + "step": 209400 + }, + { + "epoch": 0.8534569589841622, + "grad_norm": 5.665177345275879, + "learning_rate": 0.004824974910546352, + "loss": 8.1649, + "step": 209500 + }, + { + "epoch": 0.8538643370075436, + "grad_norm": 1.0476863384246826, + "learning_rate": 0.004824792332291235, + "loss": 8.2765, + "step": 209600 + }, + { + "epoch": 0.8542717150309251, + "grad_norm": 5.7169928550720215, + "learning_rate": 0.004824609662322323, + "loss": 8.3905, + "step": 209700 + }, + { + "epoch": 0.8546790930543066, + "grad_norm": 3.344644784927368, + "learning_rate": 0.004824426900646833, + "loss": 8.327, + "step": 209800 + }, + { + "epoch": 0.855086471077688, + "grad_norm": 3.115905523300171, + "learning_rate": 0.004824244047271991, + "loss": 8.2777, + "step": 209900 + }, + { + "epoch": 0.8554938491010695, + "grad_norm": 2.913144111633301, + "learning_rate": 0.004824061102205035, + "loss": 8.2966, + "step": 210000 + }, + { + "epoch": 0.8554938491010695, + "eval_MaskedAccuracy": 0.4802323235623799, + "eval_loss": 1.7398486137390137, + "eval_runtime": 612.4014, + "eval_samples_per_second": 103.651, + "eval_steps_per_second": 0.405, + "step": 210000 + }, + { + "epoch": 0.8559012271244509, + "grad_norm": 4.763663291931152, + "learning_rate": 0.0048238780654531865, + "loss": 8.1959, + "step": 210100 + }, + { + "epoch": 0.8563086051478324, + "grad_norm": 7.354310512542725, + "learning_rate": 0.004823694937023684, + "loss": 8.198, + "step": 210200 + }, + { + "epoch": 0.8567159831712139, + "grad_norm": 1.976967692375183, + "learning_rate": 0.0048235117169237686, + "loss": 8.3064, + "step": 210300 + }, + { + "epoch": 0.8571233611945953, + "grad_norm": 5.958095073699951, + "learning_rate": 0.004823328405160683, + "loss": 8.3467, + "step": 210400 + }, + { + "epoch": 0.8575307392179767, + "grad_norm": 5.468306064605713, + "learning_rate": 0.004823145001741677, + "loss": 8.2654, + "step": 210500 + }, + { + "epoch": 0.8579381172413583, + "grad_norm": 3.822810173034668, + "learning_rate": 0.004822961506674, + "loss": 8.2077, + "step": 210600 + }, + { + "epoch": 0.8583454952647397, + "grad_norm": 7.963845252990723, + "learning_rate": 0.004822777919964915, + "loss": 8.1987, + "step": 210700 + }, + { + "epoch": 0.8587528732881211, + "grad_norm": 8.342864990234375, + "learning_rate": 0.004822594241621665, + "loss": 8.2783, + "step": 210800 + }, + { + "epoch": 0.8591602513115026, + "grad_norm": 5.5617852210998535, + "learning_rate": 0.004822410471651524, + "loss": 8.3149, + "step": 210900 + }, + { + "epoch": 0.8595676293348841, + "grad_norm": 3.1351521015167236, + "learning_rate": 0.004822226610061748, + "loss": 8.1942, + "step": 211000 + }, + { + "epoch": 0.8595676293348841, + "eval_MaskedAccuracy": 0.48314665494330045, + "eval_loss": 1.7261412143707275, + "eval_runtime": 672.4997, + "eval_samples_per_second": 94.388, + "eval_steps_per_second": 0.369, + "step": 211000 + }, + { + "epoch": 0.8599750073582656, + "grad_norm": 4.378059387207031, + "learning_rate": 0.004822042656859616, + "loss": 8.205, + "step": 211100 + }, + { + "epoch": 0.860382385381647, + "grad_norm": 3.8245508670806885, + "learning_rate": 0.0048218586120523915, + "loss": 8.2617, + "step": 211200 + }, + { + "epoch": 0.8607897634050284, + "grad_norm": 3.023498773574829, + "learning_rate": 0.004821674475647355, + "loss": 8.2805, + "step": 211300 + }, + { + "epoch": 0.86119714142841, + "grad_norm": 5.668664455413818, + "learning_rate": 0.004821490247651791, + "loss": 8.1826, + "step": 211400 + }, + { + "epoch": 0.8616045194517914, + "grad_norm": 5.902626991271973, + "learning_rate": 0.004821305928072968, + "loss": 8.1585, + "step": 211500 + }, + { + "epoch": 0.8620118974751728, + "grad_norm": 4.480663299560547, + "learning_rate": 0.004821121516918187, + "loss": 8.1259, + "step": 211600 + }, + { + "epoch": 0.8624192754985543, + "grad_norm": 7.29260778427124, + "learning_rate": 0.004820937014194738, + "loss": 8.1503, + "step": 211700 + }, + { + "epoch": 0.8628266535219358, + "grad_norm": 3.6903669834136963, + "learning_rate": 0.004820752419909912, + "loss": 8.1551, + "step": 211800 + }, + { + "epoch": 0.8632340315453172, + "grad_norm": 4.0630388259887695, + "learning_rate": 0.00482056773407101, + "loss": 8.1472, + "step": 211900 + }, + { + "epoch": 0.8636414095686987, + "grad_norm": 4.894790172576904, + "learning_rate": 0.004820382956685329, + "loss": 8.1251, + "step": 212000 + }, + { + "epoch": 0.8636414095686987, + "eval_MaskedAccuracy": 0.48551628369224803, + "eval_loss": 1.7234163284301758, + "eval_runtime": 639.0434, + "eval_samples_per_second": 99.33, + "eval_steps_per_second": 0.388, + "step": 212000 + }, + { + "epoch": 0.8640487875920801, + "grad_norm": 5.390108585357666, + "learning_rate": 0.00482019808776018, + "loss": 8.1315, + "step": 212100 + }, + { + "epoch": 0.8644561656154617, + "grad_norm": 5.917113780975342, + "learning_rate": 0.00482001312730286, + "loss": 8.1449, + "step": 212200 + }, + { + "epoch": 0.8648635436388431, + "grad_norm": 6.876038551330566, + "learning_rate": 0.004819828075320687, + "loss": 8.108, + "step": 212300 + }, + { + "epoch": 0.8652709216622245, + "grad_norm": 4.333359241485596, + "learning_rate": 0.004819642931820984, + "loss": 8.1293, + "step": 212400 + }, + { + "epoch": 0.8656782996856061, + "grad_norm": 4.042921543121338, + "learning_rate": 0.0048194576968110725, + "loss": 8.2434, + "step": 212500 + }, + { + "epoch": 0.8660856777089875, + "grad_norm": 20.584726333618164, + "learning_rate": 0.004819272370298266, + "loss": 8.3424, + "step": 212600 + }, + { + "epoch": 0.8664930557323689, + "grad_norm": 0.9013800621032715, + "learning_rate": 0.004819086952289896, + "loss": 8.4333, + "step": 212700 + }, + { + "epoch": 0.8669004337557504, + "grad_norm": 2.704648971557617, + "learning_rate": 0.004818901442793292, + "loss": 8.3012, + "step": 212800 + }, + { + "epoch": 0.8673078117791319, + "grad_norm": 5.314194679260254, + "learning_rate": 0.00481871584181579, + "loss": 8.2101, + "step": 212900 + }, + { + "epoch": 0.8677151898025133, + "grad_norm": 5.427734851837158, + "learning_rate": 0.004818530149364725, + "loss": 8.1636, + "step": 213000 + }, + { + "epoch": 0.8677151898025133, + "eval_MaskedAccuracy": 0.4840721691740113, + "eval_loss": 1.7264434099197388, + "eval_runtime": 526.1976, + "eval_samples_per_second": 120.631, + "eval_steps_per_second": 0.471, + "step": 213000 + }, + { + "epoch": 0.8681225678258948, + "grad_norm": 3.8905630111694336, + "learning_rate": 0.004818344365447432, + "loss": 8.1378, + "step": 213100 + }, + { + "epoch": 0.8685299458492762, + "grad_norm": 3.7320637702941895, + "learning_rate": 0.004818158490071262, + "loss": 8.1892, + "step": 213200 + }, + { + "epoch": 0.8689373238726577, + "grad_norm": 6.944694995880127, + "learning_rate": 0.004817972523243571, + "loss": 8.1385, + "step": 213300 + }, + { + "epoch": 0.8693447018960392, + "grad_norm": 3.708341598510742, + "learning_rate": 0.004817786464971709, + "loss": 8.1633, + "step": 213400 + }, + { + "epoch": 0.8697520799194206, + "grad_norm": 4.6970720291137695, + "learning_rate": 0.004817600315263026, + "loss": 8.1282, + "step": 213500 + }, + { + "epoch": 0.8701594579428021, + "grad_norm": 4.433615207672119, + "learning_rate": 0.0048174140741248815, + "loss": 8.1219, + "step": 213600 + }, + { + "epoch": 0.8705668359661836, + "grad_norm": 3.3684661388397217, + "learning_rate": 0.0048172277415646425, + "loss": 8.1104, + "step": 213700 + }, + { + "epoch": 0.870974213989565, + "grad_norm": 5.546548366546631, + "learning_rate": 0.004817041317589675, + "loss": 8.1474, + "step": 213800 + }, + { + "epoch": 0.8713815920129465, + "grad_norm": 4.353720188140869, + "learning_rate": 0.004816854802207345, + "loss": 8.1071, + "step": 213900 + }, + { + "epoch": 0.8717889700363279, + "grad_norm": 3.027634859085083, + "learning_rate": 0.0048166681954250265, + "loss": 8.097, + "step": 214000 + }, + { + "epoch": 0.8717889700363279, + "eval_MaskedAccuracy": 0.4856966584998987, + "eval_loss": 1.7125508785247803, + "eval_runtime": 593.1009, + "eval_samples_per_second": 107.024, + "eval_steps_per_second": 0.418, + "step": 214000 + }, + { + "epoch": 0.8721963480597094, + "grad_norm": 6.464120388031006, + "learning_rate": 0.00481648149725011, + "loss": 8.2545, + "step": 214100 + }, + { + "epoch": 0.8726037260830909, + "grad_norm": 1.8093879222869873, + "learning_rate": 0.004816294707689969, + "loss": 8.3416, + "step": 214200 + }, + { + "epoch": 0.8730111041064723, + "grad_norm": 9.328423500061035, + "learning_rate": 0.004816107826751984, + "loss": 8.3573, + "step": 214300 + }, + { + "epoch": 0.8734184821298537, + "grad_norm": 6.261687755584717, + "learning_rate": 0.004815920854443545, + "loss": 8.3704, + "step": 214400 + }, + { + "epoch": 0.8738258601532353, + "grad_norm": 5.908254623413086, + "learning_rate": 0.0048157337907720425, + "loss": 8.2167, + "step": 214500 + }, + { + "epoch": 0.8742332381766167, + "grad_norm": 4.408290386199951, + "learning_rate": 0.004815546635744878, + "loss": 8.2114, + "step": 214600 + }, + { + "epoch": 0.8746406161999982, + "grad_norm": 6.703604221343994, + "learning_rate": 0.004815359389369443, + "loss": 8.2206, + "step": 214700 + }, + { + "epoch": 0.8750479942233796, + "grad_norm": 4.957728385925293, + "learning_rate": 0.004815172051653151, + "loss": 8.2255, + "step": 214800 + }, + { + "epoch": 0.8754553722467611, + "grad_norm": 8.815536499023438, + "learning_rate": 0.004814984622603397, + "loss": 8.229, + "step": 214900 + }, + { + "epoch": 0.8758627502701426, + "grad_norm": 7.434412956237793, + "learning_rate": 0.0048147971022275944, + "loss": 8.3109, + "step": 215000 + }, + { + "epoch": 0.8758627502701426, + "eval_MaskedAccuracy": 0.4821472093482054, + "eval_loss": 1.7343100309371948, + "eval_runtime": 575.8219, + "eval_samples_per_second": 110.235, + "eval_steps_per_second": 0.431, + "step": 215000 + }, + { + "epoch": 0.876270128293524, + "grad_norm": 3.0055723190307617, + "learning_rate": 0.004814609490533161, + "loss": 8.239, + "step": 215100 + }, + { + "epoch": 0.8766775063169054, + "grad_norm": 2.9075305461883545, + "learning_rate": 0.004814421787527511, + "loss": 8.2266, + "step": 215200 + }, + { + "epoch": 0.877084884340287, + "grad_norm": 2.666818141937256, + "learning_rate": 0.004814233993218074, + "loss": 8.271, + "step": 215300 + }, + { + "epoch": 0.8774922623636684, + "grad_norm": 5.189484119415283, + "learning_rate": 0.0048140461076122635, + "loss": 8.2923, + "step": 215400 + }, + { + "epoch": 0.8778996403870498, + "grad_norm": 2.3578219413757324, + "learning_rate": 0.00481385813071751, + "loss": 8.2224, + "step": 215500 + }, + { + "epoch": 0.8783070184104314, + "grad_norm": 5.166990280151367, + "learning_rate": 0.004813670062541242, + "loss": 8.1742, + "step": 215600 + }, + { + "epoch": 0.8787143964338128, + "grad_norm": 3.812676429748535, + "learning_rate": 0.004813481903090902, + "loss": 8.2637, + "step": 215700 + }, + { + "epoch": 0.8791217744571942, + "grad_norm": 4.2266645431518555, + "learning_rate": 0.004813293652373921, + "loss": 8.3079, + "step": 215800 + }, + { + "epoch": 0.8795291524805757, + "grad_norm": 7.559784889221191, + "learning_rate": 0.004813105310397747, + "loss": 8.2331, + "step": 215900 + }, + { + "epoch": 0.8799365305039571, + "grad_norm": 2.5249407291412354, + "learning_rate": 0.004812916877169828, + "loss": 8.2012, + "step": 216000 + }, + { + "epoch": 0.8799365305039571, + "eval_MaskedAccuracy": 0.4843116807781347, + "eval_loss": 1.7242289781570435, + "eval_runtime": 619.7865, + "eval_samples_per_second": 102.416, + "eval_steps_per_second": 0.4, + "step": 216000 + }, + { + "epoch": 0.8803439085273387, + "grad_norm": 7.770275592803955, + "learning_rate": 0.004812728352697608, + "loss": 8.2158, + "step": 216100 + }, + { + "epoch": 0.8807512865507201, + "grad_norm": 5.232234477996826, + "learning_rate": 0.004812539736988547, + "loss": 8.3067, + "step": 216200 + }, + { + "epoch": 0.8811586645741015, + "grad_norm": 6.027997016906738, + "learning_rate": 0.004812351030050096, + "loss": 8.1876, + "step": 216300 + }, + { + "epoch": 0.8815660425974831, + "grad_norm": 3.0700552463531494, + "learning_rate": 0.004812162231889718, + "loss": 8.1517, + "step": 216400 + }, + { + "epoch": 0.8819734206208645, + "grad_norm": 2.3771629333496094, + "learning_rate": 0.004811973342514877, + "loss": 8.1428, + "step": 216500 + }, + { + "epoch": 0.8823807986442459, + "grad_norm": 2.1454079151153564, + "learning_rate": 0.0048117843619330455, + "loss": 8.2716, + "step": 216600 + }, + { + "epoch": 0.8827881766676274, + "grad_norm": 6.59639835357666, + "learning_rate": 0.004811595290151687, + "loss": 8.2629, + "step": 216700 + }, + { + "epoch": 0.8831955546910089, + "grad_norm": 3.637233257293701, + "learning_rate": 0.004811406127178285, + "loss": 8.1797, + "step": 216800 + }, + { + "epoch": 0.8836029327143903, + "grad_norm": 5.26485013961792, + "learning_rate": 0.004811216873020314, + "loss": 8.144, + "step": 216900 + }, + { + "epoch": 0.8840103107377718, + "grad_norm": 3.676942825317383, + "learning_rate": 0.004811027527685249, + "loss": 8.1487, + "step": 217000 + }, + { + "epoch": 0.8840103107377718, + "eval_MaskedAccuracy": 0.4837425891402228, + "eval_loss": 1.7248014211654663, + "eval_runtime": 715.549, + "eval_samples_per_second": 88.71, + "eval_steps_per_second": 0.347, + "step": 217000 + }, + { + "epoch": 0.8844176887611532, + "grad_norm": 5.9648356437683105, + "learning_rate": 0.00481083809118058, + "loss": 8.2781, + "step": 217100 + }, + { + "epoch": 0.8848250667845347, + "grad_norm": 2.26716947555542, + "learning_rate": 0.004810648563513804, + "loss": 8.3153, + "step": 217200 + }, + { + "epoch": 0.8852324448079162, + "grad_norm": 4.865094184875488, + "learning_rate": 0.004810458944692406, + "loss": 8.3395, + "step": 217300 + }, + { + "epoch": 0.8856398228312976, + "grad_norm": 3.7103941440582275, + "learning_rate": 0.004810269234723887, + "loss": 8.3044, + "step": 217400 + }, + { + "epoch": 0.8860472008546791, + "grad_norm": 6.04404878616333, + "learning_rate": 0.004810079433615747, + "loss": 8.2884, + "step": 217500 + }, + { + "epoch": 0.8864545788780606, + "grad_norm": 8.202356338500977, + "learning_rate": 0.004809889541375489, + "loss": 8.2241, + "step": 217600 + }, + { + "epoch": 0.886861956901442, + "grad_norm": 4.739055156707764, + "learning_rate": 0.004809699558010626, + "loss": 8.2208, + "step": 217700 + }, + { + "epoch": 0.8872693349248235, + "grad_norm": 4.190792560577393, + "learning_rate": 0.0048095094835286555, + "loss": 8.2104, + "step": 217800 + }, + { + "epoch": 0.8876767129482049, + "grad_norm": 4.8854241371154785, + "learning_rate": 0.0048093193179371, + "loss": 8.1439, + "step": 217900 + }, + { + "epoch": 0.8880840909715864, + "grad_norm": 5.460559368133545, + "learning_rate": 0.004809129061243479, + "loss": 8.1067, + "step": 218000 + }, + { + "epoch": 0.8880840909715864, + "eval_MaskedAccuracy": 0.4865924815837562, + "eval_loss": 1.7140520811080933, + "eval_runtime": 552.9316, + "eval_samples_per_second": 114.799, + "eval_steps_per_second": 0.449, + "step": 218000 + }, + { + "epoch": 0.8884914689949679, + "grad_norm": 12.777691841125488, + "learning_rate": 0.0048089387134553135, + "loss": 8.1755, + "step": 218100 + }, + { + "epoch": 0.8888988470183493, + "grad_norm": 5.406383991241455, + "learning_rate": 0.004808748274580128, + "loss": 8.2656, + "step": 218200 + }, + { + "epoch": 0.8893062250417307, + "grad_norm": 6.635910511016846, + "learning_rate": 0.004808557744625446, + "loss": 8.2507, + "step": 218300 + }, + { + "epoch": 0.8897136030651123, + "grad_norm": 5.01666259765625, + "learning_rate": 0.004808367123598811, + "loss": 8.1709, + "step": 218400 + }, + { + "epoch": 0.8901209810884937, + "grad_norm": 1.3777137994766235, + "learning_rate": 0.004808176411507757, + "loss": 8.1806, + "step": 218500 + }, + { + "epoch": 0.8905283591118752, + "grad_norm": 0.8259178996086121, + "learning_rate": 0.004807985608359816, + "loss": 8.2648, + "step": 218600 + }, + { + "epoch": 0.8909357371352566, + "grad_norm": 5.552270889282227, + "learning_rate": 0.004807794714162537, + "loss": 8.3019, + "step": 218700 + }, + { + "epoch": 0.8913431151586381, + "grad_norm": 5.504025459289551, + "learning_rate": 0.004807603728923469, + "loss": 8.2998, + "step": 218800 + }, + { + "epoch": 0.8917504931820196, + "grad_norm": 3.2907345294952393, + "learning_rate": 0.004807412652650158, + "loss": 8.2237, + "step": 218900 + }, + { + "epoch": 0.892157871205401, + "grad_norm": 2.9608821868896484, + "learning_rate": 0.004807221485350164, + "loss": 8.1758, + "step": 219000 + }, + { + "epoch": 0.892157871205401, + "eval_MaskedAccuracy": 0.48468191219256895, + "eval_loss": 1.7179230451583862, + "eval_runtime": 607.2542, + "eval_samples_per_second": 104.53, + "eval_steps_per_second": 0.408, + "step": 219000 + }, + { + "epoch": 0.8925652492287824, + "grad_norm": 4.358753204345703, + "learning_rate": 0.004807030227031042, + "loss": 8.1467, + "step": 219100 + }, + { + "epoch": 0.892972627252164, + "grad_norm": 3.311702013015747, + "learning_rate": 0.004806838877700354, + "loss": 8.1996, + "step": 219200 + }, + { + "epoch": 0.8933800052755454, + "grad_norm": 4.425884246826172, + "learning_rate": 0.00480664743736566, + "loss": 8.1468, + "step": 219300 + }, + { + "epoch": 0.8937873832989268, + "grad_norm": 4.711964130401611, + "learning_rate": 0.004806455906034538, + "loss": 8.1468, + "step": 219400 + }, + { + "epoch": 0.8941947613223084, + "grad_norm": 18.98705291748047, + "learning_rate": 0.004806264283714551, + "loss": 8.1923, + "step": 219500 + }, + { + "epoch": 0.8946021393456898, + "grad_norm": 4.508944511413574, + "learning_rate": 0.004806072570413277, + "loss": 8.2644, + "step": 219600 + }, + { + "epoch": 0.8950095173690712, + "grad_norm": 6.128897190093994, + "learning_rate": 0.004805880766138295, + "loss": 8.1459, + "step": 219700 + }, + { + "epoch": 0.8954168953924527, + "grad_norm": 3.9684112071990967, + "learning_rate": 0.004805688870897193, + "loss": 8.1247, + "step": 219800 + }, + { + "epoch": 0.8958242734158341, + "grad_norm": 4.559117794036865, + "learning_rate": 0.004805496884697555, + "loss": 8.1418, + "step": 219900 + }, + { + "epoch": 0.8962316514392157, + "grad_norm": 4.458334922790527, + "learning_rate": 0.004805304807546975, + "loss": 8.1243, + "step": 220000 + }, + { + "epoch": 0.8962316514392157, + "eval_MaskedAccuracy": 0.4864829520402149, + "eval_loss": 1.7186992168426514, + "eval_runtime": 605.6496, + "eval_samples_per_second": 104.806, + "eval_steps_per_second": 0.409, + "step": 220000 + }, + { + "epoch": 0.8966390294625971, + "grad_norm": 6.1991286277771, + "learning_rate": 0.004805112639453043, + "loss": 8.1469, + "step": 220100 + }, + { + "epoch": 0.8970464074859785, + "grad_norm": 4.865583896636963, + "learning_rate": 0.004804920380423352, + "loss": 8.0631, + "step": 220200 + }, + { + "epoch": 0.8974537855093601, + "grad_norm": 4.732316493988037, + "learning_rate": 0.004804728030465512, + "loss": 8.0916, + "step": 220300 + }, + { + "epoch": 0.8978611635327415, + "grad_norm": 13.524624824523926, + "learning_rate": 0.004804535589587124, + "loss": 8.1176, + "step": 220400 + }, + { + "epoch": 0.8982685415561229, + "grad_norm": 5.715292930603027, + "learning_rate": 0.004804343057795791, + "loss": 8.2386, + "step": 220500 + }, + { + "epoch": 0.8986759195795044, + "grad_norm": 5.178189277648926, + "learning_rate": 0.004804150435099133, + "loss": 8.2304, + "step": 220600 + }, + { + "epoch": 0.8990832976028859, + "grad_norm": 4.106097221374512, + "learning_rate": 0.004803957721504758, + "loss": 8.203, + "step": 220700 + }, + { + "epoch": 0.8994906756262673, + "grad_norm": 6.310755729675293, + "learning_rate": 0.004803764917020284, + "loss": 8.1797, + "step": 220800 + }, + { + "epoch": 0.8998980536496488, + "grad_norm": 6.1565752029418945, + "learning_rate": 0.004803572021653345, + "loss": 8.0992, + "step": 220900 + }, + { + "epoch": 0.9003054316730302, + "grad_norm": 6.599998474121094, + "learning_rate": 0.004803379035411562, + "loss": 8.1698, + "step": 221000 + }, + { + "epoch": 0.9003054316730302, + "eval_MaskedAccuracy": 0.4855747451078135, + "eval_loss": 1.7180355787277222, + "eval_runtime": 632.8244, + "eval_samples_per_second": 100.306, + "eval_steps_per_second": 0.392, + "step": 221000 + }, + { + "epoch": 0.9007128096964118, + "grad_norm": 1.949260950088501, + "learning_rate": 0.004803185958302558, + "loss": 8.2509, + "step": 221100 + }, + { + "epoch": 0.9011201877197932, + "grad_norm": 4.0300211906433105, + "learning_rate": 0.004802992790333973, + "loss": 8.2532, + "step": 221200 + }, + { + "epoch": 0.9015275657431746, + "grad_norm": 4.199332237243652, + "learning_rate": 0.0048027995315134435, + "loss": 8.2503, + "step": 221300 + }, + { + "epoch": 0.9019349437665561, + "grad_norm": 5.702756881713867, + "learning_rate": 0.004802606181848606, + "loss": 8.1876, + "step": 221400 + }, + { + "epoch": 0.9023423217899376, + "grad_norm": 5.0414838790893555, + "learning_rate": 0.004802412741347112, + "loss": 8.1659, + "step": 221500 + }, + { + "epoch": 0.902749699813319, + "grad_norm": 3.2576799392700195, + "learning_rate": 0.004802219210016601, + "loss": 8.1218, + "step": 221600 + }, + { + "epoch": 0.9031570778367005, + "grad_norm": 3.9214446544647217, + "learning_rate": 0.004802025587864734, + "loss": 8.1341, + "step": 221700 + }, + { + "epoch": 0.9035644558600819, + "grad_norm": 6.032358646392822, + "learning_rate": 0.004801831874899155, + "loss": 8.1105, + "step": 221800 + }, + { + "epoch": 0.9039718338834634, + "grad_norm": 6.424914836883545, + "learning_rate": 0.004801638071127528, + "loss": 8.1357, + "step": 221900 + }, + { + "epoch": 0.9043792119068449, + "grad_norm": 3.933969020843506, + "learning_rate": 0.004801444176557512, + "loss": 8.1293, + "step": 222000 + }, + { + "epoch": 0.9043792119068449, + "eval_MaskedAccuracy": 0.486812708382253, + "eval_loss": 1.7057329416275024, + "eval_runtime": 636.6308, + "eval_samples_per_second": 99.706, + "eval_steps_per_second": 0.39, + "step": 222000 + }, + { + "epoch": 0.9047865899302263, + "grad_norm": 4.703785419464111, + "learning_rate": 0.004801250191196776, + "loss": 8.1116, + "step": 222100 + }, + { + "epoch": 0.9051939679536077, + "grad_norm": 4.067076206207275, + "learning_rate": 0.0048010561150529955, + "loss": 8.1207, + "step": 222200 + }, + { + "epoch": 0.9056013459769893, + "grad_norm": 4.6195502281188965, + "learning_rate": 0.004800861948133838, + "loss": 8.114, + "step": 222300 + }, + { + "epoch": 0.9060087240003707, + "grad_norm": 3.2566609382629395, + "learning_rate": 0.0048006676904469745, + "loss": 8.2789, + "step": 222400 + }, + { + "epoch": 0.9064161020237522, + "grad_norm": 5.413635730743408, + "learning_rate": 0.00480047334200009, + "loss": 8.3084, + "step": 222500 + }, + { + "epoch": 0.9068234800471336, + "grad_norm": 5.0853095054626465, + "learning_rate": 0.004800278902800868, + "loss": 8.2031, + "step": 222600 + }, + { + "epoch": 0.9072308580705151, + "grad_norm": 3.3110313415527344, + "learning_rate": 0.004800084372856993, + "loss": 8.3235, + "step": 222700 + }, + { + "epoch": 0.9076382360938966, + "grad_norm": 2.2277956008911133, + "learning_rate": 0.004799889752176161, + "loss": 8.2674, + "step": 222800 + }, + { + "epoch": 0.908045614117278, + "grad_norm": 10.274942398071289, + "learning_rate": 0.004799695040766057, + "loss": 8.3539, + "step": 222900 + }, + { + "epoch": 0.9084529921406594, + "grad_norm": 4.456951141357422, + "learning_rate": 0.004799500238634387, + "loss": 8.3261, + "step": 223000 + }, + { + "epoch": 0.9084529921406594, + "eval_MaskedAccuracy": 0.47838385192669225, + "eval_loss": 1.750592827796936, + "eval_runtime": 567.4718, + "eval_samples_per_second": 111.858, + "eval_steps_per_second": 0.437, + "step": 223000 + }, + { + "epoch": 0.908860370164041, + "grad_norm": 6.92998743057251, + "learning_rate": 0.004799305345788852, + "loss": 8.1969, + "step": 223100 + }, + { + "epoch": 0.9092677481874224, + "grad_norm": 2.9873671531677246, + "learning_rate": 0.0047991103622371575, + "loss": 8.1887, + "step": 223200 + }, + { + "epoch": 0.9096751262108038, + "grad_norm": 4.813361167907715, + "learning_rate": 0.0047989152879870075, + "loss": 8.178, + "step": 223300 + }, + { + "epoch": 0.9100825042341854, + "grad_norm": 2.244819402694702, + "learning_rate": 0.0047987201230461135, + "loss": 8.2732, + "step": 223400 + }, + { + "epoch": 0.9104898822575668, + "grad_norm": 4.322752475738525, + "learning_rate": 0.004798524867422201, + "loss": 8.2718, + "step": 223500 + }, + { + "epoch": 0.9108972602809483, + "grad_norm": 3.421313762664795, + "learning_rate": 0.004798329521122984, + "loss": 8.2029, + "step": 223600 + }, + { + "epoch": 0.9113046383043297, + "grad_norm": 4.621647834777832, + "learning_rate": 0.004798134084156185, + "loss": 8.27, + "step": 223700 + }, + { + "epoch": 0.9117120163277111, + "grad_norm": 4.301358699798584, + "learning_rate": 0.004797938556529529, + "loss": 8.1699, + "step": 223800 + }, + { + "epoch": 0.9121193943510927, + "grad_norm": 2.086094617843628, + "learning_rate": 0.004797742938250747, + "loss": 8.217, + "step": 223900 + }, + { + "epoch": 0.9125267723744741, + "grad_norm": 4.431074142456055, + "learning_rate": 0.0047975472293275805, + "loss": 8.2045, + "step": 224000 + }, + { + "epoch": 0.9125267723744741, + "eval_MaskedAccuracy": 0.48309843252640383, + "eval_loss": 1.7208740711212158, + "eval_runtime": 585.9545, + "eval_samples_per_second": 108.329, + "eval_steps_per_second": 0.423, + "step": 224000 + }, + { + "epoch": 0.9129341503978555, + "grad_norm": 5.076047897338867, + "learning_rate": 0.004797351429767757, + "loss": 8.2371, + "step": 224100 + }, + { + "epoch": 0.9133415284212371, + "grad_norm": 2.933804512023926, + "learning_rate": 0.004797155539579018, + "loss": 8.2578, + "step": 224200 + }, + { + "epoch": 0.9137489064446185, + "grad_norm": 2.729539632797241, + "learning_rate": 0.004796959558769112, + "loss": 8.2741, + "step": 224300 + }, + { + "epoch": 0.9141562844679999, + "grad_norm": 4.2180962562561035, + "learning_rate": 0.004796763487345783, + "loss": 8.1683, + "step": 224400 + }, + { + "epoch": 0.9145636624913814, + "grad_norm": 5.814899444580078, + "learning_rate": 0.0047965673253167924, + "loss": 8.2012, + "step": 224500 + }, + { + "epoch": 0.9149710405147629, + "grad_norm": 1.9807255268096924, + "learning_rate": 0.004796371072689886, + "loss": 8.176, + "step": 224600 + }, + { + "epoch": 0.9153784185381443, + "grad_norm": 6.077608585357666, + "learning_rate": 0.004796174729472829, + "loss": 8.1689, + "step": 224700 + }, + { + "epoch": 0.9157857965615258, + "grad_norm": 2.7711193561553955, + "learning_rate": 0.004795978295673379, + "loss": 8.2387, + "step": 224800 + }, + { + "epoch": 0.9161931745849072, + "grad_norm": 3.4545180797576904, + "learning_rate": 0.004795781771299311, + "loss": 8.152, + "step": 224900 + }, + { + "epoch": 0.9166005526082888, + "grad_norm": 5.071669578552246, + "learning_rate": 0.004795585156358379, + "loss": 8.2743, + "step": 225000 + }, + { + "epoch": 0.9166005526082888, + "eval_MaskedAccuracy": 0.48105270695267294, + "eval_loss": 1.7473244667053223, + "eval_runtime": 547.578, + "eval_samples_per_second": 115.921, + "eval_steps_per_second": 0.453, + "step": 225000 + }, + { + "epoch": 0.9170079306316702, + "grad_norm": 6.067659378051758, + "learning_rate": 0.0047953884508583665, + "loss": 8.2189, + "step": 225100 + }, + { + "epoch": 0.9174153086550516, + "grad_norm": 3.740722179412842, + "learning_rate": 0.004795191654807049, + "loss": 8.2815, + "step": 225200 + }, + { + "epoch": 0.9178226866784331, + "grad_norm": 1.9998103380203247, + "learning_rate": 0.004794994768212203, + "loss": 8.2135, + "step": 225300 + }, + { + "epoch": 0.9182300647018146, + "grad_norm": 4.962710380554199, + "learning_rate": 0.004794797791081626, + "loss": 8.166, + "step": 225400 + }, + { + "epoch": 0.918637442725196, + "grad_norm": 0.5499230623245239, + "learning_rate": 0.0047946007234230936, + "loss": 8.1433, + "step": 225500 + }, + { + "epoch": 0.9190448207485775, + "grad_norm": 3.616839647293091, + "learning_rate": 0.004794403565244396, + "loss": 8.267, + "step": 225600 + }, + { + "epoch": 0.9194521987719589, + "grad_norm": 2.500506639480591, + "learning_rate": 0.004794206316553328, + "loss": 8.1619, + "step": 225700 + }, + { + "epoch": 0.9198595767953404, + "grad_norm": 4.259340286254883, + "learning_rate": 0.0047940089773576946, + "loss": 8.2626, + "step": 225800 + }, + { + "epoch": 0.9202669548187219, + "grad_norm": 3.6628053188323975, + "learning_rate": 0.004793811547665293, + "loss": 8.2379, + "step": 225900 + }, + { + "epoch": 0.9206743328421033, + "grad_norm": 3.1447908878326416, + "learning_rate": 0.0047936140274839325, + "loss": 8.2492, + "step": 226000 + }, + { + "epoch": 0.9206743328421033, + "eval_MaskedAccuracy": 0.47904936406652626, + "eval_loss": 1.7529270648956299, + "eval_runtime": 577.6785, + "eval_samples_per_second": 109.881, + "eval_steps_per_second": 0.429, + "step": 226000 + }, + { + "epoch": 0.9210817108654848, + "grad_norm": 8.835447311401367, + "learning_rate": 0.00479341641682142, + "loss": 8.2834, + "step": 226100 + }, + { + "epoch": 0.9214890888888663, + "grad_norm": 6.550544738769531, + "learning_rate": 0.004793218715685563, + "loss": 8.2341, + "step": 226200 + }, + { + "epoch": 0.9218964669122477, + "grad_norm": 6.690319061279297, + "learning_rate": 0.0047930209240841844, + "loss": 8.182, + "step": 226300 + }, + { + "epoch": 0.9223038449356292, + "grad_norm": 5.749084949493408, + "learning_rate": 0.004792823042025101, + "loss": 8.2981, + "step": 226400 + }, + { + "epoch": 0.9227112229590106, + "grad_norm": 4.789872169494629, + "learning_rate": 0.004792625069516135, + "loss": 8.1843, + "step": 226500 + }, + { + "epoch": 0.9231186009823921, + "grad_norm": 2.5787711143493652, + "learning_rate": 0.004792427006565109, + "loss": 8.1366, + "step": 226600 + }, + { + "epoch": 0.9235259790057736, + "grad_norm": 3.2012808322906494, + "learning_rate": 0.004792228853179861, + "loss": 8.198, + "step": 226700 + }, + { + "epoch": 0.923933357029155, + "grad_norm": 3.811048984527588, + "learning_rate": 0.004792030609368229, + "loss": 8.1901, + "step": 226800 + }, + { + "epoch": 0.9243407350525364, + "grad_norm": 4.617664813995361, + "learning_rate": 0.004791832275138043, + "loss": 8.1577, + "step": 226900 + }, + { + "epoch": 0.924748113075918, + "grad_norm": 6.046353340148926, + "learning_rate": 0.004791633850497142, + "loss": 8.1584, + "step": 227000 + }, + { + "epoch": 0.924748113075918, + "eval_MaskedAccuracy": 0.4844946360455185, + "eval_loss": 1.7199093103408813, + "eval_runtime": 634.3689, + "eval_samples_per_second": 100.062, + "eval_steps_per_second": 0.391, + "step": 227000 + }, + { + "epoch": 0.9251554910992994, + "grad_norm": 3.4626755714416504, + "learning_rate": 0.004791435335453377, + "loss": 8.2272, + "step": 227100 + }, + { + "epoch": 0.9255628691226808, + "grad_norm": 1.8042727708816528, + "learning_rate": 0.004791236730014602, + "loss": 8.1952, + "step": 227200 + }, + { + "epoch": 0.9259702471460624, + "grad_norm": 2.550905466079712, + "learning_rate": 0.004791038034188654, + "loss": 8.294, + "step": 227300 + }, + { + "epoch": 0.9263776251694438, + "grad_norm": 0.6780903339385986, + "learning_rate": 0.0047908392479833984, + "loss": 8.2901, + "step": 227400 + }, + { + "epoch": 0.9267850031928253, + "grad_norm": 2.728419542312622, + "learning_rate": 0.004790640371406698, + "loss": 8.3103, + "step": 227500 + }, + { + "epoch": 0.9271923812162067, + "grad_norm": 2.4830403327941895, + "learning_rate": 0.004790441404466405, + "loss": 8.26, + "step": 227600 + }, + { + "epoch": 0.9275997592395882, + "grad_norm": 5.732196807861328, + "learning_rate": 0.004790242347170389, + "loss": 8.2094, + "step": 227700 + }, + { + "epoch": 0.9280071372629697, + "grad_norm": 5.323036193847656, + "learning_rate": 0.004790043199526522, + "loss": 8.174, + "step": 227800 + }, + { + "epoch": 0.9284145152863511, + "grad_norm": 5.812360763549805, + "learning_rate": 0.004789843961542677, + "loss": 8.116, + "step": 227900 + }, + { + "epoch": 0.9288218933097325, + "grad_norm": 5.612771034240723, + "learning_rate": 0.0047896446332267315, + "loss": 8.0961, + "step": 228000 + }, + { + "epoch": 0.9288218933097325, + "eval_MaskedAccuracy": 0.48676481560811663, + "eval_loss": 1.708063006401062, + "eval_runtime": 536.141, + "eval_samples_per_second": 118.394, + "eval_steps_per_second": 0.463, + "step": 228000 + }, + { + "epoch": 0.9292292713331141, + "grad_norm": 5.939042091369629, + "learning_rate": 0.004789445214586563, + "loss": 8.1447, + "step": 228100 + }, + { + "epoch": 0.9296366493564955, + "grad_norm": 5.644360065460205, + "learning_rate": 0.004789245705630057, + "loss": 8.0991, + "step": 228200 + }, + { + "epoch": 0.9300440273798769, + "grad_norm": 6.242342948913574, + "learning_rate": 0.004789046106365095, + "loss": 8.0959, + "step": 228300 + }, + { + "epoch": 0.9304514054032584, + "grad_norm": 4.7664475440979, + "learning_rate": 0.004788846416799579, + "loss": 8.1116, + "step": 228400 + }, + { + "epoch": 0.9308587834266399, + "grad_norm": 4.1409807205200195, + "learning_rate": 0.0047886466369413995, + "loss": 8.1154, + "step": 228500 + }, + { + "epoch": 0.9312661614500214, + "grad_norm": 5.9070658683776855, + "learning_rate": 0.004788446766798459, + "loss": 8.1011, + "step": 228600 + }, + { + "epoch": 0.9316735394734028, + "grad_norm": 4.916849136352539, + "learning_rate": 0.004788246806378654, + "loss": 8.0977, + "step": 228700 + }, + { + "epoch": 0.9320809174967842, + "grad_norm": 4.7911553382873535, + "learning_rate": 0.00478804675568989, + "loss": 8.0778, + "step": 228800 + }, + { + "epoch": 0.9324882955201658, + "grad_norm": 4.861231803894043, + "learning_rate": 0.004787846614740072, + "loss": 8.0898, + "step": 228900 + }, + { + "epoch": 0.9328956735435472, + "grad_norm": 2.1762640476226807, + "learning_rate": 0.004787646383537121, + "loss": 8.142, + "step": 229000 + }, + { + "epoch": 0.9328956735435472, + "eval_MaskedAccuracy": 0.4788053354069694, + "eval_loss": 1.7484434843063354, + "eval_runtime": 558.994, + "eval_samples_per_second": 113.554, + "eval_steps_per_second": 0.444, + "step": 229000 + }, + { + "epoch": 0.9333030515669286, + "grad_norm": 4.250251770019531, + "learning_rate": 0.004787446062088946, + "loss": 8.2858, + "step": 229100 + }, + { + "epoch": 0.9337104295903101, + "grad_norm": 3.8813138008117676, + "learning_rate": 0.004787245650403472, + "loss": 8.2115, + "step": 229200 + }, + { + "epoch": 0.9341178076136916, + "grad_norm": 5.123370170593262, + "learning_rate": 0.00478704514848862, + "loss": 8.1835, + "step": 229300 + }, + { + "epoch": 0.934525185637073, + "grad_norm": 5.150139331817627, + "learning_rate": 0.004786844556352322, + "loss": 8.2508, + "step": 229400 + }, + { + "epoch": 0.9349325636604545, + "grad_norm": 6.384917736053467, + "learning_rate": 0.004786643874002503, + "loss": 8.1474, + "step": 229500 + }, + { + "epoch": 0.9353399416838359, + "grad_norm": 5.372497081756592, + "learning_rate": 0.0047864431014470995, + "loss": 8.1572, + "step": 229600 + }, + { + "epoch": 0.9357473197072174, + "grad_norm": 6.097373008728027, + "learning_rate": 0.004786242238694046, + "loss": 8.1109, + "step": 229700 + }, + { + "epoch": 0.9361546977305989, + "grad_norm": 4.463663578033447, + "learning_rate": 0.004786041285751292, + "loss": 8.1152, + "step": 229800 + }, + { + "epoch": 0.9365620757539803, + "grad_norm": 5.550436973571777, + "learning_rate": 0.004785840242626772, + "loss": 8.115, + "step": 229900 + }, + { + "epoch": 0.9369694537773618, + "grad_norm": 6.260983467102051, + "learning_rate": 0.0047856391093284425, + "loss": 8.1214, + "step": 230000 + }, + { + "epoch": 0.9369694537773618, + "eval_MaskedAccuracy": 0.48657717794435645, + "eval_loss": 1.7167582511901855, + "eval_runtime": 611.5136, + "eval_samples_per_second": 103.801, + "eval_steps_per_second": 0.406, + "step": 230000 + }, + { + "epoch": 0.9373768318007433, + "grad_norm": 5.187522888183594, + "learning_rate": 0.004785437885864245, + "loss": 8.1297, + "step": 230100 + }, + { + "epoch": 0.9377842098241247, + "grad_norm": 5.606808185577393, + "learning_rate": 0.00478523657224214, + "loss": 8.1035, + "step": 230200 + }, + { + "epoch": 0.9381915878475062, + "grad_norm": 3.369248867034912, + "learning_rate": 0.0047850351684700886, + "loss": 8.2793, + "step": 230300 + }, + { + "epoch": 0.9385989658708876, + "grad_norm": 6.86972188949585, + "learning_rate": 0.004784833674556057, + "loss": 8.2764, + "step": 230400 + }, + { + "epoch": 0.9390063438942691, + "grad_norm": 6.138077259063721, + "learning_rate": 0.004784632090508006, + "loss": 8.1714, + "step": 230500 + }, + { + "epoch": 0.9394137219176506, + "grad_norm": 6.523768901824951, + "learning_rate": 0.004784430416333906, + "loss": 8.1312, + "step": 230600 + }, + { + "epoch": 0.939821099941032, + "grad_norm": 4.867851257324219, + "learning_rate": 0.004784228652041728, + "loss": 8.1492, + "step": 230700 + }, + { + "epoch": 0.9402284779644134, + "grad_norm": 8.429380416870117, + "learning_rate": 0.004784026797639454, + "loss": 8.0925, + "step": 230800 + }, + { + "epoch": 0.940635855987795, + "grad_norm": 5.946533203125, + "learning_rate": 0.0047838248531350625, + "loss": 8.0991, + "step": 230900 + }, + { + "epoch": 0.9410432340111764, + "grad_norm": 5.149491786956787, + "learning_rate": 0.004783622818536535, + "loss": 8.0938, + "step": 231000 + }, + { + "epoch": 0.9410432340111764, + "eval_MaskedAccuracy": 0.4877264425603884, + "eval_loss": 1.7004443407058716, + "eval_runtime": 635.2588, + "eval_samples_per_second": 99.921, + "eval_steps_per_second": 0.39, + "step": 231000 + }, + { + "epoch": 0.9414506120345579, + "grad_norm": 3.676353693008423, + "learning_rate": 0.004783420693851859, + "loss": 8.0954, + "step": 231100 + }, + { + "epoch": 0.9418579900579394, + "grad_norm": 8.989383697509766, + "learning_rate": 0.004783218479089029, + "loss": 8.1397, + "step": 231200 + }, + { + "epoch": 0.9422653680813208, + "grad_norm": 8.321223258972168, + "learning_rate": 0.004783016174256035, + "loss": 8.3312, + "step": 231300 + }, + { + "epoch": 0.9426727461047023, + "grad_norm": 4.853785037994385, + "learning_rate": 0.004782813779360881, + "loss": 8.2826, + "step": 231400 + }, + { + "epoch": 0.9430801241280837, + "grad_norm": 2.8699069023132324, + "learning_rate": 0.004782611294411558, + "loss": 8.2423, + "step": 231500 + }, + { + "epoch": 0.9434875021514652, + "grad_norm": 2.4450199604034424, + "learning_rate": 0.004782408719416081, + "loss": 8.2719, + "step": 231600 + }, + { + "epoch": 0.9438948801748467, + "grad_norm": 6.831642150878906, + "learning_rate": 0.004782206054382461, + "loss": 8.1932, + "step": 231700 + }, + { + "epoch": 0.9443022581982281, + "grad_norm": 1.0070613622665405, + "learning_rate": 0.004782003299318706, + "loss": 8.2675, + "step": 231800 + }, + { + "epoch": 0.9447096362216095, + "grad_norm": 4.175207138061523, + "learning_rate": 0.004781800454232826, + "loss": 8.2211, + "step": 231900 + }, + { + "epoch": 0.9451170142449911, + "grad_norm": 4.457779407501221, + "learning_rate": 0.004781597519132854, + "loss": 8.1839, + "step": 232000 + }, + { + "epoch": 0.9451170142449911, + "eval_MaskedAccuracy": 0.4806816294542539, + "eval_loss": 1.7424594163894653, + "eval_runtime": 585.7934, + "eval_samples_per_second": 108.359, + "eval_steps_per_second": 0.423, + "step": 232000 + }, + { + "epoch": 0.9455243922683725, + "grad_norm": 1.6671425104141235, + "learning_rate": 0.0047813944940268025, + "loss": 8.2148, + "step": 232100 + }, + { + "epoch": 0.9459317702917539, + "grad_norm": 2.40228271484375, + "learning_rate": 0.004781191378922703, + "loss": 8.219, + "step": 232200 + }, + { + "epoch": 0.9463391483151354, + "grad_norm": 2.7009570598602295, + "learning_rate": 0.004780988173828587, + "loss": 8.205, + "step": 232300 + }, + { + "epoch": 0.9467465263385169, + "grad_norm": 2.485184907913208, + "learning_rate": 0.004780784878752484, + "loss": 8.1793, + "step": 232400 + }, + { + "epoch": 0.9471539043618984, + "grad_norm": 10.31472110748291, + "learning_rate": 0.00478058149370243, + "loss": 8.247, + "step": 232500 + }, + { + "epoch": 0.9475612823852798, + "grad_norm": 5.390857696533203, + "learning_rate": 0.004780378018686473, + "loss": 8.2979, + "step": 232600 + }, + { + "epoch": 0.9479686604086612, + "grad_norm": 1.4720979928970337, + "learning_rate": 0.004780174453712653, + "loss": 8.1861, + "step": 232700 + }, + { + "epoch": 0.9483760384320428, + "grad_norm": 0.8540545701980591, + "learning_rate": 0.004779970798789013, + "loss": 8.213, + "step": 232800 + }, + { + "epoch": 0.9487834164554242, + "grad_norm": 2.123645305633545, + "learning_rate": 0.004779767053923611, + "loss": 8.2196, + "step": 232900 + }, + { + "epoch": 0.9491907944788056, + "grad_norm": 0.45868319272994995, + "learning_rate": 0.0047795632191245056, + "loss": 8.2264, + "step": 233000 + }, + { + "epoch": 0.9491907944788056, + "eval_MaskedAccuracy": 0.4805641898194909, + "eval_loss": 1.7387572526931763, + "eval_runtime": 633.9238, + "eval_samples_per_second": 100.132, + "eval_steps_per_second": 0.391, + "step": 233000 + }, + { + "epoch": 0.9495981725021871, + "grad_norm": 6.627081394195557, + "learning_rate": 0.004779359294399752, + "loss": 8.1783, + "step": 233100 + }, + { + "epoch": 0.9500055505255686, + "grad_norm": 6.244868278503418, + "learning_rate": 0.004779155279757411, + "loss": 8.154, + "step": 233200 + }, + { + "epoch": 0.95041292854895, + "grad_norm": 6.573578834533691, + "learning_rate": 0.00477895117520555, + "loss": 8.1343, + "step": 233300 + }, + { + "epoch": 0.9508203065723315, + "grad_norm": 5.183780193328857, + "learning_rate": 0.004778746980752234, + "loss": 8.1178, + "step": 233400 + }, + { + "epoch": 0.9512276845957129, + "grad_norm": 7.468198776245117, + "learning_rate": 0.004778542696405532, + "loss": 8.1136, + "step": 233500 + }, + { + "epoch": 0.9516350626190945, + "grad_norm": 4.690491676330566, + "learning_rate": 0.004778338322173534, + "loss": 8.1155, + "step": 233600 + }, + { + "epoch": 0.9520424406424759, + "grad_norm": 5.657881736755371, + "learning_rate": 0.004778133858064316, + "loss": 8.1164, + "step": 233700 + }, + { + "epoch": 0.9524498186658573, + "grad_norm": 5.6973772048950195, + "learning_rate": 0.004777929304085957, + "loss": 8.0804, + "step": 233800 + }, + { + "epoch": 0.9528571966892389, + "grad_norm": 3.773672103881836, + "learning_rate": 0.004777724660246545, + "loss": 8.0917, + "step": 233900 + }, + { + "epoch": 0.9532645747126203, + "grad_norm": 5.486586570739746, + "learning_rate": 0.004777519926554175, + "loss": 8.0745, + "step": 234000 + }, + { + "epoch": 0.9532645747126203, + "eval_MaskedAccuracy": 0.4871141334534211, + "eval_loss": 1.711511492729187, + "eval_runtime": 565.6537, + "eval_samples_per_second": 112.217, + "eval_steps_per_second": 0.438, + "step": 234000 + }, + { + "epoch": 0.9536719527360017, + "grad_norm": 6.356557369232178, + "learning_rate": 0.004777315103016933, + "loss": 8.0888, + "step": 234100 + }, + { + "epoch": 0.9540793307593832, + "grad_norm": 4.9865546226501465, + "learning_rate": 0.004777110189642922, + "loss": 8.0503, + "step": 234200 + }, + { + "epoch": 0.9544867087827646, + "grad_norm": 3.628206968307495, + "learning_rate": 0.0047769051864402475, + "loss": 8.1838, + "step": 234300 + }, + { + "epoch": 0.9548940868061461, + "grad_norm": 4.247488498687744, + "learning_rate": 0.004776700093417009, + "loss": 8.2788, + "step": 234400 + }, + { + "epoch": 0.9553014648295276, + "grad_norm": 2.128532886505127, + "learning_rate": 0.004776494910581318, + "loss": 8.2221, + "step": 234500 + }, + { + "epoch": 0.955708842852909, + "grad_norm": 4.686913013458252, + "learning_rate": 0.004776289637941285, + "loss": 8.189, + "step": 234600 + }, + { + "epoch": 0.9561162208762904, + "grad_norm": 3.012515068054199, + "learning_rate": 0.00477608427550502, + "loss": 8.2732, + "step": 234700 + }, + { + "epoch": 0.956523598899672, + "grad_norm": 2.0626823902130127, + "learning_rate": 0.004775878823280654, + "loss": 8.2672, + "step": 234800 + }, + { + "epoch": 0.9569309769230534, + "grad_norm": 2.8990352153778076, + "learning_rate": 0.004775673281276299, + "loss": 8.2235, + "step": 234900 + }, + { + "epoch": 0.9573383549464349, + "grad_norm": 3.8956902027130127, + "learning_rate": 0.004775467649500082, + "loss": 8.1327, + "step": 235000 + }, + { + "epoch": 0.9573383549464349, + "eval_MaskedAccuracy": 0.4866171493477707, + "eval_loss": 1.7065390348434448, + "eval_runtime": 585.4599, + "eval_samples_per_second": 108.421, + "eval_steps_per_second": 0.424, + "step": 235000 + }, + { + "epoch": 0.9577457329698164, + "grad_norm": 3.7147789001464844, + "learning_rate": 0.004775261927960138, + "loss": 8.1519, + "step": 235100 + }, + { + "epoch": 0.9581531109931978, + "grad_norm": 2.170576333999634, + "learning_rate": 0.004775056116664596, + "loss": 8.2474, + "step": 235200 + }, + { + "epoch": 0.9585604890165793, + "grad_norm": 2.1491498947143555, + "learning_rate": 0.0047748502156215915, + "loss": 8.2071, + "step": 235300 + }, + { + "epoch": 0.9589678670399607, + "grad_norm": 6.1416401863098145, + "learning_rate": 0.004774644224839271, + "loss": 8.1715, + "step": 235400 + }, + { + "epoch": 0.9593752450633422, + "grad_norm": 1.067305326461792, + "learning_rate": 0.004774438144325776, + "loss": 8.1493, + "step": 235500 + }, + { + "epoch": 0.9597826230867237, + "grad_norm": 3.929832696914673, + "learning_rate": 0.004774231974089251, + "loss": 8.2178, + "step": 235600 + }, + { + "epoch": 0.9601900011101051, + "grad_norm": 4.910709857940674, + "learning_rate": 0.004774025714137852, + "loss": 8.1504, + "step": 235700 + }, + { + "epoch": 0.9605973791334865, + "grad_norm": 4.173421382904053, + "learning_rate": 0.004773819364479737, + "loss": 8.2535, + "step": 235800 + }, + { + "epoch": 0.9610047571568681, + "grad_norm": 0.8288520574569702, + "learning_rate": 0.004773612925123056, + "loss": 8.2277, + "step": 235900 + }, + { + "epoch": 0.9614121351802495, + "grad_norm": 20.392513275146484, + "learning_rate": 0.0047734063960759646, + "loss": 8.234, + "step": 236000 + }, + { + "epoch": 0.9614121351802495, + "eval_MaskedAccuracy": 0.4783882551329281, + "eval_loss": 1.7532862424850464, + "eval_runtime": 514.8995, + "eval_samples_per_second": 123.278, + "eval_steps_per_second": 0.482, + "step": 236000 + }, + { + "epoch": 0.961819513203631, + "grad_norm": 2.5947885513305664, + "learning_rate": 0.004773199777346642, + "loss": 8.256, + "step": 236100 + }, + { + "epoch": 0.9622268912270124, + "grad_norm": 8.043744087219238, + "learning_rate": 0.004772993068943253, + "loss": 8.1542, + "step": 236200 + }, + { + "epoch": 0.9626342692503939, + "grad_norm": 5.169923305511475, + "learning_rate": 0.004772786270873959, + "loss": 8.129, + "step": 236300 + }, + { + "epoch": 0.9630416472737754, + "grad_norm": 4.159392356872559, + "learning_rate": 0.004772579383146947, + "loss": 8.2045, + "step": 236400 + }, + { + "epoch": 0.9634490252971568, + "grad_norm": 3.3252909183502197, + "learning_rate": 0.004772372405770392, + "loss": 8.1375, + "step": 236500 + }, + { + "epoch": 0.9638564033205382, + "grad_norm": 4.924101829528809, + "learning_rate": 0.004772165338752485, + "loss": 8.1663, + "step": 236600 + }, + { + "epoch": 0.9642637813439198, + "grad_norm": 3.6718101501464844, + "learning_rate": 0.0047719581821014075, + "loss": 8.1902, + "step": 236700 + }, + { + "epoch": 0.9646711593673012, + "grad_norm": 3.7462430000305176, + "learning_rate": 0.004771750935825343, + "loss": 8.1595, + "step": 236800 + }, + { + "epoch": 0.9650785373906826, + "grad_norm": 5.799289703369141, + "learning_rate": 0.004771543599932501, + "loss": 8.1417, + "step": 236900 + }, + { + "epoch": 0.9654859154140641, + "grad_norm": 5.882828712463379, + "learning_rate": 0.0047713361744310566, + "loss": 8.0669, + "step": 237000 + }, + { + "epoch": 0.9654859154140641, + "eval_MaskedAccuracy": 0.48794035925860585, + "eval_loss": 1.7013667821884155, + "eval_runtime": 567.2874, + "eval_samples_per_second": 111.894, + "eval_steps_per_second": 0.437, + "step": 237000 + }, + { + "epoch": 0.9658932934374456, + "grad_norm": 4.099752426147461, + "learning_rate": 0.0047711286593292235, + "loss": 8.1008, + "step": 237100 + }, + { + "epoch": 0.966300671460827, + "grad_norm": 13.680000305175781, + "learning_rate": 0.004770921054635203, + "loss": 8.08, + "step": 237200 + }, + { + "epoch": 0.9667080494842085, + "grad_norm": 4.279512882232666, + "learning_rate": 0.004770713360357208, + "loss": 8.2708, + "step": 237300 + }, + { + "epoch": 0.9671154275075899, + "grad_norm": 4.536948204040527, + "learning_rate": 0.004770505576503441, + "loss": 8.1607, + "step": 237400 + }, + { + "epoch": 0.9675228055309715, + "grad_norm": 4.030037879943848, + "learning_rate": 0.004770297703082127, + "loss": 8.075, + "step": 237500 + }, + { + "epoch": 0.9679301835543529, + "grad_norm": 6.2266459465026855, + "learning_rate": 0.004770089740101478, + "loss": 8.0967, + "step": 237600 + }, + { + "epoch": 0.9683375615777343, + "grad_norm": 4.494446754455566, + "learning_rate": 0.004769881687569713, + "loss": 8.0975, + "step": 237700 + }, + { + "epoch": 0.9687449396011159, + "grad_norm": 4.861915111541748, + "learning_rate": 0.004769673545495061, + "loss": 8.125, + "step": 237800 + }, + { + "epoch": 0.9691523176244973, + "grad_norm": 6.571840286254883, + "learning_rate": 0.004769465313885759, + "loss": 8.1002, + "step": 237900 + }, + { + "epoch": 0.9695596956478787, + "grad_norm": 1.3625410795211792, + "learning_rate": 0.004769256992750033, + "loss": 8.2364, + "step": 238000 + }, + { + "epoch": 0.9695596956478787, + "eval_MaskedAccuracy": 0.48214247495957324, + "eval_loss": 1.7290294170379639, + "eval_runtime": 535.4762, + "eval_samples_per_second": 118.541, + "eval_steps_per_second": 0.463, + "step": 238000 + }, + { + "epoch": 0.9699670736712602, + "grad_norm": 4.830008029937744, + "learning_rate": 0.004769048582096117, + "loss": 8.1727, + "step": 238100 + }, + { + "epoch": 0.9703744516946416, + "grad_norm": 1.5990383625030518, + "learning_rate": 0.0047688400819322525, + "loss": 8.1817, + "step": 238200 + }, + { + "epoch": 0.9707818297180231, + "grad_norm": 3.1707334518432617, + "learning_rate": 0.004768631492266677, + "loss": 8.238, + "step": 238300 + }, + { + "epoch": 0.9711892077414046, + "grad_norm": 8.788178443908691, + "learning_rate": 0.00476842281310764, + "loss": 8.2217, + "step": 238400 + }, + { + "epoch": 0.971596585764786, + "grad_norm": 2.474496364593506, + "learning_rate": 0.004768214044463394, + "loss": 8.2857, + "step": 238500 + }, + { + "epoch": 0.9720039637881674, + "grad_norm": 5.40533971786499, + "learning_rate": 0.004768005186342199, + "loss": 8.2393, + "step": 238600 + }, + { + "epoch": 0.972411341811549, + "grad_norm": 4.223481178283691, + "learning_rate": 0.0047677962387523065, + "loss": 8.1331, + "step": 238700 + }, + { + "epoch": 0.9728187198349304, + "grad_norm": 5.080321788787842, + "learning_rate": 0.004767587201701966, + "loss": 8.1194, + "step": 238800 + }, + { + "epoch": 0.9732260978583119, + "grad_norm": 4.823538303375244, + "learning_rate": 0.004767378075199454, + "loss": 8.1349, + "step": 238900 + }, + { + "epoch": 0.9736334758816934, + "grad_norm": 5.152235507965088, + "learning_rate": 0.00476716885925304, + "loss": 8.0819, + "step": 239000 + }, + { + "epoch": 0.9736334758816934, + "eval_MaskedAccuracy": 0.48745703216586467, + "eval_loss": 1.6963986158370972, + "eval_runtime": 620.819, + "eval_samples_per_second": 102.246, + "eval_steps_per_second": 0.399, + "step": 239000 + }, + { + "epoch": 0.9740408539050748, + "grad_norm": 9.305587768554688, + "learning_rate": 0.004766959553870994, + "loss": 8.0992, + "step": 239100 + }, + { + "epoch": 0.9744482319284563, + "grad_norm": 3.530278205871582, + "learning_rate": 0.004766750159061589, + "loss": 8.0774, + "step": 239200 + }, + { + "epoch": 0.9748556099518377, + "grad_norm": 1.316430926322937, + "learning_rate": 0.004766540674833102, + "loss": 8.1489, + "step": 239300 + }, + { + "epoch": 0.9752629879752192, + "grad_norm": 3.6197664737701416, + "learning_rate": 0.004766331101193819, + "loss": 8.2662, + "step": 239400 + }, + { + "epoch": 0.9756703659986007, + "grad_norm": 2.255551815032959, + "learning_rate": 0.004766121438152025, + "loss": 8.2711, + "step": 239500 + }, + { + "epoch": 0.9760777440219821, + "grad_norm": 3.01822566986084, + "learning_rate": 0.004765911685716005, + "loss": 8.2768, + "step": 239600 + }, + { + "epoch": 0.9764851220453635, + "grad_norm": 3.766021490097046, + "learning_rate": 0.004765701843894052, + "loss": 8.1865, + "step": 239700 + }, + { + "epoch": 0.9768925000687451, + "grad_norm": 1.137575387954712, + "learning_rate": 0.004765491912694465, + "loss": 8.218, + "step": 239800 + }, + { + "epoch": 0.9772998780921265, + "grad_norm": 1.7848414182662964, + "learning_rate": 0.00476528189212554, + "loss": 8.2495, + "step": 239900 + }, + { + "epoch": 0.977707256115508, + "grad_norm": 4.410600185394287, + "learning_rate": 0.004765071782195583, + "loss": 8.2364, + "step": 240000 + }, + { + "epoch": 0.977707256115508, + "eval_MaskedAccuracy": 0.4809854694355085, + "eval_loss": 1.7356899976730347, + "eval_runtime": 675.4649, + "eval_samples_per_second": 93.974, + "eval_steps_per_second": 0.367, + "step": 240000 + }, + { + "epoch": 0.9781146341388894, + "grad_norm": 1.4677624702453613, + "learning_rate": 0.0047648615829129016, + "loss": 8.2211, + "step": 240100 + }, + { + "epoch": 0.9785220121622709, + "grad_norm": 3.9822754859924316, + "learning_rate": 0.004764651294285809, + "loss": 8.1547, + "step": 240200 + }, + { + "epoch": 0.9789293901856524, + "grad_norm": 6.896245002746582, + "learning_rate": 0.00476444091632261, + "loss": 8.1219, + "step": 240300 + }, + { + "epoch": 0.9793367682090338, + "grad_norm": 3.443446397781372, + "learning_rate": 0.0047642304490316326, + "loss": 8.1556, + "step": 240400 + }, + { + "epoch": 0.9797441462324152, + "grad_norm": 3.5489230155944824, + "learning_rate": 0.004764019892421191, + "loss": 8.185, + "step": 240500 + }, + { + "epoch": 0.9801515242557968, + "grad_norm": 3.103484630584717, + "learning_rate": 0.004763809246499613, + "loss": 8.1697, + "step": 240600 + }, + { + "epoch": 0.9805589022791782, + "grad_norm": 3.4157602787017822, + "learning_rate": 0.004763598511275223, + "loss": 8.295, + "step": 240700 + }, + { + "epoch": 0.9809662803025596, + "grad_norm": 1.5067360401153564, + "learning_rate": 0.004763387686756353, + "loss": 8.2628, + "step": 240800 + }, + { + "epoch": 0.9813736583259411, + "grad_norm": 3.985395669937134, + "learning_rate": 0.004763176772951338, + "loss": 8.2128, + "step": 240900 + }, + { + "epoch": 0.9817810363493226, + "grad_norm": 2.806593418121338, + "learning_rate": 0.00476296576986852, + "loss": 8.145, + "step": 241000 + }, + { + "epoch": 0.9817810363493226, + "eval_MaskedAccuracy": 0.48619838289017225, + "eval_loss": 1.7169946432113647, + "eval_runtime": 610.259, + "eval_samples_per_second": 104.015, + "eval_steps_per_second": 0.406, + "step": 241000 + }, + { + "epoch": 0.982188414372704, + "grad_norm": 3.1887853145599365, + "learning_rate": 0.004762754677516234, + "loss": 8.2021, + "step": 241100 + }, + { + "epoch": 0.9825957923960855, + "grad_norm": 5.074123382568359, + "learning_rate": 0.004762543495902832, + "loss": 8.182, + "step": 241200 + }, + { + "epoch": 0.9830031704194669, + "grad_norm": 5.17244815826416, + "learning_rate": 0.00476233222503666, + "loss": 8.1297, + "step": 241300 + }, + { + "epoch": 0.9834105484428485, + "grad_norm": 7.497429370880127, + "learning_rate": 0.004762120864926077, + "loss": 8.1496, + "step": 241400 + }, + { + "epoch": 0.9838179264662299, + "grad_norm": 1.6269021034240723, + "learning_rate": 0.004761909415579431, + "loss": 8.1612, + "step": 241500 + }, + { + "epoch": 0.9842253044896113, + "grad_norm": 3.574080228805542, + "learning_rate": 0.004761697877005083, + "loss": 8.1715, + "step": 241600 + }, + { + "epoch": 0.9846326825129929, + "grad_norm": 4.916479587554932, + "learning_rate": 0.004761486249211402, + "loss": 8.1714, + "step": 241700 + }, + { + "epoch": 0.9850400605363743, + "grad_norm": 2.030855178833008, + "learning_rate": 0.0047612745322067485, + "loss": 8.1239, + "step": 241800 + }, + { + "epoch": 0.9854474385597557, + "grad_norm": 9.08359432220459, + "learning_rate": 0.004761062725999501, + "loss": 8.2372, + "step": 241900 + }, + { + "epoch": 0.9858548165831372, + "grad_norm": 6.1944193840026855, + "learning_rate": 0.00476085083059802, + "loss": 8.1997, + "step": 242000 + }, + { + "epoch": 0.9858548165831372, + "eval_MaskedAccuracy": 0.4853934323516833, + "eval_loss": 1.713708758354187, + "eval_runtime": 617.3324, + "eval_samples_per_second": 102.823, + "eval_steps_per_second": 0.402, + "step": 242000 + }, + { + "epoch": 0.9862621946065186, + "grad_norm": 2.0614213943481445, + "learning_rate": 0.004760638846010693, + "loss": 8.2301, + "step": 242100 + }, + { + "epoch": 0.9866695726299001, + "grad_norm": 2.5254554748535156, + "learning_rate": 0.004760426772245896, + "loss": 8.2202, + "step": 242200 + }, + { + "epoch": 0.9870769506532816, + "grad_norm": 3.4275104999542236, + "learning_rate": 0.004760214609312019, + "loss": 8.1426, + "step": 242300 + }, + { + "epoch": 0.987484328676663, + "grad_norm": 5.422964572906494, + "learning_rate": 0.004760002357217439, + "loss": 8.1568, + "step": 242400 + }, + { + "epoch": 0.9878917067000446, + "grad_norm": 1.7899110317230225, + "learning_rate": 0.004759790015970564, + "loss": 8.1831, + "step": 242500 + }, + { + "epoch": 0.988299084723426, + "grad_norm": 3.7543323040008545, + "learning_rate": 0.004759577585579777, + "loss": 8.2366, + "step": 242600 + }, + { + "epoch": 0.9887064627468074, + "grad_norm": 5.622766017913818, + "learning_rate": 0.0047593650660534784, + "loss": 8.2532, + "step": 242700 + }, + { + "epoch": 0.9891138407701889, + "grad_norm": 4.1286821365356445, + "learning_rate": 0.004759152457400075, + "loss": 8.2909, + "step": 242800 + }, + { + "epoch": 0.9895212187935704, + "grad_norm": 5.105898380279541, + "learning_rate": 0.0047589397596279675, + "loss": 8.1774, + "step": 242900 + }, + { + "epoch": 0.9899285968169518, + "grad_norm": 4.220027923583984, + "learning_rate": 0.004758726972745564, + "loss": 8.1656, + "step": 243000 + }, + { + "epoch": 0.9899285968169518, + "eval_MaskedAccuracy": 0.48216152134175716, + "eval_loss": 1.731658935546875, + "eval_runtime": 620.3479, + "eval_samples_per_second": 102.323, + "eval_steps_per_second": 0.4, + "step": 243000 + }, + { + "epoch": 0.9903359748403333, + "grad_norm": 3.5671005249023438, + "learning_rate": 0.004758514096761283, + "loss": 8.2275, + "step": 243100 + }, + { + "epoch": 0.9907433528637147, + "grad_norm": 2.6987688541412354, + "learning_rate": 0.004758301131683541, + "loss": 8.2028, + "step": 243200 + }, + { + "epoch": 0.9911507308870962, + "grad_norm": 5.813295841217041, + "learning_rate": 0.004758088077520751, + "loss": 8.2244, + "step": 243300 + }, + { + "epoch": 0.9915581089104777, + "grad_norm": 2.056358575820923, + "learning_rate": 0.004757874934281339, + "loss": 8.1549, + "step": 243400 + }, + { + "epoch": 0.9919654869338591, + "grad_norm": 2.728684663772583, + "learning_rate": 0.004757661701973738, + "loss": 8.1494, + "step": 243500 + }, + { + "epoch": 0.9923728649572405, + "grad_norm": 4.71746826171875, + "learning_rate": 0.004757448380606363, + "loss": 8.11, + "step": 243600 + }, + { + "epoch": 0.9927802429806221, + "grad_norm": 4.784518241882324, + "learning_rate": 0.004757234970187655, + "loss": 8.1885, + "step": 243700 + }, + { + "epoch": 0.9931876210040035, + "grad_norm": 2.37805438041687, + "learning_rate": 0.004757021470726056, + "loss": 8.1919, + "step": 243800 + }, + { + "epoch": 0.993594999027385, + "grad_norm": 3.5506582260131836, + "learning_rate": 0.004756807882230003, + "loss": 8.1366, + "step": 243900 + }, + { + "epoch": 0.9940023770507664, + "grad_norm": 0.5849319100379944, + "learning_rate": 0.004756594204707939, + "loss": 8.2411, + "step": 244000 + }, + { + "epoch": 0.9940023770507664, + "eval_MaskedAccuracy": 0.48107103285078806, + "eval_loss": 1.7513779401779175, + "eval_runtime": 556.9936, + "eval_samples_per_second": 113.962, + "eval_steps_per_second": 0.445, + "step": 244000 + }, + { + "epoch": 0.9944097550741479, + "grad_norm": 1.4658851623535156, + "learning_rate": 0.004756380438168312, + "loss": 8.2649, + "step": 244100 + }, + { + "epoch": 0.9948171330975294, + "grad_norm": 6.136865615844727, + "learning_rate": 0.004756166582619584, + "loss": 8.2526, + "step": 244200 + }, + { + "epoch": 0.9952245111209108, + "grad_norm": 6.565762996673584, + "learning_rate": 0.004755952638070195, + "loss": 8.2155, + "step": 244300 + }, + { + "epoch": 0.9956318891442922, + "grad_norm": 5.260137557983398, + "learning_rate": 0.004755738604528605, + "loss": 8.2215, + "step": 244400 + }, + { + "epoch": 0.9960392671676738, + "grad_norm": 2.4687938690185547, + "learning_rate": 0.004755524482003282, + "loss": 8.228, + "step": 244500 + }, + { + "epoch": 0.9964466451910552, + "grad_norm": 0.6773878335952759, + "learning_rate": 0.0047553102705026875, + "loss": 8.2016, + "step": 244600 + }, + { + "epoch": 0.9968540232144366, + "grad_norm": 5.026801109313965, + "learning_rate": 0.0047550959700352995, + "loss": 8.2244, + "step": 244700 + }, + { + "epoch": 0.9972614012378181, + "grad_norm": 5.554771423339844, + "learning_rate": 0.004754881580609581, + "loss": 8.1483, + "step": 244800 + }, + { + "epoch": 0.9976687792611996, + "grad_norm": 4.744398593902588, + "learning_rate": 0.004754667102234011, + "loss": 8.1485, + "step": 244900 + }, + { + "epoch": 0.9980761572845811, + "grad_norm": 9.949603080749512, + "learning_rate": 0.00475445253491707, + "loss": 8.2084, + "step": 245000 + }, + { + "epoch": 0.9980761572845811, + "eval_MaskedAccuracy": 0.47898123664693637, + "eval_loss": 1.7470967769622803, + "eval_runtime": 537.0918, + "eval_samples_per_second": 118.185, + "eval_steps_per_second": 0.462, + "step": 245000 + }, + { + "epoch": 0.9984835353079625, + "grad_norm": 4.404254913330078, + "learning_rate": 0.004754237878667238, + "loss": 8.2251, + "step": 245100 + }, + { + "epoch": 0.9988909133313439, + "grad_norm": 4.610202312469482, + "learning_rate": 0.004754023133493004, + "loss": 8.1291, + "step": 245200 + }, + { + "epoch": 0.9992982913547255, + "grad_norm": 4.8737335205078125, + "learning_rate": 0.004753808299402859, + "loss": 8.1327, + "step": 245300 + }, + { + "epoch": 0.9997056693781069, + "grad_norm": 1.3142606019973755, + "learning_rate": 0.004753593376405291, + "loss": 8.1187, + "step": 245400 + }, + { + "epoch": 1.0001130474014883, + "grad_norm": 6.105260848999023, + "learning_rate": 0.004753378364508799, + "loss": 8.1851, + "step": 245500 + }, + { + "epoch": 1.0005204254248699, + "grad_norm": 1.9481598138809204, + "learning_rate": 0.00475316326372189, + "loss": 8.1759, + "step": 245600 + }, + { + "epoch": 1.0009278034482514, + "grad_norm": 3.2028346061706543, + "learning_rate": 0.00475294807405306, + "loss": 8.1454, + "step": 245700 + }, + { + "epoch": 1.0013351814716327, + "grad_norm": 2.652489185333252, + "learning_rate": 0.004752732795510824, + "loss": 8.1798, + "step": 245800 + }, + { + "epoch": 1.0017425594950142, + "grad_norm": 6.72688627243042, + "learning_rate": 0.00475251742810368, + "loss": 8.2579, + "step": 245900 + }, + { + "epoch": 1.0021499375183958, + "grad_norm": 4.054202556610107, + "learning_rate": 0.004752301971840156, + "loss": 8.211, + "step": 246000 + }, + { + "epoch": 1.0021499375183958, + "eval_MaskedAccuracy": 0.4853460253722327, + "eval_loss": 1.714086651802063, + "eval_runtime": 148.5626, + "eval_samples_per_second": 427.268, + "eval_steps_per_second": 1.669, + "step": 246000 + }, + { + "epoch": 1.002557315541777, + "grad_norm": 1.8084056377410889, + "learning_rate": 0.0047520864267287695, + "loss": 8.2046, + "step": 246100 + }, + { + "epoch": 1.0029646935651586, + "grad_norm": 0.5682147145271301, + "learning_rate": 0.004751870792778029, + "loss": 8.23, + "step": 246200 + }, + { + "epoch": 1.0033720715885401, + "grad_norm": 3.379467010498047, + "learning_rate": 0.004751655069996472, + "loss": 8.2539, + "step": 246300 + }, + { + "epoch": 1.0037794496119214, + "grad_norm": 2.14129638671875, + "learning_rate": 0.004751439258392628, + "loss": 8.1796, + "step": 246400 + }, + { + "epoch": 1.004186827635303, + "grad_norm": 6.0373921394348145, + "learning_rate": 0.004751223357975017, + "loss": 8.1889, + "step": 246500 + }, + { + "epoch": 1.0045942056586845, + "grad_norm": 5.169281959533691, + "learning_rate": 0.004751007368752187, + "loss": 8.2046, + "step": 246600 + }, + { + "epoch": 1.0050015836820658, + "grad_norm": 6.113409996032715, + "learning_rate": 0.004750791290732675, + "loss": 8.108, + "step": 246700 + }, + { + "epoch": 1.0054089617054474, + "grad_norm": 2.174652338027954, + "learning_rate": 0.004750575123925025, + "loss": 8.1574, + "step": 246800 + }, + { + "epoch": 1.005816339728829, + "grad_norm": 4.004303455352783, + "learning_rate": 0.0047503588683377755, + "loss": 8.1607, + "step": 246900 + }, + { + "epoch": 1.0062237177522102, + "grad_norm": 2.085434913635254, + "learning_rate": 0.004750142523979482, + "loss": 8.1984, + "step": 247000 + }, + { + "epoch": 1.0062237177522102, + "eval_MaskedAccuracy": 0.48152066804454813, + "eval_loss": 1.7368111610412598, + "eval_runtime": 148.9571, + "eval_samples_per_second": 426.136, + "eval_steps_per_second": 1.665, + "step": 247000 + }, + { + "epoch": 1.0066310957755917, + "grad_norm": 4.9853434562683105, + "learning_rate": 0.004749926090858699, + "loss": 8.1975, + "step": 247100 + }, + { + "epoch": 1.0070384737989733, + "grad_norm": 1.6406573057174683, + "learning_rate": 0.004749709568983974, + "loss": 8.1232, + "step": 247200 + }, + { + "epoch": 1.0074458518223546, + "grad_norm": 5.71746826171875, + "learning_rate": 0.004749492958363876, + "loss": 8.1713, + "step": 247300 + }, + { + "epoch": 1.007853229845736, + "grad_norm": 2.008352279663086, + "learning_rate": 0.004749276259006973, + "loss": 8.102, + "step": 247400 + }, + { + "epoch": 1.0082606078691176, + "grad_norm": 1.7535492181777954, + "learning_rate": 0.004749059470921824, + "loss": 8.2237, + "step": 247500 + }, + { + "epoch": 1.008667985892499, + "grad_norm": 6.515242576599121, + "learning_rate": 0.004748842594117008, + "loss": 8.2053, + "step": 247600 + }, + { + "epoch": 1.0090753639158805, + "grad_norm": 3.829084873199463, + "learning_rate": 0.004748625628601089, + "loss": 8.2227, + "step": 247700 + }, + { + "epoch": 1.009482741939262, + "grad_norm": 3.9138801097869873, + "learning_rate": 0.004748408574382645, + "loss": 8.1249, + "step": 247800 + }, + { + "epoch": 1.0098901199626433, + "grad_norm": 6.763348579406738, + "learning_rate": 0.004748191431470262, + "loss": 8.1166, + "step": 247900 + }, + { + "epoch": 1.0102974979860249, + "grad_norm": 0.8834948539733887, + "learning_rate": 0.004747974199872523, + "loss": 8.1769, + "step": 248000 + }, + { + "epoch": 1.0102974979860249, + "eval_MaskedAccuracy": 0.48299971296835226, + "eval_loss": 1.7294466495513916, + "eval_runtime": 149.6861, + "eval_samples_per_second": 424.061, + "eval_steps_per_second": 1.657, + "step": 248000 + }, + { + "epoch": 1.0107048760094064, + "grad_norm": 3.2210018634796143, + "learning_rate": 0.004747756879598021, + "loss": 8.2426, + "step": 248100 + }, + { + "epoch": 1.011112254032788, + "grad_norm": 4.237231254577637, + "learning_rate": 0.00474753947065534, + "loss": 8.169, + "step": 248200 + }, + { + "epoch": 1.0115196320561692, + "grad_norm": 5.079657077789307, + "learning_rate": 0.004747321973053085, + "loss": 8.1268, + "step": 248300 + }, + { + "epoch": 1.0119270100795508, + "grad_norm": 3.452561378479004, + "learning_rate": 0.004747104386799851, + "loss": 8.0861, + "step": 248400 + }, + { + "epoch": 1.0123343881029323, + "grad_norm": 4.034000396728516, + "learning_rate": 0.004746886711904237, + "loss": 8.1469, + "step": 248500 + }, + { + "epoch": 1.0127417661263136, + "grad_norm": 1.7607423067092896, + "learning_rate": 0.004746668948374848, + "loss": 8.2048, + "step": 248600 + }, + { + "epoch": 1.0131491441496951, + "grad_norm": 3.3386831283569336, + "learning_rate": 0.004746451096220302, + "loss": 8.25, + "step": 248700 + }, + { + "epoch": 1.0135565221730767, + "grad_norm": 5.06843900680542, + "learning_rate": 0.004746233155449206, + "loss": 8.135, + "step": 248800 + }, + { + "epoch": 1.013963900196458, + "grad_norm": 5.210528373718262, + "learning_rate": 0.004746015126070175, + "loss": 8.1363, + "step": 248900 + }, + { + "epoch": 1.0143712782198395, + "grad_norm": 3.991243839263916, + "learning_rate": 0.004745797008091829, + "loss": 8.1689, + "step": 249000 + }, + { + "epoch": 1.0143712782198395, + "eval_MaskedAccuracy": 0.48222734201463563, + "eval_loss": 1.7312606573104858, + "eval_runtime": 149.2811, + "eval_samples_per_second": 425.211, + "eval_steps_per_second": 1.661, + "step": 249000 + }, + { + "epoch": 1.014778656243221, + "grad_norm": 5.869924545288086, + "learning_rate": 0.004745578801522792, + "loss": 8.1905, + "step": 249100 + }, + { + "epoch": 1.0151860342666024, + "grad_norm": 3.2548134326934814, + "learning_rate": 0.004745360506371691, + "loss": 8.2221, + "step": 249200 + }, + { + "epoch": 1.015593412289984, + "grad_norm": 6.3848137855529785, + "learning_rate": 0.00474514212264715, + "loss": 8.2396, + "step": 249300 + }, + { + "epoch": 1.0160007903133654, + "grad_norm": 5.2388105392456055, + "learning_rate": 0.004744923650357814, + "loss": 8.2026, + "step": 249400 + }, + { + "epoch": 1.0164081683367467, + "grad_norm": 3.3096816539764404, + "learning_rate": 0.004744705089512321, + "loss": 8.1698, + "step": 249500 + }, + { + "epoch": 1.0168155463601283, + "grad_norm": 2.923670530319214, + "learning_rate": 0.004744486440119309, + "loss": 8.2163, + "step": 249600 + }, + { + "epoch": 1.0172229243835098, + "grad_norm": 4.202099323272705, + "learning_rate": 0.00474426770218742, + "loss": 8.1819, + "step": 249700 + }, + { + "epoch": 1.0176303024068911, + "grad_norm": 6.3655829429626465, + "learning_rate": 0.004744048875725303, + "loss": 8.2079, + "step": 249800 + }, + { + "epoch": 1.0180376804302727, + "grad_norm": 2.7881789207458496, + "learning_rate": 0.004743829960741609, + "loss": 8.1448, + "step": 249900 + }, + { + "epoch": 1.0184450584536542, + "grad_norm": 1.5355042219161987, + "learning_rate": 0.004743610957244993, + "loss": 8.1285, + "step": 250000 + }, + { + "epoch": 1.0184450584536542, + "eval_MaskedAccuracy": 0.48120418762355, + "eval_loss": 1.7278765439987183, + "eval_runtime": 149.5109, + "eval_samples_per_second": 424.558, + "eval_steps_per_second": 1.659, + "step": 250000 + }, + { + "epoch": 1.0188524364770355, + "grad_norm": 6.138425350189209, + "learning_rate": 0.004743391865244105, + "loss": 8.1409, + "step": 250100 + }, + { + "epoch": 1.019259814500417, + "grad_norm": 1.641326665878296, + "learning_rate": 0.004743172684747619, + "loss": 8.1702, + "step": 250200 + }, + { + "epoch": 1.0196671925237986, + "grad_norm": 5.6269989013671875, + "learning_rate": 0.004742953415764191, + "loss": 8.1567, + "step": 250300 + }, + { + "epoch": 1.0200745705471799, + "grad_norm": 4.5842766761779785, + "learning_rate": 0.004742734058302499, + "loss": 8.1058, + "step": 250400 + }, + { + "epoch": 1.0204819485705614, + "grad_norm": 4.902566909790039, + "learning_rate": 0.004742514612371211, + "loss": 8.0938, + "step": 250500 + }, + { + "epoch": 1.020889326593943, + "grad_norm": 2.6178858280181885, + "learning_rate": 0.004742295077979008, + "loss": 8.1237, + "step": 250600 + }, + { + "epoch": 1.0212967046173245, + "grad_norm": 3.580260992050171, + "learning_rate": 0.004742075455134564, + "loss": 8.1047, + "step": 250700 + }, + { + "epoch": 1.0217040826407058, + "grad_norm": 6.789980888366699, + "learning_rate": 0.004741855743846565, + "loss": 8.1315, + "step": 250800 + }, + { + "epoch": 1.0221114606640873, + "grad_norm": 6.0821638107299805, + "learning_rate": 0.00474163594412369, + "loss": 8.1029, + "step": 250900 + }, + { + "epoch": 1.0225188386874688, + "grad_norm": 4.657712459564209, + "learning_rate": 0.004741416055974637, + "loss": 8.081, + "step": 251000 + }, + { + "epoch": 1.0225188386874688, + "eval_MaskedAccuracy": 0.4886493288375345, + "eval_loss": 1.7067351341247559, + "eval_runtime": 158.5147, + "eval_samples_per_second": 400.442, + "eval_steps_per_second": 1.565, + "step": 251000 + }, + { + "epoch": 1.0229262167108502, + "grad_norm": 3.2733216285705566, + "learning_rate": 0.004741196079408091, + "loss": 8.1162, + "step": 251100 + }, + { + "epoch": 1.0233335947342317, + "grad_norm": 4.076113700866699, + "learning_rate": 0.004740976014432755, + "loss": 8.2341, + "step": 251200 + }, + { + "epoch": 1.0237409727576132, + "grad_norm": 6.664559364318848, + "learning_rate": 0.0047407558610573315, + "loss": 8.2386, + "step": 251300 + }, + { + "epoch": 1.0241483507809945, + "grad_norm": 5.388659477233887, + "learning_rate": 0.004740535619290521, + "loss": 8.1699, + "step": 251400 + }, + { + "epoch": 1.024555728804376, + "grad_norm": 4.1781840324401855, + "learning_rate": 0.004740315289141033, + "loss": 8.1318, + "step": 251500 + }, + { + "epoch": 1.0249631068277576, + "grad_norm": 5.653961658477783, + "learning_rate": 0.004740094870617571, + "loss": 8.2242, + "step": 251600 + }, + { + "epoch": 1.025370484851139, + "grad_norm": 6.724490165710449, + "learning_rate": 0.004739874363728855, + "loss": 8.218, + "step": 251700 + }, + { + "epoch": 1.0257778628745204, + "grad_norm": 6.656342029571533, + "learning_rate": 0.004739653768483603, + "loss": 8.1545, + "step": 251800 + }, + { + "epoch": 1.026185240897902, + "grad_norm": 3.1695098876953125, + "learning_rate": 0.004739433084890538, + "loss": 8.256, + "step": 251900 + }, + { + "epoch": 1.0265926189212833, + "grad_norm": 6.061586856842041, + "learning_rate": 0.00473921231295838, + "loss": 8.1495, + "step": 252000 + }, + { + "epoch": 1.0265926189212833, + "eval_MaskedAccuracy": 0.48661222703316337, + "eval_loss": 1.71462881565094, + "eval_runtime": 152.3222, + "eval_samples_per_second": 416.722, + "eval_steps_per_second": 1.628, + "step": 252000 + }, + { + "epoch": 1.0269999969446648, + "grad_norm": 2.984637498855591, + "learning_rate": 0.004738991452695858, + "loss": 8.174, + "step": 252100 + }, + { + "epoch": 1.0274073749680463, + "grad_norm": 4.735746383666992, + "learning_rate": 0.0047387705041117105, + "loss": 8.2237, + "step": 252200 + }, + { + "epoch": 1.0278147529914277, + "grad_norm": 2.597001552581787, + "learning_rate": 0.004738549467214665, + "loss": 8.1202, + "step": 252300 + }, + { + "epoch": 1.0282221310148092, + "grad_norm": 3.530951738357544, + "learning_rate": 0.004738328342013465, + "loss": 8.1646, + "step": 252400 + }, + { + "epoch": 1.0286295090381907, + "grad_norm": 5.350266933441162, + "learning_rate": 0.004738107128516851, + "loss": 8.1563, + "step": 252500 + }, + { + "epoch": 1.029036887061572, + "grad_norm": 2.237780809402466, + "learning_rate": 0.004737885826733565, + "loss": 8.1153, + "step": 252600 + }, + { + "epoch": 1.0294442650849536, + "grad_norm": 6.5042829513549805, + "learning_rate": 0.00473766443667236, + "loss": 8.1714, + "step": 252700 + }, + { + "epoch": 1.029851643108335, + "grad_norm": 2.9459121227264404, + "learning_rate": 0.004737442958341991, + "loss": 8.1588, + "step": 252800 + }, + { + "epoch": 1.0302590211317164, + "grad_norm": 4.08525276184082, + "learning_rate": 0.004737221391751213, + "loss": 8.0959, + "step": 252900 + }, + { + "epoch": 1.030666399155098, + "grad_norm": 4.466893196105957, + "learning_rate": 0.004736999736908782, + "loss": 8.1995, + "step": 253000 + }, + { + "epoch": 1.030666399155098, + "eval_MaskedAccuracy": 0.4787248450113544, + "eval_loss": 1.7428325414657593, + "eval_runtime": 151.8794, + "eval_samples_per_second": 417.937, + "eval_steps_per_second": 1.633, + "step": 253000 + }, + { + "epoch": 1.0310737771784795, + "grad_norm": 8.401349067687988, + "learning_rate": 0.0047367779938234655, + "loss": 8.2618, + "step": 253100 + }, + { + "epoch": 1.031481155201861, + "grad_norm": 4.149782180786133, + "learning_rate": 0.004736556162504031, + "loss": 8.2328, + "step": 253200 + }, + { + "epoch": 1.0318885332252423, + "grad_norm": 7.0294084548950195, + "learning_rate": 0.004736334242959242, + "loss": 8.1726, + "step": 253300 + }, + { + "epoch": 1.0322959112486239, + "grad_norm": 2.5511868000030518, + "learning_rate": 0.004736112235197877, + "loss": 8.1455, + "step": 253400 + }, + { + "epoch": 1.0327032892720054, + "grad_norm": 4.843980312347412, + "learning_rate": 0.0047358901392287114, + "loss": 8.177, + "step": 253500 + }, + { + "epoch": 1.0331106672953867, + "grad_norm": 3.1217896938323975, + "learning_rate": 0.004735667955060528, + "loss": 8.1667, + "step": 253600 + }, + { + "epoch": 1.0335180453187682, + "grad_norm": 13.546120643615723, + "learning_rate": 0.0047354456827021114, + "loss": 8.173, + "step": 253700 + }, + { + "epoch": 1.0339254233421498, + "grad_norm": 2.89715576171875, + "learning_rate": 0.004735223322162252, + "loss": 8.2104, + "step": 253800 + }, + { + "epoch": 1.034332801365531, + "grad_norm": 5.824563980102539, + "learning_rate": 0.0047350008734497314, + "loss": 8.2252, + "step": 253900 + }, + { + "epoch": 1.0347401793889126, + "grad_norm": 5.2591729164123535, + "learning_rate": 0.004734778336573352, + "loss": 8.2093, + "step": 254000 + }, + { + "epoch": 1.0347401793889126, + "eval_MaskedAccuracy": 0.4854302678060531, + "eval_loss": 1.71346914768219, + "eval_runtime": 150.9444, + "eval_samples_per_second": 420.526, + "eval_steps_per_second": 1.643, + "step": 254000 + }, + { + "epoch": 1.0351475574122941, + "grad_norm": 3.1785802841186523, + "learning_rate": 0.004734555711541909, + "loss": 8.1264, + "step": 254100 + }, + { + "epoch": 1.0355549354356754, + "grad_norm": 1.5771757364273071, + "learning_rate": 0.004734332998364197, + "loss": 8.1249, + "step": 254200 + }, + { + "epoch": 1.035962313459057, + "grad_norm": 3.380925178527832, + "learning_rate": 0.004734110197049031, + "loss": 8.1276, + "step": 254300 + }, + { + "epoch": 1.0363696914824385, + "grad_norm": 5.520992755889893, + "learning_rate": 0.004733887307605224, + "loss": 8.1856, + "step": 254400 + }, + { + "epoch": 1.0367770695058198, + "grad_norm": 2.0409340858459473, + "learning_rate": 0.004733664330041577, + "loss": 8.177, + "step": 254500 + }, + { + "epoch": 1.0371844475292014, + "grad_norm": 5.373855113983154, + "learning_rate": 0.004733441264366908, + "loss": 8.2082, + "step": 254600 + }, + { + "epoch": 1.037591825552583, + "grad_norm": 2.790085792541504, + "learning_rate": 0.004733218110590038, + "loss": 8.1732, + "step": 254700 + }, + { + "epoch": 1.0379992035759642, + "grad_norm": 1.4873521327972412, + "learning_rate": 0.004732994868719789, + "loss": 8.1835, + "step": 254800 + }, + { + "epoch": 1.0384065815993457, + "grad_norm": 1.530429720878601, + "learning_rate": 0.004732771538764989, + "loss": 8.2247, + "step": 254900 + }, + { + "epoch": 1.0388139596227273, + "grad_norm": 3.61285400390625, + "learning_rate": 0.004732548120734462, + "loss": 8.1749, + "step": 255000 + }, + { + "epoch": 1.0388139596227273, + "eval_MaskedAccuracy": 0.4864262992232778, + "eval_loss": 1.7230299711227417, + "eval_runtime": 149.6605, + "eval_samples_per_second": 424.133, + "eval_steps_per_second": 1.657, + "step": 255000 + }, + { + "epoch": 1.0392213376461086, + "grad_norm": 2.3742315769195557, + "learning_rate": 0.0047323246146370465, + "loss": 8.1234, + "step": 255100 + }, + { + "epoch": 1.0396287156694901, + "grad_norm": 5.732882499694824, + "learning_rate": 0.004732101020481578, + "loss": 8.176, + "step": 255200 + }, + { + "epoch": 1.0400360936928716, + "grad_norm": 5.98242712020874, + "learning_rate": 0.004731877338276891, + "loss": 8.2219, + "step": 255300 + }, + { + "epoch": 1.040443471716253, + "grad_norm": 5.500507831573486, + "learning_rate": 0.0047316535680318305, + "loss": 8.2961, + "step": 255400 + }, + { + "epoch": 1.0408508497396345, + "grad_norm": 3.187746524810791, + "learning_rate": 0.004731429709755249, + "loss": 8.1657, + "step": 255500 + }, + { + "epoch": 1.041258227763016, + "grad_norm": 6.253506660461426, + "learning_rate": 0.0047312057634559986, + "loss": 8.1167, + "step": 255600 + }, + { + "epoch": 1.0416656057863976, + "grad_norm": 2.829343795776367, + "learning_rate": 0.004730981729142925, + "loss": 8.1404, + "step": 255700 + }, + { + "epoch": 1.0420729838097789, + "grad_norm": 2.9654922485351562, + "learning_rate": 0.00473075760682488, + "loss": 8.1371, + "step": 255800 + }, + { + "epoch": 1.0424803618331604, + "grad_norm": 1.2807074785232544, + "learning_rate": 0.004730533396510732, + "loss": 8.1983, + "step": 255900 + }, + { + "epoch": 1.042887739856542, + "grad_norm": 4.129685401916504, + "learning_rate": 0.004730309098209351, + "loss": 8.174, + "step": 256000 + }, + { + "epoch": 1.042887739856542, + "eval_MaskedAccuracy": 0.4800277906718458, + "eval_loss": 1.7342748641967773, + "eval_runtime": 192.1508, + "eval_samples_per_second": 330.345, + "eval_steps_per_second": 1.291, + "step": 256000 + }, + { + "epoch": 1.0432951178799232, + "grad_norm": 3.0830535888671875, + "learning_rate": 0.004730084711929603, + "loss": 8.1937, + "step": 256100 + }, + { + "epoch": 1.0437024959033048, + "grad_norm": 5.282797813415527, + "learning_rate": 0.00472986023768035, + "loss": 8.1215, + "step": 256200 + }, + { + "epoch": 1.0441098739266863, + "grad_norm": 0.7001706957817078, + "learning_rate": 0.004729635675470472, + "loss": 8.1979, + "step": 256300 + }, + { + "epoch": 1.0445172519500676, + "grad_norm": 3.220350980758667, + "learning_rate": 0.004729411025308854, + "loss": 8.1868, + "step": 256400 + }, + { + "epoch": 1.0449246299734491, + "grad_norm": 5.503873348236084, + "learning_rate": 0.004729186287204369, + "loss": 8.1578, + "step": 256500 + }, + { + "epoch": 1.0453320079968307, + "grad_norm": 3.1256799697875977, + "learning_rate": 0.0047289614611659035, + "loss": 8.1322, + "step": 256600 + }, + { + "epoch": 1.045739386020212, + "grad_norm": 3.134495496749878, + "learning_rate": 0.004728736547202346, + "loss": 8.1981, + "step": 256700 + }, + { + "epoch": 1.0461467640435935, + "grad_norm": 6.195826530456543, + "learning_rate": 0.004728511545322587, + "loss": 8.1569, + "step": 256800 + }, + { + "epoch": 1.046554142066975, + "grad_norm": 5.16770076751709, + "learning_rate": 0.0047282864555355224, + "loss": 8.1899, + "step": 256900 + }, + { + "epoch": 1.0469615200903564, + "grad_norm": 5.35223388671875, + "learning_rate": 0.004728061277850063, + "loss": 8.1473, + "step": 257000 + }, + { + "epoch": 1.0469615200903564, + "eval_MaskedAccuracy": 0.48726242103151224, + "eval_loss": 1.7100180387496948, + "eval_runtime": 149.1608, + "eval_samples_per_second": 425.554, + "eval_steps_per_second": 1.663, + "step": 257000 + }, + { + "epoch": 1.047368898113738, + "grad_norm": 1.6657848358154297, + "learning_rate": 0.004727836012275092, + "loss": 8.1177, + "step": 257100 + }, + { + "epoch": 1.0477762761371194, + "grad_norm": 7.9431233406066895, + "learning_rate": 0.004727610658819527, + "loss": 8.168, + "step": 257200 + }, + { + "epoch": 1.0481836541605007, + "grad_norm": 2.4482877254486084, + "learning_rate": 0.00472738521749228, + "loss": 8.2099, + "step": 257300 + }, + { + "epoch": 1.0485910321838823, + "grad_norm": 3.727255344390869, + "learning_rate": 0.004727159688302251, + "loss": 8.2143, + "step": 257400 + }, + { + "epoch": 1.0489984102072638, + "grad_norm": 1.0347565412521362, + "learning_rate": 0.004726934071258369, + "loss": 8.129, + "step": 257500 + }, + { + "epoch": 1.0494057882306451, + "grad_norm": 3.988304615020752, + "learning_rate": 0.00472670836636955, + "loss": 8.1695, + "step": 257600 + }, + { + "epoch": 1.0498131662540267, + "grad_norm": 3.0877161026000977, + "learning_rate": 0.0047264825736447165, + "loss": 8.1442, + "step": 257700 + }, + { + "epoch": 1.0502205442774082, + "grad_norm": 5.7359232902526855, + "learning_rate": 0.004726256693092799, + "loss": 8.1259, + "step": 257800 + }, + { + "epoch": 1.0506279223007895, + "grad_norm": 4.0298662185668945, + "learning_rate": 0.004726030724722721, + "loss": 8.0781, + "step": 257900 + }, + { + "epoch": 1.051035300324171, + "grad_norm": 4.852095127105713, + "learning_rate": 0.004725804668543424, + "loss": 8.0993, + "step": 258000 + }, + { + "epoch": 1.051035300324171, + "eval_MaskedAccuracy": 0.48923874445373744, + "eval_loss": 1.7012666463851929, + "eval_runtime": 151.5858, + "eval_samples_per_second": 418.746, + "eval_steps_per_second": 1.636, + "step": 258000 + }, + { + "epoch": 1.0514426783475526, + "grad_norm": 4.054843902587891, + "learning_rate": 0.004725578524563837, + "loss": 8.0835, + "step": 258100 + }, + { + "epoch": 1.051850056370934, + "grad_norm": 5.810976982116699, + "learning_rate": 0.0047253522927929075, + "loss": 8.0914, + "step": 258200 + }, + { + "epoch": 1.0522574343943154, + "grad_norm": 3.9607856273651123, + "learning_rate": 0.004725125973239575, + "loss": 8.1012, + "step": 258300 + }, + { + "epoch": 1.052664812417697, + "grad_norm": 3.03961181640625, + "learning_rate": 0.004724899565912787, + "loss": 8.144, + "step": 258400 + }, + { + "epoch": 1.0530721904410785, + "grad_norm": 6.912770748138428, + "learning_rate": 0.004724673070821498, + "loss": 8.2314, + "step": 258500 + }, + { + "epoch": 1.0534795684644598, + "grad_norm": 1.1770914793014526, + "learning_rate": 0.004724446487974659, + "loss": 8.1517, + "step": 258600 + }, + { + "epoch": 1.0538869464878413, + "grad_norm": 10.230463981628418, + "learning_rate": 0.004724219817381233, + "loss": 8.2025, + "step": 258700 + }, + { + "epoch": 1.0542943245112228, + "grad_norm": 0.9877458214759827, + "learning_rate": 0.004723993059050179, + "loss": 8.156, + "step": 258800 + }, + { + "epoch": 1.0547017025346042, + "grad_norm": 1.8062012195587158, + "learning_rate": 0.004723766212990459, + "loss": 8.2504, + "step": 258900 + }, + { + "epoch": 1.0551090805579857, + "grad_norm": 4.536640644073486, + "learning_rate": 0.004723539279211045, + "loss": 8.249, + "step": 259000 + }, + { + "epoch": 1.0551090805579857, + "eval_MaskedAccuracy": 0.48295881254614076, + "eval_loss": 1.7288438081741333, + "eval_runtime": 151.1894, + "eval_samples_per_second": 419.844, + "eval_steps_per_second": 1.64, + "step": 259000 + }, + { + "epoch": 1.0555164585813672, + "grad_norm": 3.7817294597625732, + "learning_rate": 0.0047233122577209075, + "loss": 8.2358, + "step": 259100 + }, + { + "epoch": 1.0559238366047485, + "grad_norm": 4.458771705627441, + "learning_rate": 0.004723085148529026, + "loss": 8.1856, + "step": 259200 + }, + { + "epoch": 1.05633121462813, + "grad_norm": 4.099343299865723, + "learning_rate": 0.00472285795164437, + "loss": 8.1814, + "step": 259300 + }, + { + "epoch": 1.0567385926515116, + "grad_norm": 2.2912771701812744, + "learning_rate": 0.004722630667075925, + "loss": 8.1541, + "step": 259400 + }, + { + "epoch": 1.057145970674893, + "grad_norm": 5.370869159698486, + "learning_rate": 0.004722403294832682, + "loss": 8.2239, + "step": 259500 + }, + { + "epoch": 1.0575533486982744, + "grad_norm": 5.158105850219727, + "learning_rate": 0.004722175834923627, + "loss": 8.2159, + "step": 259600 + }, + { + "epoch": 1.057960726721656, + "grad_norm": 3.3874893188476562, + "learning_rate": 0.004721948287357746, + "loss": 8.1775, + "step": 259700 + }, + { + "epoch": 1.0583681047450373, + "grad_norm": 6.313094139099121, + "learning_rate": 0.004721720652144048, + "loss": 8.1786, + "step": 259800 + }, + { + "epoch": 1.0587754827684188, + "grad_norm": 7.082878112792969, + "learning_rate": 0.004721492929291526, + "loss": 8.1443, + "step": 259900 + }, + { + "epoch": 1.0591828607918004, + "grad_norm": 7.82448673248291, + "learning_rate": 0.0047212651188091835, + "loss": 8.1311, + "step": 260000 + }, + { + "epoch": 1.0591828607918004, + "eval_MaskedAccuracy": 0.4816183055664994, + "eval_loss": 1.7276784181594849, + "eval_runtime": 155.418, + "eval_samples_per_second": 408.421, + "eval_steps_per_second": 1.596, + "step": 260000 + }, + { + "epoch": 1.0595902388151817, + "grad_norm": 1.3936896324157715, + "learning_rate": 0.004721037220706029, + "loss": 8.1921, + "step": 260100 + }, + { + "epoch": 1.0599976168385632, + "grad_norm": 0.652860701084137, + "learning_rate": 0.004720809234991075, + "loss": 8.1643, + "step": 260200 + }, + { + "epoch": 1.0604049948619447, + "grad_norm": 6.536575794219971, + "learning_rate": 0.004720581161673324, + "loss": 8.2325, + "step": 260300 + }, + { + "epoch": 1.060812372885326, + "grad_norm": 4.015500068664551, + "learning_rate": 0.004720353000761806, + "loss": 8.225, + "step": 260400 + }, + { + "epoch": 1.0612197509087076, + "grad_norm": 4.177878379821777, + "learning_rate": 0.004720124752265536, + "loss": 8.1605, + "step": 260500 + }, + { + "epoch": 1.061627128932089, + "grad_norm": 0.9724656343460083, + "learning_rate": 0.004719896416193533, + "loss": 8.1223, + "step": 260600 + }, + { + "epoch": 1.0620345069554706, + "grad_norm": 5.476675033569336, + "learning_rate": 0.004719667992554826, + "loss": 8.2556, + "step": 260700 + }, + { + "epoch": 1.062441884978852, + "grad_norm": 3.0141918659210205, + "learning_rate": 0.0047194394813584555, + "loss": 8.1575, + "step": 260800 + }, + { + "epoch": 1.0628492630022335, + "grad_norm": 2.6944692134857178, + "learning_rate": 0.00471921088261344, + "loss": 8.1083, + "step": 260900 + }, + { + "epoch": 1.063256641025615, + "grad_norm": 2.877093553543091, + "learning_rate": 0.004718982196328832, + "loss": 8.1973, + "step": 261000 + }, + { + "epoch": 1.063256641025615, + "eval_MaskedAccuracy": 0.48363264676187867, + "eval_loss": 1.72201669216156, + "eval_runtime": 148.4137, + "eval_samples_per_second": 427.696, + "eval_steps_per_second": 1.671, + "step": 261000 + }, + { + "epoch": 1.0636640190489963, + "grad_norm": 2.6658310890197754, + "learning_rate": 0.004718753422513663, + "loss": 8.1741, + "step": 261100 + }, + { + "epoch": 1.0640713970723779, + "grad_norm": 3.4677579402923584, + "learning_rate": 0.004718524561176982, + "loss": 8.1563, + "step": 261200 + }, + { + "epoch": 1.0644787750957594, + "grad_norm": 1.8401967287063599, + "learning_rate": 0.004718295612327835, + "loss": 8.2139, + "step": 261300 + }, + { + "epoch": 1.0648861531191407, + "grad_norm": 3.109457492828369, + "learning_rate": 0.00471806657597528, + "loss": 8.207, + "step": 261400 + }, + { + "epoch": 1.0652935311425222, + "grad_norm": 4.015644550323486, + "learning_rate": 0.0047178374521283616, + "loss": 8.2151, + "step": 261500 + }, + { + "epoch": 1.0657009091659038, + "grad_norm": 4.507015228271484, + "learning_rate": 0.004717608240796146, + "loss": 8.2224, + "step": 261600 + }, + { + "epoch": 1.066108287189285, + "grad_norm": 5.073245048522949, + "learning_rate": 0.004717378941987694, + "loss": 8.1965, + "step": 261700 + }, + { + "epoch": 1.0665156652126666, + "grad_norm": 2.549006700515747, + "learning_rate": 0.004717149555712068, + "loss": 8.2085, + "step": 261800 + }, + { + "epoch": 1.0669230432360481, + "grad_norm": 4.281580448150635, + "learning_rate": 0.004716920081978343, + "loss": 8.1909, + "step": 261900 + }, + { + "epoch": 1.0673304212594295, + "grad_norm": 4.725358963012695, + "learning_rate": 0.004716690520795584, + "loss": 8.1232, + "step": 262000 + }, + { + "epoch": 1.0673304212594295, + "eval_MaskedAccuracy": 0.4883496074575056, + "eval_loss": 1.6964771747589111, + "eval_runtime": 164.2241, + "eval_samples_per_second": 386.521, + "eval_steps_per_second": 1.51, + "step": 262000 + }, + { + "epoch": 1.067737799282811, + "grad_norm": 4.8197760581970215, + "learning_rate": 0.004716460872172865, + "loss": 8.1073, + "step": 262100 + }, + { + "epoch": 1.0681451773061925, + "grad_norm": 1.8663020133972168, + "learning_rate": 0.004716231136119275, + "loss": 8.0647, + "step": 262200 + }, + { + "epoch": 1.0685525553295738, + "grad_norm": 2.7896194458007812, + "learning_rate": 0.004716001312643891, + "loss": 8.1042, + "step": 262300 + }, + { + "epoch": 1.0689599333529554, + "grad_norm": 3.972243309020996, + "learning_rate": 0.004715771401755794, + "loss": 8.1265, + "step": 262400 + }, + { + "epoch": 1.069367311376337, + "grad_norm": 3.569995403289795, + "learning_rate": 0.004715541403464084, + "loss": 8.0601, + "step": 262500 + }, + { + "epoch": 1.0697746893997182, + "grad_norm": 2.492865562438965, + "learning_rate": 0.004715311317777855, + "loss": 8.1619, + "step": 262600 + }, + { + "epoch": 1.0701820674230997, + "grad_norm": 2.858626365661621, + "learning_rate": 0.004715081144706195, + "loss": 8.238, + "step": 262700 + }, + { + "epoch": 1.0705894454464813, + "grad_norm": 3.3485522270202637, + "learning_rate": 0.004714850884258203, + "loss": 8.1726, + "step": 262800 + }, + { + "epoch": 1.0709968234698626, + "grad_norm": 2.575921058654785, + "learning_rate": 0.004714620536442993, + "loss": 8.1583, + "step": 262900 + }, + { + "epoch": 1.0714042014932441, + "grad_norm": 2.6548173427581787, + "learning_rate": 0.00471439010126966, + "loss": 8.1958, + "step": 263000 + }, + { + "epoch": 1.0714042014932441, + "eval_MaskedAccuracy": 0.48232082736317583, + "eval_loss": 1.734666109085083, + "eval_runtime": 162.083, + "eval_samples_per_second": 391.627, + "eval_steps_per_second": 1.53, + "step": 263000 + }, + { + "epoch": 1.0718115795166256, + "grad_norm": 6.802894592285156, + "learning_rate": 0.004714159578747321, + "loss": 8.2134, + "step": 263100 + }, + { + "epoch": 1.0722189575400072, + "grad_norm": 4.141062259674072, + "learning_rate": 0.004713928968885087, + "loss": 8.2439, + "step": 263200 + }, + { + "epoch": 1.0726263355633885, + "grad_norm": 5.311740398406982, + "learning_rate": 0.004713698271692076, + "loss": 8.1981, + "step": 263300 + }, + { + "epoch": 1.07303371358677, + "grad_norm": 11.01203441619873, + "learning_rate": 0.004713467487177409, + "loss": 8.1446, + "step": 263400 + }, + { + "epoch": 1.0734410916101516, + "grad_norm": 6.425957202911377, + "learning_rate": 0.004713236615350214, + "loss": 8.2184, + "step": 263500 + }, + { + "epoch": 1.0738484696335329, + "grad_norm": 4.091139793395996, + "learning_rate": 0.004713005656219612, + "loss": 8.1619, + "step": 263600 + }, + { + "epoch": 1.0742558476569144, + "grad_norm": 4.363187313079834, + "learning_rate": 0.004712774609794744, + "loss": 8.1349, + "step": 263700 + }, + { + "epoch": 1.074663225680296, + "grad_norm": 3.226597309112549, + "learning_rate": 0.004712543476084728, + "loss": 8.1829, + "step": 263800 + }, + { + "epoch": 1.0750706037036772, + "grad_norm": 5.673817157745361, + "learning_rate": 0.004712312255098713, + "loss": 8.1851, + "step": 263900 + }, + { + "epoch": 1.0754779817270588, + "grad_norm": 5.958888053894043, + "learning_rate": 0.004712080946845837, + "loss": 8.1547, + "step": 264000 + }, + { + "epoch": 1.0754779817270588, + "eval_MaskedAccuracy": 0.4843080681283223, + "eval_loss": 1.7236666679382324, + "eval_runtime": 189.0288, + "eval_samples_per_second": 335.801, + "eval_steps_per_second": 1.312, + "step": 264000 + }, + { + "epoch": 1.0758853597504403, + "grad_norm": 3.0145950317382812, + "learning_rate": 0.004711849551335251, + "loss": 8.2086, + "step": 264100 + }, + { + "epoch": 1.0762927377738216, + "grad_norm": 1.837198257446289, + "learning_rate": 0.004711618068576095, + "loss": 8.2113, + "step": 264200 + }, + { + "epoch": 1.0767001157972031, + "grad_norm": 3.117332935333252, + "learning_rate": 0.004711386498577533, + "loss": 8.1258, + "step": 264300 + }, + { + "epoch": 1.0771074938205847, + "grad_norm": 3.062966823577881, + "learning_rate": 0.004711154841348704, + "loss": 8.1532, + "step": 264400 + }, + { + "epoch": 1.077514871843966, + "grad_norm": 4.211122989654541, + "learning_rate": 0.004710923096898773, + "loss": 8.1588, + "step": 264500 + }, + { + "epoch": 1.0779222498673475, + "grad_norm": 2.284803867340088, + "learning_rate": 0.004710691265236909, + "loss": 8.1051, + "step": 264600 + }, + { + "epoch": 1.078329627890729, + "grad_norm": 4.682508945465088, + "learning_rate": 0.004710459346372274, + "loss": 8.0787, + "step": 264700 + }, + { + "epoch": 1.0787370059141104, + "grad_norm": 5.156423568725586, + "learning_rate": 0.004710227340314039, + "loss": 8.0659, + "step": 264800 + }, + { + "epoch": 1.079144383937492, + "grad_norm": 3.078960657119751, + "learning_rate": 0.004709995247071368, + "loss": 8.1013, + "step": 264900 + }, + { + "epoch": 1.0795517619608734, + "grad_norm": 3.2186193466186523, + "learning_rate": 0.00470976306665344, + "loss": 8.087, + "step": 265000 + }, + { + "epoch": 1.0795517619608734, + "eval_MaskedAccuracy": 0.488403056545127, + "eval_loss": 1.7033782005310059, + "eval_runtime": 163.3397, + "eval_samples_per_second": 388.613, + "eval_steps_per_second": 1.518, + "step": 265000 + }, + { + "epoch": 1.0799591399842547, + "grad_norm": 5.391504764556885, + "learning_rate": 0.0047095307990694365, + "loss": 8.0565, + "step": 265100 + }, + { + "epoch": 1.0803665180076363, + "grad_norm": 2.081413507461548, + "learning_rate": 0.0047092984443285425, + "loss": 8.1123, + "step": 265200 + }, + { + "epoch": 1.0807738960310178, + "grad_norm": 3.115123748779297, + "learning_rate": 0.004709066002439943, + "loss": 8.1011, + "step": 265300 + }, + { + "epoch": 1.0811812740543991, + "grad_norm": 4.943385124206543, + "learning_rate": 0.00470883347341282, + "loss": 8.0437, + "step": 265400 + }, + { + "epoch": 1.0815886520777807, + "grad_norm": 4.331999778747559, + "learning_rate": 0.004708600857256373, + "loss": 8.0744, + "step": 265500 + }, + { + "epoch": 1.0819960301011622, + "grad_norm": 4.476188659667969, + "learning_rate": 0.004708368153979797, + "loss": 8.0748, + "step": 265600 + }, + { + "epoch": 1.0824034081245437, + "grad_norm": 5.666794300079346, + "learning_rate": 0.0047081353635923025, + "loss": 8.0852, + "step": 265700 + }, + { + "epoch": 1.082810786147925, + "grad_norm": 5.742965221405029, + "learning_rate": 0.004707902486103087, + "loss": 8.077, + "step": 265800 + }, + { + "epoch": 1.0832181641713066, + "grad_norm": 2.3758745193481445, + "learning_rate": 0.004707669521521353, + "loss": 8.1068, + "step": 265900 + }, + { + "epoch": 1.083625542194688, + "grad_norm": 3.634063720703125, + "learning_rate": 0.004707436469856306, + "loss": 8.0879, + "step": 266000 + }, + { + "epoch": 1.083625542194688, + "eval_MaskedAccuracy": 0.4891889805537006, + "eval_loss": 1.7100374698638916, + "eval_runtime": 174.0548, + "eval_samples_per_second": 364.69, + "eval_steps_per_second": 1.425, + "step": 266000 + }, + { + "epoch": 1.0840329202180694, + "grad_norm": 3.887019157409668, + "learning_rate": 0.004707203331117164, + "loss": 8.0937, + "step": 266100 + }, + { + "epoch": 1.084440298241451, + "grad_norm": 5.891707897186279, + "learning_rate": 0.0047069701053131475, + "loss": 8.081, + "step": 266200 + }, + { + "epoch": 1.0848476762648325, + "grad_norm": 4.093088626861572, + "learning_rate": 0.0047067367924534765, + "loss": 8.0458, + "step": 266300 + }, + { + "epoch": 1.0852550542882138, + "grad_norm": 16.209623336791992, + "learning_rate": 0.00470650339254738, + "loss": 8.0872, + "step": 266400 + }, + { + "epoch": 1.0856624323115953, + "grad_norm": 3.369231939315796, + "learning_rate": 0.004706269905604073, + "loss": 8.25, + "step": 266500 + }, + { + "epoch": 1.0860698103349768, + "grad_norm": 1.8260586261749268, + "learning_rate": 0.004706036331632798, + "loss": 8.2503, + "step": 266600 + }, + { + "epoch": 1.0864771883583582, + "grad_norm": 4.970149993896484, + "learning_rate": 0.00470580267064278, + "loss": 8.23, + "step": 266700 + }, + { + "epoch": 1.0868845663817397, + "grad_norm": 1.7472909688949585, + "learning_rate": 0.004705568922643267, + "loss": 8.1158, + "step": 266800 + }, + { + "epoch": 1.0872919444051212, + "grad_norm": 1.5036593675613403, + "learning_rate": 0.004705335087643487, + "loss": 8.2032, + "step": 266900 + }, + { + "epoch": 1.0876993224285025, + "grad_norm": 4.026489734649658, + "learning_rate": 0.004705101165652698, + "loss": 8.2286, + "step": 267000 + }, + { + "epoch": 1.0876993224285025, + "eval_MaskedAccuracy": 0.48110589174391627, + "eval_loss": 1.7291169166564941, + "eval_runtime": 155.1183, + "eval_samples_per_second": 409.21, + "eval_steps_per_second": 1.599, + "step": 267000 + }, + { + "epoch": 1.088106700451884, + "grad_norm": 2.847480535507202, + "learning_rate": 0.004704867156680141, + "loss": 8.1889, + "step": 267100 + }, + { + "epoch": 1.0885140784752656, + "grad_norm": 5.766560077667236, + "learning_rate": 0.004704633060735069, + "loss": 8.1671, + "step": 267200 + }, + { + "epoch": 1.088921456498647, + "grad_norm": 1.6122405529022217, + "learning_rate": 0.004704398877826742, + "loss": 8.1129, + "step": 267300 + }, + { + "epoch": 1.0893288345220284, + "grad_norm": 1.7545127868652344, + "learning_rate": 0.00470416460796441, + "loss": 8.1343, + "step": 267400 + }, + { + "epoch": 1.08973621254541, + "grad_norm": 2.776700019836426, + "learning_rate": 0.004703930251157339, + "loss": 8.1142, + "step": 267500 + }, + { + "epoch": 1.0901435905687913, + "grad_norm": 0.6854613423347473, + "learning_rate": 0.0047036958074147955, + "loss": 8.154, + "step": 267600 + }, + { + "epoch": 1.0905509685921728, + "grad_norm": 2.7014808654785156, + "learning_rate": 0.00470346127674605, + "loss": 8.1321, + "step": 267700 + }, + { + "epoch": 1.0909583466155544, + "grad_norm": 10.678851127624512, + "learning_rate": 0.004703226659160372, + "loss": 8.1209, + "step": 267800 + }, + { + "epoch": 1.0913657246389357, + "grad_norm": 1.0287868976593018, + "learning_rate": 0.004702991954667038, + "loss": 8.1732, + "step": 267900 + }, + { + "epoch": 1.0917731026623172, + "grad_norm": 2.418203592300415, + "learning_rate": 0.004702757163275323, + "loss": 8.2017, + "step": 268000 + }, + { + "epoch": 1.0917731026623172, + "eval_MaskedAccuracy": 0.48407416086684335, + "eval_loss": 1.7229300737380981, + "eval_runtime": 160.1907, + "eval_samples_per_second": 396.253, + "eval_steps_per_second": 1.548, + "step": 268000 + }, + { + "epoch": 1.0921804806856987, + "grad_norm": 4.111920356750488, + "learning_rate": 0.004702522284994514, + "loss": 8.1566, + "step": 268100 + }, + { + "epoch": 1.0925878587090803, + "grad_norm": 1.994573950767517, + "learning_rate": 0.004702287319833896, + "loss": 8.1353, + "step": 268200 + }, + { + "epoch": 1.0929952367324616, + "grad_norm": 1.087450623512268, + "learning_rate": 0.00470205226780276, + "loss": 8.1521, + "step": 268300 + }, + { + "epoch": 1.093402614755843, + "grad_norm": 2.8973805904388428, + "learning_rate": 0.004701817128910397, + "loss": 8.1979, + "step": 268400 + }, + { + "epoch": 1.0938099927792246, + "grad_norm": 3.912325382232666, + "learning_rate": 0.004701581903166106, + "loss": 8.1635, + "step": 268500 + }, + { + "epoch": 1.094217370802606, + "grad_norm": 3.248401641845703, + "learning_rate": 0.00470134659057918, + "loss": 8.1411, + "step": 268600 + }, + { + "epoch": 1.0946247488259875, + "grad_norm": 5.1933135986328125, + "learning_rate": 0.004701111191158926, + "loss": 8.173, + "step": 268700 + }, + { + "epoch": 1.095032126849369, + "grad_norm": 1.2353286743164062, + "learning_rate": 0.004700875704914657, + "loss": 8.1581, + "step": 268800 + }, + { + "epoch": 1.0954395048727503, + "grad_norm": 4.2090253829956055, + "learning_rate": 0.004700640131855669, + "loss": 8.1331, + "step": 268900 + }, + { + "epoch": 1.0958468828961319, + "grad_norm": 3.521942377090454, + "learning_rate": 0.0047004044719912855, + "loss": 8.1954, + "step": 269000 + }, + { + "epoch": 1.0958468828961319, + "eval_MaskedAccuracy": 0.48426087111627897, + "eval_loss": 1.7280783653259277, + "eval_runtime": 156.8039, + "eval_samples_per_second": 404.811, + "eval_steps_per_second": 1.582, + "step": 269000 + }, + { + "epoch": 1.0962542609195134, + "grad_norm": 1.2542732954025269, + "learning_rate": 0.004700168725330823, + "loss": 8.139, + "step": 269100 + }, + { + "epoch": 1.0966616389428947, + "grad_norm": 3.3350350856781006, + "learning_rate": 0.004699932891883601, + "loss": 8.1973, + "step": 269200 + }, + { + "epoch": 1.0970690169662762, + "grad_norm": 2.256042003631592, + "learning_rate": 0.004699696971658945, + "loss": 8.1227, + "step": 269300 + }, + { + "epoch": 1.0974763949896578, + "grad_norm": 1.9328958988189697, + "learning_rate": 0.004699460964666176, + "loss": 8.0896, + "step": 269400 + }, + { + "epoch": 1.097883773013039, + "grad_norm": 2.058600425720215, + "learning_rate": 0.004699224870914627, + "loss": 8.168, + "step": 269500 + }, + { + "epoch": 1.0982911510364206, + "grad_norm": 4.162485599517822, + "learning_rate": 0.004698988690413631, + "loss": 8.0951, + "step": 269600 + }, + { + "epoch": 1.0986985290598021, + "grad_norm": 2.3824028968811035, + "learning_rate": 0.004698752423172528, + "loss": 8.0478, + "step": 269700 + }, + { + "epoch": 1.0991059070831835, + "grad_norm": 3.044100284576416, + "learning_rate": 0.004698516069200664, + "loss": 8.0492, + "step": 269800 + }, + { + "epoch": 1.099513285106565, + "grad_norm": 4.95687198638916, + "learning_rate": 0.004698279628507376, + "loss": 8.0768, + "step": 269900 + }, + { + "epoch": 1.0999206631299465, + "grad_norm": 1.5366508960723877, + "learning_rate": 0.004698043101102013, + "loss": 8.0825, + "step": 270000 + }, + { + "epoch": 1.0999206631299465, + "eval_MaskedAccuracy": 0.48612449373834676, + "eval_loss": 1.71591055393219, + "eval_runtime": 166.5615, + "eval_samples_per_second": 381.096, + "eval_steps_per_second": 1.489, + "step": 270000 + }, + { + "epoch": 1.1003280411533278, + "grad_norm": 2.692131996154785, + "learning_rate": 0.0046978064869939344, + "loss": 8.1676, + "step": 270100 + }, + { + "epoch": 1.1007354191767094, + "grad_norm": 4.03659200668335, + "learning_rate": 0.004697569786192483, + "loss": 8.1174, + "step": 270200 + }, + { + "epoch": 1.101142797200091, + "grad_norm": 0.7524415850639343, + "learning_rate": 0.004697332998707021, + "loss": 8.1263, + "step": 270300 + }, + { + "epoch": 1.1015501752234722, + "grad_norm": 6.00818395614624, + "learning_rate": 0.004697096124546906, + "loss": 8.1965, + "step": 270400 + }, + { + "epoch": 1.1019575532468537, + "grad_norm": 3.065471887588501, + "learning_rate": 0.004696859163721507, + "loss": 8.2128, + "step": 270500 + }, + { + "epoch": 1.1023649312702353, + "grad_norm": 3.4066545963287354, + "learning_rate": 0.004696622116240192, + "loss": 8.1677, + "step": 270600 + }, + { + "epoch": 1.1027723092936168, + "grad_norm": 5.791946887969971, + "learning_rate": 0.0046963849821123355, + "loss": 8.115, + "step": 270700 + }, + { + "epoch": 1.1031796873169981, + "grad_norm": 6.0821533203125, + "learning_rate": 0.004696147761347314, + "loss": 8.1141, + "step": 270800 + }, + { + "epoch": 1.1035870653403796, + "grad_norm": 4.188292980194092, + "learning_rate": 0.004695910453954499, + "loss": 8.1271, + "step": 270900 + }, + { + "epoch": 1.1039944433637612, + "grad_norm": 3.8692679405212402, + "learning_rate": 0.004695673059943279, + "loss": 8.1337, + "step": 271000 + }, + { + "epoch": 1.1039944433637612, + "eval_MaskedAccuracy": 0.4849790188195527, + "eval_loss": 1.7239805459976196, + "eval_runtime": 171.5484, + "eval_samples_per_second": 370.018, + "eval_steps_per_second": 1.446, + "step": 271000 + }, + { + "epoch": 1.1044018213871425, + "grad_norm": 2.2323920726776123, + "learning_rate": 0.004695435579323033, + "loss": 8.2123, + "step": 271100 + }, + { + "epoch": 1.104809199410524, + "grad_norm": 0.9265953302383423, + "learning_rate": 0.004695198012103152, + "loss": 8.2061, + "step": 271200 + }, + { + "epoch": 1.1052165774339056, + "grad_norm": 5.013404369354248, + "learning_rate": 0.004694960358293025, + "loss": 8.1286, + "step": 271300 + }, + { + "epoch": 1.1056239554572869, + "grad_norm": 0.45778951048851013, + "learning_rate": 0.004694722617902061, + "loss": 8.1433, + "step": 271400 + }, + { + "epoch": 1.1060313334806684, + "grad_norm": 5.009927749633789, + "learning_rate": 0.004694484790939649, + "loss": 8.1416, + "step": 271500 + }, + { + "epoch": 1.10643871150405, + "grad_norm": 2.8697478771209717, + "learning_rate": 0.004694246877415187, + "loss": 8.0945, + "step": 271600 + }, + { + "epoch": 1.1068460895274312, + "grad_norm": 3.8795487880706787, + "learning_rate": 0.0046940088773380914, + "loss": 8.1403, + "step": 271700 + }, + { + "epoch": 1.1072534675508128, + "grad_norm": 6.586146354675293, + "learning_rate": 0.004693770790717769, + "loss": 8.1388, + "step": 271800 + }, + { + "epoch": 1.1076608455741943, + "grad_norm": 4.505802154541016, + "learning_rate": 0.004693532617563631, + "loss": 8.1552, + "step": 271900 + }, + { + "epoch": 1.1080682235975756, + "grad_norm": 2.4296302795410156, + "learning_rate": 0.004693294357885089, + "loss": 8.0803, + "step": 272000 + }, + { + "epoch": 1.1080682235975756, + "eval_MaskedAccuracy": 0.4881155818758434, + "eval_loss": 1.6957036256790161, + "eval_runtime": 209.1919, + "eval_samples_per_second": 303.434, + "eval_steps_per_second": 1.186, + "step": 272000 + }, + { + "epoch": 1.1084756016209572, + "grad_norm": 7.506979465484619, + "learning_rate": 0.00469305601169157, + "loss": 8.1333, + "step": 272100 + }, + { + "epoch": 1.1088829796443387, + "grad_norm": 2.638509511947632, + "learning_rate": 0.004692817578992502, + "loss": 8.1584, + "step": 272200 + }, + { + "epoch": 1.10929035766772, + "grad_norm": 3.3887877464294434, + "learning_rate": 0.004692579059797297, + "loss": 8.1036, + "step": 272300 + }, + { + "epoch": 1.1096977356911015, + "grad_norm": 4.049765110015869, + "learning_rate": 0.004692340454115392, + "loss": 8.0382, + "step": 272400 + }, + { + "epoch": 1.110105113714483, + "grad_norm": 4.908907413482666, + "learning_rate": 0.0046921017619562235, + "loss": 8.0715, + "step": 272500 + }, + { + "epoch": 1.1105124917378644, + "grad_norm": 4.042283535003662, + "learning_rate": 0.004691862983329218, + "loss": 8.0545, + "step": 272600 + }, + { + "epoch": 1.110919869761246, + "grad_norm": 4.053991794586182, + "learning_rate": 0.0046916241182438255, + "loss": 8.0384, + "step": 272700 + }, + { + "epoch": 1.1113272477846274, + "grad_norm": 3.4201111793518066, + "learning_rate": 0.004691385166709484, + "loss": 8.0466, + "step": 272800 + }, + { + "epoch": 1.1117346258080087, + "grad_norm": 4.905217170715332, + "learning_rate": 0.0046911461287356465, + "loss": 8.0787, + "step": 272900 + }, + { + "epoch": 1.1121420038313903, + "grad_norm": 0.5103607773780823, + "learning_rate": 0.004690907004331768, + "loss": 8.1088, + "step": 273000 + }, + { + "epoch": 1.1121420038313903, + "eval_MaskedAccuracy": 0.48666966660284766, + "eval_loss": 1.707664132118225, + "eval_runtime": 155.8076, + "eval_samples_per_second": 407.4, + "eval_steps_per_second": 1.592, + "step": 273000 + }, + { + "epoch": 1.1125493818547718, + "grad_norm": 3.7668569087982178, + "learning_rate": 0.004690667793507286, + "loss": 8.1373, + "step": 273100 + }, + { + "epoch": 1.1129567598781533, + "grad_norm": 1.944350004196167, + "learning_rate": 0.00469042849627167, + "loss": 8.1208, + "step": 273200 + }, + { + "epoch": 1.1133641379015347, + "grad_norm": 5.168220043182373, + "learning_rate": 0.0046901891126343714, + "loss": 8.1387, + "step": 273300 + }, + { + "epoch": 1.1137715159249162, + "grad_norm": 2.502412796020508, + "learning_rate": 0.0046899496426048605, + "loss": 8.1277, + "step": 273400 + }, + { + "epoch": 1.1141788939482977, + "grad_norm": 0.8807030320167542, + "learning_rate": 0.00468971008619261, + "loss": 8.1554, + "step": 273500 + }, + { + "epoch": 1.114586271971679, + "grad_norm": 3.1593713760375977, + "learning_rate": 0.004689470443407077, + "loss": 8.1502, + "step": 273600 + }, + { + "epoch": 1.1149936499950606, + "grad_norm": 4.367379665374756, + "learning_rate": 0.0046892307142577545, + "loss": 8.2173, + "step": 273700 + }, + { + "epoch": 1.115401028018442, + "grad_norm": 3.056534767150879, + "learning_rate": 0.0046889908987541055, + "loss": 8.1163, + "step": 273800 + }, + { + "epoch": 1.1158084060418234, + "grad_norm": 1.1638544797897339, + "learning_rate": 0.004688750996905612, + "loss": 8.1309, + "step": 273900 + }, + { + "epoch": 1.116215784065205, + "grad_norm": 2.2497355937957764, + "learning_rate": 0.004688511008721758, + "loss": 8.2247, + "step": 274000 + }, + { + "epoch": 1.116215784065205, + "eval_MaskedAccuracy": 0.4831780076449618, + "eval_loss": 1.7273262739181519, + "eval_runtime": 161.3412, + "eval_samples_per_second": 393.427, + "eval_steps_per_second": 1.537, + "step": 274000 + }, + { + "epoch": 1.1166231620885865, + "grad_norm": 6.01616096496582, + "learning_rate": 0.004688270934212037, + "loss": 8.1929, + "step": 274100 + }, + { + "epoch": 1.1170305401119678, + "grad_norm": 2.2356910705566406, + "learning_rate": 0.004688030773385945, + "loss": 8.0901, + "step": 274200 + }, + { + "epoch": 1.1174379181353493, + "grad_norm": 4.811047077178955, + "learning_rate": 0.004687790526252971, + "loss": 8.1381, + "step": 274300 + }, + { + "epoch": 1.1178452961587309, + "grad_norm": 3.640188217163086, + "learning_rate": 0.004687550192822613, + "loss": 8.1452, + "step": 274400 + }, + { + "epoch": 1.1182526741821122, + "grad_norm": 2.9838287830352783, + "learning_rate": 0.004687309773104376, + "loss": 8.0714, + "step": 274500 + }, + { + "epoch": 1.1186600522054937, + "grad_norm": 5.863322734832764, + "learning_rate": 0.004687069267107753, + "loss": 8.0812, + "step": 274600 + }, + { + "epoch": 1.1190674302288752, + "grad_norm": 4.292328834533691, + "learning_rate": 0.004686828674842264, + "loss": 8.0808, + "step": 274700 + }, + { + "epoch": 1.1194748082522565, + "grad_norm": 5.0022053718566895, + "learning_rate": 0.004686587996317418, + "loss": 8.0865, + "step": 274800 + }, + { + "epoch": 1.119882186275638, + "grad_norm": 3.4901864528656006, + "learning_rate": 0.004686347231542733, + "loss": 8.08, + "step": 274900 + }, + { + "epoch": 1.1202895642990196, + "grad_norm": 1.724212408065796, + "learning_rate": 0.004686106380527726, + "loss": 8.1934, + "step": 275000 + }, + { + "epoch": 1.1202895642990196, + "eval_MaskedAccuracy": 0.4846983685409245, + "eval_loss": 1.7235397100448608, + "eval_runtime": 256.0296, + "eval_samples_per_second": 247.924, + "eval_steps_per_second": 0.969, + "step": 275000 + }, + { + "epoch": 1.120696942322401, + "grad_norm": 0.8583388924598694, + "learning_rate": 0.004685865443281921, + "loss": 8.1504, + "step": 275100 + }, + { + "epoch": 1.1211043203457824, + "grad_norm": 2.7991883754730225, + "learning_rate": 0.004685624419814842, + "loss": 8.1777, + "step": 275200 + }, + { + "epoch": 1.121511698369164, + "grad_norm": 7.151505947113037, + "learning_rate": 0.004685383310136006, + "loss": 8.1491, + "step": 275300 + }, + { + "epoch": 1.1219190763925453, + "grad_norm": 7.722084045410156, + "learning_rate": 0.004685142114254963, + "loss": 8.149, + "step": 275400 + }, + { + "epoch": 1.1223264544159268, + "grad_norm": 6.095405101776123, + "learning_rate": 0.004684900832181243, + "loss": 8.1325, + "step": 275500 + }, + { + "epoch": 1.1227338324393084, + "grad_norm": 1.316977858543396, + "learning_rate": 0.004684659463924384, + "loss": 8.1143, + "step": 275600 + }, + { + "epoch": 1.1231412104626899, + "grad_norm": 8.855969429016113, + "learning_rate": 0.004684418009493925, + "loss": 8.1615, + "step": 275700 + }, + { + "epoch": 1.1235485884860712, + "grad_norm": 3.414064645767212, + "learning_rate": 0.004684176468899419, + "loss": 8.1684, + "step": 275800 + }, + { + "epoch": 1.1239559665094527, + "grad_norm": 3.0379483699798584, + "learning_rate": 0.004683934842150418, + "loss": 8.058, + "step": 275900 + }, + { + "epoch": 1.1243633445328343, + "grad_norm": 3.9102447032928467, + "learning_rate": 0.004683693129256457, + "loss": 8.1565, + "step": 276000 + }, + { + "epoch": 1.1243633445328343, + "eval_MaskedAccuracy": 0.48240115204096146, + "eval_loss": 1.7249886989593506, + "eval_runtime": 173.6473, + "eval_samples_per_second": 365.546, + "eval_steps_per_second": 1.428, + "step": 276000 + }, + { + "epoch": 1.1247707225562156, + "grad_norm": 4.398492813110352, + "learning_rate": 0.004683451330227109, + "loss": 8.1829, + "step": 276100 + }, + { + "epoch": 1.125178100579597, + "grad_norm": 3.6306369304656982, + "learning_rate": 0.004683209445071927, + "loss": 8.0959, + "step": 276200 + }, + { + "epoch": 1.1255854786029786, + "grad_norm": 4.454225063323975, + "learning_rate": 0.004682967473800478, + "loss": 8.0981, + "step": 276300 + }, + { + "epoch": 1.12599285662636, + "grad_norm": 3.1625213623046875, + "learning_rate": 0.004682725416422324, + "loss": 8.0729, + "step": 276400 + }, + { + "epoch": 1.1264002346497415, + "grad_norm": 4.1124701499938965, + "learning_rate": 0.0046824832729470385, + "loss": 8.0742, + "step": 276500 + }, + { + "epoch": 1.126807612673123, + "grad_norm": 4.031746864318848, + "learning_rate": 0.004682241043384192, + "loss": 8.0598, + "step": 276600 + }, + { + "epoch": 1.1272149906965043, + "grad_norm": 3.8960773944854736, + "learning_rate": 0.0046819987277433636, + "loss": 8.0928, + "step": 276700 + }, + { + "epoch": 1.1276223687198859, + "grad_norm": 4.482669830322266, + "learning_rate": 0.004681756326034138, + "loss": 8.0778, + "step": 276800 + }, + { + "epoch": 1.1280297467432674, + "grad_norm": 2.868711471557617, + "learning_rate": 0.004681513838266088, + "loss": 8.0551, + "step": 276900 + }, + { + "epoch": 1.1284371247666487, + "grad_norm": 4.186704635620117, + "learning_rate": 0.00468127126444881, + "loss": 8.1249, + "step": 277000 + }, + { + "epoch": 1.1284371247666487, + "eval_MaskedAccuracy": 0.4834509396531186, + "eval_loss": 1.724225640296936, + "eval_runtime": 171.7546, + "eval_samples_per_second": 369.574, + "eval_steps_per_second": 1.444, + "step": 277000 + }, + { + "epoch": 1.1288445027900302, + "grad_norm": 3.7420201301574707, + "learning_rate": 0.004681028604591884, + "loss": 8.1495, + "step": 277100 + }, + { + "epoch": 1.1292518808134118, + "grad_norm": 4.511409759521484, + "learning_rate": 0.00468078585870491, + "loss": 8.0943, + "step": 277200 + }, + { + "epoch": 1.129659258836793, + "grad_norm": 5.29677152633667, + "learning_rate": 0.00468054302679748, + "loss": 8.0926, + "step": 277300 + }, + { + "epoch": 1.1300666368601746, + "grad_norm": 4.133951663970947, + "learning_rate": 0.0046803001088792035, + "loss": 8.0612, + "step": 277400 + }, + { + "epoch": 1.1304740148835561, + "grad_norm": 4.754375457763672, + "learning_rate": 0.004680057104959683, + "loss": 8.0477, + "step": 277500 + }, + { + "epoch": 1.1308813929069375, + "grad_norm": 5.114566326141357, + "learning_rate": 0.00467981401504852, + "loss": 8.0638, + "step": 277600 + }, + { + "epoch": 1.131288770930319, + "grad_norm": 4.64804744720459, + "learning_rate": 0.004679570839155321, + "loss": 8.0741, + "step": 277700 + }, + { + "epoch": 1.1316961489537005, + "grad_norm": 2.923494815826416, + "learning_rate": 0.004679327577289709, + "loss": 8.1535, + "step": 277800 + }, + { + "epoch": 1.1321035269770818, + "grad_norm": 3.0124878883361816, + "learning_rate": 0.004679084229461299, + "loss": 8.1356, + "step": 277900 + }, + { + "epoch": 1.1325109050004634, + "grad_norm": 4.786769866943359, + "learning_rate": 0.004678840795679713, + "loss": 8.119, + "step": 278000 + }, + { + "epoch": 1.1325109050004634, + "eval_MaskedAccuracy": 0.4894147655588561, + "eval_loss": 1.7013311386108398, + "eval_runtime": 233.4884, + "eval_samples_per_second": 271.859, + "eval_steps_per_second": 1.062, + "step": 278000 + }, + { + "epoch": 1.132918283023845, + "grad_norm": 5.455703258514404, + "learning_rate": 0.004678597275954576, + "loss": 8.0748, + "step": 278100 + }, + { + "epoch": 1.1333256610472264, + "grad_norm": 5.843703746795654, + "learning_rate": 0.004678353670295513, + "loss": 8.1148, + "step": 278200 + }, + { + "epoch": 1.1337330390706077, + "grad_norm": 2.1924307346343994, + "learning_rate": 0.004678109978712153, + "loss": 8.1491, + "step": 278300 + }, + { + "epoch": 1.1341404170939893, + "grad_norm": 5.719590663909912, + "learning_rate": 0.0046778662012141314, + "loss": 8.1309, + "step": 278400 + }, + { + "epoch": 1.1345477951173706, + "grad_norm": 5.64662504196167, + "learning_rate": 0.0046776223378110875, + "loss": 8.1755, + "step": 278500 + }, + { + "epoch": 1.1349551731407521, + "grad_norm": 4.763120651245117, + "learning_rate": 0.00467737838851267, + "loss": 8.2121, + "step": 278600 + }, + { + "epoch": 1.1353625511641336, + "grad_norm": 4.780452251434326, + "learning_rate": 0.004677134353328508, + "loss": 8.1014, + "step": 278700 + }, + { + "epoch": 1.1357699291875152, + "grad_norm": 4.963366985321045, + "learning_rate": 0.004676890232268258, + "loss": 8.0473, + "step": 278800 + }, + { + "epoch": 1.1361773072108965, + "grad_norm": 3.391356945037842, + "learning_rate": 0.004676646025341571, + "loss": 8.0508, + "step": 278900 + }, + { + "epoch": 1.136584685234278, + "grad_norm": 5.09703254699707, + "learning_rate": 0.004676401732558105, + "loss": 8.0855, + "step": 279000 + }, + { + "epoch": 1.136584685234278, + "eval_MaskedAccuracy": 0.4898350904715383, + "eval_loss": 1.6970337629318237, + "eval_runtime": 182.909, + "eval_samples_per_second": 347.036, + "eval_steps_per_second": 1.356, + "step": 279000 + }, + { + "epoch": 1.1369920632576596, + "grad_norm": 4.194087982177734, + "learning_rate": 0.0046761573539275065, + "loss": 8.0722, + "step": 279100 + }, + { + "epoch": 1.1373994412810409, + "grad_norm": 3.768462657928467, + "learning_rate": 0.00467591288945945, + "loss": 8.1768, + "step": 279200 + }, + { + "epoch": 1.1378068193044224, + "grad_norm": 2.8276965618133545, + "learning_rate": 0.004675668339163595, + "loss": 8.1721, + "step": 279300 + }, + { + "epoch": 1.138214197327804, + "grad_norm": 3.0509397983551025, + "learning_rate": 0.004675423703049611, + "loss": 8.1377, + "step": 279400 + }, + { + "epoch": 1.1386215753511852, + "grad_norm": 1.7160394191741943, + "learning_rate": 0.004675178981127179, + "loss": 8.1548, + "step": 279500 + }, + { + "epoch": 1.1390289533745668, + "grad_norm": 2.2624850273132324, + "learning_rate": 0.004674934173405958, + "loss": 8.1776, + "step": 279600 + }, + { + "epoch": 1.1394363313979483, + "grad_norm": 7.251711368560791, + "learning_rate": 0.004674689279895635, + "loss": 8.1354, + "step": 279700 + }, + { + "epoch": 1.1398437094213296, + "grad_norm": 5.023645401000977, + "learning_rate": 0.00467444430060589, + "loss": 8.2043, + "step": 279800 + }, + { + "epoch": 1.1402510874447112, + "grad_norm": 3.3399031162261963, + "learning_rate": 0.004674199235546409, + "loss": 8.0932, + "step": 279900 + }, + { + "epoch": 1.1406584654680927, + "grad_norm": 3.627291202545166, + "learning_rate": 0.0046739540847268776, + "loss": 8.065, + "step": 280000 + }, + { + "epoch": 1.1406584654680927, + "eval_MaskedAccuracy": 0.4889675875917004, + "eval_loss": 1.7060195207595825, + "eval_runtime": 209.0401, + "eval_samples_per_second": 303.655, + "eval_steps_per_second": 1.186, + "step": 280000 + }, + { + "epoch": 1.141065843491474, + "grad_norm": 5.373180866241455, + "learning_rate": 0.00467370884815699, + "loss": 8.091, + "step": 280100 + }, + { + "epoch": 1.1414732215148555, + "grad_norm": 6.052729606628418, + "learning_rate": 0.004673463525846452, + "loss": 8.0751, + "step": 280200 + }, + { + "epoch": 1.141880599538237, + "grad_norm": 3.9188849925994873, + "learning_rate": 0.004673218117804952, + "loss": 8.1804, + "step": 280300 + }, + { + "epoch": 1.1422879775616184, + "grad_norm": 4.439460754394531, + "learning_rate": 0.004672972624042189, + "loss": 8.1365, + "step": 280400 + }, + { + "epoch": 1.142695355585, + "grad_norm": 7.058625221252441, + "learning_rate": 0.004672727044567876, + "loss": 8.1494, + "step": 280500 + }, + { + "epoch": 1.1431027336083814, + "grad_norm": 3.8022348880767822, + "learning_rate": 0.004672481379391715, + "loss": 8.1378, + "step": 280600 + }, + { + "epoch": 1.143510111631763, + "grad_norm": 3.5311572551727295, + "learning_rate": 0.004672235628523426, + "loss": 8.1137, + "step": 280700 + }, + { + "epoch": 1.1439174896551443, + "grad_norm": 4.319153785705566, + "learning_rate": 0.004671989791972717, + "loss": 8.0777, + "step": 280800 + }, + { + "epoch": 1.1443248676785258, + "grad_norm": 4.833874225616455, + "learning_rate": 0.004671743869749315, + "loss": 8.1024, + "step": 280900 + }, + { + "epoch": 1.1447322457019071, + "grad_norm": 3.737802743911743, + "learning_rate": 0.004671497861862941, + "loss": 8.078, + "step": 281000 + }, + { + "epoch": 1.1447322457019071, + "eval_MaskedAccuracy": 0.4893886676306348, + "eval_loss": 1.7081918716430664, + "eval_runtime": 181.4112, + "eval_samples_per_second": 349.901, + "eval_steps_per_second": 1.367, + "step": 281000 + }, + { + "epoch": 1.1451396237252887, + "grad_norm": 2.8301610946655273, + "learning_rate": 0.004671251768323314, + "loss": 8.0738, + "step": 281100 + }, + { + "epoch": 1.1455470017486702, + "grad_norm": 4.914058208465576, + "learning_rate": 0.004671005589140168, + "loss": 8.0886, + "step": 281200 + }, + { + "epoch": 1.1459543797720517, + "grad_norm": 3.273712158203125, + "learning_rate": 0.004670759324323236, + "loss": 8.0935, + "step": 281300 + }, + { + "epoch": 1.146361757795433, + "grad_norm": 4.916646480560303, + "learning_rate": 0.004670512973882251, + "loss": 8.0889, + "step": 281400 + }, + { + "epoch": 1.1467691358188146, + "grad_norm": 7.305778980255127, + "learning_rate": 0.00467026653782696, + "loss": 8.1171, + "step": 281500 + }, + { + "epoch": 1.147176513842196, + "grad_norm": 2.4786717891693115, + "learning_rate": 0.004670020016167105, + "loss": 8.2327, + "step": 281600 + }, + { + "epoch": 1.1475838918655774, + "grad_norm": 3.4263861179351807, + "learning_rate": 0.004669773408912425, + "loss": 8.2002, + "step": 281700 + }, + { + "epoch": 1.147991269888959, + "grad_norm": 4.081550121307373, + "learning_rate": 0.0046695267160726745, + "loss": 8.1621, + "step": 281800 + }, + { + "epoch": 1.1483986479123405, + "grad_norm": 2.9435956478118896, + "learning_rate": 0.004669279937657605, + "loss": 8.0995, + "step": 281900 + }, + { + "epoch": 1.1488060259357218, + "grad_norm": 3.1086764335632324, + "learning_rate": 0.0046690330736769755, + "loss": 8.0852, + "step": 282000 + }, + { + "epoch": 1.1488060259357218, + "eval_MaskedAccuracy": 0.48949397088198715, + "eval_loss": 1.698317527770996, + "eval_runtime": 171.0137, + "eval_samples_per_second": 371.175, + "eval_steps_per_second": 1.45, + "step": 282000 + }, + { + "epoch": 1.1492134039591033, + "grad_norm": 4.455804347991943, + "learning_rate": 0.004668786124140548, + "loss": 8.0668, + "step": 282100 + }, + { + "epoch": 1.1496207819824849, + "grad_norm": 8.845037460327148, + "learning_rate": 0.0046685390890580806, + "loss": 8.0871, + "step": 282200 + }, + { + "epoch": 1.1500281600058662, + "grad_norm": 2.697097063064575, + "learning_rate": 0.004668291968439348, + "loss": 8.1008, + "step": 282300 + }, + { + "epoch": 1.1504355380292477, + "grad_norm": 1.5756800174713135, + "learning_rate": 0.004668044762294112, + "loss": 8.0934, + "step": 282400 + }, + { + "epoch": 1.1508429160526292, + "grad_norm": 3.879124641418457, + "learning_rate": 0.004667797470632144, + "loss": 8.1046, + "step": 282500 + }, + { + "epoch": 1.1512502940760105, + "grad_norm": 3.2476515769958496, + "learning_rate": 0.004667550093463231, + "loss": 8.0288, + "step": 282600 + }, + { + "epoch": 1.151657672099392, + "grad_norm": 3.167145252227783, + "learning_rate": 0.004667302630797136, + "loss": 8.0793, + "step": 282700 + }, + { + "epoch": 1.1520650501227736, + "grad_norm": 6.005479335784912, + "learning_rate": 0.004667055082643657, + "loss": 8.0578, + "step": 282800 + }, + { + "epoch": 1.152472428146155, + "grad_norm": 3.0901143550872803, + "learning_rate": 0.00466680744901258, + "loss": 8.1376, + "step": 282900 + }, + { + "epoch": 1.1528798061695364, + "grad_norm": 0.9978356957435608, + "learning_rate": 0.004666559729913692, + "loss": 8.2153, + "step": 283000 + }, + { + "epoch": 1.1528798061695364, + "eval_MaskedAccuracy": 0.4848038331343619, + "eval_loss": 1.721642255783081, + "eval_runtime": 235.129, + "eval_samples_per_second": 269.962, + "eval_steps_per_second": 1.055, + "step": 283000 + }, + { + "epoch": 1.153287184192918, + "grad_norm": 3.1643245220184326, + "learning_rate": 0.004666311925356788, + "loss": 8.1531, + "step": 283100 + }, + { + "epoch": 1.1536945622162995, + "grad_norm": 0.6623817682266235, + "learning_rate": 0.004666064035351659, + "loss": 8.1229, + "step": 283200 + }, + { + "epoch": 1.1541019402396808, + "grad_norm": 4.834157943725586, + "learning_rate": 0.0046658160599081155, + "loss": 8.1745, + "step": 283300 + }, + { + "epoch": 1.1545093182630624, + "grad_norm": 2.770371437072754, + "learning_rate": 0.004665567999035959, + "loss": 8.1868, + "step": 283400 + }, + { + "epoch": 1.1549166962864437, + "grad_norm": 2.7883384227752686, + "learning_rate": 0.004665319852744992, + "loss": 8.0956, + "step": 283500 + }, + { + "epoch": 1.1553240743098252, + "grad_norm": 3.587266445159912, + "learning_rate": 0.004665071621045019, + "loss": 8.1426, + "step": 283600 + }, + { + "epoch": 1.1557314523332067, + "grad_norm": 2.944342851638794, + "learning_rate": 0.004664823303945865, + "loss": 8.1905, + "step": 283700 + }, + { + "epoch": 1.1561388303565883, + "grad_norm": 4.455312252044678, + "learning_rate": 0.004664574901457347, + "loss": 8.0936, + "step": 283800 + }, + { + "epoch": 1.1565462083799696, + "grad_norm": 2.765490770339966, + "learning_rate": 0.004664326413589277, + "loss": 8.0715, + "step": 283900 + }, + { + "epoch": 1.156953586403351, + "grad_norm": 5.531642436981201, + "learning_rate": 0.004664077840351492, + "loss": 8.0301, + "step": 284000 + }, + { + "epoch": 1.156953586403351, + "eval_MaskedAccuracy": 0.49008495701906885, + "eval_loss": 1.6991403102874756, + "eval_runtime": 226.8438, + "eval_samples_per_second": 279.822, + "eval_steps_per_second": 1.093, + "step": 284000 + }, + { + "epoch": 1.1573609644267326, + "grad_norm": 3.5490705966949463, + "learning_rate": 0.004663829181753806, + "loss": 8.0236, + "step": 284100 + }, + { + "epoch": 1.157768342450114, + "grad_norm": 3.0921506881713867, + "learning_rate": 0.004663580437806063, + "loss": 8.0636, + "step": 284200 + }, + { + "epoch": 1.1581757204734955, + "grad_norm": 1.8780794143676758, + "learning_rate": 0.0046633316085180915, + "loss": 8.0648, + "step": 284300 + }, + { + "epoch": 1.158583098496877, + "grad_norm": 3.8255105018615723, + "learning_rate": 0.004663082693899725, + "loss": 8.0325, + "step": 284400 + }, + { + "epoch": 1.1589904765202583, + "grad_norm": 5.340209007263184, + "learning_rate": 0.0046628336939608115, + "loss": 8.057, + "step": 284500 + }, + { + "epoch": 1.1593978545436399, + "grad_norm": 3.661881446838379, + "learning_rate": 0.004662584608711187, + "loss": 8.0247, + "step": 284600 + }, + { + "epoch": 1.1598052325670214, + "grad_norm": 4.300080299377441, + "learning_rate": 0.0046623354381607005, + "loss": 8.0536, + "step": 284700 + }, + { + "epoch": 1.1602126105904027, + "grad_norm": 5.313004493713379, + "learning_rate": 0.004662086182319206, + "loss": 8.0564, + "step": 284800 + }, + { + "epoch": 1.1606199886137842, + "grad_norm": 3.9499144554138184, + "learning_rate": 0.004661836841196555, + "loss": 8.0147, + "step": 284900 + }, + { + "epoch": 1.1610273666371658, + "grad_norm": 3.613180637359619, + "learning_rate": 0.004661587414802613, + "loss": 8.0632, + "step": 285000 + }, + { + "epoch": 1.1610273666371658, + "eval_MaskedAccuracy": 0.4901985433871838, + "eval_loss": 1.706134557723999, + "eval_runtime": 176.7895, + "eval_samples_per_second": 359.048, + "eval_steps_per_second": 1.403, + "step": 285000 + }, + { + "epoch": 1.161434744660547, + "grad_norm": 3.91060471534729, + "learning_rate": 0.004661337903147235, + "loss": 8.0707, + "step": 285100 + }, + { + "epoch": 1.1618421226839286, + "grad_norm": 1.3036532402038574, + "learning_rate": 0.0046610883062402925, + "loss": 8.1888, + "step": 285200 + }, + { + "epoch": 1.1622495007073101, + "grad_norm": 0.8332093954086304, + "learning_rate": 0.004660838624091639, + "loss": 8.1748, + "step": 285300 + }, + { + "epoch": 1.1626568787306915, + "grad_norm": 2.4648735523223877, + "learning_rate": 0.004660588856711164, + "loss": 8.193, + "step": 285400 + }, + { + "epoch": 1.163064256754073, + "grad_norm": 2.6603598594665527, + "learning_rate": 0.004660339004108723, + "loss": 8.2083, + "step": 285500 + }, + { + "epoch": 1.1634716347774545, + "grad_norm": 2.0850071907043457, + "learning_rate": 0.00466008906629421, + "loss": 8.1916, + "step": 285600 + }, + { + "epoch": 1.163879012800836, + "grad_norm": 5.8866424560546875, + "learning_rate": 0.004659839043277496, + "loss": 8.1063, + "step": 285700 + }, + { + "epoch": 1.1642863908242174, + "grad_norm": 4.506791591644287, + "learning_rate": 0.004659588935068473, + "loss": 8.0866, + "step": 285800 + }, + { + "epoch": 1.164693768847599, + "grad_norm": 4.221245765686035, + "learning_rate": 0.004659338741677024, + "loss": 8.0499, + "step": 285900 + }, + { + "epoch": 1.1651011468709802, + "grad_norm": 6.313796520233154, + "learning_rate": 0.004659088463113041, + "loss": 8.0061, + "step": 286000 + }, + { + "epoch": 1.1651011468709802, + "eval_MaskedAccuracy": 0.4906126527733975, + "eval_loss": 1.6945469379425049, + "eval_runtime": 158.8981, + "eval_samples_per_second": 399.476, + "eval_steps_per_second": 1.561, + "step": 286000 + }, + { + "epoch": 1.1655085248943617, + "grad_norm": 3.875453472137451, + "learning_rate": 0.004658838099386422, + "loss": 8.0584, + "step": 286100 + }, + { + "epoch": 1.1659159029177433, + "grad_norm": 3.0014939308166504, + "learning_rate": 0.004658587650507059, + "loss": 8.0632, + "step": 286200 + }, + { + "epoch": 1.1663232809411248, + "grad_norm": 2.730990171432495, + "learning_rate": 0.004658337116484863, + "loss": 7.9925, + "step": 286300 + }, + { + "epoch": 1.1667306589645061, + "grad_norm": 4.797238349914551, + "learning_rate": 0.00465808649732973, + "loss": 8.036, + "step": 286400 + }, + { + "epoch": 1.1671380369878877, + "grad_norm": 4.819352626800537, + "learning_rate": 0.004657835793051574, + "loss": 8.0317, + "step": 286500 + }, + { + "epoch": 1.1675454150112692, + "grad_norm": 1.2718093395233154, + "learning_rate": 0.00465758500366031, + "loss": 8.0646, + "step": 286600 + }, + { + "epoch": 1.1679527930346505, + "grad_norm": 2.9197137355804443, + "learning_rate": 0.004657334129165845, + "loss": 8.1509, + "step": 286700 + }, + { + "epoch": 1.168360171058032, + "grad_norm": 11.091897964477539, + "learning_rate": 0.0046570831695780975, + "loss": 8.1928, + "step": 286800 + }, + { + "epoch": 1.1687675490814136, + "grad_norm": 4.137915134429932, + "learning_rate": 0.004656832124906996, + "loss": 8.1989, + "step": 286900 + }, + { + "epoch": 1.1691749271047949, + "grad_norm": 4.835000514984131, + "learning_rate": 0.0046565809951624616, + "loss": 8.0976, + "step": 287000 + }, + { + "epoch": 1.1691749271047949, + "eval_MaskedAccuracy": 0.48856667125114417, + "eval_loss": 1.700318694114685, + "eval_runtime": 179.8052, + "eval_samples_per_second": 353.026, + "eval_steps_per_second": 1.379, + "step": 287000 + }, + { + "epoch": 1.1695823051281764, + "grad_norm": 3.653043508529663, + "learning_rate": 0.004656329780354424, + "loss": 8.0464, + "step": 287100 + }, + { + "epoch": 1.169989683151558, + "grad_norm": 4.344825267791748, + "learning_rate": 0.004656078480492809, + "loss": 8.0559, + "step": 287200 + }, + { + "epoch": 1.1703970611749392, + "grad_norm": 4.034629821777344, + "learning_rate": 0.004655827095587565, + "loss": 8.0518, + "step": 287300 + }, + { + "epoch": 1.1708044391983208, + "grad_norm": 1.9681981801986694, + "learning_rate": 0.004655575625648619, + "loss": 8.0562, + "step": 287400 + }, + { + "epoch": 1.1712118172217023, + "grad_norm": 5.370085716247559, + "learning_rate": 0.004655324070685913, + "loss": 8.0785, + "step": 287500 + }, + { + "epoch": 1.1716191952450836, + "grad_norm": 13.195560455322266, + "learning_rate": 0.004655072430709397, + "loss": 8.0701, + "step": 287600 + }, + { + "epoch": 1.1720265732684652, + "grad_norm": 4.355969429016113, + "learning_rate": 0.004654820705729013, + "loss": 8.1519, + "step": 287700 + }, + { + "epoch": 1.1724339512918467, + "grad_norm": 4.580450534820557, + "learning_rate": 0.004654568895754724, + "loss": 8.0764, + "step": 287800 + }, + { + "epoch": 1.172841329315228, + "grad_norm": 4.757116794586182, + "learning_rate": 0.004654317000796484, + "loss": 8.0572, + "step": 287900 + }, + { + "epoch": 1.1732487073386095, + "grad_norm": 2.296961545944214, + "learning_rate": 0.0046540650208642355, + "loss": 8.1, + "step": 288000 + }, + { + "epoch": 1.1732487073386095, + "eval_MaskedAccuracy": 0.48794103054963767, + "eval_loss": 1.70279061794281, + "eval_runtime": 227.6202, + "eval_samples_per_second": 278.868, + "eval_steps_per_second": 1.09, + "step": 288000 + }, + { + "epoch": 1.173656085361991, + "grad_norm": 2.192230224609375, + "learning_rate": 0.004653812955967959, + "loss": 8.1132, + "step": 288100 + }, + { + "epoch": 1.1740634633853726, + "grad_norm": 3.585665464401245, + "learning_rate": 0.004653560806117613, + "loss": 8.1349, + "step": 288200 + }, + { + "epoch": 1.174470841408754, + "grad_norm": 4.043099880218506, + "learning_rate": 0.004653308571323171, + "loss": 8.1289, + "step": 288300 + }, + { + "epoch": 1.1748782194321354, + "grad_norm": 3.1525979042053223, + "learning_rate": 0.004653056251594601, + "loss": 8.2137, + "step": 288400 + }, + { + "epoch": 1.1752855974555168, + "grad_norm": 3.7244136333465576, + "learning_rate": 0.00465280384694188, + "loss": 8.1575, + "step": 288500 + }, + { + "epoch": 1.1756929754788983, + "grad_norm": 4.780149459838867, + "learning_rate": 0.004652551357374978, + "loss": 8.0906, + "step": 288600 + }, + { + "epoch": 1.1761003535022798, + "grad_norm": 3.728311061859131, + "learning_rate": 0.0046522987829038884, + "loss": 8.0462, + "step": 288700 + }, + { + "epoch": 1.1765077315256613, + "grad_norm": 4.469501972198486, + "learning_rate": 0.004652046123538593, + "loss": 8.0955, + "step": 288800 + }, + { + "epoch": 1.1769151095490427, + "grad_norm": 3.7230305671691895, + "learning_rate": 0.004651793379289087, + "loss": 8.0194, + "step": 288900 + }, + { + "epoch": 1.1773224875724242, + "grad_norm": 4.39764404296875, + "learning_rate": 0.004651540550165351, + "loss": 8.051, + "step": 289000 + }, + { + "epoch": 1.1773224875724242, + "eval_MaskedAccuracy": 0.49087479124984285, + "eval_loss": 1.6909769773483276, + "eval_runtime": 558.9363, + "eval_samples_per_second": 113.566, + "eval_steps_per_second": 0.444, + "step": 289000 + }, + { + "epoch": 1.1777298655958057, + "grad_norm": 6.782959461212158, + "learning_rate": 0.0046512876361773856, + "loss": 8.032, + "step": 289100 + }, + { + "epoch": 1.178137243619187, + "grad_norm": 1.3018677234649658, + "learning_rate": 0.004651034637335197, + "loss": 8.174, + "step": 289200 + }, + { + "epoch": 1.1785446216425686, + "grad_norm": 1.7693558931350708, + "learning_rate": 0.00465078155364878, + "loss": 8.1965, + "step": 289300 + }, + { + "epoch": 1.17895199966595, + "grad_norm": 4.082480430603027, + "learning_rate": 0.004650528385128143, + "loss": 8.1515, + "step": 289400 + }, + { + "epoch": 1.1793593776893314, + "grad_norm": 4.21183967590332, + "learning_rate": 0.0046502751317832915, + "loss": 8.0705, + "step": 289500 + }, + { + "epoch": 1.179766755712713, + "grad_norm": 3.432459831237793, + "learning_rate": 0.004650021793624242, + "loss": 8.0865, + "step": 289600 + }, + { + "epoch": 1.1801741337360945, + "grad_norm": 4.862122535705566, + "learning_rate": 0.004649768370661001, + "loss": 8.0546, + "step": 289700 + }, + { + "epoch": 1.1805815117594758, + "grad_norm": 4.301804542541504, + "learning_rate": 0.004649514862903603, + "loss": 8.0604, + "step": 289800 + }, + { + "epoch": 1.1809888897828573, + "grad_norm": 3.298346757888794, + "learning_rate": 0.004649261270362061, + "loss": 8.0628, + "step": 289900 + }, + { + "epoch": 1.1813962678062389, + "grad_norm": 6.476545333862305, + "learning_rate": 0.004649007593046405, + "loss": 8.1626, + "step": 290000 + }, + { + "epoch": 1.1813962678062389, + "eval_MaskedAccuracy": 0.4817633937202652, + "eval_loss": 1.730674386024475, + "eval_runtime": 160.0866, + "eval_samples_per_second": 396.51, + "eval_steps_per_second": 1.549, + "step": 290000 + }, + { + "epoch": 1.1818036458296202, + "grad_norm": 4.453090667724609, + "learning_rate": 0.00464875383096666, + "loss": 8.1244, + "step": 290100 + }, + { + "epoch": 1.1822110238530017, + "grad_norm": 5.203738212585449, + "learning_rate": 0.00464849998413286, + "loss": 8.1195, + "step": 290200 + }, + { + "epoch": 1.1826184018763832, + "grad_norm": 5.900394916534424, + "learning_rate": 0.004648246052555043, + "loss": 8.1849, + "step": 290300 + }, + { + "epoch": 1.1830257798997645, + "grad_norm": 1.8650020360946655, + "learning_rate": 0.004647992036243241, + "loss": 8.184, + "step": 290400 + }, + { + "epoch": 1.183433157923146, + "grad_norm": 2.952779769897461, + "learning_rate": 0.004647737935207507, + "loss": 8.1314, + "step": 290500 + }, + { + "epoch": 1.1838405359465276, + "grad_norm": 1.1895891427993774, + "learning_rate": 0.004647483749457883, + "loss": 8.0934, + "step": 290600 + }, + { + "epoch": 1.1842479139699091, + "grad_norm": 2.0459725856781006, + "learning_rate": 0.004647229479004416, + "loss": 8.1721, + "step": 290700 + }, + { + "epoch": 1.1846552919932904, + "grad_norm": 3.5183563232421875, + "learning_rate": 0.004646975123857159, + "loss": 8.1091, + "step": 290800 + }, + { + "epoch": 1.185062670016672, + "grad_norm": 1.6890475749969482, + "learning_rate": 0.004646720684026171, + "loss": 8.1511, + "step": 290900 + }, + { + "epoch": 1.1854700480400533, + "grad_norm": 3.2172505855560303, + "learning_rate": 0.004646466159521512, + "loss": 8.0632, + "step": 291000 + }, + { + "epoch": 1.1854700480400533, + "eval_MaskedAccuracy": 0.4890016386016322, + "eval_loss": 1.703794002532959, + "eval_runtime": 264.6484, + "eval_samples_per_second": 239.85, + "eval_steps_per_second": 0.937, + "step": 291000 + }, + { + "epoch": 1.1858774260634348, + "grad_norm": 1.6402572393417358, + "learning_rate": 0.00464621155035324, + "loss": 8.1184, + "step": 291100 + }, + { + "epoch": 1.1862848040868164, + "grad_norm": 3.0767862796783447, + "learning_rate": 0.004645956856531417, + "loss": 8.1399, + "step": 291200 + }, + { + "epoch": 1.186692182110198, + "grad_norm": 3.245995283126831, + "learning_rate": 0.004645702078066124, + "loss": 8.0841, + "step": 291300 + }, + { + "epoch": 1.1870995601335792, + "grad_norm": 4.78672981262207, + "learning_rate": 0.004645447214967429, + "loss": 8.0896, + "step": 291400 + }, + { + "epoch": 1.1875069381569607, + "grad_norm": 6.897592544555664, + "learning_rate": 0.0046451922672454075, + "loss": 8.0885, + "step": 291500 + }, + { + "epoch": 1.1879143161803423, + "grad_norm": 2.3824174404144287, + "learning_rate": 0.004644937234910144, + "loss": 8.1196, + "step": 291600 + }, + { + "epoch": 1.1883216942037236, + "grad_norm": 5.0155863761901855, + "learning_rate": 0.00464468211797171, + "loss": 8.0814, + "step": 291700 + }, + { + "epoch": 1.188729072227105, + "grad_norm": 1.3886222839355469, + "learning_rate": 0.0046444269164402, + "loss": 8.1123, + "step": 291800 + }, + { + "epoch": 1.1891364502504866, + "grad_norm": 4.824828624725342, + "learning_rate": 0.004644171630325706, + "loss": 8.1269, + "step": 291900 + }, + { + "epoch": 1.189543828273868, + "grad_norm": 1.4126006364822388, + "learning_rate": 0.004643916259638313, + "loss": 8.1427, + "step": 292000 + }, + { + "epoch": 1.189543828273868, + "eval_MaskedAccuracy": 0.48855525663268745, + "eval_loss": 1.6854230165481567, + "eval_runtime": 178.3745, + "eval_samples_per_second": 355.858, + "eval_steps_per_second": 1.39, + "step": 292000 + }, + { + "epoch": 1.1899512062972495, + "grad_norm": 2.1228461265563965, + "learning_rate": 0.004643660804388119, + "loss": 8.1371, + "step": 292100 + }, + { + "epoch": 1.190358584320631, + "grad_norm": 0.6336086988449097, + "learning_rate": 0.004643405264585224, + "loss": 8.1392, + "step": 292200 + }, + { + "epoch": 1.1907659623440123, + "grad_norm": 0.878674328327179, + "learning_rate": 0.004643149640239736, + "loss": 8.1752, + "step": 292300 + }, + { + "epoch": 1.1911733403673939, + "grad_norm": 5.786048889160156, + "learning_rate": 0.004642893931361753, + "loss": 8.1784, + "step": 292400 + }, + { + "epoch": 1.1915807183907754, + "grad_norm": 6.48675012588501, + "learning_rate": 0.004642638137961386, + "loss": 8.1513, + "step": 292500 + }, + { + "epoch": 1.1919880964141567, + "grad_norm": 4.116359233856201, + "learning_rate": 0.004642382260048754, + "loss": 8.1237, + "step": 292600 + }, + { + "epoch": 1.1923954744375382, + "grad_norm": 5.0022406578063965, + "learning_rate": 0.004642126297633973, + "loss": 8.0614, + "step": 292700 + }, + { + "epoch": 1.1928028524609198, + "grad_norm": 4.177118301391602, + "learning_rate": 0.004641870250727155, + "loss": 8.0497, + "step": 292800 + }, + { + "epoch": 1.193210230484301, + "grad_norm": 3.184532642364502, + "learning_rate": 0.004641614119338434, + "loss": 8.0297, + "step": 292900 + }, + { + "epoch": 1.1936176085076826, + "grad_norm": 3.5414209365844727, + "learning_rate": 0.004641357903477921, + "loss": 8.0677, + "step": 293000 + }, + { + "epoch": 1.1936176085076826, + "eval_MaskedAccuracy": 0.4917024225587292, + "eval_loss": 1.6840522289276123, + "eval_runtime": 525.4173, + "eval_samples_per_second": 120.811, + "eval_steps_per_second": 0.472, + "step": 293000 + }, + { + "epoch": 1.1940249865310641, + "grad_norm": 4.260696887969971, + "learning_rate": 0.004641101603155761, + "loss": 8.0335, + "step": 293100 + }, + { + "epoch": 1.1944323645544457, + "grad_norm": 6.748197078704834, + "learning_rate": 0.004640845218382074, + "loss": 8.0371, + "step": 293200 + }, + { + "epoch": 1.194839742577827, + "grad_norm": 8.22083568572998, + "learning_rate": 0.004640588749167006, + "loss": 8.0992, + "step": 293300 + }, + { + "epoch": 1.1952471206012085, + "grad_norm": 2.970147132873535, + "learning_rate": 0.004640332195520685, + "loss": 8.1274, + "step": 293400 + }, + { + "epoch": 1.1956544986245898, + "grad_norm": 1.9968116283416748, + "learning_rate": 0.0046400755574532646, + "loss": 8.083, + "step": 293500 + }, + { + "epoch": 1.1960618766479714, + "grad_norm": 3.9696598052978516, + "learning_rate": 0.004639818834974893, + "loss": 8.043, + "step": 293600 + }, + { + "epoch": 1.196469254671353, + "grad_norm": 3.1897449493408203, + "learning_rate": 0.004639562028095717, + "loss": 8.0187, + "step": 293700 + }, + { + "epoch": 1.1968766326947344, + "grad_norm": 2.5749704837799072, + "learning_rate": 0.004639305136825884, + "loss": 8.0338, + "step": 293800 + }, + { + "epoch": 1.1972840107181157, + "grad_norm": 5.313751697540283, + "learning_rate": 0.004639048161175557, + "loss": 8.0858, + "step": 293900 + }, + { + "epoch": 1.1976913887414973, + "grad_norm": 4.109214782714844, + "learning_rate": 0.004638791101154886, + "loss": 8.1647, + "step": 294000 + }, + { + "epoch": 1.1976913887414973, + "eval_MaskedAccuracy": 0.4850518393627376, + "eval_loss": 1.7173192501068115, + "eval_runtime": 186.1655, + "eval_samples_per_second": 340.965, + "eval_steps_per_second": 1.332, + "step": 294000 + }, + { + "epoch": 1.1980987667648788, + "grad_norm": 2.07083797454834, + "learning_rate": 0.004638533956774044, + "loss": 8.1601, + "step": 294100 + }, + { + "epoch": 1.1985061447882601, + "grad_norm": 0.9491326808929443, + "learning_rate": 0.004638276728043195, + "loss": 8.1223, + "step": 294200 + }, + { + "epoch": 1.1989135228116417, + "grad_norm": 1.581750750541687, + "learning_rate": 0.0046380194149725044, + "loss": 8.1301, + "step": 294300 + }, + { + "epoch": 1.1993209008350232, + "grad_norm": 3.208538293838501, + "learning_rate": 0.004637762017572147, + "loss": 8.1662, + "step": 294400 + }, + { + "epoch": 1.1997282788584045, + "grad_norm": 2.7052595615386963, + "learning_rate": 0.004637504535852303, + "loss": 8.0273, + "step": 294500 + }, + { + "epoch": 1.200135656881786, + "grad_norm": 8.370946884155273, + "learning_rate": 0.004637246969823157, + "loss": 8.1228, + "step": 294600 + }, + { + "epoch": 1.2005430349051676, + "grad_norm": 2.6555542945861816, + "learning_rate": 0.004636989319494877, + "loss": 8.1628, + "step": 294700 + }, + { + "epoch": 1.2009504129285489, + "grad_norm": 1.5750937461853027, + "learning_rate": 0.0046367315848776566, + "loss": 8.1815, + "step": 294800 + }, + { + "epoch": 1.2013577909519304, + "grad_norm": 4.36472749710083, + "learning_rate": 0.004636473765981692, + "loss": 8.1676, + "step": 294900 + }, + { + "epoch": 1.201765168975312, + "grad_norm": 1.2535489797592163, + "learning_rate": 0.00463621586281716, + "loss": 8.1392, + "step": 295000 + }, + { + "epoch": 1.201765168975312, + "eval_MaskedAccuracy": 0.48718782684736733, + "eval_loss": 1.709524154663086, + "eval_runtime": 187.0772, + "eval_samples_per_second": 339.304, + "eval_steps_per_second": 1.326, + "step": 295000 + }, + { + "epoch": 1.2021725469986932, + "grad_norm": 2.1562795639038086, + "learning_rate": 0.00463595787539427, + "loss": 8.1355, + "step": 295100 + }, + { + "epoch": 1.2025799250220748, + "grad_norm": 2.3468427658081055, + "learning_rate": 0.004635699803723219, + "loss": 8.107, + "step": 295200 + }, + { + "epoch": 1.2029873030454563, + "grad_norm": 1.1948546171188354, + "learning_rate": 0.004635441647814208, + "loss": 8.1078, + "step": 295300 + }, + { + "epoch": 1.2033946810688376, + "grad_norm": 5.46122407913208, + "learning_rate": 0.00463518340767744, + "loss": 8.1465, + "step": 295400 + }, + { + "epoch": 1.2038020590922192, + "grad_norm": 3.8204262256622314, + "learning_rate": 0.004634925083323132, + "loss": 8.0802, + "step": 295500 + }, + { + "epoch": 1.2042094371156007, + "grad_norm": 3.1001439094543457, + "learning_rate": 0.004634666674761497, + "loss": 8.0605, + "step": 295600 + }, + { + "epoch": 1.2046168151389822, + "grad_norm": 4.485762596130371, + "learning_rate": 0.004634408182002743, + "loss": 8.0497, + "step": 295700 + }, + { + "epoch": 1.2050241931623635, + "grad_norm": 0.7458304762840271, + "learning_rate": 0.004634149605057095, + "loss": 8.0634, + "step": 295800 + }, + { + "epoch": 1.205431571185745, + "grad_norm": 3.6596946716308594, + "learning_rate": 0.004633890943934777, + "loss": 8.125, + "step": 295900 + }, + { + "epoch": 1.2058389492091264, + "grad_norm": 1.1765960454940796, + "learning_rate": 0.004633632198646016, + "loss": 8.1403, + "step": 296000 + }, + { + "epoch": 1.2058389492091264, + "eval_MaskedAccuracy": 0.4867809973571029, + "eval_loss": 1.7117823362350464, + "eval_runtime": 165.8442, + "eval_samples_per_second": 382.745, + "eval_steps_per_second": 1.495, + "step": 296000 + }, + { + "epoch": 1.206246327232508, + "grad_norm": 1.9596821069717407, + "learning_rate": 0.004633373369201036, + "loss": 8.1842, + "step": 296100 + }, + { + "epoch": 1.2066537052558894, + "grad_norm": 2.8808438777923584, + "learning_rate": 0.004633114455610075, + "loss": 8.2099, + "step": 296200 + }, + { + "epoch": 1.207061083279271, + "grad_norm": 5.307520389556885, + "learning_rate": 0.004632855457883367, + "loss": 8.0813, + "step": 296300 + }, + { + "epoch": 1.2074684613026523, + "grad_norm": 0.6345957517623901, + "learning_rate": 0.004632596376031153, + "loss": 8.136, + "step": 296400 + }, + { + "epoch": 1.2078758393260338, + "grad_norm": 1.8907524347305298, + "learning_rate": 0.004632337210063677, + "loss": 8.1015, + "step": 296500 + }, + { + "epoch": 1.2082832173494154, + "grad_norm": 4.043900966644287, + "learning_rate": 0.00463207795999118, + "loss": 8.1081, + "step": 296600 + }, + { + "epoch": 1.2086905953727967, + "grad_norm": 5.490545749664307, + "learning_rate": 0.004631818625823919, + "loss": 8.1222, + "step": 296700 + }, + { + "epoch": 1.2090979733961782, + "grad_norm": 4.196577548980713, + "learning_rate": 0.004631559207572141, + "loss": 8.0542, + "step": 296800 + }, + { + "epoch": 1.2095053514195597, + "grad_norm": 3.866391658782959, + "learning_rate": 0.004631299705246102, + "loss": 8.0695, + "step": 296900 + }, + { + "epoch": 1.209912729442941, + "grad_norm": 0.752380907535553, + "learning_rate": 0.0046310401188560685, + "loss": 8.0092, + "step": 297000 + }, + { + "epoch": 1.209912729442941, + "eval_MaskedAccuracy": 0.4891799310806632, + "eval_loss": 1.6924179792404175, + "eval_runtime": 171.3721, + "eval_samples_per_second": 370.399, + "eval_steps_per_second": 1.447, + "step": 297000 + }, + { + "epoch": 1.2103201074663226, + "grad_norm": 3.039321184158325, + "learning_rate": 0.004630780448412297, + "loss": 8.1151, + "step": 297100 + }, + { + "epoch": 1.210727485489704, + "grad_norm": 1.9718246459960938, + "learning_rate": 0.004630520693925053, + "loss": 8.1389, + "step": 297200 + }, + { + "epoch": 1.2111348635130854, + "grad_norm": 0.9533228278160095, + "learning_rate": 0.004630260855404601, + "loss": 8.139, + "step": 297300 + }, + { + "epoch": 1.211542241536467, + "grad_norm": 3.9541590213775635, + "learning_rate": 0.004630000932861223, + "loss": 8.16, + "step": 297400 + }, + { + "epoch": 1.2119496195598485, + "grad_norm": 2.471520185470581, + "learning_rate": 0.004629740926305191, + "loss": 8.179, + "step": 297500 + }, + { + "epoch": 1.2123569975832298, + "grad_norm": 4.447343826293945, + "learning_rate": 0.00462948083574679, + "loss": 8.0903, + "step": 297600 + }, + { + "epoch": 1.2127643756066113, + "grad_norm": 4.4200758934021, + "learning_rate": 0.004629220661196297, + "loss": 8.0701, + "step": 297700 + }, + { + "epoch": 1.2131717536299929, + "grad_norm": 3.8681282997131348, + "learning_rate": 0.004628960402664008, + "loss": 8.0591, + "step": 297800 + }, + { + "epoch": 1.2135791316533742, + "grad_norm": 4.3446197509765625, + "learning_rate": 0.004628700060160198, + "loss": 8.0233, + "step": 297900 + }, + { + "epoch": 1.2139865096767557, + "grad_norm": 8.378327369689941, + "learning_rate": 0.004628439633695168, + "loss": 8.0572, + "step": 298000 + }, + { + "epoch": 1.2139865096767557, + "eval_MaskedAccuracy": 0.4878226944878192, + "eval_loss": 1.7110786437988281, + "eval_runtime": 226.902, + "eval_samples_per_second": 279.751, + "eval_steps_per_second": 1.093, + "step": 298000 + }, + { + "epoch": 1.2143938877001372, + "grad_norm": 3.936922311782837, + "learning_rate": 0.004628179123279213, + "loss": 8.0832, + "step": 298100 + }, + { + "epoch": 1.2148012657235188, + "grad_norm": 3.7519657611846924, + "learning_rate": 0.004627918528922626, + "loss": 8.1479, + "step": 298200 + }, + { + "epoch": 1.2152086437469, + "grad_norm": 2.771986246109009, + "learning_rate": 0.004627657850635724, + "loss": 8.18, + "step": 298300 + }, + { + "epoch": 1.2156160217702816, + "grad_norm": 1.1955420970916748, + "learning_rate": 0.0046273970884288034, + "loss": 8.1782, + "step": 298400 + }, + { + "epoch": 1.216023399793663, + "grad_norm": 2.453226089477539, + "learning_rate": 0.004627136242312175, + "loss": 8.1279, + "step": 298500 + }, + { + "epoch": 1.2164307778170445, + "grad_norm": 2.992370367050171, + "learning_rate": 0.0046268753122961515, + "loss": 8.0821, + "step": 298600 + }, + { + "epoch": 1.216838155840426, + "grad_norm": 5.242353439331055, + "learning_rate": 0.004626614298391044, + "loss": 8.1247, + "step": 298700 + }, + { + "epoch": 1.2172455338638075, + "grad_norm": 1.5426554679870605, + "learning_rate": 0.00462635320060717, + "loss": 8.0914, + "step": 298800 + }, + { + "epoch": 1.2176529118871888, + "grad_norm": 5.316330909729004, + "learning_rate": 0.004626092018954864, + "loss": 8.051, + "step": 298900 + }, + { + "epoch": 1.2180602899105704, + "grad_norm": 3.0452542304992676, + "learning_rate": 0.004625830753444443, + "loss": 8.0352, + "step": 299000 + }, + { + "epoch": 1.2180602899105704, + "eval_MaskedAccuracy": 0.49069970591165174, + "eval_loss": 1.6971735954284668, + "eval_runtime": 168.9447, + "eval_samples_per_second": 375.721, + "eval_steps_per_second": 1.468, + "step": 299000 + }, + { + "epoch": 1.218467667933952, + "grad_norm": 3.606816053390503, + "learning_rate": 0.004625569404086243, + "loss": 8.0061, + "step": 299100 + }, + { + "epoch": 1.2188750459573332, + "grad_norm": 4.931297302246094, + "learning_rate": 0.00462530797089059, + "loss": 8.0126, + "step": 299200 + }, + { + "epoch": 1.2192824239807147, + "grad_norm": 5.3326873779296875, + "learning_rate": 0.0046250464538678225, + "loss": 7.9827, + "step": 299300 + }, + { + "epoch": 1.2196898020040963, + "grad_norm": 14.773650169372559, + "learning_rate": 0.0046247848530282815, + "loss": 8.1116, + "step": 299400 + }, + { + "epoch": 1.2200971800274776, + "grad_norm": 4.419100761413574, + "learning_rate": 0.004624523168382312, + "loss": 8.1343, + "step": 299500 + }, + { + "epoch": 1.2205045580508591, + "grad_norm": 2.470949649810791, + "learning_rate": 0.0046242613999402556, + "loss": 8.1014, + "step": 299600 + }, + { + "epoch": 1.2209119360742406, + "grad_norm": 1.0886467695236206, + "learning_rate": 0.004623999547712456, + "loss": 8.0791, + "step": 299700 + }, + { + "epoch": 1.221319314097622, + "grad_norm": 1.0229309797286987, + "learning_rate": 0.0046237376117092745, + "loss": 8.1222, + "step": 299800 + }, + { + "epoch": 1.2217266921210035, + "grad_norm": 3.866816759109497, + "learning_rate": 0.0046234755919410545, + "loss": 8.1587, + "step": 299900 + }, + { + "epoch": 1.222134070144385, + "grad_norm": 5.334160327911377, + "learning_rate": 0.00462321348841817, + "loss": 8.0816, + "step": 300000 + }, + { + "epoch": 1.222134070144385, + "eval_MaskedAccuracy": 0.4896748310121843, + "eval_loss": 1.706262230873108, + "eval_runtime": 195.5812, + "eval_samples_per_second": 324.551, + "eval_steps_per_second": 1.268, + "step": 300000 + }, + { + "epoch": 1.2225414481677663, + "grad_norm": 3.508280038833618, + "learning_rate": 0.004622951301150977, + "loss": 8.0479, + "step": 300100 + }, + { + "epoch": 1.2229488261911479, + "grad_norm": 3.168687343597412, + "learning_rate": 0.004622689030149839, + "loss": 8.0469, + "step": 300200 + }, + { + "epoch": 1.2233562042145294, + "grad_norm": 2.397977352142334, + "learning_rate": 0.004622426675425136, + "loss": 8.1201, + "step": 300300 + }, + { + "epoch": 1.2237635822379107, + "grad_norm": 2.019606113433838, + "learning_rate": 0.004622164236987225, + "loss": 8.1686, + "step": 300400 + }, + { + "epoch": 1.2241709602612922, + "grad_norm": 2.7145845890045166, + "learning_rate": 0.004621901714846492, + "loss": 8.0884, + "step": 300500 + }, + { + "epoch": 1.2245783382846738, + "grad_norm": 2.8610126972198486, + "learning_rate": 0.004621639109013311, + "loss": 8.0361, + "step": 300600 + }, + { + "epoch": 1.2249857163080553, + "grad_norm": 4.2266435623168945, + "learning_rate": 0.004621376419498069, + "loss": 8.0563, + "step": 300700 + }, + { + "epoch": 1.2253930943314366, + "grad_norm": 4.127247333526611, + "learning_rate": 0.004621113646311145, + "loss": 8.0616, + "step": 300800 + }, + { + "epoch": 1.2258004723548181, + "grad_norm": 2.9968063831329346, + "learning_rate": 0.0046208507894629275, + "loss": 8.0561, + "step": 300900 + }, + { + "epoch": 1.2262078503781995, + "grad_norm": 5.844603538513184, + "learning_rate": 0.004620587848963815, + "loss": 8.0466, + "step": 301000 + }, + { + "epoch": 1.2262078503781995, + "eval_MaskedAccuracy": 0.4911941245461271, + "eval_loss": 1.6918003559112549, + "eval_runtime": 154.9656, + "eval_samples_per_second": 409.613, + "eval_steps_per_second": 1.6, + "step": 301000 + }, + { + "epoch": 1.226615228401581, + "grad_norm": 4.7555060386657715, + "learning_rate": 0.004620324824824195, + "loss": 8.0266, + "step": 301100 + }, + { + "epoch": 1.2270226064249625, + "grad_norm": 3.523132801055908, + "learning_rate": 0.00462006171705448, + "loss": 8.0484, + "step": 301200 + }, + { + "epoch": 1.227429984448344, + "grad_norm": 1.8597171306610107, + "learning_rate": 0.004619798525665065, + "loss": 8.0113, + "step": 301300 + }, + { + "epoch": 1.2278373624717254, + "grad_norm": 8.603140830993652, + "learning_rate": 0.004619535250666346, + "loss": 8.1079, + "step": 301400 + }, + { + "epoch": 1.228244740495107, + "grad_norm": 6.364404201507568, + "learning_rate": 0.004619271892068739, + "loss": 8.1175, + "step": 301500 + }, + { + "epoch": 1.2286521185184884, + "grad_norm": 2.263643503189087, + "learning_rate": 0.0046190084498826545, + "loss": 8.1233, + "step": 301600 + }, + { + "epoch": 1.2290594965418697, + "grad_norm": 0.9943809509277344, + "learning_rate": 0.004618744924118511, + "loss": 8.11, + "step": 301700 + }, + { + "epoch": 1.2294668745652513, + "grad_norm": 1.856001853942871, + "learning_rate": 0.004618481314786725, + "loss": 8.131, + "step": 301800 + }, + { + "epoch": 1.2298742525886328, + "grad_norm": 3.258477210998535, + "learning_rate": 0.004618217621897724, + "loss": 8.1214, + "step": 301900 + }, + { + "epoch": 1.2302816306120141, + "grad_norm": 1.1988614797592163, + "learning_rate": 0.0046179538454619255, + "loss": 8.0814, + "step": 302000 + }, + { + "epoch": 1.2302816306120141, + "eval_MaskedAccuracy": 0.48597322669550147, + "eval_loss": 1.714728593826294, + "eval_runtime": 179.9806, + "eval_samples_per_second": 352.682, + "eval_steps_per_second": 1.378, + "step": 302000 + }, + { + "epoch": 1.2306890086353957, + "grad_norm": 2.050732374191284, + "learning_rate": 0.00461768998548976, + "loss": 8.108, + "step": 302100 + }, + { + "epoch": 1.2310963866587772, + "grad_norm": 3.5282022953033447, + "learning_rate": 0.00461742604199166, + "loss": 8.0959, + "step": 302200 + }, + { + "epoch": 1.2315037646821585, + "grad_norm": 3.6474194526672363, + "learning_rate": 0.004617162014978059, + "loss": 8.0581, + "step": 302300 + }, + { + "epoch": 1.23191114270554, + "grad_norm": 5.273232460021973, + "learning_rate": 0.004616897904459396, + "loss": 8.0409, + "step": 302400 + }, + { + "epoch": 1.2323185207289216, + "grad_norm": 5.792709827423096, + "learning_rate": 0.0046166337104461205, + "loss": 8.0111, + "step": 302500 + }, + { + "epoch": 1.2327258987523029, + "grad_norm": 4.636826038360596, + "learning_rate": 0.004616369432948664, + "loss": 8.083, + "step": 302600 + }, + { + "epoch": 1.2331332767756844, + "grad_norm": 2.228703022003174, + "learning_rate": 0.004616105071977488, + "loss": 8.0733, + "step": 302700 + }, + { + "epoch": 1.233540654799066, + "grad_norm": 2.250886917114258, + "learning_rate": 0.004615840627543034, + "loss": 8.1061, + "step": 302800 + }, + { + "epoch": 1.2339480328224472, + "grad_norm": 2.698251962661743, + "learning_rate": 0.004615576099655761, + "loss": 8.1026, + "step": 302900 + }, + { + "epoch": 1.2343554108458288, + "grad_norm": 3.854100227355957, + "learning_rate": 0.004615311488326126, + "loss": 8.0714, + "step": 303000 + }, + { + "epoch": 1.2343554108458288, + "eval_MaskedAccuracy": 0.4881060796935515, + "eval_loss": 1.6926175355911255, + "eval_runtime": 155.3429, + "eval_samples_per_second": 408.619, + "eval_steps_per_second": 1.596, + "step": 303000 + }, + { + "epoch": 1.2347627888692103, + "grad_norm": 2.0090067386627197, + "learning_rate": 0.004615046793564592, + "loss": 8.0752, + "step": 303100 + }, + { + "epoch": 1.2351701668925918, + "grad_norm": 2.734537363052368, + "learning_rate": 0.004614782015381621, + "loss": 8.0923, + "step": 303200 + }, + { + "epoch": 1.2355775449159732, + "grad_norm": 15.769610404968262, + "learning_rate": 0.0046145171537876825, + "loss": 8.119, + "step": 303300 + }, + { + "epoch": 1.2359849229393547, + "grad_norm": 3.1276133060455322, + "learning_rate": 0.00461425220879325, + "loss": 8.14, + "step": 303400 + }, + { + "epoch": 1.236392300962736, + "grad_norm": 5.444284915924072, + "learning_rate": 0.004613987180408806, + "loss": 8.1267, + "step": 303500 + }, + { + "epoch": 1.2367996789861175, + "grad_norm": 5.557637691497803, + "learning_rate": 0.0046137220686448075, + "loss": 8.1384, + "step": 303600 + }, + { + "epoch": 1.237207057009499, + "grad_norm": 1.9772955179214478, + "learning_rate": 0.004613456873511747, + "loss": 8.0422, + "step": 303700 + }, + { + "epoch": 1.2376144350328806, + "grad_norm": 3.7055411338806152, + "learning_rate": 0.004613191595020113, + "loss": 8.1321, + "step": 303800 + }, + { + "epoch": 1.238021813056262, + "grad_norm": 3.4126334190368652, + "learning_rate": 0.004612926233180385, + "loss": 8.1559, + "step": 303900 + }, + { + "epoch": 1.2384291910796434, + "grad_norm": 4.730792045593262, + "learning_rate": 0.004612660788003063, + "loss": 8.0798, + "step": 304000 + }, + { + "epoch": 1.2384291910796434, + "eval_MaskedAccuracy": 0.48978617182119905, + "eval_loss": 1.7045177221298218, + "eval_runtime": 193.263, + "eval_samples_per_second": 328.444, + "eval_steps_per_second": 1.283, + "step": 304000 + }, + { + "epoch": 1.238836569103025, + "grad_norm": 1.8504356145858765, + "learning_rate": 0.004612395259498635, + "loss": 8.0589, + "step": 304100 + }, + { + "epoch": 1.2392439471264063, + "grad_norm": 3.34515118598938, + "learning_rate": 0.004612129647677609, + "loss": 8.1066, + "step": 304200 + }, + { + "epoch": 1.2396513251497878, + "grad_norm": 1.174383282661438, + "learning_rate": 0.004611863952550479, + "loss": 8.0642, + "step": 304300 + }, + { + "epoch": 1.2400587031731694, + "grad_norm": 5.638028621673584, + "learning_rate": 0.004611598174127743, + "loss": 8.1238, + "step": 304400 + }, + { + "epoch": 1.2404660811965507, + "grad_norm": 2.1906986236572266, + "learning_rate": 0.004611332312419916, + "loss": 8.1202, + "step": 304500 + }, + { + "epoch": 1.2408734592199322, + "grad_norm": 3.808988332748413, + "learning_rate": 0.004611066367437503, + "loss": 8.1227, + "step": 304600 + }, + { + "epoch": 1.2412808372433137, + "grad_norm": 1.7970826625823975, + "learning_rate": 0.004610800339191022, + "loss": 8.0841, + "step": 304700 + }, + { + "epoch": 1.241688215266695, + "grad_norm": 1.9682625532150269, + "learning_rate": 0.004610534227690988, + "loss": 8.1265, + "step": 304800 + }, + { + "epoch": 1.2420955932900766, + "grad_norm": 5.152673244476318, + "learning_rate": 0.004610268032947929, + "loss": 8.0917, + "step": 304900 + }, + { + "epoch": 1.242502971313458, + "grad_norm": 0.8323341012001038, + "learning_rate": 0.004610001754972367, + "loss": 8.133, + "step": 305000 + }, + { + "epoch": 1.242502971313458, + "eval_MaskedAccuracy": 0.48686285156172704, + "eval_loss": 1.7085505723953247, + "eval_runtime": 161.2778, + "eval_samples_per_second": 393.582, + "eval_steps_per_second": 1.538, + "step": 305000 + }, + { + "epoch": 1.2429103493368394, + "grad_norm": 2.9382216930389404, + "learning_rate": 0.004609735393774827, + "loss": 8.1191, + "step": 305100 + }, + { + "epoch": 1.243317727360221, + "grad_norm": 2.5187878608703613, + "learning_rate": 0.004609468949365838, + "loss": 8.1362, + "step": 305200 + }, + { + "epoch": 1.2437251053836025, + "grad_norm": 4.52233362197876, + "learning_rate": 0.00460920242175594, + "loss": 8.1266, + "step": 305300 + }, + { + "epoch": 1.2441324834069838, + "grad_norm": 5.111374855041504, + "learning_rate": 0.004608935810955663, + "loss": 8.1116, + "step": 305400 + }, + { + "epoch": 1.2445398614303653, + "grad_norm": 1.2241135835647583, + "learning_rate": 0.004608669116975555, + "loss": 8.0831, + "step": 305500 + }, + { + "epoch": 1.2449472394537469, + "grad_norm": 2.461332321166992, + "learning_rate": 0.004608402339826152, + "loss": 8.0891, + "step": 305600 + }, + { + "epoch": 1.2453546174771284, + "grad_norm": 6.2857208251953125, + "learning_rate": 0.0046081354795180045, + "loss": 8.0607, + "step": 305700 + }, + { + "epoch": 1.2457619955005097, + "grad_norm": 1.3130388259887695, + "learning_rate": 0.004607868536061661, + "loss": 8.1015, + "step": 305800 + }, + { + "epoch": 1.2461693735238912, + "grad_norm": 1.7209711074829102, + "learning_rate": 0.00460760150946768, + "loss": 8.0705, + "step": 305900 + }, + { + "epoch": 1.2465767515472725, + "grad_norm": 3.74379301071167, + "learning_rate": 0.00460733439974661, + "loss": 8.1352, + "step": 306000 + }, + { + "epoch": 1.2465767515472725, + "eval_MaskedAccuracy": 0.4850375812619579, + "eval_loss": 1.7195402383804321, + "eval_runtime": 164.8649, + "eval_samples_per_second": 385.018, + "eval_steps_per_second": 1.504, + "step": 306000 + }, + { + "epoch": 1.246984129570654, + "grad_norm": 3.218581199645996, + "learning_rate": 0.004607067206909027, + "loss": 8.1156, + "step": 306100 + }, + { + "epoch": 1.2473915075940356, + "grad_norm": 2.4101579189300537, + "learning_rate": 0.004606799930965481, + "loss": 8.093, + "step": 306200 + }, + { + "epoch": 1.2477988856174171, + "grad_norm": 3.4861109256744385, + "learning_rate": 0.004606532571926543, + "loss": 8.1189, + "step": 306300 + }, + { + "epoch": 1.2482062636407985, + "grad_norm": 4.77168083190918, + "learning_rate": 0.004606265129802781, + "loss": 8.1026, + "step": 306400 + }, + { + "epoch": 1.24861364166418, + "grad_norm": 0.6300526261329651, + "learning_rate": 0.004605997604604765, + "loss": 8.1177, + "step": 306500 + }, + { + "epoch": 1.2490210196875615, + "grad_norm": 1.2264113426208496, + "learning_rate": 0.0046057299963430855, + "loss": 8.1051, + "step": 306600 + }, + { + "epoch": 1.2494283977109428, + "grad_norm": 2.0417797565460205, + "learning_rate": 0.0046054623050283084, + "loss": 8.0656, + "step": 306700 + }, + { + "epoch": 1.2498357757343244, + "grad_norm": 2.3916094303131104, + "learning_rate": 0.0046051945306710305, + "loss": 8.1169, + "step": 306800 + }, + { + "epoch": 1.250243153757706, + "grad_norm": 6.253011226654053, + "learning_rate": 0.004604926673281828, + "loss": 8.0892, + "step": 306900 + }, + { + "epoch": 1.2506505317810872, + "grad_norm": 2.1163041591644287, + "learning_rate": 0.004604658732871291, + "loss": 8.1115, + "step": 307000 + }, + { + "epoch": 1.2506505317810872, + "eval_MaskedAccuracy": 0.48697341643848935, + "eval_loss": 1.7105181217193604, + "eval_runtime": 196.2697, + "eval_samples_per_second": 323.412, + "eval_steps_per_second": 1.264, + "step": 307000 + }, + { + "epoch": 1.2510579098044687, + "grad_norm": 2.6035571098327637, + "learning_rate": 0.004604390709450008, + "loss": 8.1378, + "step": 307100 + }, + { + "epoch": 1.2514652878278503, + "grad_norm": 5.0959248542785645, + "learning_rate": 0.004604122603028583, + "loss": 8.134, + "step": 307200 + }, + { + "epoch": 1.2518726658512316, + "grad_norm": 2.674523115158081, + "learning_rate": 0.004603854413617617, + "loss": 8.0982, + "step": 307300 + }, + { + "epoch": 1.2522800438746131, + "grad_norm": 1.7357358932495117, + "learning_rate": 0.00460358614122771, + "loss": 8.0884, + "step": 307400 + }, + { + "epoch": 1.2526874218979946, + "grad_norm": 4.28224515914917, + "learning_rate": 0.00460331778586947, + "loss": 8.1212, + "step": 307500 + }, + { + "epoch": 1.2530947999213762, + "grad_norm": 3.870980978012085, + "learning_rate": 0.004603049347553501, + "loss": 8.1202, + "step": 307600 + }, + { + "epoch": 1.2535021779447575, + "grad_norm": 2.9729363918304443, + "learning_rate": 0.0046027808262904275, + "loss": 8.109, + "step": 307700 + }, + { + "epoch": 1.253909555968139, + "grad_norm": 3.985447645187378, + "learning_rate": 0.004602512222090859, + "loss": 8.0672, + "step": 307800 + }, + { + "epoch": 1.2543169339915203, + "grad_norm": 7.9126505851745605, + "learning_rate": 0.0046022435349654086, + "loss": 8.0686, + "step": 307900 + }, + { + "epoch": 1.2547243120149019, + "grad_norm": 3.8987011909484863, + "learning_rate": 0.00460197476492471, + "loss": 8.1029, + "step": 308000 + }, + { + "epoch": 1.2547243120149019, + "eval_MaskedAccuracy": 0.4900233862981613, + "eval_loss": 1.7020424604415894, + "eval_runtime": 187.7355, + "eval_samples_per_second": 338.114, + "eval_steps_per_second": 1.321, + "step": 308000 + }, + { + "epoch": 1.2551316900382834, + "grad_norm": 3.613825559616089, + "learning_rate": 0.004601705911979381, + "loss": 8.0801, + "step": 308100 + }, + { + "epoch": 1.255539068061665, + "grad_norm": 1.4558838605880737, + "learning_rate": 0.004601436976140047, + "loss": 8.1215, + "step": 308200 + }, + { + "epoch": 1.2559464460850462, + "grad_norm": 7.969110488891602, + "learning_rate": 0.004601167957417355, + "loss": 8.1459, + "step": 308300 + }, + { + "epoch": 1.2563538241084278, + "grad_norm": 5.945659160614014, + "learning_rate": 0.004600898855821931, + "loss": 8.108, + "step": 308400 + }, + { + "epoch": 1.256761202131809, + "grad_norm": 2.5267322063446045, + "learning_rate": 0.004600629671364405, + "loss": 8.0766, + "step": 308500 + }, + { + "epoch": 1.2571685801551906, + "grad_norm": 2.464092493057251, + "learning_rate": 0.0046003604040554405, + "loss": 8.1208, + "step": 308600 + }, + { + "epoch": 1.2575759581785722, + "grad_norm": 3.6171298027038574, + "learning_rate": 0.004600091053905665, + "loss": 8.1159, + "step": 308700 + }, + { + "epoch": 1.2579833362019537, + "grad_norm": 3.425246477127075, + "learning_rate": 0.004599821620925738, + "loss": 8.1051, + "step": 308800 + }, + { + "epoch": 1.258390714225335, + "grad_norm": 2.6326539516448975, + "learning_rate": 0.0045995521051263154, + "loss": 8.0713, + "step": 308900 + }, + { + "epoch": 1.2587980922487165, + "grad_norm": 3.3350260257720947, + "learning_rate": 0.004599282506518043, + "loss": 8.0321, + "step": 309000 + }, + { + "epoch": 1.2587980922487165, + "eval_MaskedAccuracy": 0.4913877225593287, + "eval_loss": 1.684166431427002, + "eval_runtime": 175.0026, + "eval_samples_per_second": 362.715, + "eval_steps_per_second": 1.417, + "step": 309000 + }, + { + "epoch": 1.2592054702720978, + "grad_norm": 4.280453681945801, + "learning_rate": 0.004599012825111581, + "loss": 8.055, + "step": 309100 + }, + { + "epoch": 1.2596128482954794, + "grad_norm": 2.7680320739746094, + "learning_rate": 0.004598743060917593, + "loss": 8.1195, + "step": 309200 + }, + { + "epoch": 1.260020226318861, + "grad_norm": 1.9460463523864746, + "learning_rate": 0.004598473213946741, + "loss": 8.0987, + "step": 309300 + }, + { + "epoch": 1.2604276043422424, + "grad_norm": 1.9638036489486694, + "learning_rate": 0.0045982032842097, + "loss": 8.0552, + "step": 309400 + }, + { + "epoch": 1.2608349823656237, + "grad_norm": 3.210782289505005, + "learning_rate": 0.004597933271717138, + "loss": 8.0821, + "step": 309500 + }, + { + "epoch": 1.2612423603890053, + "grad_norm": 4.542831897735596, + "learning_rate": 0.004597663176479727, + "loss": 8.0303, + "step": 309600 + }, + { + "epoch": 1.2616497384123868, + "grad_norm": 4.670233726501465, + "learning_rate": 0.004597392998508145, + "loss": 8.0116, + "step": 309700 + }, + { + "epoch": 1.2620571164357681, + "grad_norm": 2.5082669258117676, + "learning_rate": 0.00459712273781308, + "loss": 8.0405, + "step": 309800 + }, + { + "epoch": 1.2624644944591497, + "grad_norm": 2.2756545543670654, + "learning_rate": 0.004596852394405212, + "loss": 7.9797, + "step": 309900 + }, + { + "epoch": 1.2628718724825312, + "grad_norm": 2.471325159072876, + "learning_rate": 0.004596581968295235, + "loss": 8.0666, + "step": 310000 + }, + { + "epoch": 1.2628718724825312, + "eval_MaskedAccuracy": 0.4880205762552702, + "eval_loss": 1.6943151950836182, + "eval_runtime": 160.5475, + "eval_samples_per_second": 395.372, + "eval_steps_per_second": 1.545, + "step": 310000 + }, + { + "epoch": 1.2632792505059127, + "grad_norm": 2.194175958633423, + "learning_rate": 0.004596311459493836, + "loss": 8.0875, + "step": 310100 + }, + { + "epoch": 1.263686628529294, + "grad_norm": 2.4820425510406494, + "learning_rate": 0.00459604086801171, + "loss": 8.1294, + "step": 310200 + }, + { + "epoch": 1.2640940065526756, + "grad_norm": 0.9932268857955933, + "learning_rate": 0.004595770193859551, + "loss": 8.0846, + "step": 310300 + }, + { + "epoch": 1.2645013845760569, + "grad_norm": 5.9561381340026855, + "learning_rate": 0.004595499437048069, + "loss": 8.145, + "step": 310400 + }, + { + "epoch": 1.2649087625994384, + "grad_norm": 4.604516983032227, + "learning_rate": 0.0045952285975879555, + "loss": 8.098, + "step": 310500 + }, + { + "epoch": 1.26531614062282, + "grad_norm": 4.713510513305664, + "learning_rate": 0.004594957675489932, + "loss": 8.1127, + "step": 310600 + }, + { + "epoch": 1.2657235186462015, + "grad_norm": 0.9178376793861389, + "learning_rate": 0.004594686670764699, + "loss": 8.1282, + "step": 310700 + }, + { + "epoch": 1.2661308966695828, + "grad_norm": 1.6705297231674194, + "learning_rate": 0.004594415583422981, + "loss": 8.1677, + "step": 310800 + }, + { + "epoch": 1.2665382746929643, + "grad_norm": 5.615497589111328, + "learning_rate": 0.004594144413475491, + "loss": 8.1144, + "step": 310900 + }, + { + "epoch": 1.2669456527163456, + "grad_norm": 3.187138319015503, + "learning_rate": 0.0045938731609329415, + "loss": 8.0699, + "step": 311000 + }, + { + "epoch": 1.2669456527163456, + "eval_MaskedAccuracy": 0.4898853434538884, + "eval_loss": 1.6914721727371216, + "eval_runtime": 168.9928, + "eval_samples_per_second": 375.614, + "eval_steps_per_second": 1.468, + "step": 311000 + }, + { + "epoch": 1.2673530307397272, + "grad_norm": 3.255194902420044, + "learning_rate": 0.004593601825806063, + "loss": 8.0868, + "step": 311100 + }, + { + "epoch": 1.2677604087631087, + "grad_norm": 4.374510288238525, + "learning_rate": 0.004593330408105596, + "loss": 8.1538, + "step": 311200 + }, + { + "epoch": 1.2681677867864902, + "grad_norm": 3.63806414604187, + "learning_rate": 0.004593058907842255, + "loss": 8.1031, + "step": 311300 + }, + { + "epoch": 1.2685751648098715, + "grad_norm": 2.7120907306671143, + "learning_rate": 0.0045927873250267755, + "loss": 8.0774, + "step": 311400 + }, + { + "epoch": 1.268982542833253, + "grad_norm": 4.321403503417969, + "learning_rate": 0.004592515659669892, + "loss": 8.0922, + "step": 311500 + }, + { + "epoch": 1.2693899208566344, + "grad_norm": 2.5028560161590576, + "learning_rate": 0.004592243911782355, + "loss": 8.1523, + "step": 311600 + }, + { + "epoch": 1.269797298880016, + "grad_norm": 3.913832426071167, + "learning_rate": 0.004591972081374904, + "loss": 8.0677, + "step": 311700 + }, + { + "epoch": 1.2702046769033974, + "grad_norm": 2.202146530151367, + "learning_rate": 0.004591700168458283, + "loss": 8.0916, + "step": 311800 + }, + { + "epoch": 1.270612054926779, + "grad_norm": 1.159952163696289, + "learning_rate": 0.004591428173043244, + "loss": 8.0811, + "step": 311900 + }, + { + "epoch": 1.2710194329501603, + "grad_norm": 2.1494171619415283, + "learning_rate": 0.0045911560951405386, + "loss": 8.1072, + "step": 312000 + }, + { + "epoch": 1.2710194329501603, + "eval_MaskedAccuracy": 0.48879268615180604, + "eval_loss": 1.6904964447021484, + "eval_runtime": 164.6769, + "eval_samples_per_second": 385.458, + "eval_steps_per_second": 1.506, + "step": 312000 + }, + { + "epoch": 1.2714268109735418, + "grad_norm": 3.1282689571380615, + "learning_rate": 0.004590883934760925, + "loss": 8.0268, + "step": 312100 + }, + { + "epoch": 1.2718341889969234, + "grad_norm": 2.633990526199341, + "learning_rate": 0.004590611691915167, + "loss": 8.0739, + "step": 312200 + }, + { + "epoch": 1.2722415670203047, + "grad_norm": 2.560723066329956, + "learning_rate": 0.004590339366614029, + "loss": 8.0921, + "step": 312300 + }, + { + "epoch": 1.2726489450436862, + "grad_norm": 5.123629093170166, + "learning_rate": 0.004590066958868263, + "loss": 8.0859, + "step": 312400 + }, + { + "epoch": 1.2730563230670677, + "grad_norm": 5.334412097930908, + "learning_rate": 0.004589794468688652, + "loss": 8.0871, + "step": 312500 + }, + { + "epoch": 1.2734637010904493, + "grad_norm": 4.66272497177124, + "learning_rate": 0.004589521896085961, + "loss": 8.0924, + "step": 312600 + }, + { + "epoch": 1.2738710791138306, + "grad_norm": 2.968400716781616, + "learning_rate": 0.004589249241070973, + "loss": 8.0869, + "step": 312700 + }, + { + "epoch": 1.274278457137212, + "grad_norm": 4.876562118530273, + "learning_rate": 0.004588976503654471, + "loss": 8.03, + "step": 312800 + }, + { + "epoch": 1.2746858351605934, + "grad_norm": 3.0390806198120117, + "learning_rate": 0.004588703683847226, + "loss": 8.0704, + "step": 312900 + }, + { + "epoch": 1.275093213183975, + "grad_norm": 0.9052464962005615, + "learning_rate": 0.004588430781660034, + "loss": 8.0532, + "step": 313000 + }, + { + "epoch": 1.275093213183975, + "eval_MaskedAccuracy": 0.4882918460567083, + "eval_loss": 1.7067179679870605, + "eval_runtime": 183.803, + "eval_samples_per_second": 345.348, + "eval_steps_per_second": 1.349, + "step": 313000 + }, + { + "epoch": 1.2755005912073565, + "grad_norm": 1.3546417951583862, + "learning_rate": 0.004588157797103682, + "loss": 8.0895, + "step": 313100 + }, + { + "epoch": 1.275907969230738, + "grad_norm": 1.6751666069030762, + "learning_rate": 0.004587884730188957, + "loss": 8.0547, + "step": 313200 + }, + { + "epoch": 1.2763153472541193, + "grad_norm": 3.803537130355835, + "learning_rate": 0.004587611580926658, + "loss": 8.1219, + "step": 313300 + }, + { + "epoch": 1.2767227252775009, + "grad_norm": 1.501133680343628, + "learning_rate": 0.0045873383493275815, + "loss": 8.0677, + "step": 313400 + }, + { + "epoch": 1.2771301033008822, + "grad_norm": 2.1493144035339355, + "learning_rate": 0.004587065035402535, + "loss": 8.0835, + "step": 313500 + }, + { + "epoch": 1.2775374813242637, + "grad_norm": 4.274430751800537, + "learning_rate": 0.004586791639162321, + "loss": 8.0656, + "step": 313600 + }, + { + "epoch": 1.2779448593476452, + "grad_norm": 4.489757061004639, + "learning_rate": 0.0045865181606177495, + "loss": 8.0331, + "step": 313700 + }, + { + "epoch": 1.2783522373710268, + "grad_norm": 4.5486016273498535, + "learning_rate": 0.004586244599779637, + "loss": 7.9875, + "step": 313800 + }, + { + "epoch": 1.278759615394408, + "grad_norm": 3.658928394317627, + "learning_rate": 0.004585970956658794, + "loss": 8.0497, + "step": 313900 + }, + { + "epoch": 1.2791669934177896, + "grad_norm": 1.7318885326385498, + "learning_rate": 0.004585697231266038, + "loss": 8.0245, + "step": 314000 + }, + { + "epoch": 1.2791669934177896, + "eval_MaskedAccuracy": 0.4921710905561849, + "eval_loss": 1.6829789876937866, + "eval_runtime": 178.2729, + "eval_samples_per_second": 356.061, + "eval_steps_per_second": 1.391, + "step": 314000 + }, + { + "epoch": 1.279574371441171, + "grad_norm": 4.326584815979004, + "learning_rate": 0.00458542342361219, + "loss": 8.0065, + "step": 314100 + }, + { + "epoch": 1.2799817494645525, + "grad_norm": 5.520540714263916, + "learning_rate": 0.0045851495337080805, + "loss": 8.033, + "step": 314200 + }, + { + "epoch": 1.280389127487934, + "grad_norm": 3.209029197692871, + "learning_rate": 0.004584875561564537, + "loss": 8.0821, + "step": 314300 + }, + { + "epoch": 1.2807965055113155, + "grad_norm": 5.361316680908203, + "learning_rate": 0.004584601507192383, + "loss": 8.0513, + "step": 314400 + }, + { + "epoch": 1.2812038835346968, + "grad_norm": 0.8805705904960632, + "learning_rate": 0.004584327370602461, + "loss": 8.091, + "step": 314500 + }, + { + "epoch": 1.2816112615580784, + "grad_norm": 3.406642436981201, + "learning_rate": 0.004584053151805604, + "loss": 8.1275, + "step": 314600 + }, + { + "epoch": 1.28201863958146, + "grad_norm": 3.8101279735565186, + "learning_rate": 0.00458377885081265, + "loss": 8.0898, + "step": 314700 + }, + { + "epoch": 1.2824260176048412, + "grad_norm": 2.9716594219207764, + "learning_rate": 0.0045835044676344585, + "loss": 8.0317, + "step": 314800 + }, + { + "epoch": 1.2828333956282227, + "grad_norm": 7.393836498260498, + "learning_rate": 0.004583230002281866, + "loss": 8.0202, + "step": 314900 + }, + { + "epoch": 1.2832407736516043, + "grad_norm": 2.1396756172180176, + "learning_rate": 0.0045829554547657265, + "loss": 8.1281, + "step": 315000 + }, + { + "epoch": 1.2832407736516043, + "eval_MaskedAccuracy": 0.4878806756258978, + "eval_loss": 1.7071367502212524, + "eval_runtime": 238.8459, + "eval_samples_per_second": 265.761, + "eval_steps_per_second": 1.038, + "step": 315000 + }, + { + "epoch": 1.2836481516749856, + "grad_norm": 4.4804816246032715, + "learning_rate": 0.004582680825096893, + "loss": 8.1365, + "step": 315100 + }, + { + "epoch": 1.2840555296983671, + "grad_norm": 2.1836588382720947, + "learning_rate": 0.004582406113286226, + "loss": 8.0849, + "step": 315200 + }, + { + "epoch": 1.2844629077217486, + "grad_norm": 3.1351490020751953, + "learning_rate": 0.004582131319344582, + "loss": 8.0476, + "step": 315300 + }, + { + "epoch": 1.28487028574513, + "grad_norm": 1.471785545349121, + "learning_rate": 0.0045818564432828345, + "loss": 8.0731, + "step": 315400 + }, + { + "epoch": 1.2852776637685115, + "grad_norm": 3.349703073501587, + "learning_rate": 0.004581581485111841, + "loss": 8.084, + "step": 315500 + }, + { + "epoch": 1.285685041791893, + "grad_norm": 2.4420437812805176, + "learning_rate": 0.004581306444842476, + "loss": 8.037, + "step": 315600 + }, + { + "epoch": 1.2860924198152746, + "grad_norm": 3.901373863220215, + "learning_rate": 0.00458103132248561, + "loss": 8.0409, + "step": 315700 + }, + { + "epoch": 1.2864997978386559, + "grad_norm": 3.417895793914795, + "learning_rate": 0.00458075611805213, + "loss": 8.0222, + "step": 315800 + }, + { + "epoch": 1.2869071758620374, + "grad_norm": 4.650550365447998, + "learning_rate": 0.004580480831552905, + "loss": 8.0042, + "step": 315900 + }, + { + "epoch": 1.2873145538854187, + "grad_norm": 3.382446050643921, + "learning_rate": 0.00458020546299882, + "loss": 7.981, + "step": 316000 + }, + { + "epoch": 1.2873145538854187, + "eval_MaskedAccuracy": 0.49159966625540336, + "eval_loss": 1.6857359409332275, + "eval_runtime": 236.8735, + "eval_samples_per_second": 267.974, + "eval_steps_per_second": 1.047, + "step": 316000 + }, + { + "epoch": 1.2877219319088002, + "grad_norm": 4.8859477043151855, + "learning_rate": 0.004579930012400767, + "loss": 8.0121, + "step": 316100 + }, + { + "epoch": 1.2881293099321818, + "grad_norm": 0.9068177342414856, + "learning_rate": 0.004579654479769629, + "loss": 8.0609, + "step": 316200 + }, + { + "epoch": 1.2885366879555633, + "grad_norm": 6.070545196533203, + "learning_rate": 0.0045793788651163, + "loss": 8.089, + "step": 316300 + }, + { + "epoch": 1.2889440659789446, + "grad_norm": 1.946527361869812, + "learning_rate": 0.004579103168451684, + "loss": 8.0639, + "step": 316400 + }, + { + "epoch": 1.2893514440023262, + "grad_norm": 2.040311336517334, + "learning_rate": 0.0045788273897866836, + "loss": 8.0402, + "step": 316500 + }, + { + "epoch": 1.2897588220257075, + "grad_norm": 2.201308012008667, + "learning_rate": 0.004578551529132186, + "loss": 8.0823, + "step": 316600 + }, + { + "epoch": 1.290166200049089, + "grad_norm": 5.859253406524658, + "learning_rate": 0.004578275586499108, + "loss": 8.1054, + "step": 316700 + }, + { + "epoch": 1.2905735780724705, + "grad_norm": 4.747668266296387, + "learning_rate": 0.004577999561898358, + "loss": 8.0821, + "step": 316800 + }, + { + "epoch": 1.290980956095852, + "grad_norm": 2.4468894004821777, + "learning_rate": 0.004577723455340848, + "loss": 8.1113, + "step": 316900 + }, + { + "epoch": 1.2913883341192334, + "grad_norm": 8.025307655334473, + "learning_rate": 0.004577447266837487, + "loss": 8.0874, + "step": 317000 + }, + { + "epoch": 1.2913883341192334, + "eval_MaskedAccuracy": 0.48697882185196195, + "eval_loss": 1.7081642150878906, + "eval_runtime": 192.8795, + "eval_samples_per_second": 329.097, + "eval_steps_per_second": 1.286, + "step": 317000 + }, + { + "epoch": 1.291795712142615, + "grad_norm": 6.981740474700928, + "learning_rate": 0.004577170996399209, + "loss": 8.1121, + "step": 317100 + }, + { + "epoch": 1.2922030901659964, + "grad_norm": 2.590914487838745, + "learning_rate": 0.004576894644036933, + "loss": 8.0975, + "step": 317200 + }, + { + "epoch": 1.2926104681893777, + "grad_norm": 1.8462945222854614, + "learning_rate": 0.004576618209761579, + "loss": 8.0449, + "step": 317300 + }, + { + "epoch": 1.2930178462127593, + "grad_norm": 3.038158416748047, + "learning_rate": 0.004576341693584074, + "loss": 8.0803, + "step": 317400 + }, + { + "epoch": 1.2934252242361408, + "grad_norm": 5.122670650482178, + "learning_rate": 0.004576065095515352, + "loss": 8.041, + "step": 317500 + }, + { + "epoch": 1.2938326022595221, + "grad_norm": 5.177152633666992, + "learning_rate": 0.0045757884155663555, + "loss": 8.1004, + "step": 317600 + }, + { + "epoch": 1.2942399802829037, + "grad_norm": 1.4997923374176025, + "learning_rate": 0.004575511653748008, + "loss": 8.0695, + "step": 317700 + }, + { + "epoch": 1.2946473583062852, + "grad_norm": 4.639394283294678, + "learning_rate": 0.0045752348100712645, + "loss": 8.0935, + "step": 317800 + }, + { + "epoch": 1.2950547363296665, + "grad_norm": 1.499557614326477, + "learning_rate": 0.004574957884547069, + "loss": 8.018, + "step": 317900 + }, + { + "epoch": 1.295462114353048, + "grad_norm": 3.189948558807373, + "learning_rate": 0.004574680877186372, + "loss": 8.1136, + "step": 318000 + }, + { + "epoch": 1.295462114353048, + "eval_MaskedAccuracy": 0.4893474880067361, + "eval_loss": 1.6934936046600342, + "eval_runtime": 197.65, + "eval_samples_per_second": 321.153, + "eval_steps_per_second": 1.255, + "step": 318000 + }, + { + "epoch": 1.2958694923764296, + "grad_norm": 1.9850223064422607, + "learning_rate": 0.004574403788000111, + "loss": 8.0685, + "step": 318100 + }, + { + "epoch": 1.296276870399811, + "grad_norm": 4.261771202087402, + "learning_rate": 0.00457412661699925, + "loss": 8.0733, + "step": 318200 + }, + { + "epoch": 1.2966842484231924, + "grad_norm": 2.2075061798095703, + "learning_rate": 0.004573849364194751, + "loss": 8.0329, + "step": 318300 + }, + { + "epoch": 1.297091626446574, + "grad_norm": 1.7161146402359009, + "learning_rate": 0.004573572029597568, + "loss": 8.0544, + "step": 318400 + }, + { + "epoch": 1.2974990044699553, + "grad_norm": 3.8380091190338135, + "learning_rate": 0.004573294613218673, + "loss": 8.0884, + "step": 318500 + }, + { + "epoch": 1.2979063824933368, + "grad_norm": 2.094470262527466, + "learning_rate": 0.004573017115069026, + "loss": 8.0765, + "step": 318600 + }, + { + "epoch": 1.2983137605167183, + "grad_norm": 2.915336847305298, + "learning_rate": 0.004572739535159604, + "loss": 8.0381, + "step": 318700 + }, + { + "epoch": 1.2987211385400999, + "grad_norm": 3.584977626800537, + "learning_rate": 0.004572461873501381, + "loss": 8.0563, + "step": 318800 + }, + { + "epoch": 1.2991285165634812, + "grad_norm": 3.8318121433258057, + "learning_rate": 0.004572184130105332, + "loss": 8.0355, + "step": 318900 + }, + { + "epoch": 1.2995358945868627, + "grad_norm": 4.111202716827393, + "learning_rate": 0.004571906304982439, + "loss": 8.1071, + "step": 319000 + }, + { + "epoch": 1.2995358945868627, + "eval_MaskedAccuracy": 0.4893559775239622, + "eval_loss": 1.700352668762207, + "eval_runtime": 205.5605, + "eval_samples_per_second": 308.795, + "eval_steps_per_second": 1.206, + "step": 319000 + }, + { + "epoch": 1.299943272610244, + "grad_norm": 0.9857933521270752, + "learning_rate": 0.004571628398143688, + "loss": 8.0685, + "step": 319100 + }, + { + "epoch": 1.3003506506336255, + "grad_norm": 4.459076881408691, + "learning_rate": 0.004571350409600061, + "loss": 8.121, + "step": 319200 + }, + { + "epoch": 1.300758028657007, + "grad_norm": 4.374945640563965, + "learning_rate": 0.004571072339362558, + "loss": 8.0528, + "step": 319300 + }, + { + "epoch": 1.3011654066803886, + "grad_norm": 1.8823482990264893, + "learning_rate": 0.004570794187442155, + "loss": 8.1035, + "step": 319400 + }, + { + "epoch": 1.30157278470377, + "grad_norm": 2.472083568572998, + "learning_rate": 0.004570515953849863, + "loss": 8.0685, + "step": 319500 + }, + { + "epoch": 1.3019801627271514, + "grad_norm": 5.041461944580078, + "learning_rate": 0.004570237638596677, + "loss": 8.0999, + "step": 319600 + }, + { + "epoch": 1.302387540750533, + "grad_norm": 4.218006610870361, + "learning_rate": 0.004569959241693603, + "loss": 8.1125, + "step": 319700 + }, + { + "epoch": 1.3027949187739143, + "grad_norm": 3.5334811210632324, + "learning_rate": 0.004569680763151646, + "loss": 8.0523, + "step": 319800 + }, + { + "epoch": 1.3032022967972958, + "grad_norm": 4.296719074249268, + "learning_rate": 0.004569402202981819, + "loss": 8.0481, + "step": 319900 + }, + { + "epoch": 1.3036096748206774, + "grad_norm": 2.7426321506500244, + "learning_rate": 0.004569123561195129, + "loss": 7.9919, + "step": 320000 + }, + { + "epoch": 1.3036096748206774, + "eval_MaskedAccuracy": 0.4920605854338912, + "eval_loss": 1.6915849447250366, + "eval_runtime": 177.7127, + "eval_samples_per_second": 357.183, + "eval_steps_per_second": 1.396, + "step": 320000 + }, + { + "epoch": 1.3040170528440587, + "grad_norm": 1.2438089847564697, + "learning_rate": 0.0045688448378025945, + "loss": 8.0546, + "step": 320100 + }, + { + "epoch": 1.3044244308674402, + "grad_norm": 4.601132869720459, + "learning_rate": 0.004568566032815233, + "loss": 8.1145, + "step": 320200 + }, + { + "epoch": 1.3048318088908217, + "grad_norm": 3.7675604820251465, + "learning_rate": 0.004568287146244068, + "loss": 8.0729, + "step": 320300 + }, + { + "epoch": 1.305239186914203, + "grad_norm": 2.828606605529785, + "learning_rate": 0.004568008178100126, + "loss": 8.0348, + "step": 320400 + }, + { + "epoch": 1.3056465649375846, + "grad_norm": 5.11243200302124, + "learning_rate": 0.004567729128394441, + "loss": 8.0683, + "step": 320500 + }, + { + "epoch": 1.306053942960966, + "grad_norm": 4.557095050811768, + "learning_rate": 0.004567449997138039, + "loss": 8.0726, + "step": 320600 + }, + { + "epoch": 1.3064613209843476, + "grad_norm": 2.090935707092285, + "learning_rate": 0.0045671707843419635, + "loss": 8.0992, + "step": 320700 + }, + { + "epoch": 1.306868699007729, + "grad_norm": 2.7916462421417236, + "learning_rate": 0.004566891490017243, + "loss": 8.0534, + "step": 320800 + }, + { + "epoch": 1.3072760770311105, + "grad_norm": 3.4025943279266357, + "learning_rate": 0.004566612114174925, + "loss": 8.0205, + "step": 320900 + }, + { + "epoch": 1.3076834550544918, + "grad_norm": 3.246011972427368, + "learning_rate": 0.0045663326568260556, + "loss": 8.0384, + "step": 321000 + }, + { + "epoch": 1.3076834550544918, + "eval_MaskedAccuracy": 0.4923900148265971, + "eval_loss": 1.6762045621871948, + "eval_runtime": 246.8153, + "eval_samples_per_second": 257.18, + "eval_steps_per_second": 1.005, + "step": 321000 + }, + { + "epoch": 1.3080908330778733, + "grad_norm": 2.6321635246276855, + "learning_rate": 0.0045660531179816805, + "loss": 8.0152, + "step": 321100 + }, + { + "epoch": 1.3084982111012549, + "grad_norm": 2.8856520652770996, + "learning_rate": 0.0045657734976528525, + "loss": 8.0505, + "step": 321200 + }, + { + "epoch": 1.3089055891246364, + "grad_norm": 4.007819175720215, + "learning_rate": 0.004565493795850623, + "loss": 8.0754, + "step": 321300 + }, + { + "epoch": 1.3093129671480177, + "grad_norm": 4.678389072418213, + "learning_rate": 0.004565214012586056, + "loss": 8.1385, + "step": 321400 + }, + { + "epoch": 1.3097203451713992, + "grad_norm": 4.059128284454346, + "learning_rate": 0.0045649341478702105, + "loss": 8.1046, + "step": 321500 + }, + { + "epoch": 1.3101277231947805, + "grad_norm": 2.2171413898468018, + "learning_rate": 0.004564654201714153, + "loss": 8.0373, + "step": 321600 + }, + { + "epoch": 1.310535101218162, + "grad_norm": 5.1353678703308105, + "learning_rate": 0.004564374174128946, + "loss": 8.0667, + "step": 321700 + }, + { + "epoch": 1.3109424792415436, + "grad_norm": 1.2548649311065674, + "learning_rate": 0.004564094065125663, + "loss": 8.0762, + "step": 321800 + }, + { + "epoch": 1.3113498572649251, + "grad_norm": 9.941655158996582, + "learning_rate": 0.004563813874715376, + "loss": 8.0995, + "step": 321900 + }, + { + "epoch": 1.3117572352883065, + "grad_norm": 3.532829761505127, + "learning_rate": 0.004563533602909171, + "loss": 8.1438, + "step": 322000 + }, + { + "epoch": 1.3117572352883065, + "eval_MaskedAccuracy": 0.48833868593461993, + "eval_loss": 1.7106949090957642, + "eval_runtime": 174.0025, + "eval_samples_per_second": 364.799, + "eval_steps_per_second": 1.425, + "step": 322000 + }, + { + "epoch": 1.312164613311688, + "grad_norm": 4.559520721435547, + "learning_rate": 0.004563253249718122, + "loss": 8.0733, + "step": 322100 + }, + { + "epoch": 1.3125719913350695, + "grad_norm": 2.4093596935272217, + "learning_rate": 0.004562972815153312, + "loss": 8.1038, + "step": 322200 + }, + { + "epoch": 1.3129793693584508, + "grad_norm": 2.1672794818878174, + "learning_rate": 0.0045626922992258335, + "loss": 8.018, + "step": 322300 + }, + { + "epoch": 1.3133867473818324, + "grad_norm": 4.177785396575928, + "learning_rate": 0.004562411701946769, + "loss": 8.0942, + "step": 322400 + }, + { + "epoch": 1.313794125405214, + "grad_norm": 1.0137476921081543, + "learning_rate": 0.00456213102332722, + "loss": 8.0877, + "step": 322500 + }, + { + "epoch": 1.3142015034285952, + "grad_norm": 4.39864444732666, + "learning_rate": 0.004561850263378278, + "loss": 8.0427, + "step": 322600 + }, + { + "epoch": 1.3146088814519767, + "grad_norm": 4.880063056945801, + "learning_rate": 0.004561569422111041, + "loss": 8.1473, + "step": 322700 + }, + { + "epoch": 1.3150162594753583, + "grad_norm": 0.6708384156227112, + "learning_rate": 0.00456128849953662, + "loss": 8.0599, + "step": 322800 + }, + { + "epoch": 1.3154236374987396, + "grad_norm": 3.5477101802825928, + "learning_rate": 0.004561007495666116, + "loss": 8.0718, + "step": 322900 + }, + { + "epoch": 1.3158310155221211, + "grad_norm": 1.742142915725708, + "learning_rate": 0.004560726410510638, + "loss": 8.0656, + "step": 323000 + }, + { + "epoch": 1.3158310155221211, + "eval_MaskedAccuracy": 0.48881355493981477, + "eval_loss": 1.7019761800765991, + "eval_runtime": 165.7085, + "eval_samples_per_second": 383.058, + "eval_steps_per_second": 1.497, + "step": 323000 + }, + { + "epoch": 1.3162383935455026, + "grad_norm": 3.2039802074432373, + "learning_rate": 0.0045604452440813, + "loss": 8.0663, + "step": 323100 + }, + { + "epoch": 1.3166457715688842, + "grad_norm": 1.3553608655929565, + "learning_rate": 0.0045601639963892176, + "loss": 8.0598, + "step": 323200 + }, + { + "epoch": 1.3170531495922655, + "grad_norm": 3.884082794189453, + "learning_rate": 0.004559882667445508, + "loss": 8.079, + "step": 323300 + }, + { + "epoch": 1.317460527615647, + "grad_norm": 0.9716830253601074, + "learning_rate": 0.004559601257261301, + "loss": 8.0483, + "step": 323400 + }, + { + "epoch": 1.3178679056390283, + "grad_norm": 4.185678482055664, + "learning_rate": 0.004559319765847716, + "loss": 8.0821, + "step": 323500 + }, + { + "epoch": 1.3182752836624099, + "grad_norm": 3.945801258087158, + "learning_rate": 0.0045590381932158865, + "loss": 8.0995, + "step": 323600 + }, + { + "epoch": 1.3186826616857914, + "grad_norm": 3.5799834728240967, + "learning_rate": 0.004558756539376936, + "loss": 8.0337, + "step": 323700 + }, + { + "epoch": 1.319090039709173, + "grad_norm": 3.5952444076538086, + "learning_rate": 0.004558474804342002, + "loss": 8.0947, + "step": 323800 + }, + { + "epoch": 1.3194974177325542, + "grad_norm": 3.2049031257629395, + "learning_rate": 0.004558192988122229, + "loss": 8.1092, + "step": 323900 + }, + { + "epoch": 1.3199047957559358, + "grad_norm": 2.319859504699707, + "learning_rate": 0.004557911090728755, + "loss": 8.0848, + "step": 324000 + }, + { + "epoch": 1.3199047957559358, + "eval_MaskedAccuracy": 0.4901750505678959, + "eval_loss": 1.6923388242721558, + "eval_runtime": 169.8525, + "eval_samples_per_second": 373.712, + "eval_steps_per_second": 1.46, + "step": 324000 + }, + { + "epoch": 1.320312173779317, + "grad_norm": 1.0164000988006592, + "learning_rate": 0.004557629112172727, + "loss": 8.0731, + "step": 324100 + }, + { + "epoch": 1.3207195518026986, + "grad_norm": 1.9091594219207764, + "learning_rate": 0.004557347052465298, + "loss": 8.13, + "step": 324200 + }, + { + "epoch": 1.3211269298260802, + "grad_norm": 1.4779466390609741, + "learning_rate": 0.0045570649116176126, + "loss": 8.1098, + "step": 324300 + }, + { + "epoch": 1.3215343078494617, + "grad_norm": 13.105330467224121, + "learning_rate": 0.004556782689640824, + "loss": 8.0902, + "step": 324400 + }, + { + "epoch": 1.321941685872843, + "grad_norm": 2.3293752670288086, + "learning_rate": 0.004556500386546093, + "loss": 8.0626, + "step": 324500 + }, + { + "epoch": 1.3223490638962245, + "grad_norm": 1.8382813930511475, + "learning_rate": 0.004556218002344574, + "loss": 8.0754, + "step": 324600 + }, + { + "epoch": 1.322756441919606, + "grad_norm": 3.2023203372955322, + "learning_rate": 0.0045559355370474374, + "loss": 8.0802, + "step": 324700 + }, + { + "epoch": 1.3231638199429874, + "grad_norm": 2.298903703689575, + "learning_rate": 0.004555652990665853, + "loss": 8.0578, + "step": 324800 + }, + { + "epoch": 1.323571197966369, + "grad_norm": 1.0970746278762817, + "learning_rate": 0.004555370363210988, + "loss": 8.0446, + "step": 324900 + }, + { + "epoch": 1.3239785759897504, + "grad_norm": 1.624550223350525, + "learning_rate": 0.004555087654694023, + "loss": 8.0969, + "step": 325000 + }, + { + "epoch": 1.3239785759897504, + "eval_MaskedAccuracy": 0.49033513649122273, + "eval_loss": 1.6904529333114624, + "eval_runtime": 245.9911, + "eval_samples_per_second": 258.042, + "eval_steps_per_second": 1.008, + "step": 325000 + }, + { + "epoch": 1.3243859540131317, + "grad_norm": 2.7660932540893555, + "learning_rate": 0.004554804865126121, + "loss": 8.0789, + "step": 325100 + }, + { + "epoch": 1.3247933320365133, + "grad_norm": 1.6017922163009644, + "learning_rate": 0.0045545219945184725, + "loss": 8.0805, + "step": 325200 + }, + { + "epoch": 1.3252007100598948, + "grad_norm": 1.1955177783966064, + "learning_rate": 0.004554239042882257, + "loss": 8.0557, + "step": 325300 + }, + { + "epoch": 1.3256080880832761, + "grad_norm": 1.2732222080230713, + "learning_rate": 0.004553956010228655, + "loss": 8.0826, + "step": 325400 + }, + { + "epoch": 1.3260154661066577, + "grad_norm": 1.009123682975769, + "learning_rate": 0.004553672896568865, + "loss": 8.0644, + "step": 325500 + }, + { + "epoch": 1.3264228441300392, + "grad_norm": 3.8442931175231934, + "learning_rate": 0.00455338970191408, + "loss": 8.052, + "step": 325600 + }, + { + "epoch": 1.3268302221534207, + "grad_norm": 3.865133285522461, + "learning_rate": 0.004553106426275495, + "loss": 8.016, + "step": 325700 + }, + { + "epoch": 1.327237600176802, + "grad_norm": 1.6926275491714478, + "learning_rate": 0.004552823069664311, + "loss": 8.0729, + "step": 325800 + }, + { + "epoch": 1.3276449782001836, + "grad_norm": 3.8086345195770264, + "learning_rate": 0.004552539632091731, + "loss": 8.0186, + "step": 325900 + }, + { + "epoch": 1.3280523562235649, + "grad_norm": 2.3455417156219482, + "learning_rate": 0.004552256113568953, + "loss": 8.0357, + "step": 326000 + }, + { + "epoch": 1.3280523562235649, + "eval_MaskedAccuracy": 0.4922446641348998, + "eval_loss": 1.688795566558838, + "eval_runtime": 223.6127, + "eval_samples_per_second": 283.866, + "eval_steps_per_second": 1.109, + "step": 326000 + }, + { + "epoch": 1.3284597342469464, + "grad_norm": 4.211648464202881, + "learning_rate": 0.004551972514107189, + "loss": 7.9905, + "step": 326100 + }, + { + "epoch": 1.328867112270328, + "grad_norm": 3.3933537006378174, + "learning_rate": 0.004551688833717657, + "loss": 7.9922, + "step": 326200 + }, + { + "epoch": 1.3292744902937095, + "grad_norm": 3.274343252182007, + "learning_rate": 0.004551405072411573, + "loss": 8.0469, + "step": 326300 + }, + { + "epoch": 1.3296818683170908, + "grad_norm": 2.8096282482147217, + "learning_rate": 0.004551121230200147, + "loss": 8.0395, + "step": 326400 + }, + { + "epoch": 1.3300892463404723, + "grad_norm": 2.965846538543701, + "learning_rate": 0.0045508373070946035, + "loss": 8.0873, + "step": 326500 + }, + { + "epoch": 1.3304966243638536, + "grad_norm": 7.503636837005615, + "learning_rate": 0.004550553303106173, + "loss": 8.0563, + "step": 326600 + }, + { + "epoch": 1.3309040023872352, + "grad_norm": 1.167646050453186, + "learning_rate": 0.004550269218246077, + "loss": 8.0396, + "step": 326700 + }, + { + "epoch": 1.3313113804106167, + "grad_norm": 2.967207431793213, + "learning_rate": 0.004549985052525551, + "loss": 8.1156, + "step": 326800 + }, + { + "epoch": 1.3317187584339982, + "grad_norm": 1.8906344175338745, + "learning_rate": 0.004549700805955837, + "loss": 8.0438, + "step": 326900 + }, + { + "epoch": 1.3321261364573795, + "grad_norm": 2.934070348739624, + "learning_rate": 0.004549416478548158, + "loss": 8.0693, + "step": 327000 + }, + { + "epoch": 1.3321261364573795, + "eval_MaskedAccuracy": 0.4904791841240935, + "eval_loss": 1.6997677087783813, + "eval_runtime": 181.7826, + "eval_samples_per_second": 349.186, + "eval_steps_per_second": 1.364, + "step": 327000 + }, + { + "epoch": 1.332533514480761, + "grad_norm": 3.414621353149414, + "learning_rate": 0.004549132070313766, + "loss": 7.9985, + "step": 327100 + }, + { + "epoch": 1.3329408925041426, + "grad_norm": 4.870385646820068, + "learning_rate": 0.004548847581263898, + "loss": 8.0352, + "step": 327200 + }, + { + "epoch": 1.333348270527524, + "grad_norm": 4.610667705535889, + "learning_rate": 0.004548563011409796, + "loss": 8.059, + "step": 327300 + }, + { + "epoch": 1.3337556485509054, + "grad_norm": 5.722850322723389, + "learning_rate": 0.004548278360762722, + "loss": 8.0358, + "step": 327400 + }, + { + "epoch": 1.334163026574287, + "grad_norm": 3.6654059886932373, + "learning_rate": 0.00454799362933393, + "loss": 8.0514, + "step": 327500 + }, + { + "epoch": 1.3345704045976683, + "grad_norm": 0.7920379638671875, + "learning_rate": 0.00454770881713467, + "loss": 8.0352, + "step": 327600 + }, + { + "epoch": 1.3349777826210498, + "grad_norm": 3.1758406162261963, + "learning_rate": 0.004547423924176206, + "loss": 8.0964, + "step": 327700 + }, + { + "epoch": 1.3353851606444314, + "grad_norm": 15.178168296813965, + "learning_rate": 0.004547138950469805, + "loss": 8.0582, + "step": 327800 + }, + { + "epoch": 1.3357925386678127, + "grad_norm": 4.05223274230957, + "learning_rate": 0.00454685389602673, + "loss": 8.0708, + "step": 327900 + }, + { + "epoch": 1.3361999166911942, + "grad_norm": 1.9266265630722046, + "learning_rate": 0.004546568760858243, + "loss": 8.0914, + "step": 328000 + }, + { + "epoch": 1.3361999166911942, + "eval_MaskedAccuracy": 0.4894137960178113, + "eval_loss": 1.7010035514831543, + "eval_runtime": 176.7678, + "eval_samples_per_second": 359.092, + "eval_steps_per_second": 1.403, + "step": 328000 + }, + { + "epoch": 1.3366072947145757, + "grad_norm": 4.658905982971191, + "learning_rate": 0.004546283544975624, + "loss": 8.0473, + "step": 328100 + }, + { + "epoch": 1.3370146727379573, + "grad_norm": 4.361426830291748, + "learning_rate": 0.004545998248390153, + "loss": 8.0613, + "step": 328200 + }, + { + "epoch": 1.3374220507613386, + "grad_norm": 3.9232702255249023, + "learning_rate": 0.004545712871113103, + "loss": 8.0478, + "step": 328300 + }, + { + "epoch": 1.33782942878472, + "grad_norm": 1.5778297185897827, + "learning_rate": 0.004545427413155761, + "loss": 8.0545, + "step": 328400 + }, + { + "epoch": 1.3382368068081014, + "grad_norm": 1.4725537300109863, + "learning_rate": 0.004545141874529407, + "loss": 8.0925, + "step": 328500 + }, + { + "epoch": 1.338644184831483, + "grad_norm": 1.4098880290985107, + "learning_rate": 0.004544856255245334, + "loss": 8.0293, + "step": 328600 + }, + { + "epoch": 1.3390515628548645, + "grad_norm": 3.308492660522461, + "learning_rate": 0.004544570555314832, + "loss": 8.0651, + "step": 328700 + }, + { + "epoch": 1.339458940878246, + "grad_norm": 3.185256242752075, + "learning_rate": 0.004544284774749198, + "loss": 8.0394, + "step": 328800 + }, + { + "epoch": 1.3398663189016273, + "grad_norm": 2.046502113342285, + "learning_rate": 0.004543998913559733, + "loss": 8.012, + "step": 328900 + }, + { + "epoch": 1.3402736969250089, + "grad_norm": 4.595850944519043, + "learning_rate": 0.00454371297175773, + "loss": 8.0634, + "step": 329000 + }, + { + "epoch": 1.3402736969250089, + "eval_MaskedAccuracy": 0.48753905746642684, + "eval_loss": 1.6977170705795288, + "eval_runtime": 173.5205, + "eval_samples_per_second": 365.813, + "eval_steps_per_second": 1.429, + "step": 329000 + }, + { + "epoch": 1.3406810749483902, + "grad_norm": 1.6070823669433594, + "learning_rate": 0.004543426949354504, + "loss": 8.0694, + "step": 329100 + }, + { + "epoch": 1.3410884529717717, + "grad_norm": 0.9883131384849548, + "learning_rate": 0.004543140846361356, + "loss": 8.0704, + "step": 329200 + }, + { + "epoch": 1.3414958309951532, + "grad_norm": 2.9271223545074463, + "learning_rate": 0.004542854662789608, + "loss": 8.0833, + "step": 329300 + }, + { + "epoch": 1.3419032090185348, + "grad_norm": 2.7808384895324707, + "learning_rate": 0.0045425683986505625, + "loss": 8.0299, + "step": 329400 + }, + { + "epoch": 1.342310587041916, + "grad_norm": 3.7802367210388184, + "learning_rate": 0.00454228205395553, + "loss": 8.038, + "step": 329500 + }, + { + "epoch": 1.3427179650652976, + "grad_norm": 2.54129958152771, + "learning_rate": 0.004541995628715848, + "loss": 7.9923, + "step": 329600 + }, + { + "epoch": 1.3431253430886791, + "grad_norm": 3.8371310234069824, + "learning_rate": 0.00454170912294283, + "loss": 8.0065, + "step": 329700 + }, + { + "epoch": 1.3435327211120605, + "grad_norm": 4.309117317199707, + "learning_rate": 0.004541422536647813, + "loss": 8.0154, + "step": 329800 + }, + { + "epoch": 1.343940099135442, + "grad_norm": 6.226172924041748, + "learning_rate": 0.0045411358698421184, + "loss": 8.0002, + "step": 329900 + }, + { + "epoch": 1.3443474771588235, + "grad_norm": 5.053940296173096, + "learning_rate": 0.0045408491225370825, + "loss": 8.0782, + "step": 330000 + }, + { + "epoch": 1.3443474771588235, + "eval_MaskedAccuracy": 0.48996696355953634, + "eval_loss": 1.6935937404632568, + "eval_runtime": 171.1582, + "eval_samples_per_second": 370.862, + "eval_steps_per_second": 1.449, + "step": 330000 + }, + { + "epoch": 1.3447548551822048, + "grad_norm": 3.9390504360198975, + "learning_rate": 0.004540562294744039, + "loss": 8.0451, + "step": 330100 + }, + { + "epoch": 1.3451622332055864, + "grad_norm": 3.181065082550049, + "learning_rate": 0.004540275386474344, + "loss": 8.0572, + "step": 330200 + }, + { + "epoch": 1.345569611228968, + "grad_norm": 3.486276865005493, + "learning_rate": 0.004539988397739318, + "loss": 8.0695, + "step": 330300 + }, + { + "epoch": 1.3459769892523492, + "grad_norm": 2.4831082820892334, + "learning_rate": 0.004539701328550317, + "loss": 8.0066, + "step": 330400 + }, + { + "epoch": 1.3463843672757307, + "grad_norm": 1.9745352268218994, + "learning_rate": 0.004539414178918695, + "loss": 8.0093, + "step": 330500 + }, + { + "epoch": 1.3467917452991123, + "grad_norm": 2.8708128929138184, + "learning_rate": 0.004539126948855803, + "loss": 8.0634, + "step": 330600 + }, + { + "epoch": 1.3471991233224938, + "grad_norm": 2.1291728019714355, + "learning_rate": 0.004538839638372989, + "loss": 8.0565, + "step": 330700 + }, + { + "epoch": 1.3476065013458751, + "grad_norm": 0.8737789988517761, + "learning_rate": 0.004538552247481611, + "loss": 8.0754, + "step": 330800 + }, + { + "epoch": 1.3480138793692567, + "grad_norm": 2.749574899673462, + "learning_rate": 0.0045382647761930345, + "loss": 8.0911, + "step": 330900 + }, + { + "epoch": 1.348421257392638, + "grad_norm": 1.407920479774475, + "learning_rate": 0.004537977224518631, + "loss": 8.0574, + "step": 331000 + }, + { + "epoch": 1.348421257392638, + "eval_MaskedAccuracy": 0.487671073066648, + "eval_loss": 1.7070835828781128, + "eval_runtime": 179.2232, + "eval_samples_per_second": 354.173, + "eval_steps_per_second": 1.384, + "step": 331000 + }, + { + "epoch": 1.3488286354160195, + "grad_norm": 4.601722240447998, + "learning_rate": 0.004537689592469766, + "loss": 8.0862, + "step": 331100 + }, + { + "epoch": 1.349236013439401, + "grad_norm": 3.048001527786255, + "learning_rate": 0.00453740188005781, + "loss": 8.0407, + "step": 331200 + }, + { + "epoch": 1.3496433914627826, + "grad_norm": 3.656520366668701, + "learning_rate": 0.00453711408729414, + "loss": 8.0899, + "step": 331300 + }, + { + "epoch": 1.3500507694861639, + "grad_norm": 2.7314696311950684, + "learning_rate": 0.004536826214190134, + "loss": 8.0933, + "step": 331400 + }, + { + "epoch": 1.3504581475095454, + "grad_norm": 4.379732131958008, + "learning_rate": 0.0045365382607571655, + "loss": 8.0515, + "step": 331500 + }, + { + "epoch": 1.3508655255329267, + "grad_norm": 1.268511414527893, + "learning_rate": 0.004536250227006631, + "loss": 8.0318, + "step": 331600 + }, + { + "epoch": 1.3512729035563082, + "grad_norm": 3.986025094985962, + "learning_rate": 0.0045359621129499065, + "loss": 8.0469, + "step": 331700 + }, + { + "epoch": 1.3516802815796898, + "grad_norm": 5.96984338760376, + "learning_rate": 0.004535673918598391, + "loss": 8.0228, + "step": 331800 + }, + { + "epoch": 1.3520876596030713, + "grad_norm": 3.952479839324951, + "learning_rate": 0.004535385643963468, + "loss": 8.0324, + "step": 331900 + }, + { + "epoch": 1.3524950376264526, + "grad_norm": 2.14145827293396, + "learning_rate": 0.00453509728905654, + "loss": 8.0112, + "step": 332000 + }, + { + "epoch": 1.3524950376264526, + "eval_MaskedAccuracy": 0.488893655978673, + "eval_loss": 1.7075307369232178, + "eval_runtime": 205.2728, + "eval_samples_per_second": 309.228, + "eval_steps_per_second": 1.208, + "step": 332000 + }, + { + "epoch": 1.3529024156498342, + "grad_norm": 3.316530227661133, + "learning_rate": 0.004534808853889009, + "loss": 8.032, + "step": 332100 + }, + { + "epoch": 1.3533097936732157, + "grad_norm": 4.175098896026611, + "learning_rate": 0.004534520338472278, + "loss": 8.0142, + "step": 332200 + }, + { + "epoch": 1.353717171696597, + "grad_norm": 1.8745940923690796, + "learning_rate": 0.004534231742817755, + "loss": 8.0436, + "step": 332300 + }, + { + "epoch": 1.3541245497199785, + "grad_norm": 4.861321449279785, + "learning_rate": 0.00453394306693685, + "loss": 8.0902, + "step": 332400 + }, + { + "epoch": 1.35453192774336, + "grad_norm": 4.840919494628906, + "learning_rate": 0.004533654310840977, + "loss": 8.0495, + "step": 332500 + }, + { + "epoch": 1.3549393057667414, + "grad_norm": 3.8090786933898926, + "learning_rate": 0.004533365474541542, + "loss": 8.0285, + "step": 332600 + }, + { + "epoch": 1.355346683790123, + "grad_norm": 1.8056049346923828, + "learning_rate": 0.004533076558049974, + "loss": 7.9913, + "step": 332700 + }, + { + "epoch": 1.3557540618135044, + "grad_norm": 2.420238733291626, + "learning_rate": 0.004532787561377689, + "loss": 8.0275, + "step": 332800 + }, + { + "epoch": 1.3561614398368858, + "grad_norm": 3.4509689807891846, + "learning_rate": 0.004532498484536121, + "loss": 8.0616, + "step": 332900 + }, + { + "epoch": 1.3565688178602673, + "grad_norm": 1.4320125579833984, + "learning_rate": 0.004532209327536695, + "loss": 8.0347, + "step": 333000 + }, + { + "epoch": 1.3565688178602673, + "eval_MaskedAccuracy": 0.4893542575052335, + "eval_loss": 1.6820272207260132, + "eval_runtime": 191.0192, + "eval_samples_per_second": 332.302, + "eval_steps_per_second": 1.298, + "step": 333000 + }, + { + "epoch": 1.3569761958836488, + "grad_norm": 1.30508553981781, + "learning_rate": 0.004531920090390839, + "loss": 8.0336, + "step": 333100 + }, + { + "epoch": 1.3573835739070303, + "grad_norm": 0.9989843964576721, + "learning_rate": 0.004531630773109995, + "loss": 8.0265, + "step": 333200 + }, + { + "epoch": 1.3577909519304117, + "grad_norm": 1.1286146640777588, + "learning_rate": 0.004531341375705594, + "loss": 8.0547, + "step": 333300 + }, + { + "epoch": 1.3581983299537932, + "grad_norm": 3.9923794269561768, + "learning_rate": 0.004531051898189085, + "loss": 8.0602, + "step": 333400 + }, + { + "epoch": 1.3586057079771745, + "grad_norm": 2.3580663204193115, + "learning_rate": 0.004530762340571909, + "loss": 8.0617, + "step": 333500 + }, + { + "epoch": 1.359013086000556, + "grad_norm": 3.204130172729492, + "learning_rate": 0.004530472702865515, + "loss": 8.0422, + "step": 333600 + }, + { + "epoch": 1.3594204640239376, + "grad_norm": 3.7657577991485596, + "learning_rate": 0.00453018298508135, + "loss": 8.0668, + "step": 333700 + }, + { + "epoch": 1.359827842047319, + "grad_norm": 2.150792360305786, + "learning_rate": 0.0045298931872308715, + "loss": 8.0803, + "step": 333800 + }, + { + "epoch": 1.3602352200707004, + "grad_norm": 2.5532517433166504, + "learning_rate": 0.004529603309325526, + "loss": 8.0441, + "step": 333900 + }, + { + "epoch": 1.360642598094082, + "grad_norm": 3.6992883682250977, + "learning_rate": 0.004529313351376784, + "loss": 8.0215, + "step": 334000 + }, + { + "epoch": 1.360642598094082, + "eval_MaskedAccuracy": 0.49045791943351175, + "eval_loss": 1.703776478767395, + "eval_runtime": 197.5733, + "eval_samples_per_second": 321.278, + "eval_steps_per_second": 1.255, + "step": 334000 + }, + { + "epoch": 1.3610499761174633, + "grad_norm": 4.939713478088379, + "learning_rate": 0.004529023313396113, + "loss": 8.0213, + "step": 334100 + }, + { + "epoch": 1.3614573541408448, + "grad_norm": 4.058901309967041, + "learning_rate": 0.004528733195394972, + "loss": 8.0415, + "step": 334200 + }, + { + "epoch": 1.3618647321642263, + "grad_norm": 3.4482951164245605, + "learning_rate": 0.0045284429973848385, + "loss": 8.043, + "step": 334300 + }, + { + "epoch": 1.3622721101876079, + "grad_norm": 4.013343811035156, + "learning_rate": 0.004528152719377181, + "loss": 8.049, + "step": 334400 + }, + { + "epoch": 1.3626794882109892, + "grad_norm": 5.67153263092041, + "learning_rate": 0.004527862361383474, + "loss": 8.0046, + "step": 334500 + }, + { + "epoch": 1.3630868662343707, + "grad_norm": 1.1400394439697266, + "learning_rate": 0.004527571923415196, + "loss": 8.0832, + "step": 334600 + }, + { + "epoch": 1.3634942442577522, + "grad_norm": 1.0227196216583252, + "learning_rate": 0.004527281405483833, + "loss": 8.1128, + "step": 334700 + }, + { + "epoch": 1.3639016222811335, + "grad_norm": 4.858122825622559, + "learning_rate": 0.004526990807600877, + "loss": 8.0738, + "step": 334800 + }, + { + "epoch": 1.364309000304515, + "grad_norm": 4.523569107055664, + "learning_rate": 0.004526700129777813, + "loss": 8.027, + "step": 334900 + }, + { + "epoch": 1.3647163783278966, + "grad_norm": 12.582639694213867, + "learning_rate": 0.004526409372026125, + "loss": 7.9837, + "step": 335000 + }, + { + "epoch": 1.3647163783278966, + "eval_MaskedAccuracy": 0.48998230648758867, + "eval_loss": 1.6911903619766235, + "eval_runtime": 420.5773, + "eval_samples_per_second": 150.926, + "eval_steps_per_second": 0.59, + "step": 335000 + }, + { + "epoch": 1.365123756351278, + "grad_norm": 2.2136664390563965, + "learning_rate": 0.00452611853435731, + "loss": 8.0276, + "step": 335100 + }, + { + "epoch": 1.3655311343746594, + "grad_norm": 3.2456421852111816, + "learning_rate": 0.004525827616782868, + "loss": 8.0373, + "step": 335200 + }, + { + "epoch": 1.365938512398041, + "grad_norm": 4.780536651611328, + "learning_rate": 0.004525536619314304, + "loss": 8.004, + "step": 335300 + }, + { + "epoch": 1.3663458904214223, + "grad_norm": 3.3634612560272217, + "learning_rate": 0.004525245541963124, + "loss": 8.0008, + "step": 335400 + }, + { + "epoch": 1.3667532684448038, + "grad_norm": 5.064257621765137, + "learning_rate": 0.0045249543847408315, + "loss": 7.9679, + "step": 335500 + }, + { + "epoch": 1.3671606464681854, + "grad_norm": 4.003411293029785, + "learning_rate": 0.004524663147658934, + "loss": 8.0352, + "step": 335600 + }, + { + "epoch": 1.367568024491567, + "grad_norm": 2.0102012157440186, + "learning_rate": 0.004524371830728952, + "loss": 8.041, + "step": 335700 + }, + { + "epoch": 1.3679754025149482, + "grad_norm": 2.133049726486206, + "learning_rate": 0.004524080433962406, + "loss": 8.0463, + "step": 335800 + }, + { + "epoch": 1.3683827805383297, + "grad_norm": 1.0200728178024292, + "learning_rate": 0.004523788957370809, + "loss": 8.0384, + "step": 335900 + }, + { + "epoch": 1.368790158561711, + "grad_norm": 5.349991798400879, + "learning_rate": 0.004523497400965693, + "loss": 8.0657, + "step": 336000 + }, + { + "epoch": 1.368790158561711, + "eval_MaskedAccuracy": 0.48971874947821503, + "eval_loss": 1.6930047273635864, + "eval_runtime": 210.3323, + "eval_samples_per_second": 301.789, + "eval_steps_per_second": 1.179, + "step": 336000 + }, + { + "epoch": 1.3691975365850926, + "grad_norm": 4.166293621063232, + "learning_rate": 0.004523205764758574, + "loss": 8.0336, + "step": 336100 + }, + { + "epoch": 1.3696049146084741, + "grad_norm": 5.182121276855469, + "learning_rate": 0.004522914048760986, + "loss": 8.0082, + "step": 336200 + }, + { + "epoch": 1.3700122926318556, + "grad_norm": 2.6781444549560547, + "learning_rate": 0.004522622252984457, + "loss": 7.967, + "step": 336300 + }, + { + "epoch": 1.370419670655237, + "grad_norm": 1.211844801902771, + "learning_rate": 0.004522330377440533, + "loss": 8.0258, + "step": 336400 + }, + { + "epoch": 1.3708270486786185, + "grad_norm": 0.9275456070899963, + "learning_rate": 0.004522038422140751, + "loss": 8.0486, + "step": 336500 + }, + { + "epoch": 1.3712344267019998, + "grad_norm": 4.825027942657471, + "learning_rate": 0.004521746387096655, + "loss": 8.0614, + "step": 336600 + }, + { + "epoch": 1.3716418047253813, + "grad_norm": 4.791982173919678, + "learning_rate": 0.00452145427231979, + "loss": 8.0858, + "step": 336700 + }, + { + "epoch": 1.3720491827487629, + "grad_norm": 4.076387882232666, + "learning_rate": 0.0045211620778217, + "loss": 8.0703, + "step": 336800 + }, + { + "epoch": 1.3724565607721444, + "grad_norm": 5.908453941345215, + "learning_rate": 0.004520869803613938, + "loss": 8.0356, + "step": 336900 + }, + { + "epoch": 1.3728639387955257, + "grad_norm": 3.02319598197937, + "learning_rate": 0.0045205774497080594, + "loss": 8.0423, + "step": 337000 + }, + { + "epoch": 1.3728639387955257, + "eval_MaskedAccuracy": 0.4881793270387368, + "eval_loss": 1.7019963264465332, + "eval_runtime": 203.1019, + "eval_samples_per_second": 312.533, + "eval_steps_per_second": 1.221, + "step": 337000 + }, + { + "epoch": 1.3732713168189072, + "grad_norm": 2.5091326236724854, + "learning_rate": 0.00452028501611563, + "loss": 8.0723, + "step": 337100 + }, + { + "epoch": 1.3736786948422888, + "grad_norm": 1.8626298904418945, + "learning_rate": 0.004519992502848197, + "loss": 8.095, + "step": 337200 + }, + { + "epoch": 1.37408607286567, + "grad_norm": 2.8279223442077637, + "learning_rate": 0.004519699909917331, + "loss": 8.0275, + "step": 337300 + }, + { + "epoch": 1.3744934508890516, + "grad_norm": 1.8525398969650269, + "learning_rate": 0.00451940723733461, + "loss": 8.003, + "step": 337400 + }, + { + "epoch": 1.3749008289124331, + "grad_norm": 2.949223041534424, + "learning_rate": 0.0045191144851115935, + "loss": 8.0568, + "step": 337500 + }, + { + "epoch": 1.3753082069358145, + "grad_norm": 4.134367942810059, + "learning_rate": 0.004518821653259865, + "loss": 8.0503, + "step": 337600 + }, + { + "epoch": 1.375715584959196, + "grad_norm": 2.8016107082366943, + "learning_rate": 0.0045185287417909906, + "loss": 8.0944, + "step": 337700 + }, + { + "epoch": 1.3761229629825775, + "grad_norm": 2.1527552604675293, + "learning_rate": 0.004518235750716557, + "loss": 8.0467, + "step": 337800 + }, + { + "epoch": 1.3765303410059588, + "grad_norm": 4.885125637054443, + "learning_rate": 0.004517942680048152, + "loss": 8.0484, + "step": 337900 + }, + { + "epoch": 1.3769377190293404, + "grad_norm": 4.211520195007324, + "learning_rate": 0.004517649529797351, + "loss": 8.0575, + "step": 338000 + }, + { + "epoch": 1.3769377190293404, + "eval_MaskedAccuracy": 0.4918551946198786, + "eval_loss": 1.6849377155303955, + "eval_runtime": 247.7347, + "eval_samples_per_second": 256.226, + "eval_steps_per_second": 1.001, + "step": 338000 + }, + { + "epoch": 1.377345097052722, + "grad_norm": 3.0525062084198, + "learning_rate": 0.004517356299975756, + "loss": 8.0429, + "step": 338100 + }, + { + "epoch": 1.3777524750761034, + "grad_norm": 3.92828369140625, + "learning_rate": 0.00451706299059495, + "loss": 7.9908, + "step": 338200 + }, + { + "epoch": 1.3781598530994847, + "grad_norm": 3.10221529006958, + "learning_rate": 0.004516769601666532, + "loss": 8.037, + "step": 338300 + }, + { + "epoch": 1.3785672311228663, + "grad_norm": 2.6689865589141846, + "learning_rate": 0.004516476133202104, + "loss": 8.0976, + "step": 338400 + }, + { + "epoch": 1.3789746091462476, + "grad_norm": 1.3879767656326294, + "learning_rate": 0.004516182585213266, + "loss": 8.0572, + "step": 338500 + }, + { + "epoch": 1.3793819871696291, + "grad_norm": 1.174448013305664, + "learning_rate": 0.004515888957711625, + "loss": 8.0771, + "step": 338600 + }, + { + "epoch": 1.3797893651930107, + "grad_norm": 5.28508186340332, + "learning_rate": 0.004515595250708782, + "loss": 8.0546, + "step": 338700 + }, + { + "epoch": 1.3801967432163922, + "grad_norm": 4.878987789154053, + "learning_rate": 0.004515301464216359, + "loss": 8.0572, + "step": 338800 + }, + { + "epoch": 1.3806041212397735, + "grad_norm": 6.071109771728516, + "learning_rate": 0.004515007598245968, + "loss": 8.0044, + "step": 338900 + }, + { + "epoch": 1.381011499263155, + "grad_norm": 2.3168129920959473, + "learning_rate": 0.004514713652809226, + "loss": 8.0468, + "step": 339000 + }, + { + "epoch": 1.381011499263155, + "eval_MaskedAccuracy": 0.4877877929554792, + "eval_loss": 1.7033212184906006, + "eval_runtime": 185.87, + "eval_samples_per_second": 341.507, + "eval_steps_per_second": 1.334, + "step": 339000 + }, + { + "epoch": 1.3814188772865363, + "grad_norm": 1.4501903057098389, + "learning_rate": 0.004514419627917752, + "loss": 8.0691, + "step": 339100 + }, + { + "epoch": 1.3818262553099179, + "grad_norm": 12.737586975097656, + "learning_rate": 0.004514125523583172, + "loss": 8.0547, + "step": 339200 + }, + { + "epoch": 1.3822336333332994, + "grad_norm": 2.92000412940979, + "learning_rate": 0.004513831339817122, + "loss": 8.051, + "step": 339300 + }, + { + "epoch": 1.382641011356681, + "grad_norm": 5.565245628356934, + "learning_rate": 0.004513537076631216, + "loss": 8.0789, + "step": 339400 + }, + { + "epoch": 1.3830483893800622, + "grad_norm": 5.634011268615723, + "learning_rate": 0.0045132427340370965, + "loss": 8.068, + "step": 339500 + }, + { + "epoch": 1.3834557674034438, + "grad_norm": 1.9182945489883423, + "learning_rate": 0.004512948312046409, + "loss": 8.0151, + "step": 339600 + }, + { + "epoch": 1.3838631454268253, + "grad_norm": 4.284234523773193, + "learning_rate": 0.004512653810670781, + "loss": 7.9576, + "step": 339700 + }, + { + "epoch": 1.3842705234502066, + "grad_norm": 3.367342710494995, + "learning_rate": 0.004512359229921861, + "loss": 8.0757, + "step": 339800 + }, + { + "epoch": 1.3846779014735882, + "grad_norm": 4.452456474304199, + "learning_rate": 0.004512064569811299, + "loss": 8.0776, + "step": 339900 + }, + { + "epoch": 1.3850852794969697, + "grad_norm": 4.0282883644104, + "learning_rate": 0.004511769830350738, + "loss": 8.0499, + "step": 340000 + }, + { + "epoch": 1.3850852794969697, + "eval_MaskedAccuracy": 0.49261641937426026, + "eval_loss": 1.6836434602737427, + "eval_runtime": 183.9162, + "eval_samples_per_second": 345.135, + "eval_steps_per_second": 1.348, + "step": 340000 + }, + { + "epoch": 1.385492657520351, + "grad_norm": 1.8705769777297974, + "learning_rate": 0.0045114750115518324, + "loss": 7.9889, + "step": 340100 + }, + { + "epoch": 1.3859000355437325, + "grad_norm": 3.2162158489227295, + "learning_rate": 0.004511180113426241, + "loss": 8.0753, + "step": 340200 + }, + { + "epoch": 1.386307413567114, + "grad_norm": 3.431786298751831, + "learning_rate": 0.004510885135985616, + "loss": 8.0001, + "step": 340300 + }, + { + "epoch": 1.3867147915904954, + "grad_norm": 2.138399362564087, + "learning_rate": 0.004510590079241629, + "loss": 8.0156, + "step": 340400 + }, + { + "epoch": 1.387122169613877, + "grad_norm": 3.040599822998047, + "learning_rate": 0.004510294943205934, + "loss": 8.0638, + "step": 340500 + }, + { + "epoch": 1.3875295476372584, + "grad_norm": 12.077610969543457, + "learning_rate": 0.004509999727890208, + "loss": 8.0502, + "step": 340600 + }, + { + "epoch": 1.38793692566064, + "grad_norm": 5.232182025909424, + "learning_rate": 0.004509704433306117, + "loss": 8.0552, + "step": 340700 + }, + { + "epoch": 1.3883443036840213, + "grad_norm": 6.518558502197266, + "learning_rate": 0.004509409059465335, + "loss": 8.0078, + "step": 340800 + }, + { + "epoch": 1.3887516817074028, + "grad_norm": 1.897269368171692, + "learning_rate": 0.004509113606379544, + "loss": 8.0856, + "step": 340900 + }, + { + "epoch": 1.3891590597307841, + "grad_norm": 3.018871784210205, + "learning_rate": 0.004508818074060427, + "loss": 8.021, + "step": 341000 + }, + { + "epoch": 1.3891590597307841, + "eval_MaskedAccuracy": 0.49240115959551306, + "eval_loss": 1.686929702758789, + "eval_runtime": 171.6369, + "eval_samples_per_second": 369.827, + "eval_steps_per_second": 1.445, + "step": 341000 + }, + { + "epoch": 1.3895664377541657, + "grad_norm": 4.302035331726074, + "learning_rate": 0.004508522462519665, + "loss": 7.9861, + "step": 341100 + }, + { + "epoch": 1.3899738157775472, + "grad_norm": 1.3537317514419556, + "learning_rate": 0.004508226771768944, + "loss": 8.0481, + "step": 341200 + }, + { + "epoch": 1.3903811938009287, + "grad_norm": 2.8749494552612305, + "learning_rate": 0.004507931001819948, + "loss": 8.0595, + "step": 341300 + }, + { + "epoch": 1.39078857182431, + "grad_norm": 5.485761642456055, + "learning_rate": 0.004507635152684382, + "loss": 8.0556, + "step": 341400 + }, + { + "epoch": 1.3911959498476916, + "grad_norm": 4.461055278778076, + "learning_rate": 0.004507339224373937, + "loss": 8.0743, + "step": 341500 + }, + { + "epoch": 1.3916033278710729, + "grad_norm": 5.363972187042236, + "learning_rate": 0.004507043216900309, + "loss": 8.0532, + "step": 341600 + }, + { + "epoch": 1.3920107058944544, + "grad_norm": 5.1722588539123535, + "learning_rate": 0.004506747130275201, + "loss": 7.9847, + "step": 341700 + }, + { + "epoch": 1.392418083917836, + "grad_norm": 3.0832786560058594, + "learning_rate": 0.004506450964510331, + "loss": 8.0637, + "step": 341800 + }, + { + "epoch": 1.3928254619412175, + "grad_norm": 1.4068841934204102, + "learning_rate": 0.004506154719617401, + "loss": 8.0129, + "step": 341900 + }, + { + "epoch": 1.3932328399645988, + "grad_norm": 4.5246052742004395, + "learning_rate": 0.004505858395608122, + "loss": 8.0463, + "step": 342000 + }, + { + "epoch": 1.3932328399645988, + "eval_MaskedAccuracy": 0.488171587673146, + "eval_loss": 1.7060898542404175, + "eval_runtime": 173.3484, + "eval_samples_per_second": 366.176, + "eval_steps_per_second": 1.431, + "step": 342000 + }, + { + "epoch": 1.3936402179879803, + "grad_norm": 2.04990291595459, + "learning_rate": 0.004505561992494208, + "loss": 8.0104, + "step": 342100 + }, + { + "epoch": 1.3940475960113619, + "grad_norm": 7.8837199211120605, + "learning_rate": 0.0045052655102873745, + "loss": 8.0434, + "step": 342200 + }, + { + "epoch": 1.3944549740347432, + "grad_norm": 3.021195888519287, + "learning_rate": 0.004504968948999357, + "loss": 8.0437, + "step": 342300 + }, + { + "epoch": 1.3948623520581247, + "grad_norm": 2.564603090286255, + "learning_rate": 0.004504672308641867, + "loss": 8.0236, + "step": 342400 + }, + { + "epoch": 1.3952697300815062, + "grad_norm": 0.9281584024429321, + "learning_rate": 0.0045043755892266325, + "loss": 8.058, + "step": 342500 + }, + { + "epoch": 1.3956771081048875, + "grad_norm": 2.436694383621216, + "learning_rate": 0.00450407879076539, + "loss": 8.0409, + "step": 342600 + }, + { + "epoch": 1.396084486128269, + "grad_norm": 2.731529712677002, + "learning_rate": 0.004503781913269866, + "loss": 8.0601, + "step": 342700 + }, + { + "epoch": 1.3964918641516506, + "grad_norm": 2.8823978900909424, + "learning_rate": 0.004503484956751801, + "loss": 8.0132, + "step": 342800 + }, + { + "epoch": 1.396899242175032, + "grad_norm": 8.806553840637207, + "learning_rate": 0.004503187921222941, + "loss": 7.992, + "step": 342900 + }, + { + "epoch": 1.3973066201984135, + "grad_norm": 0.9807514548301697, + "learning_rate": 0.0045028908066950275, + "loss": 8.0146, + "step": 343000 + }, + { + "epoch": 1.3973066201984135, + "eval_MaskedAccuracy": 0.4901069446463299, + "eval_loss": 1.6997807025909424, + "eval_runtime": 264.8787, + "eval_samples_per_second": 239.642, + "eval_steps_per_second": 0.936, + "step": 343000 + }, + { + "epoch": 1.397713998221795, + "grad_norm": 4.19398832321167, + "learning_rate": 0.004502593613179801, + "loss": 8.0688, + "step": 343100 + }, + { + "epoch": 1.3981213762451765, + "grad_norm": 3.5427639484405518, + "learning_rate": 0.004502296340689015, + "loss": 8.0185, + "step": 343200 + }, + { + "epoch": 1.3985287542685578, + "grad_norm": 4.444757461547852, + "learning_rate": 0.004501998989234425, + "loss": 8.0288, + "step": 343300 + }, + { + "epoch": 1.3989361322919394, + "grad_norm": 3.28558349609375, + "learning_rate": 0.00450170155882778, + "loss": 8.0218, + "step": 343400 + }, + { + "epoch": 1.3993435103153207, + "grad_norm": 3.0191750526428223, + "learning_rate": 0.004501404049480848, + "loss": 7.9866, + "step": 343500 + }, + { + "epoch": 1.3997508883387022, + "grad_norm": 2.12580943107605, + "learning_rate": 0.004501106461205386, + "loss": 8.0421, + "step": 343600 + }, + { + "epoch": 1.4001582663620837, + "grad_norm": 2.5206921100616455, + "learning_rate": 0.004500808794013157, + "loss": 8.0331, + "step": 343700 + }, + { + "epoch": 1.4005656443854653, + "grad_norm": 4.923537731170654, + "learning_rate": 0.0045005110479159356, + "loss": 8.0711, + "step": 343800 + }, + { + "epoch": 1.4009730224088466, + "grad_norm": 4.230342388153076, + "learning_rate": 0.004500213222925488, + "loss": 8.0021, + "step": 343900 + }, + { + "epoch": 1.4013804004322281, + "grad_norm": 2.248675584793091, + "learning_rate": 0.004499915319053593, + "loss": 8.0087, + "step": 344000 + }, + { + "epoch": 1.4013804004322281, + "eval_MaskedAccuracy": 0.49225142805141237, + "eval_loss": 1.6809008121490479, + "eval_runtime": 172.1493, + "eval_samples_per_second": 368.726, + "eval_steps_per_second": 1.441, + "step": 344000 + }, + { + "epoch": 1.4017877784556094, + "grad_norm": 6.397619247436523, + "learning_rate": 0.004499617336312023, + "loss": 8.0279, + "step": 344100 + }, + { + "epoch": 1.402195156478991, + "grad_norm": 6.686524868011475, + "learning_rate": 0.004499319274712562, + "loss": 8.0732, + "step": 344200 + }, + { + "epoch": 1.4026025345023725, + "grad_norm": 2.8127522468566895, + "learning_rate": 0.004499021134266992, + "loss": 8.0659, + "step": 344300 + }, + { + "epoch": 1.403009912525754, + "grad_norm": 1.8352597951889038, + "learning_rate": 0.004498722914987101, + "loss": 8.0295, + "step": 344400 + }, + { + "epoch": 1.4034172905491353, + "grad_norm": 3.4253170490264893, + "learning_rate": 0.004498424616884684, + "loss": 8.0607, + "step": 344500 + }, + { + "epoch": 1.4038246685725169, + "grad_norm": 2.7556371688842773, + "learning_rate": 0.00449812623997153, + "loss": 8.0144, + "step": 344600 + }, + { + "epoch": 1.4042320465958982, + "grad_norm": 3.0799689292907715, + "learning_rate": 0.004497827784259437, + "loss": 8.0261, + "step": 344700 + }, + { + "epoch": 1.4046394246192797, + "grad_norm": 3.8346197605133057, + "learning_rate": 0.004497529249760206, + "loss": 7.9727, + "step": 344800 + }, + { + "epoch": 1.4050468026426612, + "grad_norm": 5.817394733428955, + "learning_rate": 0.00449723063648563, + "loss": 8.0071, + "step": 344900 + }, + { + "epoch": 1.4054541806660428, + "grad_norm": 3.7115495204925537, + "learning_rate": 0.004496931944447526, + "loss": 7.9977, + "step": 345000 + }, + { + "epoch": 1.4054541806660428, + "eval_MaskedAccuracy": 0.4927991312882558, + "eval_loss": 1.6924138069152832, + "eval_runtime": 164.8575, + "eval_samples_per_second": 385.036, + "eval_steps_per_second": 1.504, + "step": 345000 + }, + { + "epoch": 1.405861558689424, + "grad_norm": 2.293860912322998, + "learning_rate": 0.004496633173657698, + "loss": 7.9953, + "step": 345100 + }, + { + "epoch": 1.4062689367128056, + "grad_norm": 2.226228713989258, + "learning_rate": 0.004496334324127954, + "loss": 8.0028, + "step": 345200 + }, + { + "epoch": 1.4066763147361871, + "grad_norm": 1.7654036283493042, + "learning_rate": 0.004496035395870117, + "loss": 8.0259, + "step": 345300 + }, + { + "epoch": 1.4070836927595685, + "grad_norm": 3.241719961166382, + "learning_rate": 0.004495736388896001, + "loss": 8.0116, + "step": 345400 + }, + { + "epoch": 1.40749107078295, + "grad_norm": 6.492445468902588, + "learning_rate": 0.00449543730321742, + "loss": 8.0326, + "step": 345500 + }, + { + "epoch": 1.4078984488063315, + "grad_norm": 2.9684269428253174, + "learning_rate": 0.004495138138846212, + "loss": 8.0502, + "step": 345600 + }, + { + "epoch": 1.408305826829713, + "grad_norm": 1.2709413766860962, + "learning_rate": 0.0044948388957942054, + "loss": 8.0654, + "step": 345700 + }, + { + "epoch": 1.4087132048530944, + "grad_norm": 5.379870891571045, + "learning_rate": 0.004494539574073218, + "loss": 8.0678, + "step": 345800 + }, + { + "epoch": 1.409120582876476, + "grad_norm": 3.2984161376953125, + "learning_rate": 0.0044942401736951, + "loss": 8.0417, + "step": 345900 + }, + { + "epoch": 1.4095279608998572, + "grad_norm": 1.9095920324325562, + "learning_rate": 0.0044939406946716744, + "loss": 8.0622, + "step": 346000 + }, + { + "epoch": 1.4095279608998572, + "eval_MaskedAccuracy": 0.48976976075929307, + "eval_loss": 1.6921502351760864, + "eval_runtime": 224.8006, + "eval_samples_per_second": 282.366, + "eval_steps_per_second": 1.103, + "step": 346000 + }, + { + "epoch": 1.4099353389232387, + "grad_norm": 2.1624269485473633, + "learning_rate": 0.004493641137014782, + "loss": 8.0521, + "step": 346100 + }, + { + "epoch": 1.4103427169466203, + "grad_norm": 2.690730571746826, + "learning_rate": 0.004493341500736279, + "loss": 8.0555, + "step": 346200 + }, + { + "epoch": 1.4107500949700018, + "grad_norm": 6.256383895874023, + "learning_rate": 0.004493041785847994, + "loss": 8.0565, + "step": 346300 + }, + { + "epoch": 1.4111574729933831, + "grad_norm": 2.366687536239624, + "learning_rate": 0.004492741992361785, + "loss": 8.0496, + "step": 346400 + }, + { + "epoch": 1.4115648510167647, + "grad_norm": 7.474885940551758, + "learning_rate": 0.004492442120289507, + "loss": 8.0452, + "step": 346500 + }, + { + "epoch": 1.411972229040146, + "grad_norm": 1.8844860792160034, + "learning_rate": 0.004492142169643017, + "loss": 8.056, + "step": 346600 + }, + { + "epoch": 1.4123796070635275, + "grad_norm": 3.0277657508850098, + "learning_rate": 0.004491842140434164, + "loss": 8.0266, + "step": 346700 + }, + { + "epoch": 1.412786985086909, + "grad_norm": 4.008308410644531, + "learning_rate": 0.004491542032674812, + "loss": 8.0023, + "step": 346800 + }, + { + "epoch": 1.4131943631102906, + "grad_norm": 1.7533228397369385, + "learning_rate": 0.004491241846376829, + "loss": 8.0353, + "step": 346900 + }, + { + "epoch": 1.4136017411336719, + "grad_norm": 3.0283710956573486, + "learning_rate": 0.004490941581552086, + "loss": 8.0078, + "step": 347000 + }, + { + "epoch": 1.4136017411336719, + "eval_MaskedAccuracy": 0.48967784953301846, + "eval_loss": 1.7042174339294434, + "eval_runtime": 185.5618, + "eval_samples_per_second": 342.075, + "eval_steps_per_second": 1.336, + "step": 347000 + }, + { + "epoch": 1.4140091191570534, + "grad_norm": 3.1316170692443848, + "learning_rate": 0.004490641238212449, + "loss": 8.0283, + "step": 347100 + }, + { + "epoch": 1.4144164971804347, + "grad_norm": 2.7969183921813965, + "learning_rate": 0.004490340816369788, + "loss": 8.0212, + "step": 347200 + }, + { + "epoch": 1.4148238752038163, + "grad_norm": 3.6290698051452637, + "learning_rate": 0.004490040316035995, + "loss": 8.0161, + "step": 347300 + }, + { + "epoch": 1.4152312532271978, + "grad_norm": 1.1475740671157837, + "learning_rate": 0.004489739737222939, + "loss": 8.0271, + "step": 347400 + }, + { + "epoch": 1.4156386312505793, + "grad_norm": 5.602312088012695, + "learning_rate": 0.004489439079942507, + "loss": 8.0837, + "step": 347500 + }, + { + "epoch": 1.4160460092739606, + "grad_norm": 0.7871115803718567, + "learning_rate": 0.00448913834420659, + "loss": 8.0143, + "step": 347600 + }, + { + "epoch": 1.4164533872973422, + "grad_norm": 5.471859455108643, + "learning_rate": 0.004488837530027066, + "loss": 8.066, + "step": 347700 + }, + { + "epoch": 1.4168607653207237, + "grad_norm": 1.3712109327316284, + "learning_rate": 0.004488536637415832, + "loss": 8.0948, + "step": 347800 + }, + { + "epoch": 1.417268143344105, + "grad_norm": 1.636563777923584, + "learning_rate": 0.004488235666384787, + "loss": 8.0749, + "step": 347900 + }, + { + "epoch": 1.4176755213674865, + "grad_norm": 4.104404449462891, + "learning_rate": 0.004487934616945828, + "loss": 8.049, + "step": 348000 + }, + { + "epoch": 1.4176755213674865, + "eval_MaskedAccuracy": 0.49094555395382244, + "eval_loss": 1.6923339366912842, + "eval_runtime": 168.3756, + "eval_samples_per_second": 376.99, + "eval_steps_per_second": 1.473, + "step": 348000 + }, + { + "epoch": 1.418082899390868, + "grad_norm": 1.3405181169509888, + "learning_rate": 0.00448763348911086, + "loss": 8.0409, + "step": 348100 + }, + { + "epoch": 1.4184902774142496, + "grad_norm": 0.7778652310371399, + "learning_rate": 0.004487332282891787, + "loss": 8.0562, + "step": 348200 + }, + { + "epoch": 1.418897655437631, + "grad_norm": 1.7618839740753174, + "learning_rate": 0.004487030998300513, + "loss": 8.0204, + "step": 348300 + }, + { + "epoch": 1.4193050334610124, + "grad_norm": 4.535425186157227, + "learning_rate": 0.004486729635348955, + "loss": 8.0057, + "step": 348400 + }, + { + "epoch": 1.4197124114843938, + "grad_norm": 5.359290599822998, + "learning_rate": 0.004486428194049023, + "loss": 8.0211, + "step": 348500 + }, + { + "epoch": 1.4201197895077753, + "grad_norm": 2.8018269538879395, + "learning_rate": 0.0044861266744126395, + "loss": 8.044, + "step": 348600 + }, + { + "epoch": 1.4205271675311568, + "grad_norm": 6.559733867645264, + "learning_rate": 0.004485825076451726, + "loss": 8.0027, + "step": 348700 + }, + { + "epoch": 1.4209345455545384, + "grad_norm": 2.3456294536590576, + "learning_rate": 0.004485523400178198, + "loss": 8.0445, + "step": 348800 + }, + { + "epoch": 1.4213419235779197, + "grad_norm": 2.961416482925415, + "learning_rate": 0.0044852216456039914, + "loss": 8.0284, + "step": 348900 + }, + { + "epoch": 1.4217493016013012, + "grad_norm": 4.189509391784668, + "learning_rate": 0.0044849198127410285, + "loss": 7.9987, + "step": 349000 + }, + { + "epoch": 1.4217493016013012, + "eval_MaskedAccuracy": 0.4919331122905063, + "eval_loss": 1.686869740486145, + "eval_runtime": 245.5479, + "eval_samples_per_second": 258.508, + "eval_steps_per_second": 1.01, + "step": 349000 + }, + { + "epoch": 1.4221566796246825, + "grad_norm": 4.080605506896973, + "learning_rate": 0.004484617901601247, + "loss": 8.0049, + "step": 349100 + }, + { + "epoch": 1.422564057648064, + "grad_norm": 1.869256615638733, + "learning_rate": 0.004484315912196584, + "loss": 8.0349, + "step": 349200 + }, + { + "epoch": 1.4229714356714456, + "grad_norm": 1.7196160554885864, + "learning_rate": 0.004484013844538975, + "loss": 8.0591, + "step": 349300 + }, + { + "epoch": 1.423378813694827, + "grad_norm": 6.398401260375977, + "learning_rate": 0.004483711698640363, + "loss": 8.0571, + "step": 349400 + }, + { + "epoch": 1.4237861917182084, + "grad_norm": 6.66635799407959, + "learning_rate": 0.0044834094745126955, + "loss": 8.0254, + "step": 349500 + }, + { + "epoch": 1.42419356974159, + "grad_norm": 1.418727993965149, + "learning_rate": 0.0044831071721679215, + "loss": 8.0496, + "step": 349600 + }, + { + "epoch": 1.4246009477649713, + "grad_norm": 2.5015978813171387, + "learning_rate": 0.0044828047916179985, + "loss": 8.0475, + "step": 349700 + }, + { + "epoch": 1.4250083257883528, + "grad_norm": 1.1977105140686035, + "learning_rate": 0.004482502332874867, + "loss": 8.0083, + "step": 349800 + }, + { + "epoch": 1.4254157038117343, + "grad_norm": 4.282104015350342, + "learning_rate": 0.004482199795950487, + "loss": 7.9986, + "step": 349900 + }, + { + "epoch": 1.4258230818351159, + "grad_norm": 1.0990618467330933, + "learning_rate": 0.004481897180856828, + "loss": 8.0481, + "step": 350000 + }, + { + "epoch": 1.4258230818351159, + "eval_MaskedAccuracy": 0.4905801972207007, + "eval_loss": 1.6971557140350342, + "eval_runtime": 232.8802, + "eval_samples_per_second": 272.569, + "eval_steps_per_second": 1.065, + "step": 350000 + }, + { + "epoch": 1.4262304598584972, + "grad_norm": 2.1344125270843506, + "learning_rate": 0.004481594487605856, + "loss": 8.0476, + "step": 350100 + }, + { + "epoch": 1.4266378378818787, + "grad_norm": 2.339507579803467, + "learning_rate": 0.004481291716209531, + "loss": 8.0124, + "step": 350200 + }, + { + "epoch": 1.4270452159052602, + "grad_norm": 1.9598414897918701, + "learning_rate": 0.004480988866679821, + "loss": 7.977, + "step": 350300 + }, + { + "epoch": 1.4274525939286415, + "grad_norm": 2.153442859649658, + "learning_rate": 0.004480685939028701, + "loss": 8.0344, + "step": 350400 + }, + { + "epoch": 1.427859971952023, + "grad_norm": 3.163754463195801, + "learning_rate": 0.004480382933268155, + "loss": 8.0101, + "step": 350500 + }, + { + "epoch": 1.4282673499754046, + "grad_norm": 0.8315991163253784, + "learning_rate": 0.004480079849410152, + "loss": 8.0767, + "step": 350600 + }, + { + "epoch": 1.4286747279987861, + "grad_norm": 0.9263890385627747, + "learning_rate": 0.00447977668746668, + "loss": 8.0548, + "step": 350700 + }, + { + "epoch": 1.4290821060221675, + "grad_norm": 2.093311071395874, + "learning_rate": 0.0044794734474497225, + "loss": 8.0203, + "step": 350800 + }, + { + "epoch": 1.429489484045549, + "grad_norm": 5.483651638031006, + "learning_rate": 0.004479170129371276, + "loss": 8.048, + "step": 350900 + }, + { + "epoch": 1.4298968620689303, + "grad_norm": 1.5579036474227905, + "learning_rate": 0.004478866733243319, + "loss": 8.0299, + "step": 351000 + }, + { + "epoch": 1.4298968620689303, + "eval_MaskedAccuracy": 0.49019804182762994, + "eval_loss": 1.6904391050338745, + "eval_runtime": 483.3898, + "eval_samples_per_second": 131.314, + "eval_steps_per_second": 0.513, + "step": 351000 + }, + { + "epoch": 1.4303042400923118, + "grad_norm": 2.5887644290924072, + "learning_rate": 0.0044785632590778526, + "loss": 8.0062, + "step": 351100 + }, + { + "epoch": 1.4307116181156934, + "grad_norm": 4.814824104309082, + "learning_rate": 0.0044782597068868755, + "loss": 8.0206, + "step": 351200 + }, + { + "epoch": 1.431118996139075, + "grad_norm": 0.8506344556808472, + "learning_rate": 0.004477956076682388, + "loss": 8.0234, + "step": 351300 + }, + { + "epoch": 1.4315263741624562, + "grad_norm": 2.3943898677825928, + "learning_rate": 0.004477652368476392, + "loss": 8.0364, + "step": 351400 + }, + { + "epoch": 1.4319337521858377, + "grad_norm": 1.785700798034668, + "learning_rate": 0.004477348582280897, + "loss": 8.0473, + "step": 351500 + }, + { + "epoch": 1.432341130209219, + "grad_norm": 3.400876045227051, + "learning_rate": 0.00447704471810792, + "loss": 8.1064, + "step": 351600 + }, + { + "epoch": 1.4327485082326006, + "grad_norm": 2.6870365142822266, + "learning_rate": 0.004476740775969462, + "loss": 8.0594, + "step": 351700 + }, + { + "epoch": 1.4331558862559821, + "grad_norm": 1.729146122932434, + "learning_rate": 0.004476436755877546, + "loss": 8.0338, + "step": 351800 + }, + { + "epoch": 1.4335632642793636, + "grad_norm": 1.1923850774765015, + "learning_rate": 0.00447613265784419, + "loss": 8.0226, + "step": 351900 + }, + { + "epoch": 1.433970642302745, + "grad_norm": 2.12165904045105, + "learning_rate": 0.004475828481881418, + "loss": 8.02, + "step": 352000 + }, + { + "epoch": 1.433970642302745, + "eval_MaskedAccuracy": 0.4926593809327485, + "eval_loss": 1.6839932203292847, + "eval_runtime": 168.0703, + "eval_samples_per_second": 377.675, + "eval_steps_per_second": 1.476, + "step": 352000 + }, + { + "epoch": 1.4343780203261265, + "grad_norm": 3.7795934677124023, + "learning_rate": 0.004475524228001263, + "loss": 7.9862, + "step": 352100 + }, + { + "epoch": 1.4347853983495078, + "grad_norm": 4.6571574211120605, + "learning_rate": 0.004475219896215741, + "loss": 7.9855, + "step": 352200 + }, + { + "epoch": 1.4351927763728893, + "grad_norm": 4.111793518066406, + "learning_rate": 0.004474915486536891, + "loss": 8.0121, + "step": 352300 + }, + { + "epoch": 1.4356001543962709, + "grad_norm": 1.6061862707138062, + "learning_rate": 0.004474610998976745, + "loss": 7.983, + "step": 352400 + }, + { + "epoch": 1.4360075324196524, + "grad_norm": 1.8522026538848877, + "learning_rate": 0.004474306433547344, + "loss": 8.01, + "step": 352500 + }, + { + "epoch": 1.4364149104430337, + "grad_norm": 2.6164515018463135, + "learning_rate": 0.00447400179026072, + "loss": 8.0455, + "step": 352600 + }, + { + "epoch": 1.4368222884664152, + "grad_norm": 3.0808162689208984, + "learning_rate": 0.00447369706912893, + "loss": 8.0419, + "step": 352700 + }, + { + "epoch": 1.4372296664897968, + "grad_norm": 5.4039812088012695, + "learning_rate": 0.00447339227016402, + "loss": 8.0127, + "step": 352800 + }, + { + "epoch": 1.437637044513178, + "grad_norm": 1.473721981048584, + "learning_rate": 0.004473087393378034, + "loss": 8.0568, + "step": 352900 + }, + { + "epoch": 1.4380444225365596, + "grad_norm": 2.6627840995788574, + "learning_rate": 0.0044727824387830175, + "loss": 8.037, + "step": 353000 + }, + { + "epoch": 1.4380444225365596, + "eval_MaskedAccuracy": 0.49035301162574, + "eval_loss": 1.6992812156677246, + "eval_runtime": 171.5562, + "eval_samples_per_second": 370.001, + "eval_steps_per_second": 1.446, + "step": 353000 + }, + { + "epoch": 1.4384518005599412, + "grad_norm": 1.1248985528945923, + "learning_rate": 0.004472477406391048, + "loss": 8.0544, + "step": 353100 + }, + { + "epoch": 1.4388591785833227, + "grad_norm": 3.9796719551086426, + "learning_rate": 0.004472172296214165, + "loss": 8.027, + "step": 353200 + }, + { + "epoch": 1.439266556606704, + "grad_norm": 2.6582648754119873, + "learning_rate": 0.004471867108264448, + "loss": 8.0448, + "step": 353300 + }, + { + "epoch": 1.4396739346300855, + "grad_norm": 1.5737559795379639, + "learning_rate": 0.004471561842553949, + "loss": 7.9911, + "step": 353400 + }, + { + "epoch": 1.4400813126534668, + "grad_norm": 1.3840856552124023, + "learning_rate": 0.004471256499094746, + "loss": 8.0141, + "step": 353500 + }, + { + "epoch": 1.4404886906768484, + "grad_norm": 1.7901232242584229, + "learning_rate": 0.004470951077898908, + "loss": 8.0552, + "step": 353600 + }, + { + "epoch": 1.44089606870023, + "grad_norm": 2.631977081298828, + "learning_rate": 0.004470645578978506, + "loss": 8.0119, + "step": 353700 + }, + { + "epoch": 1.4413034467236114, + "grad_norm": 3.0884792804718018, + "learning_rate": 0.004470340002345623, + "loss": 8.0113, + "step": 353800 + }, + { + "epoch": 1.4417108247469927, + "grad_norm": 2.1293485164642334, + "learning_rate": 0.0044700343480123415, + "loss": 7.9717, + "step": 353900 + }, + { + "epoch": 1.4421182027703743, + "grad_norm": 2.5049309730529785, + "learning_rate": 0.0044697286159907385, + "loss": 8.0098, + "step": 354000 + }, + { + "epoch": 1.4421182027703743, + "eval_MaskedAccuracy": 0.49255391146316696, + "eval_loss": 1.674404501914978, + "eval_runtime": 162.3384, + "eval_samples_per_second": 391.01, + "eval_steps_per_second": 1.528, + "step": 354000 + }, + { + "epoch": 1.4425255807937556, + "grad_norm": 4.3438191413879395, + "learning_rate": 0.004469422806292906, + "loss": 8.0276, + "step": 354100 + }, + { + "epoch": 1.4429329588171371, + "grad_norm": 2.7198264598846436, + "learning_rate": 0.004469116918930937, + "loss": 8.0444, + "step": 354200 + }, + { + "epoch": 1.4433403368405187, + "grad_norm": 2.4105312824249268, + "learning_rate": 0.0044688109539169196, + "loss": 8.0014, + "step": 354300 + }, + { + "epoch": 1.4437477148639002, + "grad_norm": 3.454275131225586, + "learning_rate": 0.0044685049112629495, + "loss": 8.0121, + "step": 354400 + }, + { + "epoch": 1.4441550928872815, + "grad_norm": 5.2834153175354, + "learning_rate": 0.004468198790981126, + "loss": 8.0129, + "step": 354500 + }, + { + "epoch": 1.444562470910663, + "grad_norm": 4.581850528717041, + "learning_rate": 0.004467892593083559, + "loss": 8.0206, + "step": 354600 + }, + { + "epoch": 1.4449698489340443, + "grad_norm": 3.5770163536071777, + "learning_rate": 0.004467586317582348, + "loss": 8.0247, + "step": 354700 + }, + { + "epoch": 1.4453772269574259, + "grad_norm": 3.2041547298431396, + "learning_rate": 0.004467279964489606, + "loss": 8.0195, + "step": 354800 + }, + { + "epoch": 1.4457846049808074, + "grad_norm": 3.5487098693847656, + "learning_rate": 0.004466973533817442, + "loss": 8.0313, + "step": 354900 + }, + { + "epoch": 1.446191983004189, + "grad_norm": 3.3173043727874756, + "learning_rate": 0.004466667025577968, + "loss": 8.0444, + "step": 355000 + }, + { + "epoch": 1.446191983004189, + "eval_MaskedAccuracy": 0.4920171257509205, + "eval_loss": 1.688253402709961, + "eval_runtime": 207.1816, + "eval_samples_per_second": 306.379, + "eval_steps_per_second": 1.197, + "step": 355000 + }, + { + "epoch": 1.4465993610275703, + "grad_norm": 7.595371246337891, + "learning_rate": 0.004466360439783299, + "loss": 8.0146, + "step": 355100 + }, + { + "epoch": 1.4470067390509518, + "grad_norm": 2.420732259750366, + "learning_rate": 0.004466053776445564, + "loss": 8.0344, + "step": 355200 + }, + { + "epoch": 1.4474141170743333, + "grad_norm": 1.8066754341125488, + "learning_rate": 0.004465747035576886, + "loss": 7.9927, + "step": 355300 + }, + { + "epoch": 1.4478214950977146, + "grad_norm": 2.813026189804077, + "learning_rate": 0.004465440217189393, + "loss": 8.0021, + "step": 355400 + }, + { + "epoch": 1.4482288731210962, + "grad_norm": 1.8070228099822998, + "learning_rate": 0.0044651333212952105, + "loss": 7.9979, + "step": 355500 + }, + { + "epoch": 1.4486362511444777, + "grad_norm": 3.373173713684082, + "learning_rate": 0.004464826347906479, + "loss": 8.0092, + "step": 355600 + }, + { + "epoch": 1.4490436291678592, + "grad_norm": 1.6938971281051636, + "learning_rate": 0.004464519297035334, + "loss": 8.0332, + "step": 355700 + }, + { + "epoch": 1.4494510071912405, + "grad_norm": 2.879782199859619, + "learning_rate": 0.004464212168693899, + "loss": 8.0537, + "step": 355800 + }, + { + "epoch": 1.449858385214622, + "grad_norm": 3.2696335315704346, + "learning_rate": 0.004463904962894334, + "loss": 8.0623, + "step": 355900 + }, + { + "epoch": 1.4502657632380034, + "grad_norm": 3.788130521774292, + "learning_rate": 0.004463597679648781, + "loss": 8.0341, + "step": 356000 + }, + { + "epoch": 1.4502657632380034, + "eval_MaskedAccuracy": 0.4898133608425122, + "eval_loss": 1.6999173164367676, + "eval_runtime": 182.9109, + "eval_samples_per_second": 347.032, + "eval_steps_per_second": 1.356, + "step": 356000 + }, + { + "epoch": 1.450673141261385, + "grad_norm": 4.155025005340576, + "learning_rate": 0.004463290318969386, + "loss": 8.0388, + "step": 356100 + }, + { + "epoch": 1.4510805192847664, + "grad_norm": 5.0551676750183105, + "learning_rate": 0.004462982880868299, + "loss": 7.9944, + "step": 356200 + }, + { + "epoch": 1.451487897308148, + "grad_norm": 3.779100179672241, + "learning_rate": 0.004462675365357674, + "loss": 8.0413, + "step": 356300 + }, + { + "epoch": 1.4518952753315293, + "grad_norm": 3.9221200942993164, + "learning_rate": 0.00446236777244968, + "loss": 7.9957, + "step": 356400 + }, + { + "epoch": 1.4523026533549108, + "grad_norm": 4.456353187561035, + "learning_rate": 0.004462060102156466, + "loss": 7.9959, + "step": 356500 + }, + { + "epoch": 1.4527100313782921, + "grad_norm": 4.030895709991455, + "learning_rate": 0.004461752354490196, + "loss": 7.9711, + "step": 356600 + }, + { + "epoch": 1.4531174094016737, + "grad_norm": 4.752027988433838, + "learning_rate": 0.004461444529463043, + "loss": 7.9674, + "step": 356700 + }, + { + "epoch": 1.4535247874250552, + "grad_norm": 4.187869548797607, + "learning_rate": 0.004461136627087173, + "loss": 7.9886, + "step": 356800 + }, + { + "epoch": 1.4539321654484367, + "grad_norm": 2.539093017578125, + "learning_rate": 0.004460828647374755, + "loss": 7.9884, + "step": 356900 + }, + { + "epoch": 1.454339543471818, + "grad_norm": 6.668517589569092, + "learning_rate": 0.004460520590337975, + "loss": 8.0211, + "step": 357000 + }, + { + "epoch": 1.454339543471818, + "eval_MaskedAccuracy": 0.48822290326397216, + "eval_loss": 1.6956151723861694, + "eval_runtime": 230.3796, + "eval_samples_per_second": 275.528, + "eval_steps_per_second": 1.076, + "step": 357000 + }, + { + "epoch": 1.4547469214951996, + "grad_norm": 2.4861207008361816, + "learning_rate": 0.004460212455989005, + "loss": 8.0312, + "step": 357100 + }, + { + "epoch": 1.4551542995185809, + "grad_norm": 3.670663833618164, + "learning_rate": 0.0044599042443400315, + "loss": 8.0086, + "step": 357200 + }, + { + "epoch": 1.4555616775419624, + "grad_norm": 2.3406200408935547, + "learning_rate": 0.004459595955403238, + "loss": 8.003, + "step": 357300 + }, + { + "epoch": 1.455969055565344, + "grad_norm": 1.8610780239105225, + "learning_rate": 0.004459287589190809, + "loss": 7.9989, + "step": 357400 + }, + { + "epoch": 1.4563764335887255, + "grad_norm": 1.2680808305740356, + "learning_rate": 0.004458979145714939, + "loss": 8.0225, + "step": 357500 + }, + { + "epoch": 1.4567838116121068, + "grad_norm": 4.370670318603516, + "learning_rate": 0.004458670624987823, + "loss": 7.9976, + "step": 357600 + }, + { + "epoch": 1.4571911896354883, + "grad_norm": 4.670384883880615, + "learning_rate": 0.004458362027021667, + "loss": 8.0201, + "step": 357700 + }, + { + "epoch": 1.4575985676588699, + "grad_norm": 2.720135450363159, + "learning_rate": 0.004458053351828655, + "loss": 8.0347, + "step": 357800 + }, + { + "epoch": 1.4580059456822512, + "grad_norm": 2.43333101272583, + "learning_rate": 0.0044577445994209985, + "loss": 7.977, + "step": 357900 + }, + { + "epoch": 1.4584133237056327, + "grad_norm": 1.0447818040847778, + "learning_rate": 0.0044574357698109034, + "loss": 8.0378, + "step": 358000 + }, + { + "epoch": 1.4584133237056327, + "eval_MaskedAccuracy": 0.4895248275655935, + "eval_loss": 1.6998313665390015, + "eval_runtime": 178.2771, + "eval_samples_per_second": 356.052, + "eval_steps_per_second": 1.391, + "step": 358000 + }, + { + "epoch": 1.4588207017290142, + "grad_norm": 3.34891414642334, + "learning_rate": 0.00445712686301058, + "loss": 8.0238, + "step": 358100 + }, + { + "epoch": 1.4592280797523958, + "grad_norm": 2.1483287811279297, + "learning_rate": 0.004456817879032248, + "loss": 8.0378, + "step": 358200 + }, + { + "epoch": 1.459635457775777, + "grad_norm": 7.006333351135254, + "learning_rate": 0.004456508817888111, + "loss": 7.9972, + "step": 358300 + }, + { + "epoch": 1.4600428357991586, + "grad_norm": 0.8216266632080078, + "learning_rate": 0.004456199679590388, + "loss": 8.042, + "step": 358400 + }, + { + "epoch": 1.46045021382254, + "grad_norm": 4.701449394226074, + "learning_rate": 0.004455890464151309, + "loss": 8.0216, + "step": 358500 + }, + { + "epoch": 1.4608575918459215, + "grad_norm": 3.473137855529785, + "learning_rate": 0.004455581171583096, + "loss": 8.0026, + "step": 358600 + }, + { + "epoch": 1.461264969869303, + "grad_norm": 6.207772254943848, + "learning_rate": 0.0044552718018979805, + "loss": 8.0614, + "step": 358700 + }, + { + "epoch": 1.4616723478926845, + "grad_norm": 2.560777425765991, + "learning_rate": 0.004454962355108189, + "loss": 8.0169, + "step": 358800 + }, + { + "epoch": 1.4620797259160658, + "grad_norm": 1.8854252099990845, + "learning_rate": 0.004454652831225954, + "loss": 8.0301, + "step": 358900 + }, + { + "epoch": 1.4624871039394474, + "grad_norm": 4.9735188484191895, + "learning_rate": 0.004454343230263511, + "loss": 8.003, + "step": 359000 + }, + { + "epoch": 1.4624871039394474, + "eval_MaskedAccuracy": 0.48814781740974716, + "eval_loss": 1.6938985586166382, + "eval_runtime": 279.0939, + "eval_samples_per_second": 227.436, + "eval_steps_per_second": 0.889, + "step": 359000 + }, + { + "epoch": 1.4628944819628287, + "grad_norm": 2.5394465923309326, + "learning_rate": 0.004454033552233106, + "loss": 8.0566, + "step": 359100 + }, + { + "epoch": 1.4633018599862102, + "grad_norm": 4.700394153594971, + "learning_rate": 0.004453723797146984, + "loss": 8.0367, + "step": 359200 + }, + { + "epoch": 1.4637092380095917, + "grad_norm": 3.184756278991699, + "learning_rate": 0.004453413965017392, + "loss": 8.0044, + "step": 359300 + }, + { + "epoch": 1.4641166160329733, + "grad_norm": 3.3647334575653076, + "learning_rate": 0.0044531040558565705, + "loss": 7.9719, + "step": 359400 + }, + { + "epoch": 1.4645239940563546, + "grad_norm": 3.093984603881836, + "learning_rate": 0.004452794069676775, + "loss": 8.021, + "step": 359500 + }, + { + "epoch": 1.4649313720797361, + "grad_norm": 1.4502638578414917, + "learning_rate": 0.0044524840064902645, + "loss": 7.9956, + "step": 359600 + }, + { + "epoch": 1.4653387501031174, + "grad_norm": 4.267673015594482, + "learning_rate": 0.004452173866309295, + "loss": 8.0186, + "step": 359700 + }, + { + "epoch": 1.465746128126499, + "grad_norm": 8.67224407196045, + "learning_rate": 0.004451863649146126, + "loss": 8.0116, + "step": 359800 + }, + { + "epoch": 1.4661535061498805, + "grad_norm": 3.730142116546631, + "learning_rate": 0.004451553355013028, + "loss": 8.0154, + "step": 359900 + }, + { + "epoch": 1.466560884173262, + "grad_norm": 2.6251790523529053, + "learning_rate": 0.00445124298392226, + "loss": 7.9937, + "step": 360000 + }, + { + "epoch": 1.466560884173262, + "eval_MaskedAccuracy": 0.4928633534091767, + "eval_loss": 1.688250184059143, + "eval_runtime": 179.4596, + "eval_samples_per_second": 353.706, + "eval_steps_per_second": 1.382, + "step": 360000 + }, + { + "epoch": 1.4669682621966433, + "grad_norm": 4.280770301818848, + "learning_rate": 0.004450932535886099, + "loss": 8.0097, + "step": 360100 + }, + { + "epoch": 1.4673756402200249, + "grad_norm": 10.832054138183594, + "learning_rate": 0.004450622010916824, + "loss": 7.9814, + "step": 360200 + }, + { + "epoch": 1.4677830182434064, + "grad_norm": 4.457239151000977, + "learning_rate": 0.004450311409026702, + "loss": 8.0166, + "step": 360300 + }, + { + "epoch": 1.4681903962667877, + "grad_norm": 5.342009544372559, + "learning_rate": 0.004450000730228014, + "loss": 8.0293, + "step": 360400 + }, + { + "epoch": 1.4685977742901692, + "grad_norm": 5.352954387664795, + "learning_rate": 0.004449689974533045, + "loss": 8.0095, + "step": 360500 + }, + { + "epoch": 1.4690051523135508, + "grad_norm": 6.877140998840332, + "learning_rate": 0.004449379141954084, + "loss": 7.9923, + "step": 360600 + }, + { + "epoch": 1.4694125303369323, + "grad_norm": 2.1410417556762695, + "learning_rate": 0.004449068232503411, + "loss": 8.0304, + "step": 360700 + }, + { + "epoch": 1.4698199083603136, + "grad_norm": 1.1315438747406006, + "learning_rate": 0.004448757246193324, + "loss": 8.0495, + "step": 360800 + }, + { + "epoch": 1.4702272863836952, + "grad_norm": 2.2842488288879395, + "learning_rate": 0.004448446183036121, + "loss": 8.021, + "step": 360900 + }, + { + "epoch": 1.4706346644070765, + "grad_norm": 2.7832038402557373, + "learning_rate": 0.0044481350430440915, + "loss": 7.9993, + "step": 361000 + }, + { + "epoch": 1.4706346644070765, + "eval_MaskedAccuracy": 0.49182212582889695, + "eval_loss": 1.680207371711731, + "eval_runtime": 224.7039, + "eval_samples_per_second": 282.487, + "eval_steps_per_second": 1.104, + "step": 361000 + }, + { + "epoch": 1.471042042430458, + "grad_norm": 2.329878807067871, + "learning_rate": 0.0044478238262295475, + "loss": 8.0567, + "step": 361100 + }, + { + "epoch": 1.4714494204538395, + "grad_norm": 1.3917580842971802, + "learning_rate": 0.004447512532604777, + "loss": 8.0241, + "step": 361200 + }, + { + "epoch": 1.471856798477221, + "grad_norm": 3.8327815532684326, + "learning_rate": 0.004447201162182104, + "loss": 8.0266, + "step": 361300 + }, + { + "epoch": 1.4722641765006024, + "grad_norm": 6.694658279418945, + "learning_rate": 0.004446889714973832, + "loss": 8.0424, + "step": 361400 + }, + { + "epoch": 1.472671554523984, + "grad_norm": 3.1367924213409424, + "learning_rate": 0.00444657819099227, + "loss": 8.003, + "step": 361500 + }, + { + "epoch": 1.4730789325473652, + "grad_norm": 3.245832920074463, + "learning_rate": 0.0044462665902497405, + "loss": 8.0443, + "step": 361600 + }, + { + "epoch": 1.4734863105707467, + "grad_norm": 4.566452503204346, + "learning_rate": 0.004445954912758557, + "loss": 8.0364, + "step": 361700 + }, + { + "epoch": 1.4738936885941283, + "grad_norm": 1.9279391765594482, + "learning_rate": 0.004445643158531048, + "loss": 8.0695, + "step": 361800 + }, + { + "epoch": 1.4743010666175098, + "grad_norm": 3.4402527809143066, + "learning_rate": 0.004445331327579535, + "loss": 8.0358, + "step": 361900 + }, + { + "epoch": 1.4747084446408911, + "grad_norm": 4.117164611816406, + "learning_rate": 0.004445019419916347, + "loss": 8.0099, + "step": 362000 + }, + { + "epoch": 1.4747084446408911, + "eval_MaskedAccuracy": 0.49113621811736785, + "eval_loss": 1.6913385391235352, + "eval_runtime": 199.2783, + "eval_samples_per_second": 318.529, + "eval_steps_per_second": 1.244, + "step": 362000 + }, + { + "epoch": 1.4751158226642727, + "grad_norm": 2.392179012298584, + "learning_rate": 0.004444707435553814, + "loss": 8.0438, + "step": 362100 + }, + { + "epoch": 1.475523200687654, + "grad_norm": 2.4375250339508057, + "learning_rate": 0.0044443953745042656, + "loss": 8.0186, + "step": 362200 + }, + { + "epoch": 1.4759305787110355, + "grad_norm": 1.4311915636062622, + "learning_rate": 0.004444083236780047, + "loss": 8.0084, + "step": 362300 + }, + { + "epoch": 1.476337956734417, + "grad_norm": 4.306060791015625, + "learning_rate": 0.004443771022393504, + "loss": 8.0104, + "step": 362400 + }, + { + "epoch": 1.4767453347577986, + "grad_norm": 3.543065071105957, + "learning_rate": 0.004443458731356971, + "loss": 7.9741, + "step": 362500 + }, + { + "epoch": 1.4771527127811799, + "grad_norm": 4.2835798263549805, + "learning_rate": 0.004443146363682799, + "loss": 7.9911, + "step": 362600 + }, + { + "epoch": 1.4775600908045614, + "grad_norm": 1.9651813507080078, + "learning_rate": 0.004442833919383335, + "loss": 8.0053, + "step": 362700 + }, + { + "epoch": 1.477967468827943, + "grad_norm": 3.219658851623535, + "learning_rate": 0.00444252139847093, + "loss": 8.0154, + "step": 362800 + }, + { + "epoch": 1.4783748468513243, + "grad_norm": 4.614295959472656, + "learning_rate": 0.0044422088009579445, + "loss": 7.9951, + "step": 362900 + }, + { + "epoch": 1.4787822248747058, + "grad_norm": 5.121086597442627, + "learning_rate": 0.004441896126856728, + "loss": 8.0194, + "step": 363000 + }, + { + "epoch": 1.4787822248747058, + "eval_MaskedAccuracy": 0.49050609975653975, + "eval_loss": 1.6859321594238281, + "eval_runtime": 204.7747, + "eval_samples_per_second": 309.98, + "eval_steps_per_second": 1.211, + "step": 363000 + }, + { + "epoch": 1.4791896028980873, + "grad_norm": 5.913979530334473, + "learning_rate": 0.004441583376179654, + "loss": 8.0509, + "step": 363100 + }, + { + "epoch": 1.4795969809214689, + "grad_norm": 4.092564582824707, + "learning_rate": 0.004441270548939076, + "loss": 8.0319, + "step": 363200 + }, + { + "epoch": 1.4800043589448502, + "grad_norm": 1.9727094173431396, + "learning_rate": 0.004440957645147368, + "loss": 7.981, + "step": 363300 + }, + { + "epoch": 1.4804117369682317, + "grad_norm": 2.3226356506347656, + "learning_rate": 0.004440644664816902, + "loss": 7.9914, + "step": 363400 + }, + { + "epoch": 1.480819114991613, + "grad_norm": 5.634875297546387, + "learning_rate": 0.004440331607960047, + "loss": 7.978, + "step": 363500 + }, + { + "epoch": 1.4812264930149945, + "grad_norm": 1.1887385845184326, + "learning_rate": 0.004440018474589184, + "loss": 8.0349, + "step": 363600 + }, + { + "epoch": 1.481633871038376, + "grad_norm": 3.598142147064209, + "learning_rate": 0.004439705264716694, + "loss": 8.033, + "step": 363700 + }, + { + "epoch": 1.4820412490617576, + "grad_norm": 1.807037115097046, + "learning_rate": 0.004439391978354955, + "loss": 8.0413, + "step": 363800 + }, + { + "epoch": 1.482448627085139, + "grad_norm": 3.9683003425598145, + "learning_rate": 0.004439078615516356, + "loss": 8.0459, + "step": 363900 + }, + { + "epoch": 1.4828560051085204, + "grad_norm": 5.336919784545898, + "learning_rate": 0.004438765176213286, + "loss": 8.0241, + "step": 364000 + }, + { + "epoch": 1.4828560051085204, + "eval_MaskedAccuracy": 0.4891500509316086, + "eval_loss": 1.691572904586792, + "eval_runtime": 172.8148, + "eval_samples_per_second": 367.306, + "eval_steps_per_second": 1.435, + "step": 364000 + }, + { + "epoch": 1.4832633831319018, + "grad_norm": 2.170383930206299, + "learning_rate": 0.004438451660458131, + "loss": 7.9796, + "step": 364100 + }, + { + "epoch": 1.4836707611552833, + "grad_norm": 1.0361725091934204, + "learning_rate": 0.0044381380682632955, + "loss": 8.0232, + "step": 364200 + }, + { + "epoch": 1.4840781391786648, + "grad_norm": 2.4752933979034424, + "learning_rate": 0.004437824399641173, + "loss": 8.0235, + "step": 364300 + }, + { + "epoch": 1.4844855172020464, + "grad_norm": 2.0901341438293457, + "learning_rate": 0.004437510654604171, + "loss": 8.0367, + "step": 364400 + }, + { + "epoch": 1.4848928952254277, + "grad_norm": 3.661705255508423, + "learning_rate": 0.004437196833164682, + "loss": 8.0061, + "step": 364500 + }, + { + "epoch": 1.4853002732488092, + "grad_norm": 1.7246965169906616, + "learning_rate": 0.004436882935335121, + "loss": 8.0055, + "step": 364600 + }, + { + "epoch": 1.4857076512721905, + "grad_norm": 2.910794258117676, + "learning_rate": 0.004436568961127888, + "loss": 8.0041, + "step": 364700 + }, + { + "epoch": 1.486115029295572, + "grad_norm": 2.564167022705078, + "learning_rate": 0.004436254910555408, + "loss": 7.9882, + "step": 364800 + }, + { + "epoch": 1.4865224073189536, + "grad_norm": 1.9633032083511353, + "learning_rate": 0.00443594078363009, + "loss": 8.0237, + "step": 364900 + }, + { + "epoch": 1.486929785342335, + "grad_norm": 1.481297492980957, + "learning_rate": 0.004435626580364358, + "loss": 8.0262, + "step": 365000 + }, + { + "epoch": 1.486929785342335, + "eval_MaskedAccuracy": 0.4919499401216122, + "eval_loss": 1.6813246011734009, + "eval_runtime": 202.8804, + "eval_samples_per_second": 312.874, + "eval_steps_per_second": 1.222, + "step": 365000 + }, + { + "epoch": 1.4873371633657164, + "grad_norm": 2.3210670948028564, + "learning_rate": 0.004435312300770635, + "loss": 8.0464, + "step": 365100 + }, + { + "epoch": 1.487744541389098, + "grad_norm": 4.390432834625244, + "learning_rate": 0.004434997944861334, + "loss": 8.0521, + "step": 365200 + }, + { + "epoch": 1.4881519194124795, + "grad_norm": 1.854626178741455, + "learning_rate": 0.004434683512648892, + "loss": 8.0288, + "step": 365300 + }, + { + "epoch": 1.4885592974358608, + "grad_norm": 1.3176974058151245, + "learning_rate": 0.00443436900414574, + "loss": 8.0144, + "step": 365400 + }, + { + "epoch": 1.4889666754592423, + "grad_norm": 1.8628607988357544, + "learning_rate": 0.00443405441936432, + "loss": 7.9832, + "step": 365500 + }, + { + "epoch": 1.4893740534826239, + "grad_norm": 2.8778035640716553, + "learning_rate": 0.004433739758317057, + "loss": 8.0054, + "step": 365600 + }, + { + "epoch": 1.4897814315060054, + "grad_norm": 3.7729175090789795, + "learning_rate": 0.004433425021016393, + "loss": 8.0231, + "step": 365700 + }, + { + "epoch": 1.4901888095293867, + "grad_norm": 2.486950397491455, + "learning_rate": 0.004433110207474778, + "loss": 7.9779, + "step": 365800 + }, + { + "epoch": 1.4905961875527682, + "grad_norm": 1.9985967874526978, + "learning_rate": 0.004432795317704664, + "loss": 8.0286, + "step": 365900 + }, + { + "epoch": 1.4910035655761495, + "grad_norm": 1.8332232236862183, + "learning_rate": 0.004432480351718478, + "loss": 8.0113, + "step": 366000 + }, + { + "epoch": 1.4910035655761495, + "eval_MaskedAccuracy": 0.49197204558073937, + "eval_loss": 1.6859568357467651, + "eval_runtime": 165.0848, + "eval_samples_per_second": 384.506, + "eval_steps_per_second": 1.502, + "step": 366000 + }, + { + "epoch": 1.491410943599531, + "grad_norm": 3.6302599906921387, + "learning_rate": 0.004432165309528689, + "loss": 8.0024, + "step": 366100 + }, + { + "epoch": 1.4918183216229126, + "grad_norm": 4.383216857910156, + "learning_rate": 0.004431850191147746, + "loss": 7.9696, + "step": 366200 + }, + { + "epoch": 1.4922256996462941, + "grad_norm": 3.673788070678711, + "learning_rate": 0.004431534996588106, + "loss": 8.0354, + "step": 366300 + }, + { + "epoch": 1.4926330776696755, + "grad_norm": 2.768078565597534, + "learning_rate": 0.004431219725862236, + "loss": 8.0335, + "step": 366400 + }, + { + "epoch": 1.493040455693057, + "grad_norm": 4.210900783538818, + "learning_rate": 0.004430904378982597, + "loss": 7.9617, + "step": 366500 + }, + { + "epoch": 1.4934478337164383, + "grad_norm": 5.263278484344482, + "learning_rate": 0.004430588955961656, + "loss": 7.9852, + "step": 366600 + }, + { + "epoch": 1.4938552117398198, + "grad_norm": 1.806351900100708, + "learning_rate": 0.004430273456811887, + "loss": 8.016, + "step": 366700 + }, + { + "epoch": 1.4942625897632014, + "grad_norm": 7.461493968963623, + "learning_rate": 0.004429957881545764, + "loss": 8.0104, + "step": 366800 + }, + { + "epoch": 1.494669967786583, + "grad_norm": 1.170372724533081, + "learning_rate": 0.00442964223017576, + "loss": 8.0552, + "step": 366900 + }, + { + "epoch": 1.4950773458099642, + "grad_norm": 2.6418955326080322, + "learning_rate": 0.004429326502714353, + "loss": 8.0102, + "step": 367000 + }, + { + "epoch": 1.4950773458099642, + "eval_MaskedAccuracy": 0.4934207540611734, + "eval_loss": 1.673243761062622, + "eval_runtime": 262.6899, + "eval_samples_per_second": 241.639, + "eval_steps_per_second": 0.944, + "step": 367000 + }, + { + "epoch": 1.4954847238333457, + "grad_norm": 1.5339170694351196, + "learning_rate": 0.004429010699174032, + "loss": 8.0161, + "step": 367100 + }, + { + "epoch": 1.495892101856727, + "grad_norm": 3.244283676147461, + "learning_rate": 0.004428694819567276, + "loss": 7.9964, + "step": 367200 + }, + { + "epoch": 1.4962994798801086, + "grad_norm": 3.9228017330169678, + "learning_rate": 0.004428378863906572, + "loss": 8.0588, + "step": 367300 + }, + { + "epoch": 1.4967068579034901, + "grad_norm": 4.916393756866455, + "learning_rate": 0.004428062832204412, + "loss": 8.0278, + "step": 367400 + }, + { + "epoch": 1.4971142359268717, + "grad_norm": 6.785499095916748, + "learning_rate": 0.004427746724473295, + "loss": 8.0251, + "step": 367500 + }, + { + "epoch": 1.497521613950253, + "grad_norm": 1.6612313985824585, + "learning_rate": 0.004427430540725714, + "loss": 8.0086, + "step": 367600 + }, + { + "epoch": 1.4979289919736345, + "grad_norm": 3.4228081703186035, + "learning_rate": 0.0044271142809741764, + "loss": 8.0555, + "step": 367700 + }, + { + "epoch": 1.498336369997016, + "grad_norm": 5.840153217315674, + "learning_rate": 0.004426797945231178, + "loss": 7.9652, + "step": 367800 + }, + { + "epoch": 1.4987437480203973, + "grad_norm": 5.135745048522949, + "learning_rate": 0.004426481533509228, + "loss": 8.0224, + "step": 367900 + }, + { + "epoch": 1.4991511260437789, + "grad_norm": 6.537227153778076, + "learning_rate": 0.004426165045820837, + "loss": 8.0252, + "step": 368000 + }, + { + "epoch": 1.4991511260437789, + "eval_MaskedAccuracy": 0.488887823402608, + "eval_loss": 1.701386570930481, + "eval_runtime": 204.1211, + "eval_samples_per_second": 310.972, + "eval_steps_per_second": 1.215, + "step": 368000 + }, + { + "epoch": 1.4995585040671604, + "grad_norm": 4.002716541290283, + "learning_rate": 0.004425848482178517, + "loss": 8.0073, + "step": 368100 + }, + { + "epoch": 1.499965882090542, + "grad_norm": 3.9338126182556152, + "learning_rate": 0.0044255318425947855, + "loss": 8.0021, + "step": 368200 + }, + { + "epoch": 1.5003732601139232, + "grad_norm": 1.7457914352416992, + "learning_rate": 0.004425215127082153, + "loss": 8.0395, + "step": 368300 + }, + { + "epoch": 1.5007806381373048, + "grad_norm": 2.6222951412200928, + "learning_rate": 0.004424898335653149, + "loss": 8.0063, + "step": 368400 + }, + { + "epoch": 1.501188016160686, + "grad_norm": 2.925283193588257, + "learning_rate": 0.004424581468320298, + "loss": 8.0165, + "step": 368500 + }, + { + "epoch": 1.5015953941840676, + "grad_norm": 1.6769073009490967, + "learning_rate": 0.004424264525096125, + "loss": 8.0312, + "step": 368600 + }, + { + "epoch": 1.5020027722074492, + "grad_norm": 3.1565895080566406, + "learning_rate": 0.004423947505993157, + "loss": 8.0026, + "step": 368700 + }, + { + "epoch": 1.5024101502308307, + "grad_norm": 2.3908684253692627, + "learning_rate": 0.004423630411023935, + "loss": 8.0234, + "step": 368800 + }, + { + "epoch": 1.502817528254212, + "grad_norm": 3.7312817573547363, + "learning_rate": 0.00442331324020099, + "loss": 8.0064, + "step": 368900 + }, + { + "epoch": 1.5032249062775935, + "grad_norm": 3.0165467262268066, + "learning_rate": 0.004422995993536863, + "loss": 8.0322, + "step": 369000 + }, + { + "epoch": 1.5032249062775935, + "eval_MaskedAccuracy": 0.4903636892160923, + "eval_loss": 1.6973485946655273, + "eval_runtime": 170.173, + "eval_samples_per_second": 373.009, + "eval_steps_per_second": 1.457, + "step": 369000 + }, + { + "epoch": 1.5036322843009748, + "grad_norm": 1.907222867012024, + "learning_rate": 0.004422678671044098, + "loss": 7.9781, + "step": 369100 + }, + { + "epoch": 1.5040396623243564, + "grad_norm": 5.6811065673828125, + "learning_rate": 0.004422361272735235, + "loss": 8.0059, + "step": 369200 + }, + { + "epoch": 1.504447040347738, + "grad_norm": 3.2971274852752686, + "learning_rate": 0.004422043798622829, + "loss": 8.0166, + "step": 369300 + }, + { + "epoch": 1.5048544183711194, + "grad_norm": 1.2491105794906616, + "learning_rate": 0.004421726248719429, + "loss": 7.9775, + "step": 369400 + }, + { + "epoch": 1.5052617963945008, + "grad_norm": 3.353694200515747, + "learning_rate": 0.0044214086230375885, + "loss": 8.0086, + "step": 369500 + }, + { + "epoch": 1.5056691744178823, + "grad_norm": 1.343021273612976, + "learning_rate": 0.0044210909215898694, + "loss": 8.0073, + "step": 369600 + }, + { + "epoch": 1.5060765524412636, + "grad_norm": 4.270023345947266, + "learning_rate": 0.004420773144388821, + "loss": 8.0072, + "step": 369700 + }, + { + "epoch": 1.5064839304646451, + "grad_norm": 3.398383855819702, + "learning_rate": 0.00442045529144702, + "loss": 7.9974, + "step": 369800 + }, + { + "epoch": 1.5068913084880267, + "grad_norm": 5.190047264099121, + "learning_rate": 0.004420137362777025, + "loss": 8.0058, + "step": 369900 + }, + { + "epoch": 1.5072986865114082, + "grad_norm": 6.810204982757568, + "learning_rate": 0.004419819358391406, + "loss": 8.008, + "step": 370000 + }, + { + "epoch": 1.5072986865114082, + "eval_MaskedAccuracy": 0.4904813325585378, + "eval_loss": 1.6939561367034912, + "eval_runtime": 181.4744, + "eval_samples_per_second": 349.779, + "eval_steps_per_second": 1.367, + "step": 370000 + }, + { + "epoch": 1.5077060645347897, + "grad_norm": 1.2358757257461548, + "learning_rate": 0.004419501278302739, + "loss": 7.9975, + "step": 370100 + }, + { + "epoch": 1.508113442558171, + "grad_norm": 1.7502859830856323, + "learning_rate": 0.004419183122523591, + "loss": 7.9901, + "step": 370200 + }, + { + "epoch": 1.5085208205815523, + "grad_norm": 3.5640110969543457, + "learning_rate": 0.004418864891066549, + "loss": 8.0361, + "step": 370300 + }, + { + "epoch": 1.5089281986049339, + "grad_norm": 7.9837493896484375, + "learning_rate": 0.00441854658394419, + "loss": 8.0115, + "step": 370400 + }, + { + "epoch": 1.5093355766283154, + "grad_norm": 4.1491780281066895, + "learning_rate": 0.004418228201169104, + "loss": 8.0016, + "step": 370500 + }, + { + "epoch": 1.509742954651697, + "grad_norm": 5.766002655029297, + "learning_rate": 0.004417909742753878, + "loss": 7.989, + "step": 370600 + }, + { + "epoch": 1.5101503326750785, + "grad_norm": 3.899489164352417, + "learning_rate": 0.004417591208711096, + "loss": 8.0149, + "step": 370700 + }, + { + "epoch": 1.5105577106984598, + "grad_norm": 8.60437297821045, + "learning_rate": 0.004417272599053358, + "loss": 7.9866, + "step": 370800 + }, + { + "epoch": 1.5109650887218413, + "grad_norm": 1.5715206861495972, + "learning_rate": 0.00441695391379325, + "loss": 7.9908, + "step": 370900 + }, + { + "epoch": 1.5113724667452226, + "grad_norm": 1.131524920463562, + "learning_rate": 0.004416635152943376, + "loss": 8.0339, + "step": 371000 + }, + { + "epoch": 1.5113724667452226, + "eval_MaskedAccuracy": 0.49164760725542445, + "eval_loss": 1.6915628910064697, + "eval_runtime": 292.0873, + "eval_samples_per_second": 217.319, + "eval_steps_per_second": 0.849, + "step": 371000 + }, + { + "epoch": 1.5117798447686042, + "grad_norm": 1.6420693397521973, + "learning_rate": 0.004416316316516341, + "loss": 8.0173, + "step": 371100 + }, + { + "epoch": 1.5121872227919857, + "grad_norm": 4.51385498046875, + "learning_rate": 0.004415997404524749, + "loss": 7.9963, + "step": 371200 + }, + { + "epoch": 1.5125946008153672, + "grad_norm": 1.2989581823349, + "learning_rate": 0.004415678416981209, + "loss": 7.9935, + "step": 371300 + }, + { + "epoch": 1.5130019788387485, + "grad_norm": 1.5589622259140015, + "learning_rate": 0.004415359353898335, + "loss": 7.9899, + "step": 371400 + }, + { + "epoch": 1.51340935686213, + "grad_norm": 2.448767900466919, + "learning_rate": 0.004415040215288737, + "loss": 8.0232, + "step": 371500 + }, + { + "epoch": 1.5138167348855114, + "grad_norm": 4.266506671905518, + "learning_rate": 0.00441472100116503, + "loss": 7.9833, + "step": 371600 + }, + { + "epoch": 1.514224112908893, + "grad_norm": 2.878375291824341, + "learning_rate": 0.0044144017115398445, + "loss": 8.0197, + "step": 371700 + }, + { + "epoch": 1.5146314909322744, + "grad_norm": 3.721202850341797, + "learning_rate": 0.004414082346425793, + "loss": 7.9689, + "step": 371800 + }, + { + "epoch": 1.515038868955656, + "grad_norm": 2.0374796390533447, + "learning_rate": 0.004413762905835502, + "loss": 7.9809, + "step": 371900 + }, + { + "epoch": 1.5154462469790373, + "grad_norm": 4.74337911605835, + "learning_rate": 0.004413443389781608, + "loss": 8.0431, + "step": 372000 + }, + { + "epoch": 1.5154462469790373, + "eval_MaskedAccuracy": 0.4893654390312909, + "eval_loss": 1.6991828680038452, + "eval_runtime": 175.6731, + "eval_samples_per_second": 361.33, + "eval_steps_per_second": 1.412, + "step": 372000 + }, + { + "epoch": 1.5158536250024188, + "grad_norm": 2.3870112895965576, + "learning_rate": 0.004413123798276741, + "loss": 7.9965, + "step": 372100 + }, + { + "epoch": 1.5162610030258001, + "grad_norm": 5.8625712394714355, + "learning_rate": 0.004412804131333538, + "loss": 8.0064, + "step": 372200 + }, + { + "epoch": 1.5166683810491817, + "grad_norm": 5.180693626403809, + "learning_rate": 0.0044124843889646265, + "loss": 8.0432, + "step": 372300 + }, + { + "epoch": 1.5170757590725632, + "grad_norm": 3.3647851943969727, + "learning_rate": 0.0044121645711826625, + "loss": 7.989, + "step": 372400 + }, + { + "epoch": 1.5174831370959447, + "grad_norm": 2.1196417808532715, + "learning_rate": 0.004411844678000285, + "loss": 8.0252, + "step": 372500 + }, + { + "epoch": 1.5178905151193263, + "grad_norm": 3.921891689300537, + "learning_rate": 0.004411524709430136, + "loss": 7.9924, + "step": 372600 + }, + { + "epoch": 1.5182978931427076, + "grad_norm": 3.548800230026245, + "learning_rate": 0.0044112046654848636, + "loss": 7.967, + "step": 372700 + }, + { + "epoch": 1.5187052711660889, + "grad_norm": 3.645210027694702, + "learning_rate": 0.004410884546177128, + "loss": 7.9845, + "step": 372800 + }, + { + "epoch": 1.5191126491894704, + "grad_norm": 5.26247501373291, + "learning_rate": 0.004410564351519583, + "loss": 7.9699, + "step": 372900 + }, + { + "epoch": 1.519520027212852, + "grad_norm": 2.6062309741973877, + "learning_rate": 0.0044102440815248925, + "loss": 7.9899, + "step": 373000 + }, + { + "epoch": 1.519520027212852, + "eval_MaskedAccuracy": 0.49075348218722203, + "eval_loss": 1.6940529346466064, + "eval_runtime": 235.0784, + "eval_samples_per_second": 270.021, + "eval_steps_per_second": 1.055, + "step": 373000 + }, + { + "epoch": 1.5199274052362335, + "grad_norm": 2.387111186981201, + "learning_rate": 0.00440992373620571, + "loss": 8.0044, + "step": 373100 + }, + { + "epoch": 1.520334783259615, + "grad_norm": 0.986452043056488, + "learning_rate": 0.004409603315574701, + "loss": 7.9815, + "step": 373200 + }, + { + "epoch": 1.5207421612829963, + "grad_norm": 5.4955010414123535, + "learning_rate": 0.004409282819644539, + "loss": 8.0191, + "step": 373300 + }, + { + "epoch": 1.5211495393063779, + "grad_norm": 2.1645195484161377, + "learning_rate": 0.004408962248427899, + "loss": 8.0379, + "step": 373400 + }, + { + "epoch": 1.5215569173297592, + "grad_norm": 4.374330043792725, + "learning_rate": 0.004408641601937445, + "loss": 7.9843, + "step": 373500 + }, + { + "epoch": 1.5219642953531407, + "grad_norm": 5.132680892944336, + "learning_rate": 0.004408320880185853, + "loss": 7.9998, + "step": 373600 + }, + { + "epoch": 1.5223716733765222, + "grad_norm": 7.35249137878418, + "learning_rate": 0.004408000083185806, + "loss": 8.0068, + "step": 373700 + }, + { + "epoch": 1.5227790513999038, + "grad_norm": 1.830296516418457, + "learning_rate": 0.004407679210949991, + "loss": 8.0002, + "step": 373800 + }, + { + "epoch": 1.523186429423285, + "grad_norm": 3.0734915733337402, + "learning_rate": 0.0044073582634910876, + "loss": 8.0164, + "step": 373900 + }, + { + "epoch": 1.5235938074466666, + "grad_norm": 2.2600815296173096, + "learning_rate": 0.0044070372408217835, + "loss": 7.9963, + "step": 374000 + }, + { + "epoch": 1.5235938074466666, + "eval_MaskedAccuracy": 0.4904077083014307, + "eval_loss": 1.7035925388336182, + "eval_runtime": 178.9778, + "eval_samples_per_second": 354.659, + "eval_steps_per_second": 1.386, + "step": 374000 + }, + { + "epoch": 1.524001185470048, + "grad_norm": 3.5932393074035645, + "learning_rate": 0.004406716142954784, + "loss": 7.9975, + "step": 374100 + }, + { + "epoch": 1.5244085634934295, + "grad_norm": 1.704676866531372, + "learning_rate": 0.004406394969902773, + "loss": 8.0119, + "step": 374200 + }, + { + "epoch": 1.524815941516811, + "grad_norm": 1.0707470178604126, + "learning_rate": 0.004406073721678446, + "loss": 8.0245, + "step": 374300 + }, + { + "epoch": 1.5252233195401925, + "grad_norm": 3.7876572608947754, + "learning_rate": 0.004405752398294502, + "loss": 7.9867, + "step": 374400 + }, + { + "epoch": 1.5256306975635738, + "grad_norm": 1.4847164154052734, + "learning_rate": 0.004405430999763656, + "loss": 8.0224, + "step": 374500 + }, + { + "epoch": 1.5260380755869554, + "grad_norm": 1.2850916385650635, + "learning_rate": 0.004405109526098606, + "loss": 8.0137, + "step": 374600 + }, + { + "epoch": 1.5264454536103367, + "grad_norm": 1.354112148284912, + "learning_rate": 0.0044047879773120675, + "loss": 8.021, + "step": 374700 + }, + { + "epoch": 1.5268528316337182, + "grad_norm": 2.893594741821289, + "learning_rate": 0.004404466353416749, + "loss": 7.997, + "step": 374800 + }, + { + "epoch": 1.5272602096570997, + "grad_norm": 2.0816569328308105, + "learning_rate": 0.00440414465442537, + "loss": 7.9995, + "step": 374900 + }, + { + "epoch": 1.5276675876804813, + "grad_norm": 1.5227657556533813, + "learning_rate": 0.00440382288035064, + "loss": 8.0081, + "step": 375000 + }, + { + "epoch": 1.5276675876804813, + "eval_MaskedAccuracy": 0.4910730598327578, + "eval_loss": 1.6873279809951782, + "eval_runtime": 172.7911, + "eval_samples_per_second": 367.357, + "eval_steps_per_second": 1.435, + "step": 375000 + }, + { + "epoch": 1.5280749657038628, + "grad_norm": 2.5384185314178467, + "learning_rate": 0.004403501031205286, + "loss": 8.0438, + "step": 375100 + }, + { + "epoch": 1.5284823437272441, + "grad_norm": 3.016969680786133, + "learning_rate": 0.004403179107002036, + "loss": 7.999, + "step": 375200 + }, + { + "epoch": 1.5288897217506254, + "grad_norm": 3.4094536304473877, + "learning_rate": 0.0044028571077536095, + "loss": 7.9872, + "step": 375300 + }, + { + "epoch": 1.529297099774007, + "grad_norm": 3.460625410079956, + "learning_rate": 0.004402535033472744, + "loss": 8.0281, + "step": 375400 + }, + { + "epoch": 1.5297044777973885, + "grad_norm": 2.0465474128723145, + "learning_rate": 0.004402212884172171, + "loss": 7.9687, + "step": 375500 + }, + { + "epoch": 1.53011185582077, + "grad_norm": 3.2641730308532715, + "learning_rate": 0.004401890659864625, + "loss": 7.9884, + "step": 375600 + }, + { + "epoch": 1.5305192338441516, + "grad_norm": 0.8561846613883972, + "learning_rate": 0.004401568360562846, + "loss": 7.9817, + "step": 375700 + }, + { + "epoch": 1.5309266118675329, + "grad_norm": 7.27733850479126, + "learning_rate": 0.004401245986279574, + "loss": 8.0236, + "step": 375800 + }, + { + "epoch": 1.5313339898909144, + "grad_norm": 1.1665247678756714, + "learning_rate": 0.004400923537027569, + "loss": 8.0193, + "step": 375900 + }, + { + "epoch": 1.5317413679142957, + "grad_norm": 1.3204345703125, + "learning_rate": 0.0044006010128195515, + "loss": 8.0183, + "step": 376000 + }, + { + "epoch": 1.5317413679142957, + "eval_MaskedAccuracy": 0.49200622618186474, + "eval_loss": 1.6839189529418945, + "eval_runtime": 257.4429, + "eval_samples_per_second": 246.563, + "eval_steps_per_second": 0.963, + "step": 376000 + }, + { + "epoch": 1.5321487459376772, + "grad_norm": 3.4822580814361572, + "learning_rate": 0.004400278413668291, + "loss": 8.0098, + "step": 376100 + }, + { + "epoch": 1.5325561239610588, + "grad_norm": 1.6631346940994263, + "learning_rate": 0.004399955739586542, + "loss": 7.9757, + "step": 376200 + }, + { + "epoch": 1.5329635019844403, + "grad_norm": 7.522161483764648, + "learning_rate": 0.004399632990587055, + "loss": 8.0237, + "step": 376300 + }, + { + "epoch": 1.5333708800078216, + "grad_norm": 1.9211970567703247, + "learning_rate": 0.004399310166682584, + "loss": 8.0303, + "step": 376400 + }, + { + "epoch": 1.5337782580312032, + "grad_norm": 4.876394271850586, + "learning_rate": 0.004398987267885908, + "loss": 8.044, + "step": 376500 + }, + { + "epoch": 1.5341856360545845, + "grad_norm": 1.8055191040039062, + "learning_rate": 0.004398664294209784, + "loss": 7.9903, + "step": 376600 + }, + { + "epoch": 1.534593014077966, + "grad_norm": 3.4431655406951904, + "learning_rate": 0.0043983412456669825, + "loss": 7.9903, + "step": 376700 + }, + { + "epoch": 1.5350003921013475, + "grad_norm": 3.2290079593658447, + "learning_rate": 0.004398018122270274, + "loss": 7.9868, + "step": 376800 + }, + { + "epoch": 1.535407770124729, + "grad_norm": 0.8165335655212402, + "learning_rate": 0.004397694924032439, + "loss": 8.0685, + "step": 376900 + }, + { + "epoch": 1.5358151481481104, + "grad_norm": 1.4743595123291016, + "learning_rate": 0.004397371650966245, + "loss": 8.0165, + "step": 377000 + }, + { + "epoch": 1.5358151481481104, + "eval_MaskedAccuracy": 0.4923337604931521, + "eval_loss": 1.687532901763916, + "eval_runtime": 177.2597, + "eval_samples_per_second": 358.096, + "eval_steps_per_second": 1.399, + "step": 377000 + }, + { + "epoch": 1.536222526171492, + "grad_norm": 4.091436862945557, + "learning_rate": 0.0043970483030844835, + "loss": 7.9861, + "step": 377100 + }, + { + "epoch": 1.5366299041948732, + "grad_norm": 2.774745225906372, + "learning_rate": 0.00439672488039993, + "loss": 7.9904, + "step": 377200 + }, + { + "epoch": 1.5370372822182548, + "grad_norm": 4.360829830169678, + "learning_rate": 0.004396401382925373, + "loss": 8.0045, + "step": 377300 + }, + { + "epoch": 1.5374446602416363, + "grad_norm": 2.3972575664520264, + "learning_rate": 0.004396077810673604, + "loss": 8.0261, + "step": 377400 + }, + { + "epoch": 1.5378520382650178, + "grad_norm": 3.517026424407959, + "learning_rate": 0.004395754163657418, + "loss": 7.9857, + "step": 377500 + }, + { + "epoch": 1.5382594162883994, + "grad_norm": 6.994585037231445, + "learning_rate": 0.004395430441889607, + "loss": 7.9608, + "step": 377600 + }, + { + "epoch": 1.5386667943117807, + "grad_norm": 2.0977888107299805, + "learning_rate": 0.004395106645382972, + "loss": 7.9971, + "step": 377700 + }, + { + "epoch": 1.539074172335162, + "grad_norm": 4.672280311584473, + "learning_rate": 0.004394782774150305, + "loss": 7.9997, + "step": 377800 + }, + { + "epoch": 1.5394815503585435, + "grad_norm": 3.41923189163208, + "learning_rate": 0.004394458828204427, + "loss": 7.9821, + "step": 377900 + }, + { + "epoch": 1.539888928381925, + "grad_norm": 1.2892951965332031, + "learning_rate": 0.004394134807558128, + "loss": 7.9444, + "step": 378000 + }, + { + "epoch": 1.539888928381925, + "eval_MaskedAccuracy": 0.49300857328657016, + "eval_loss": 1.6862092018127441, + "eval_runtime": 186.9825, + "eval_samples_per_second": 339.476, + "eval_steps_per_second": 1.326, + "step": 378000 + }, + { + "epoch": 1.5402963064053066, + "grad_norm": 2.5519824028015137, + "learning_rate": 0.004393810712224229, + "loss": 7.9852, + "step": 378100 + }, + { + "epoch": 1.540703684428688, + "grad_norm": 5.804073810577393, + "learning_rate": 0.004393486542215543, + "loss": 7.9899, + "step": 378200 + }, + { + "epoch": 1.5411110624520694, + "grad_norm": 3.2780373096466064, + "learning_rate": 0.004393162297544885, + "loss": 7.9598, + "step": 378300 + }, + { + "epoch": 1.541518440475451, + "grad_norm": 3.6317949295043945, + "learning_rate": 0.0043928379782250645, + "loss": 8.0093, + "step": 378400 + }, + { + "epoch": 1.5419258184988323, + "grad_norm": 1.9392168521881104, + "learning_rate": 0.004392513584268915, + "loss": 7.9733, + "step": 378500 + }, + { + "epoch": 1.5423331965222138, + "grad_norm": 3.0337905883789062, + "learning_rate": 0.004392189115689258, + "loss": 8.016, + "step": 378600 + }, + { + "epoch": 1.5427405745455953, + "grad_norm": 2.9371018409729004, + "learning_rate": 0.004391864572498925, + "loss": 7.993, + "step": 378700 + }, + { + "epoch": 1.5431479525689769, + "grad_norm": 4.97229528427124, + "learning_rate": 0.004391539954710744, + "loss": 8.0001, + "step": 378800 + }, + { + "epoch": 1.5435553305923582, + "grad_norm": 1.364279866218567, + "learning_rate": 0.0043912152623375505, + "loss": 8.0293, + "step": 378900 + }, + { + "epoch": 1.5439627086157397, + "grad_norm": 2.0546860694885254, + "learning_rate": 0.0043908904953921775, + "loss": 7.9817, + "step": 379000 + }, + { + "epoch": 1.5439627086157397, + "eval_MaskedAccuracy": 0.4925978571629836, + "eval_loss": 1.6845088005065918, + "eval_runtime": 193.6522, + "eval_samples_per_second": 327.784, + "eval_steps_per_second": 1.281, + "step": 379000 + }, + { + "epoch": 1.544370086639121, + "grad_norm": 3.2605416774749756, + "learning_rate": 0.004390565653887459, + "loss": 7.9926, + "step": 379100 + }, + { + "epoch": 1.5447774646625025, + "grad_norm": 1.0459586381912231, + "learning_rate": 0.0043902407378362505, + "loss": 7.9852, + "step": 379200 + }, + { + "epoch": 1.545184842685884, + "grad_norm": 3.6447129249572754, + "learning_rate": 0.004389915747251395, + "loss": 7.9771, + "step": 379300 + }, + { + "epoch": 1.5455922207092656, + "grad_norm": 4.3206706047058105, + "learning_rate": 0.004389590682145738, + "loss": 7.9652, + "step": 379400 + }, + { + "epoch": 1.545999598732647, + "grad_norm": 1.4588992595672607, + "learning_rate": 0.00438926554253213, + "loss": 7.9142, + "step": 379500 + }, + { + "epoch": 1.5464069767560285, + "grad_norm": 5.171288967132568, + "learning_rate": 0.004388940328423422, + "loss": 8.0281, + "step": 379600 + }, + { + "epoch": 1.5468143547794098, + "grad_norm": 1.0709530115127563, + "learning_rate": 0.004388615039832481, + "loss": 7.9797, + "step": 379700 + }, + { + "epoch": 1.5472217328027913, + "grad_norm": 1.3747026920318604, + "learning_rate": 0.004388289676772165, + "loss": 7.988, + "step": 379800 + }, + { + "epoch": 1.5476291108261728, + "grad_norm": 4.58999490737915, + "learning_rate": 0.0043879642392553415, + "loss": 8.0068, + "step": 379900 + }, + { + "epoch": 1.5480364888495544, + "grad_norm": 4.6510009765625, + "learning_rate": 0.004387638727294866, + "loss": 8.0183, + "step": 380000 + }, + { + "epoch": 1.5480364888495544, + "eval_MaskedAccuracy": 0.492262019536217, + "eval_loss": 1.6800833940505981, + "eval_runtime": 167.5096, + "eval_samples_per_second": 378.94, + "eval_steps_per_second": 1.481, + "step": 380000 + }, + { + "epoch": 1.548443866872936, + "grad_norm": 4.6116156578063965, + "learning_rate": 0.004387313140903613, + "loss": 7.9816, + "step": 380100 + }, + { + "epoch": 1.5488512448963172, + "grad_norm": 1.628583312034607, + "learning_rate": 0.004386987480094455, + "loss": 7.9637, + "step": 380200 + }, + { + "epoch": 1.5492586229196985, + "grad_norm": 3.180694818496704, + "learning_rate": 0.004386661744880261, + "loss": 8.0141, + "step": 380300 + }, + { + "epoch": 1.54966600094308, + "grad_norm": 3.0045862197875977, + "learning_rate": 0.004386335935273921, + "loss": 7.9867, + "step": 380400 + }, + { + "epoch": 1.5500733789664616, + "grad_norm": 4.71543025970459, + "learning_rate": 0.004386010051288303, + "loss": 7.9974, + "step": 380500 + }, + { + "epoch": 1.5504807569898431, + "grad_norm": 4.103104591369629, + "learning_rate": 0.004385684092936303, + "loss": 8.0233, + "step": 380600 + }, + { + "epoch": 1.5508881350132246, + "grad_norm": 5.594005107879639, + "learning_rate": 0.004385358060230802, + "loss": 7.9874, + "step": 380700 + }, + { + "epoch": 1.551295513036606, + "grad_norm": 2.153127670288086, + "learning_rate": 0.004385031953184691, + "loss": 8.0051, + "step": 380800 + }, + { + "epoch": 1.5517028910599873, + "grad_norm": 6.683218002319336, + "learning_rate": 0.004384705771810855, + "loss": 7.9949, + "step": 380900 + }, + { + "epoch": 1.5521102690833688, + "grad_norm": 7.274106502532959, + "learning_rate": 0.004384379516122195, + "loss": 7.976, + "step": 381000 + }, + { + "epoch": 1.5521102690833688, + "eval_MaskedAccuracy": 0.49259958121987407, + "eval_loss": 1.6832220554351807, + "eval_runtime": 247.5638, + "eval_samples_per_second": 256.403, + "eval_steps_per_second": 1.002, + "step": 381000 + }, + { + "epoch": 1.5525176471067503, + "grad_norm": 2.6610333919525146, + "learning_rate": 0.004384053186131619, + "loss": 7.9938, + "step": 381100 + }, + { + "epoch": 1.5529250251301319, + "grad_norm": 2.1829004287719727, + "learning_rate": 0.004383726781852016, + "loss": 7.997, + "step": 381200 + }, + { + "epoch": 1.5533324031535134, + "grad_norm": 4.432714939117432, + "learning_rate": 0.004383400303296297, + "loss": 7.9341, + "step": 381300 + }, + { + "epoch": 1.5537397811768947, + "grad_norm": 5.071836948394775, + "learning_rate": 0.004383073750477368, + "loss": 7.9738, + "step": 381400 + }, + { + "epoch": 1.5541471592002762, + "grad_norm": 2.290721893310547, + "learning_rate": 0.004382747123408135, + "loss": 8.0268, + "step": 381500 + }, + { + "epoch": 1.5545545372236576, + "grad_norm": 4.035449981689453, + "learning_rate": 0.004382420422101519, + "loss": 8.0091, + "step": 381600 + }, + { + "epoch": 1.554961915247039, + "grad_norm": 5.090133190155029, + "learning_rate": 0.004382093646570438, + "loss": 8.0049, + "step": 381700 + }, + { + "epoch": 1.5553692932704206, + "grad_norm": 1.8367220163345337, + "learning_rate": 0.004381766796827804, + "loss": 7.9489, + "step": 381800 + }, + { + "epoch": 1.5557766712938021, + "grad_norm": 1.850050926208496, + "learning_rate": 0.004381439872886538, + "loss": 7.9628, + "step": 381900 + }, + { + "epoch": 1.5561840493171835, + "grad_norm": 3.262934446334839, + "learning_rate": 0.0043811128747595655, + "loss": 7.9987, + "step": 382000 + }, + { + "epoch": 1.5561840493171835, + "eval_MaskedAccuracy": 0.49243849720880073, + "eval_loss": 1.679644227027893, + "eval_runtime": 217.7531, + "eval_samples_per_second": 291.504, + "eval_steps_per_second": 1.139, + "step": 382000 + }, + { + "epoch": 1.556591427340565, + "grad_norm": 4.958277225494385, + "learning_rate": 0.004380785802459817, + "loss": 7.9732, + "step": 382100 + }, + { + "epoch": 1.5569988053639463, + "grad_norm": 8.351605415344238, + "learning_rate": 0.004380458656000227, + "loss": 7.9613, + "step": 382200 + }, + { + "epoch": 1.5574061833873278, + "grad_norm": 1.5247186422348022, + "learning_rate": 0.00438013143539372, + "loss": 7.9877, + "step": 382300 + }, + { + "epoch": 1.5578135614107094, + "grad_norm": 3.52302885055542, + "learning_rate": 0.0043798041406532436, + "loss": 7.9944, + "step": 382400 + }, + { + "epoch": 1.558220939434091, + "grad_norm": 5.48790168762207, + "learning_rate": 0.004379476771791734, + "loss": 7.9529, + "step": 382500 + }, + { + "epoch": 1.5586283174574724, + "grad_norm": 1.6266847848892212, + "learning_rate": 0.004379149328822129, + "loss": 8.005, + "step": 382600 + }, + { + "epoch": 1.5590356954808537, + "grad_norm": 3.445540189743042, + "learning_rate": 0.00437882181175738, + "loss": 7.9733, + "step": 382700 + }, + { + "epoch": 1.559443073504235, + "grad_norm": 6.589896202087402, + "learning_rate": 0.004378494220610433, + "loss": 7.9937, + "step": 382800 + }, + { + "epoch": 1.5598504515276166, + "grad_norm": 4.113238334655762, + "learning_rate": 0.0043781665553942395, + "loss": 7.9968, + "step": 382900 + }, + { + "epoch": 1.5602578295509981, + "grad_norm": 4.015130996704102, + "learning_rate": 0.004377838816121752, + "loss": 7.9891, + "step": 383000 + }, + { + "epoch": 1.5602578295509981, + "eval_MaskedAccuracy": 0.49189015147263415, + "eval_loss": 1.689454436302185, + "eval_runtime": 259.1091, + "eval_samples_per_second": 244.978, + "eval_steps_per_second": 0.957, + "step": 383000 + }, + { + "epoch": 1.5606652075743797, + "grad_norm": 2.397777557373047, + "learning_rate": 0.004377511002805935, + "loss": 8.006, + "step": 383100 + }, + { + "epoch": 1.5610725855977612, + "grad_norm": 6.622825622558594, + "learning_rate": 0.004377183115459738, + "loss": 8.0003, + "step": 383200 + }, + { + "epoch": 1.5614799636211425, + "grad_norm": 3.1762149333953857, + "learning_rate": 0.004376855154096129, + "loss": 7.984, + "step": 383300 + }, + { + "epoch": 1.5618873416445238, + "grad_norm": 1.678264856338501, + "learning_rate": 0.004376527118728077, + "loss": 7.9757, + "step": 383400 + }, + { + "epoch": 1.5622947196679053, + "grad_norm": 3.344902992248535, + "learning_rate": 0.004376199009368541, + "loss": 8.0187, + "step": 383500 + }, + { + "epoch": 1.5627020976912869, + "grad_norm": 4.5006022453308105, + "learning_rate": 0.004375870826030505, + "loss": 7.961, + "step": 383600 + }, + { + "epoch": 1.5631094757146684, + "grad_norm": 1.239322304725647, + "learning_rate": 0.0043755425687269376, + "loss": 7.9776, + "step": 383700 + }, + { + "epoch": 1.56351685373805, + "grad_norm": 4.102320194244385, + "learning_rate": 0.004375214237470823, + "loss": 7.9709, + "step": 383800 + }, + { + "epoch": 1.5639242317614312, + "grad_norm": 3.3950605392456055, + "learning_rate": 0.0043748858322751345, + "loss": 8.0008, + "step": 383900 + }, + { + "epoch": 1.5643316097848128, + "grad_norm": 1.621116280555725, + "learning_rate": 0.0043745573531528545, + "loss": 7.9943, + "step": 384000 + }, + { + "epoch": 1.5643316097848128, + "eval_MaskedAccuracy": 0.4919575874356232, + "eval_loss": 1.6874390840530396, + "eval_runtime": 217.0269, + "eval_samples_per_second": 292.48, + "eval_steps_per_second": 1.143, + "step": 384000 + }, + { + "epoch": 1.564738987808194, + "grad_norm": 4.499731063842773, + "learning_rate": 0.0043742288001169805, + "loss": 7.954, + "step": 384100 + }, + { + "epoch": 1.5651463658315756, + "grad_norm": 2.5246453285217285, + "learning_rate": 0.004373900173180485, + "loss": 7.9966, + "step": 384200 + }, + { + "epoch": 1.5655537438549572, + "grad_norm": 2.9872934818267822, + "learning_rate": 0.0043735714723563725, + "loss": 7.9967, + "step": 384300 + }, + { + "epoch": 1.5659611218783387, + "grad_norm": 1.435693383216858, + "learning_rate": 0.004373242697657639, + "loss": 7.971, + "step": 384400 + }, + { + "epoch": 1.56636849990172, + "grad_norm": 7.620283126831055, + "learning_rate": 0.0043729138490972855, + "loss": 7.9908, + "step": 384500 + }, + { + "epoch": 1.5667758779251015, + "grad_norm": 3.052788496017456, + "learning_rate": 0.004372584926688302, + "loss": 7.9748, + "step": 384600 + }, + { + "epoch": 1.5671832559484828, + "grad_norm": 3.5217275619506836, + "learning_rate": 0.004372255930443699, + "loss": 7.9848, + "step": 384700 + }, + { + "epoch": 1.5675906339718644, + "grad_norm": 5.30711030960083, + "learning_rate": 0.0043719268603764775, + "loss": 8.0149, + "step": 384800 + }, + { + "epoch": 1.567998011995246, + "grad_norm": 2.9961659908294678, + "learning_rate": 0.004371597716499653, + "loss": 7.9639, + "step": 384900 + }, + { + "epoch": 1.5684053900186274, + "grad_norm": 3.3438985347747803, + "learning_rate": 0.004371268498826239, + "loss": 7.9791, + "step": 385000 + }, + { + "epoch": 1.5684053900186274, + "eval_MaskedAccuracy": 0.4931344547286357, + "eval_loss": 1.679723858833313, + "eval_runtime": 248.0602, + "eval_samples_per_second": 255.889, + "eval_steps_per_second": 1.0, + "step": 385000 + }, + { + "epoch": 1.568812768042009, + "grad_norm": 2.019742250442505, + "learning_rate": 0.004370939207369248, + "loss": 7.9797, + "step": 385100 + }, + { + "epoch": 1.5692201460653903, + "grad_norm": 3.772763967514038, + "learning_rate": 0.004370609842141702, + "loss": 8.0025, + "step": 385200 + }, + { + "epoch": 1.5696275240887716, + "grad_norm": 3.6090774536132812, + "learning_rate": 0.004370280403156618, + "loss": 7.9975, + "step": 385300 + }, + { + "epoch": 1.5700349021121531, + "grad_norm": 1.7758287191390991, + "learning_rate": 0.0043699508904270256, + "loss": 8.0111, + "step": 385400 + }, + { + "epoch": 1.5704422801355347, + "grad_norm": 5.4468913078308105, + "learning_rate": 0.004369621303965948, + "loss": 7.9821, + "step": 385500 + }, + { + "epoch": 1.5708496581589162, + "grad_norm": 1.8788725137710571, + "learning_rate": 0.004369291643786418, + "loss": 7.9887, + "step": 385600 + }, + { + "epoch": 1.5712570361822977, + "grad_norm": 3.531351089477539, + "learning_rate": 0.004368961909901478, + "loss": 7.9929, + "step": 385700 + }, + { + "epoch": 1.571664414205679, + "grad_norm": 1.8444750308990479, + "learning_rate": 0.004368632102324148, + "loss": 7.9473, + "step": 385800 + }, + { + "epoch": 1.5720717922290603, + "grad_norm": 3.00870418548584, + "learning_rate": 0.004368302221067471, + "loss": 7.9732, + "step": 385900 + }, + { + "epoch": 1.5724791702524419, + "grad_norm": 2.3317511081695557, + "learning_rate": 0.004367972266144491, + "loss": 7.9927, + "step": 386000 + }, + { + "epoch": 1.5724791702524419, + "eval_MaskedAccuracy": 0.49240926965755605, + "eval_loss": 1.6796014308929443, + "eval_runtime": 175.2543, + "eval_samples_per_second": 362.194, + "eval_steps_per_second": 1.415, + "step": 386000 + }, + { + "epoch": 1.5728865482758234, + "grad_norm": 4.185787677764893, + "learning_rate": 0.004367642237568251, + "loss": 8.0055, + "step": 386100 + }, + { + "epoch": 1.573293926299205, + "grad_norm": 2.6901581287384033, + "learning_rate": 0.004367312135351802, + "loss": 8.0233, + "step": 386200 + }, + { + "epoch": 1.5737013043225865, + "grad_norm": 2.5258898735046387, + "learning_rate": 0.004366981959508197, + "loss": 7.9875, + "step": 386300 + }, + { + "epoch": 1.5741086823459678, + "grad_norm": 1.9751198291778564, + "learning_rate": 0.004366651710050486, + "loss": 8.0142, + "step": 386400 + }, + { + "epoch": 1.5745160603693493, + "grad_norm": 3.2172818183898926, + "learning_rate": 0.0043663213869917315, + "loss": 7.978, + "step": 386500 + }, + { + "epoch": 1.5749234383927306, + "grad_norm": 2.6670279502868652, + "learning_rate": 0.00436599099034499, + "loss": 7.992, + "step": 386600 + }, + { + "epoch": 1.5753308164161122, + "grad_norm": 1.5565224885940552, + "learning_rate": 0.004365660520123314, + "loss": 7.9866, + "step": 386700 + }, + { + "epoch": 1.5757381944394937, + "grad_norm": 1.5170857906341553, + "learning_rate": 0.004365329976339782, + "loss": 7.9695, + "step": 386800 + }, + { + "epoch": 1.5761455724628752, + "grad_norm": 5.241313457489014, + "learning_rate": 0.004364999359007448, + "loss": 7.9856, + "step": 386900 + }, + { + "epoch": 1.5765529504862565, + "grad_norm": 2.3493220806121826, + "learning_rate": 0.004364668668139396, + "loss": 8.0067, + "step": 387000 + }, + { + "epoch": 1.5765529504862565, + "eval_MaskedAccuracy": 0.4925615120398001, + "eval_loss": 1.68255615234375, + "eval_runtime": 184.6857, + "eval_samples_per_second": 343.697, + "eval_steps_per_second": 1.343, + "step": 387000 + }, + { + "epoch": 1.576960328509638, + "grad_norm": 1.3695019483566284, + "learning_rate": 0.0043643379037486985, + "loss": 8.0047, + "step": 387100 + }, + { + "epoch": 1.5773677065330194, + "grad_norm": 3.1875879764556885, + "learning_rate": 0.004364007065848422, + "loss": 7.9819, + "step": 387200 + }, + { + "epoch": 1.577775084556401, + "grad_norm": 2.9773612022399902, + "learning_rate": 0.0043636761544516576, + "loss": 7.9542, + "step": 387300 + }, + { + "epoch": 1.5781824625797825, + "grad_norm": 3.6195931434631348, + "learning_rate": 0.004363345169571481, + "loss": 7.9477, + "step": 387400 + }, + { + "epoch": 1.578589840603164, + "grad_norm": 0.8657932281494141, + "learning_rate": 0.004363014111220982, + "loss": 7.9695, + "step": 387500 + }, + { + "epoch": 1.5789972186265455, + "grad_norm": 2.1034436225891113, + "learning_rate": 0.004362682979413249, + "loss": 7.9899, + "step": 387600 + }, + { + "epoch": 1.5794045966499268, + "grad_norm": 2.825350761413574, + "learning_rate": 0.004362351774161368, + "loss": 7.9802, + "step": 387700 + }, + { + "epoch": 1.5798119746733081, + "grad_norm": 1.7635798454284668, + "learning_rate": 0.004362020495478441, + "loss": 7.9105, + "step": 387800 + }, + { + "epoch": 1.5802193526966897, + "grad_norm": 1.4848153591156006, + "learning_rate": 0.0043616891433775635, + "loss": 7.9555, + "step": 387900 + }, + { + "epoch": 1.5806267307200712, + "grad_norm": 2.003793954849243, + "learning_rate": 0.004361357717871833, + "loss": 8.0073, + "step": 388000 + }, + { + "epoch": 1.5806267307200712, + "eval_MaskedAccuracy": 0.49248137653424673, + "eval_loss": 1.6808550357818604, + "eval_runtime": 252.9763, + "eval_samples_per_second": 250.917, + "eval_steps_per_second": 0.98, + "step": 388000 + }, + { + "epoch": 1.5810341087434527, + "grad_norm": 4.4342145919799805, + "learning_rate": 0.004361026218974351, + "loss": 7.9984, + "step": 388100 + }, + { + "epoch": 1.5814414867668343, + "grad_norm": 5.26987361907959, + "learning_rate": 0.004360694646698225, + "loss": 7.9808, + "step": 388200 + }, + { + "epoch": 1.5818488647902156, + "grad_norm": 3.508080244064331, + "learning_rate": 0.004360363001056569, + "loss": 7.9658, + "step": 388300 + }, + { + "epoch": 1.582256242813597, + "grad_norm": 1.4259755611419678, + "learning_rate": 0.0043600312820624805, + "loss": 7.992, + "step": 388400 + }, + { + "epoch": 1.5826636208369784, + "grad_norm": 2.7010579109191895, + "learning_rate": 0.004359699489729094, + "loss": 7.9874, + "step": 388500 + }, + { + "epoch": 1.58307099886036, + "grad_norm": 4.352523326873779, + "learning_rate": 0.004359367624069517, + "loss": 7.997, + "step": 388600 + }, + { + "epoch": 1.5834783768837415, + "grad_norm": 0.9584260582923889, + "learning_rate": 0.004359035685096869, + "loss": 8.0089, + "step": 388700 + }, + { + "epoch": 1.583885754907123, + "grad_norm": 1.1232610940933228, + "learning_rate": 0.004358703672824273, + "loss": 7.985, + "step": 388800 + }, + { + "epoch": 1.5842931329305043, + "grad_norm": 2.125617742538452, + "learning_rate": 0.0043583715872648554, + "loss": 8.0019, + "step": 388900 + }, + { + "epoch": 1.5847005109538859, + "grad_norm": 1.381211280822754, + "learning_rate": 0.004358039428431752, + "loss": 7.9937, + "step": 389000 + }, + { + "epoch": 1.5847005109538859, + "eval_MaskedAccuracy": 0.4935783608289307, + "eval_loss": 1.6715673208236694, + "eval_runtime": 197.0436, + "eval_samples_per_second": 322.142, + "eval_steps_per_second": 1.259, + "step": 389000 + }, + { + "epoch": 1.5851078889772672, + "grad_norm": 0.8518221378326416, + "learning_rate": 0.004357707196338086, + "loss": 7.9801, + "step": 389100 + }, + { + "epoch": 1.5855152670006487, + "grad_norm": 1.8795791864395142, + "learning_rate": 0.004357374890996995, + "loss": 7.9729, + "step": 389200 + }, + { + "epoch": 1.5859226450240302, + "grad_norm": 2.8003125190734863, + "learning_rate": 0.0043570425124216175, + "loss": 8.003, + "step": 389300 + }, + { + "epoch": 1.5863300230474118, + "grad_norm": 2.7544593811035156, + "learning_rate": 0.004356710060625093, + "loss": 7.9742, + "step": 389400 + }, + { + "epoch": 1.586737401070793, + "grad_norm": 1.9319356679916382, + "learning_rate": 0.00435637753562057, + "loss": 7.9684, + "step": 389500 + }, + { + "epoch": 1.5871447790941746, + "grad_norm": 2.1396312713623047, + "learning_rate": 0.004356044937421187, + "loss": 7.9813, + "step": 389600 + }, + { + "epoch": 1.587552157117556, + "grad_norm": 2.89149808883667, + "learning_rate": 0.004355712266040101, + "loss": 8.0062, + "step": 389700 + }, + { + "epoch": 1.5879595351409375, + "grad_norm": 4.6878132820129395, + "learning_rate": 0.004355379521490456, + "loss": 8.047, + "step": 389800 + }, + { + "epoch": 1.588366913164319, + "grad_norm": 1.7978068590164185, + "learning_rate": 0.0043550467037854115, + "loss": 7.974, + "step": 389900 + }, + { + "epoch": 1.5887742911877005, + "grad_norm": 2.9096691608428955, + "learning_rate": 0.004354713812938128, + "loss": 7.9832, + "step": 390000 + }, + { + "epoch": 1.5887742911877005, + "eval_MaskedAccuracy": 0.49209820537622284, + "eval_loss": 1.678334355354309, + "eval_runtime": 185.1042, + "eval_samples_per_second": 342.92, + "eval_steps_per_second": 1.34, + "step": 390000 + }, + { + "epoch": 1.589181669211082, + "grad_norm": 2.5478997230529785, + "learning_rate": 0.0043543808489617634, + "loss": 7.9452, + "step": 390100 + }, + { + "epoch": 1.5895890472344634, + "grad_norm": 1.4829764366149902, + "learning_rate": 0.004354047811869491, + "loss": 7.9529, + "step": 390200 + }, + { + "epoch": 1.5899964252578447, + "grad_norm": 4.051734447479248, + "learning_rate": 0.004353714701674461, + "loss": 7.9611, + "step": 390300 + }, + { + "epoch": 1.5904038032812262, + "grad_norm": 2.479869842529297, + "learning_rate": 0.004353381518389854, + "loss": 7.9544, + "step": 390400 + }, + { + "epoch": 1.5908111813046077, + "grad_norm": 1.2474099397659302, + "learning_rate": 0.004353048262028836, + "loss": 7.9865, + "step": 390500 + }, + { + "epoch": 1.5912185593279893, + "grad_norm": 3.6086440086364746, + "learning_rate": 0.00435271493260459, + "loss": 7.9947, + "step": 390600 + }, + { + "epoch": 1.5916259373513708, + "grad_norm": 5.260233402252197, + "learning_rate": 0.0043523815301302875, + "loss": 7.9664, + "step": 390700 + }, + { + "epoch": 1.5920333153747521, + "grad_norm": 3.744760036468506, + "learning_rate": 0.004352048054619116, + "loss": 7.9917, + "step": 390800 + }, + { + "epoch": 1.5924406933981334, + "grad_norm": 3.6280267238616943, + "learning_rate": 0.004351714506084258, + "loss": 7.9795, + "step": 390900 + }, + { + "epoch": 1.592848071421515, + "grad_norm": 1.0758583545684814, + "learning_rate": 0.0043513808845389, + "loss": 7.9759, + "step": 391000 + }, + { + "epoch": 1.592848071421515, + "eval_MaskedAccuracy": 0.49325010018277465, + "eval_loss": 1.6883565187454224, + "eval_runtime": 164.2987, + "eval_samples_per_second": 386.345, + "eval_steps_per_second": 1.509, + "step": 391000 + }, + { + "epoch": 1.5932554494448965, + "grad_norm": 4.809948921203613, + "learning_rate": 0.004351047189996223, + "loss": 8.0092, + "step": 391100 + }, + { + "epoch": 1.593662827468278, + "grad_norm": 1.1911133527755737, + "learning_rate": 0.004350713422469431, + "loss": 8.0217, + "step": 391200 + }, + { + "epoch": 1.5940702054916596, + "grad_norm": 2.9028546810150146, + "learning_rate": 0.004350379581971718, + "loss": 7.9676, + "step": 391300 + }, + { + "epoch": 1.5944775835150409, + "grad_norm": 4.913288593292236, + "learning_rate": 0.004350045668516274, + "loss": 7.9644, + "step": 391400 + }, + { + "epoch": 1.5948849615384224, + "grad_norm": 2.528618574142456, + "learning_rate": 0.004349711682116308, + "loss": 7.9554, + "step": 391500 + }, + { + "epoch": 1.5952923395618037, + "grad_norm": 4.065859794616699, + "learning_rate": 0.004349377622785024, + "loss": 7.9782, + "step": 391600 + }, + { + "epoch": 1.5956997175851853, + "grad_norm": 1.9169384241104126, + "learning_rate": 0.004349043490535629, + "loss": 7.982, + "step": 391700 + }, + { + "epoch": 1.5961070956085668, + "grad_norm": 3.5128958225250244, + "learning_rate": 0.00434870928538133, + "loss": 7.9852, + "step": 391800 + }, + { + "epoch": 1.5965144736319483, + "grad_norm": 1.8897231817245483, + "learning_rate": 0.004348375007335346, + "loss": 7.9819, + "step": 391900 + }, + { + "epoch": 1.5969218516553296, + "grad_norm": 2.6008095741271973, + "learning_rate": 0.004348040656410888, + "loss": 7.987, + "step": 392000 + }, + { + "epoch": 1.5969218516553296, + "eval_MaskedAccuracy": 0.4921028195568469, + "eval_loss": 1.6823772192001343, + "eval_runtime": 167.8952, + "eval_samples_per_second": 378.069, + "eval_steps_per_second": 1.477, + "step": 392000 + }, + { + "epoch": 1.5973292296787112, + "grad_norm": 3.952047109603882, + "learning_rate": 0.004347706232621176, + "loss": 7.9643, + "step": 392100 + }, + { + "epoch": 1.5977366077020925, + "grad_norm": 1.9097968339920044, + "learning_rate": 0.004347371735979424, + "loss": 7.9793, + "step": 392200 + }, + { + "epoch": 1.598143985725474, + "grad_norm": 1.9411801099777222, + "learning_rate": 0.004347037166498867, + "loss": 7.9616, + "step": 392300 + }, + { + "epoch": 1.5985513637488555, + "grad_norm": 1.4001778364181519, + "learning_rate": 0.00434670252419273, + "loss": 7.9738, + "step": 392400 + }, + { + "epoch": 1.598958741772237, + "grad_norm": 1.7431035041809082, + "learning_rate": 0.004346367809074245, + "loss": 7.9926, + "step": 392500 + }, + { + "epoch": 1.5993661197956186, + "grad_norm": 2.230560541152954, + "learning_rate": 0.004346033021156636, + "loss": 7.9617, + "step": 392600 + }, + { + "epoch": 1.599773497819, + "grad_norm": 1.0867366790771484, + "learning_rate": 0.00434569816045315, + "loss": 7.9819, + "step": 392700 + }, + { + "epoch": 1.6001808758423812, + "grad_norm": 3.5999398231506348, + "learning_rate": 0.0043453632269770184, + "loss": 8.0082, + "step": 392800 + }, + { + "epoch": 1.6005882538657628, + "grad_norm": 2.177280902862549, + "learning_rate": 0.0043450282207414875, + "loss": 7.9843, + "step": 392900 + }, + { + "epoch": 1.6009956318891443, + "grad_norm": 1.610917568206787, + "learning_rate": 0.004344693141759801, + "loss": 7.9563, + "step": 393000 + }, + { + "epoch": 1.6009956318891443, + "eval_MaskedAccuracy": 0.49194733052811196, + "eval_loss": 1.6850906610488892, + "eval_runtime": 159.2594, + "eval_samples_per_second": 398.57, + "eval_steps_per_second": 1.557, + "step": 393000 + }, + { + "epoch": 1.6014030099125258, + "grad_norm": 4.063870429992676, + "learning_rate": 0.004344357990045204, + "loss": 7.9919, + "step": 393100 + }, + { + "epoch": 1.6018103879359074, + "grad_norm": 1.9149118661880493, + "learning_rate": 0.004344022765610947, + "loss": 7.9467, + "step": 393200 + }, + { + "epoch": 1.6022177659592887, + "grad_norm": 5.278997421264648, + "learning_rate": 0.0043436874684702875, + "loss": 7.9724, + "step": 393300 + }, + { + "epoch": 1.60262514398267, + "grad_norm": 2.6950948238372803, + "learning_rate": 0.004343352098636476, + "loss": 7.9997, + "step": 393400 + }, + { + "epoch": 1.6030325220060515, + "grad_norm": 3.0038623809814453, + "learning_rate": 0.004343016656122776, + "loss": 7.9972, + "step": 393500 + }, + { + "epoch": 1.603439900029433, + "grad_norm": 1.5131049156188965, + "learning_rate": 0.004342681140942442, + "loss": 8.0109, + "step": 393600 + }, + { + "epoch": 1.6038472780528146, + "grad_norm": 2.710996150970459, + "learning_rate": 0.004342345553108749, + "loss": 7.946, + "step": 393700 + }, + { + "epoch": 1.604254656076196, + "grad_norm": 3.3355636596679688, + "learning_rate": 0.004342009892634961, + "loss": 7.9358, + "step": 393800 + }, + { + "epoch": 1.6046620340995774, + "grad_norm": 4.113181114196777, + "learning_rate": 0.004341674159534346, + "loss": 7.9612, + "step": 393900 + }, + { + "epoch": 1.605069412122959, + "grad_norm": 4.498475551605225, + "learning_rate": 0.004341338353820176, + "loss": 7.9791, + "step": 394000 + }, + { + "epoch": 1.605069412122959, + "eval_MaskedAccuracy": 0.4912583135960985, + "eval_loss": 1.6878175735473633, + "eval_runtime": 162.9635, + "eval_samples_per_second": 389.511, + "eval_steps_per_second": 1.522, + "step": 394000 + }, + { + "epoch": 1.6054767901463403, + "grad_norm": 6.954246520996094, + "learning_rate": 0.004341002475505732, + "loss": 7.9823, + "step": 394100 + }, + { + "epoch": 1.6058841681697218, + "grad_norm": 4.334616184234619, + "learning_rate": 0.0043406665246042905, + "loss": 7.9726, + "step": 394200 + }, + { + "epoch": 1.6062915461931033, + "grad_norm": 4.763188362121582, + "learning_rate": 0.004340330501129129, + "loss": 7.9971, + "step": 394300 + }, + { + "epoch": 1.6066989242164849, + "grad_norm": 1.9894366264343262, + "learning_rate": 0.004339994405093543, + "loss": 7.9599, + "step": 394400 + }, + { + "epoch": 1.6071063022398662, + "grad_norm": 5.279543876647949, + "learning_rate": 0.004339658236510814, + "loss": 7.9878, + "step": 394500 + }, + { + "epoch": 1.6075136802632477, + "grad_norm": 1.0106977224349976, + "learning_rate": 0.004339321995394225, + "loss": 7.9433, + "step": 394600 + }, + { + "epoch": 1.607921058286629, + "grad_norm": 2.0923564434051514, + "learning_rate": 0.004338985681757076, + "loss": 7.9819, + "step": 394700 + }, + { + "epoch": 1.6083284363100105, + "grad_norm": 3.0404632091522217, + "learning_rate": 0.004338649295612668, + "loss": 7.99, + "step": 394800 + }, + { + "epoch": 1.608735814333392, + "grad_norm": 3.663414716720581, + "learning_rate": 0.004338312836974292, + "loss": 7.9619, + "step": 394900 + }, + { + "epoch": 1.6091431923567736, + "grad_norm": 11.241720199584961, + "learning_rate": 0.00433797630585526, + "loss": 7.9876, + "step": 395000 + }, + { + "epoch": 1.6091431923567736, + "eval_MaskedAccuracy": 0.4917487531948813, + "eval_loss": 1.6852360963821411, + "eval_runtime": 185.0984, + "eval_samples_per_second": 342.931, + "eval_steps_per_second": 1.34, + "step": 395000 + }, + { + "epoch": 1.6095505703801551, + "grad_norm": 4.754985809326172, + "learning_rate": 0.004337639702268867, + "loss": 7.9595, + "step": 395100 + }, + { + "epoch": 1.6099579484035365, + "grad_norm": 5.527943134307861, + "learning_rate": 0.004337303026228427, + "loss": 7.9756, + "step": 395200 + }, + { + "epoch": 1.6103653264269178, + "grad_norm": 1.6219241619110107, + "learning_rate": 0.004336966277747251, + "loss": 8.0081, + "step": 395300 + }, + { + "epoch": 1.6107727044502993, + "grad_norm": 4.942535400390625, + "learning_rate": 0.004336629456838651, + "loss": 7.9625, + "step": 395400 + }, + { + "epoch": 1.6111800824736808, + "grad_norm": 1.6272914409637451, + "learning_rate": 0.004336292563515942, + "loss": 7.9781, + "step": 395500 + }, + { + "epoch": 1.6115874604970624, + "grad_norm": 3.4605345726013184, + "learning_rate": 0.004335955597792442, + "loss": 7.9501, + "step": 395600 + }, + { + "epoch": 1.611994838520444, + "grad_norm": 1.3869355916976929, + "learning_rate": 0.004335618559681468, + "loss": 7.95, + "step": 395700 + }, + { + "epoch": 1.6124022165438252, + "grad_norm": 5.522362232208252, + "learning_rate": 0.004335281449196358, + "loss": 7.9842, + "step": 395800 + }, + { + "epoch": 1.6128095945672065, + "grad_norm": 2.6254281997680664, + "learning_rate": 0.004334944266350435, + "loss": 7.937, + "step": 395900 + }, + { + "epoch": 1.613216972590588, + "grad_norm": 2.4742610454559326, + "learning_rate": 0.004334607011157026, + "loss": 7.9573, + "step": 396000 + }, + { + "epoch": 1.613216972590588, + "eval_MaskedAccuracy": 0.4950616120567586, + "eval_loss": 1.6649715900421143, + "eval_runtime": 164.3113, + "eval_samples_per_second": 386.316, + "eval_steps_per_second": 1.509, + "step": 396000 + }, + { + "epoch": 1.6136243506139696, + "grad_norm": 3.2119338512420654, + "learning_rate": 0.004334269683629464, + "loss": 7.9016, + "step": 396100 + }, + { + "epoch": 1.6140317286373511, + "grad_norm": 1.6203646659851074, + "learning_rate": 0.004333932283781086, + "loss": 7.9583, + "step": 396200 + }, + { + "epoch": 1.6144391066607326, + "grad_norm": 7.14988374710083, + "learning_rate": 0.004333594811625235, + "loss": 7.9748, + "step": 396300 + }, + { + "epoch": 1.614846484684114, + "grad_norm": 2.0653839111328125, + "learning_rate": 0.004333257267175256, + "loss": 7.966, + "step": 396400 + }, + { + "epoch": 1.6152538627074955, + "grad_norm": 3.278308153152466, + "learning_rate": 0.00433291965044449, + "loss": 7.9845, + "step": 396500 + }, + { + "epoch": 1.6156612407308768, + "grad_norm": 3.3761467933654785, + "learning_rate": 0.004332581961446283, + "loss": 7.9792, + "step": 396600 + }, + { + "epoch": 1.6160686187542583, + "grad_norm": 4.255733013153076, + "learning_rate": 0.004332244200193985, + "loss": 7.9835, + "step": 396700 + }, + { + "epoch": 1.6164759967776399, + "grad_norm": 2.899632692337036, + "learning_rate": 0.004331906366700951, + "loss": 7.9676, + "step": 396800 + }, + { + "epoch": 1.6168833748010214, + "grad_norm": 1.675042986869812, + "learning_rate": 0.0043315684609805324, + "loss": 7.9631, + "step": 396900 + }, + { + "epoch": 1.6172907528244027, + "grad_norm": 1.578526496887207, + "learning_rate": 0.0043312304830461, + "loss": 7.9974, + "step": 397000 + }, + { + "epoch": 1.6172907528244027, + "eval_MaskedAccuracy": 0.49271030592332854, + "eval_loss": 1.6789170503616333, + "eval_runtime": 193.6599, + "eval_samples_per_second": 327.771, + "eval_steps_per_second": 1.281, + "step": 397000 + }, + { + "epoch": 1.6176981308477842, + "grad_norm": 1.3485881090164185, + "learning_rate": 0.004330892432911004, + "loss": 7.9684, + "step": 397100 + }, + { + "epoch": 1.6181055088711656, + "grad_norm": 3.4496870040893555, + "learning_rate": 0.004330554310588616, + "loss": 7.9862, + "step": 397200 + }, + { + "epoch": 1.618512886894547, + "grad_norm": 4.903573513031006, + "learning_rate": 0.004330216116092305, + "loss": 7.9935, + "step": 397300 + }, + { + "epoch": 1.6189202649179286, + "grad_norm": 2.4433298110961914, + "learning_rate": 0.004329877849435437, + "loss": 7.979, + "step": 397400 + }, + { + "epoch": 1.6193276429413102, + "grad_norm": 5.1005120277404785, + "learning_rate": 0.004329539510631386, + "loss": 7.9972, + "step": 397500 + }, + { + "epoch": 1.6197350209646917, + "grad_norm": 3.9642107486724854, + "learning_rate": 0.004329201099693534, + "loss": 7.9366, + "step": 397600 + }, + { + "epoch": 1.620142398988073, + "grad_norm": 2.4458343982696533, + "learning_rate": 0.004328862616635256, + "loss": 7.9604, + "step": 397700 + }, + { + "epoch": 1.6205497770114543, + "grad_norm": 2.9090120792388916, + "learning_rate": 0.004328524061469932, + "loss": 8.0138, + "step": 397800 + }, + { + "epoch": 1.6209571550348358, + "grad_norm": 2.2161173820495605, + "learning_rate": 0.0043281854342109485, + "loss": 7.9771, + "step": 397900 + }, + { + "epoch": 1.6213645330582174, + "grad_norm": 7.317041397094727, + "learning_rate": 0.004327846734871696, + "loss": 7.9441, + "step": 398000 + }, + { + "epoch": 1.6213645330582174, + "eval_MaskedAccuracy": 0.49359894427607975, + "eval_loss": 1.6811603307724, + "eval_runtime": 275.6596, + "eval_samples_per_second": 230.269, + "eval_steps_per_second": 0.9, + "step": 398000 + }, + { + "epoch": 1.621771911081599, + "grad_norm": 4.609094142913818, + "learning_rate": 0.004327507963465556, + "loss": 7.9808, + "step": 398100 + }, + { + "epoch": 1.6221792891049804, + "grad_norm": 1.264139175415039, + "learning_rate": 0.00432716912000593, + "loss": 7.9551, + "step": 398200 + }, + { + "epoch": 1.6225866671283617, + "grad_norm": 3.328643321990967, + "learning_rate": 0.004326830204506212, + "loss": 7.9815, + "step": 398300 + }, + { + "epoch": 1.622994045151743, + "grad_norm": 2.0682921409606934, + "learning_rate": 0.004326491216979807, + "loss": 7.9274, + "step": 398400 + }, + { + "epoch": 1.6234014231751246, + "grad_norm": 2.3350448608398438, + "learning_rate": 0.004326152157440109, + "loss": 7.9593, + "step": 398500 + }, + { + "epoch": 1.6238088011985061, + "grad_norm": 2.5045876502990723, + "learning_rate": 0.004325813025900526, + "loss": 7.9423, + "step": 398600 + }, + { + "epoch": 1.6242161792218877, + "grad_norm": 3.6239988803863525, + "learning_rate": 0.004325473822374469, + "loss": 7.9834, + "step": 398700 + }, + { + "epoch": 1.6246235572452692, + "grad_norm": 5.0391130447387695, + "learning_rate": 0.004325134546875345, + "loss": 7.9711, + "step": 398800 + }, + { + "epoch": 1.6250309352686505, + "grad_norm": 4.182869911193848, + "learning_rate": 0.004324795199416563, + "loss": 7.9867, + "step": 398900 + }, + { + "epoch": 1.625438313292032, + "grad_norm": 1.795639157295227, + "learning_rate": 0.0043244557800115446, + "loss": 7.9402, + "step": 399000 + }, + { + "epoch": 1.625438313292032, + "eval_MaskedAccuracy": 0.4933457146622044, + "eval_loss": 1.6800826787948608, + "eval_runtime": 198.8368, + "eval_samples_per_second": 319.237, + "eval_steps_per_second": 1.247, + "step": 399000 + }, + { + "epoch": 1.6258456913154133, + "grad_norm": 1.998484492301941, + "learning_rate": 0.004324116288673711, + "loss": 7.9877, + "step": 399100 + }, + { + "epoch": 1.6262530693387949, + "grad_norm": 5.772905349731445, + "learning_rate": 0.0043237767254164835, + "loss": 7.9615, + "step": 399200 + }, + { + "epoch": 1.6266604473621764, + "grad_norm": 2.0624938011169434, + "learning_rate": 0.004323437090253278, + "loss": 7.971, + "step": 399300 + }, + { + "epoch": 1.627067825385558, + "grad_norm": 3.218437671661377, + "learning_rate": 0.004323097383197533, + "loss": 7.9829, + "step": 399400 + }, + { + "epoch": 1.6274752034089393, + "grad_norm": 4.9599103927612305, + "learning_rate": 0.004322757604262674, + "loss": 7.9524, + "step": 399500 + }, + { + "epoch": 1.6278825814323208, + "grad_norm": 2.601705312728882, + "learning_rate": 0.004322417753462135, + "loss": 7.9643, + "step": 399600 + }, + { + "epoch": 1.628289959455702, + "grad_norm": 3.322831869125366, + "learning_rate": 0.004322077830809351, + "loss": 7.9489, + "step": 399700 + }, + { + "epoch": 1.6286973374790836, + "grad_norm": 8.045740127563477, + "learning_rate": 0.004321737836317761, + "loss": 7.9575, + "step": 399800 + }, + { + "epoch": 1.6291047155024652, + "grad_norm": 2.5539023876190186, + "learning_rate": 0.004321397770000811, + "loss": 7.9826, + "step": 399900 + }, + { + "epoch": 1.6295120935258467, + "grad_norm": 2.9093823432922363, + "learning_rate": 0.00432105763187194, + "loss": 7.9627, + "step": 400000 + }, + { + "epoch": 1.6295120935258467, + "eval_MaskedAccuracy": 0.4935417284178504, + "eval_loss": 1.6767427921295166, + "eval_runtime": 177.7659, + "eval_samples_per_second": 357.076, + "eval_steps_per_second": 1.395, + "step": 400000 + }, + { + "epoch": 1.6299194715492282, + "grad_norm": 5.491323471069336, + "learning_rate": 0.004320717421944603, + "loss": 7.9637, + "step": 400100 + }, + { + "epoch": 1.6303268495726095, + "grad_norm": 4.1802239418029785, + "learning_rate": 0.004320377140232242, + "loss": 7.9297, + "step": 400200 + }, + { + "epoch": 1.6307342275959908, + "grad_norm": 2.1961169242858887, + "learning_rate": 0.004320036786748309, + "loss": 7.9707, + "step": 400300 + }, + { + "epoch": 1.6311416056193724, + "grad_norm": 4.688284873962402, + "learning_rate": 0.004319696361506266, + "loss": 7.953, + "step": 400400 + }, + { + "epoch": 1.631548983642754, + "grad_norm": 3.7482964992523193, + "learning_rate": 0.00431935586451957, + "loss": 7.9678, + "step": 400500 + }, + { + "epoch": 1.6319563616661354, + "grad_norm": 3.6271839141845703, + "learning_rate": 0.004319015295801683, + "loss": 7.9599, + "step": 400600 + }, + { + "epoch": 1.632363739689517, + "grad_norm": 1.394652247428894, + "learning_rate": 0.004318674655366075, + "loss": 7.9605, + "step": 400700 + }, + { + "epoch": 1.6327711177128983, + "grad_norm": 6.607354164123535, + "learning_rate": 0.004318333943226209, + "loss": 7.9698, + "step": 400800 + }, + { + "epoch": 1.6331784957362796, + "grad_norm": 4.040886402130127, + "learning_rate": 0.004317993159395542, + "loss": 7.9767, + "step": 400900 + }, + { + "epoch": 1.6335858737596611, + "grad_norm": 1.9131618738174438, + "learning_rate": 0.004317652303887569, + "loss": 7.9528, + "step": 401000 + }, + { + "epoch": 1.6335858737596611, + "eval_MaskedAccuracy": 0.4931281655770838, + "eval_loss": 1.6782045364379883, + "eval_runtime": 223.652, + "eval_samples_per_second": 283.816, + "eval_steps_per_second": 1.109, + "step": 401000 + }, + { + "epoch": 1.6339932517830427, + "grad_norm": 2.304086208343506, + "learning_rate": 0.004317311376715756, + "loss": 7.9797, + "step": 401100 + }, + { + "epoch": 1.6344006298064242, + "grad_norm": 2.0475404262542725, + "learning_rate": 0.00431697037789358, + "loss": 7.9691, + "step": 401200 + }, + { + "epoch": 1.6348080078298057, + "grad_norm": 1.0804282426834106, + "learning_rate": 0.004316629307434522, + "loss": 7.9735, + "step": 401300 + }, + { + "epoch": 1.635215385853187, + "grad_norm": 1.3630620241165161, + "learning_rate": 0.004316288165352066, + "loss": 7.9923, + "step": 401400 + }, + { + "epoch": 1.6356227638765686, + "grad_norm": 1.6740797758102417, + "learning_rate": 0.0043159469516597035, + "loss": 7.9888, + "step": 401500 + }, + { + "epoch": 1.6360301418999499, + "grad_norm": 2.8491854667663574, + "learning_rate": 0.004315605666370924, + "loss": 7.9797, + "step": 401600 + }, + { + "epoch": 1.6364375199233314, + "grad_norm": 1.6639002561569214, + "learning_rate": 0.004315264309499222, + "loss": 7.9679, + "step": 401700 + }, + { + "epoch": 1.636844897946713, + "grad_norm": 4.907125949859619, + "learning_rate": 0.004314922881058082, + "loss": 7.9838, + "step": 401800 + }, + { + "epoch": 1.6372522759700945, + "grad_norm": 1.5400975942611694, + "learning_rate": 0.004314581381061013, + "loss": 7.9144, + "step": 401900 + }, + { + "epoch": 1.6376596539934758, + "grad_norm": 1.6475751399993896, + "learning_rate": 0.004314239809521518, + "loss": 7.9528, + "step": 402000 + }, + { + "epoch": 1.6376596539934758, + "eval_MaskedAccuracy": 0.49255158440846286, + "eval_loss": 1.6747674942016602, + "eval_runtime": 234.5816, + "eval_samples_per_second": 270.592, + "eval_steps_per_second": 1.057, + "step": 402000 + }, + { + "epoch": 1.6380670320168573, + "grad_norm": 2.1660892963409424, + "learning_rate": 0.00431389816645309, + "loss": 7.9743, + "step": 402100 + }, + { + "epoch": 1.6384744100402386, + "grad_norm": 3.7078475952148438, + "learning_rate": 0.004313556451869249, + "loss": 7.9838, + "step": 402200 + }, + { + "epoch": 1.6388817880636202, + "grad_norm": 5.297366142272949, + "learning_rate": 0.004313214665783495, + "loss": 7.9534, + "step": 402300 + }, + { + "epoch": 1.6392891660870017, + "grad_norm": 6.336480617523193, + "learning_rate": 0.00431287280820935, + "loss": 7.9774, + "step": 402400 + }, + { + "epoch": 1.6396965441103832, + "grad_norm": 3.608903169631958, + "learning_rate": 0.004312530879160318, + "loss": 7.956, + "step": 402500 + }, + { + "epoch": 1.6401039221337648, + "grad_norm": 8.459395408630371, + "learning_rate": 0.0043121888786499234, + "loss": 7.9813, + "step": 402600 + }, + { + "epoch": 1.640511300157146, + "grad_norm": 2.347891330718994, + "learning_rate": 0.004311846806691691, + "loss": 7.9465, + "step": 402700 + }, + { + "epoch": 1.6409186781805274, + "grad_norm": 4.714664459228516, + "learning_rate": 0.004311504663299135, + "loss": 7.965, + "step": 402800 + }, + { + "epoch": 1.641326056203909, + "grad_norm": 1.5120912790298462, + "learning_rate": 0.004311162448485785, + "loss": 7.9713, + "step": 402900 + }, + { + "epoch": 1.6417334342272905, + "grad_norm": 4.471959590911865, + "learning_rate": 0.004310820162265182, + "loss": 7.9685, + "step": 403000 + }, + { + "epoch": 1.6417334342272905, + "eval_MaskedAccuracy": 0.4934505209408353, + "eval_loss": 1.6755223274230957, + "eval_runtime": 211.8507, + "eval_samples_per_second": 299.626, + "eval_steps_per_second": 1.171, + "step": 403000 + }, + { + "epoch": 1.642140812250672, + "grad_norm": 4.829403400421143, + "learning_rate": 0.0043104778046508495, + "loss": 7.9463, + "step": 403100 + }, + { + "epoch": 1.6425481902740535, + "grad_norm": 2.266538381576538, + "learning_rate": 0.004310135375656314, + "loss": 7.9841, + "step": 403200 + }, + { + "epoch": 1.6429555682974348, + "grad_norm": 2.3340885639190674, + "learning_rate": 0.004309792875295126, + "loss": 7.9527, + "step": 403300 + }, + { + "epoch": 1.6433629463208161, + "grad_norm": 7.065990924835205, + "learning_rate": 0.004309450303580828, + "loss": 7.9651, + "step": 403400 + }, + { + "epoch": 1.6437703243441977, + "grad_norm": 2.794461965560913, + "learning_rate": 0.004309107660526959, + "loss": 7.958, + "step": 403500 + }, + { + "epoch": 1.6441777023675792, + "grad_norm": 4.081377029418945, + "learning_rate": 0.00430876494614706, + "loss": 7.9748, + "step": 403600 + }, + { + "epoch": 1.6445850803909607, + "grad_norm": 3.606553316116333, + "learning_rate": 0.004308422160454688, + "loss": 7.9587, + "step": 403700 + }, + { + "epoch": 1.6449924584143423, + "grad_norm": 5.445745468139648, + "learning_rate": 0.004308079303463388, + "loss": 7.9171, + "step": 403800 + }, + { + "epoch": 1.6453998364377236, + "grad_norm": 4.294501781463623, + "learning_rate": 0.004307736375186721, + "loss": 7.9594, + "step": 403900 + }, + { + "epoch": 1.6458072144611051, + "grad_norm": 1.0839147567749023, + "learning_rate": 0.004307393375638248, + "loss": 7.9425, + "step": 404000 + }, + { + "epoch": 1.6458072144611051, + "eval_MaskedAccuracy": 0.4937758301783506, + "eval_loss": 1.675153136253357, + "eval_runtime": 185.3521, + "eval_samples_per_second": 342.462, + "eval_steps_per_second": 1.338, + "step": 404000 + }, + { + "epoch": 1.6462145924844864, + "grad_norm": 5.004493236541748, + "learning_rate": 0.004307050304831517, + "loss": 7.9765, + "step": 404100 + }, + { + "epoch": 1.646621970507868, + "grad_norm": 3.2074897289276123, + "learning_rate": 0.004306707162780103, + "loss": 7.9565, + "step": 404200 + }, + { + "epoch": 1.6470293485312495, + "grad_norm": 1.7683274745941162, + "learning_rate": 0.004306363949497569, + "loss": 7.985, + "step": 404300 + }, + { + "epoch": 1.647436726554631, + "grad_norm": 2.3448328971862793, + "learning_rate": 0.0043060206649974805, + "loss": 7.955, + "step": 404400 + }, + { + "epoch": 1.6478441045780123, + "grad_norm": 2.96193528175354, + "learning_rate": 0.0043056773092934155, + "loss": 7.9635, + "step": 404500 + }, + { + "epoch": 1.6482514826013939, + "grad_norm": 1.4944769144058228, + "learning_rate": 0.004305333882398952, + "loss": 7.958, + "step": 404600 + }, + { + "epoch": 1.6486588606247752, + "grad_norm": 2.9811477661132812, + "learning_rate": 0.004304990384327652, + "loss": 7.9884, + "step": 404700 + }, + { + "epoch": 1.6490662386481567, + "grad_norm": 13.158381462097168, + "learning_rate": 0.004304646815093105, + "loss": 7.9626, + "step": 404800 + }, + { + "epoch": 1.6494736166715382, + "grad_norm": 4.048529148101807, + "learning_rate": 0.0043043031747089, + "loss": 7.958, + "step": 404900 + }, + { + "epoch": 1.6498809946949198, + "grad_norm": 1.8619483709335327, + "learning_rate": 0.004303959463188612, + "loss": 7.9605, + "step": 405000 + }, + { + "epoch": 1.6498809946949198, + "eval_MaskedAccuracy": 0.49467404513256763, + "eval_loss": 1.6699923276901245, + "eval_runtime": 266.4221, + "eval_samples_per_second": 238.254, + "eval_steps_per_second": 0.931, + "step": 405000 + }, + { + "epoch": 1.6502883727183013, + "grad_norm": 4.210719585418701, + "learning_rate": 0.004303615680545833, + "loss": 7.9587, + "step": 405100 + }, + { + "epoch": 1.6506957507416826, + "grad_norm": 6.177710056304932, + "learning_rate": 0.004303271826794163, + "loss": 7.9308, + "step": 405200 + }, + { + "epoch": 1.651103128765064, + "grad_norm": 8.642091751098633, + "learning_rate": 0.00430292790194719, + "loss": 7.9682, + "step": 405300 + }, + { + "epoch": 1.6515105067884455, + "grad_norm": 4.50748348236084, + "learning_rate": 0.004302583906018505, + "loss": 7.9616, + "step": 405400 + }, + { + "epoch": 1.651917884811827, + "grad_norm": 4.075202465057373, + "learning_rate": 0.004302239839021721, + "loss": 7.9338, + "step": 405500 + }, + { + "epoch": 1.6523252628352085, + "grad_norm": 1.6843070983886719, + "learning_rate": 0.004301895700970431, + "loss": 7.978, + "step": 405600 + }, + { + "epoch": 1.65273264085859, + "grad_norm": 2.934196710586548, + "learning_rate": 0.0043015514918782445, + "loss": 7.9676, + "step": 405700 + }, + { + "epoch": 1.6531400188819714, + "grad_norm": 2.3354573249816895, + "learning_rate": 0.004301207211758769, + "loss": 7.9966, + "step": 405800 + }, + { + "epoch": 1.6535473969053527, + "grad_norm": 3.1194190979003906, + "learning_rate": 0.00430086286062561, + "loss": 7.985, + "step": 405900 + }, + { + "epoch": 1.6539547749287342, + "grad_norm": 1.0679525136947632, + "learning_rate": 0.004300518438492392, + "loss": 7.9865, + "step": 406000 + }, + { + "epoch": 1.6539547749287342, + "eval_MaskedAccuracy": 0.49296178046008887, + "eval_loss": 1.6834198236465454, + "eval_runtime": 170.1924, + "eval_samples_per_second": 372.966, + "eval_steps_per_second": 1.457, + "step": 406000 + }, + { + "epoch": 1.6543621529521157, + "grad_norm": 3.0432701110839844, + "learning_rate": 0.004300173945372724, + "loss": 7.9259, + "step": 406100 + }, + { + "epoch": 1.6547695309754973, + "grad_norm": 2.8257131576538086, + "learning_rate": 0.004299829381280237, + "loss": 7.9596, + "step": 406200 + }, + { + "epoch": 1.6551769089988788, + "grad_norm": 1.2833287715911865, + "learning_rate": 0.004299484746228541, + "loss": 7.9542, + "step": 406300 + }, + { + "epoch": 1.6555842870222601, + "grad_norm": 1.386589527130127, + "learning_rate": 0.004299140040231264, + "loss": 7.9351, + "step": 406400 + }, + { + "epoch": 1.6559916650456417, + "grad_norm": 1.9801644086837769, + "learning_rate": 0.004298795263302037, + "loss": 7.9559, + "step": 406500 + }, + { + "epoch": 1.656399043069023, + "grad_norm": 2.5966262817382812, + "learning_rate": 0.0042984504154544865, + "loss": 7.9901, + "step": 406600 + }, + { + "epoch": 1.6568064210924045, + "grad_norm": 1.6363015174865723, + "learning_rate": 0.004298105496702247, + "loss": 7.9713, + "step": 406700 + }, + { + "epoch": 1.657213799115786, + "grad_norm": 4.371832847595215, + "learning_rate": 0.004297760507058954, + "loss": 7.9641, + "step": 406800 + }, + { + "epoch": 1.6576211771391676, + "grad_norm": 1.9847220182418823, + "learning_rate": 0.004297415446538257, + "loss": 7.9447, + "step": 406900 + }, + { + "epoch": 1.6580285551625489, + "grad_norm": 1.4259852170944214, + "learning_rate": 0.004297070315153793, + "loss": 7.9332, + "step": 407000 + }, + { + "epoch": 1.6580285551625489, + "eval_MaskedAccuracy": 0.49389236265893344, + "eval_loss": 1.6760696172714233, + "eval_runtime": 283.6492, + "eval_samples_per_second": 223.783, + "eval_steps_per_second": 0.874, + "step": 407000 + }, + { + "epoch": 1.6584359331859304, + "grad_norm": 3.0803794860839844, + "learning_rate": 0.004296725112919201, + "loss": 7.9529, + "step": 407100 + }, + { + "epoch": 1.6588433112093117, + "grad_norm": 1.4253218173980713, + "learning_rate": 0.004296379839848138, + "loss": 7.9313, + "step": 407200 + }, + { + "epoch": 1.6592506892326933, + "grad_norm": 1.1467676162719727, + "learning_rate": 0.004296034495954242, + "loss": 7.9871, + "step": 407300 + }, + { + "epoch": 1.6596580672560748, + "grad_norm": 1.4785820245742798, + "learning_rate": 0.004295689081251178, + "loss": 8.0032, + "step": 407400 + }, + { + "epoch": 1.6600654452794563, + "grad_norm": 4.422873020172119, + "learning_rate": 0.004295343595752593, + "loss": 7.9378, + "step": 407500 + }, + { + "epoch": 1.6604728233028379, + "grad_norm": 5.220428943634033, + "learning_rate": 0.004294998039472155, + "loss": 7.9202, + "step": 407600 + }, + { + "epoch": 1.6608802013262192, + "grad_norm": 8.391969680786133, + "learning_rate": 0.0042946524124235184, + "loss": 7.9858, + "step": 407700 + }, + { + "epoch": 1.6612875793496005, + "grad_norm": 3.050560474395752, + "learning_rate": 0.004294306714620354, + "loss": 7.9808, + "step": 407800 + }, + { + "epoch": 1.661694957372982, + "grad_norm": 3.4746363162994385, + "learning_rate": 0.004293960946076323, + "loss": 7.9493, + "step": 407900 + }, + { + "epoch": 1.6621023353963635, + "grad_norm": 1.5520859956741333, + "learning_rate": 0.004293615106805094, + "loss": 7.9309, + "step": 408000 + }, + { + "epoch": 1.6621023353963635, + "eval_MaskedAccuracy": 0.4951752125082972, + "eval_loss": 1.66789972782135, + "eval_runtime": 230.7829, + "eval_samples_per_second": 275.046, + "eval_steps_per_second": 1.075, + "step": 408000 + }, + { + "epoch": 1.662509713419745, + "grad_norm": 7.703237056732178, + "learning_rate": 0.004293269196820345, + "loss": 7.9612, + "step": 408100 + }, + { + "epoch": 1.6629170914431266, + "grad_norm": 2.1739680767059326, + "learning_rate": 0.004292923216135756, + "loss": 7.9304, + "step": 408200 + }, + { + "epoch": 1.663324469466508, + "grad_norm": 3.6314687728881836, + "learning_rate": 0.004292577164764995, + "loss": 7.95, + "step": 408300 + }, + { + "epoch": 1.6637318474898892, + "grad_norm": 1.4672982692718506, + "learning_rate": 0.004292231042721748, + "loss": 7.9747, + "step": 408400 + }, + { + "epoch": 1.6641392255132708, + "grad_norm": 2.5027122497558594, + "learning_rate": 0.004291884850019704, + "loss": 7.96, + "step": 408500 + }, + { + "epoch": 1.6645466035366523, + "grad_norm": 1.6916136741638184, + "learning_rate": 0.004291538586672537, + "loss": 7.9663, + "step": 408600 + }, + { + "epoch": 1.6649539815600338, + "grad_norm": 1.5934362411499023, + "learning_rate": 0.004291192252693948, + "loss": 7.9469, + "step": 408700 + }, + { + "epoch": 1.6653613595834154, + "grad_norm": 2.4465174674987793, + "learning_rate": 0.0042908458480976274, + "loss": 7.9432, + "step": 408800 + }, + { + "epoch": 1.6657687376067967, + "grad_norm": 4.356081008911133, + "learning_rate": 0.004290499372897264, + "loss": 7.984, + "step": 408900 + }, + { + "epoch": 1.6661761156301782, + "grad_norm": 3.9348583221435547, + "learning_rate": 0.004290152827106566, + "loss": 7.9373, + "step": 409000 + }, + { + "epoch": 1.6661761156301782, + "eval_MaskedAccuracy": 0.4956848225531449, + "eval_loss": 1.672873854637146, + "eval_runtime": 177.0871, + "eval_samples_per_second": 358.445, + "eval_steps_per_second": 1.4, + "step": 409000 + }, + { + "epoch": 1.6665834936535595, + "grad_norm": 2.170700788497925, + "learning_rate": 0.004289806210739226, + "loss": 7.9574, + "step": 409100 + }, + { + "epoch": 1.666990871676941, + "grad_norm": 6.48043155670166, + "learning_rate": 0.0042894595238089465, + "loss": 7.9671, + "step": 409200 + }, + { + "epoch": 1.6673982497003226, + "grad_norm": 3.2680306434631348, + "learning_rate": 0.0042891127663294455, + "loss": 7.9648, + "step": 409300 + }, + { + "epoch": 1.667805627723704, + "grad_norm": 7.81300687789917, + "learning_rate": 0.004288765938314423, + "loss": 7.9988, + "step": 409400 + }, + { + "epoch": 1.6682130057470854, + "grad_norm": 2.928682327270508, + "learning_rate": 0.0042884190397775935, + "loss": 7.94, + "step": 409500 + }, + { + "epoch": 1.668620383770467, + "grad_norm": 5.415159225463867, + "learning_rate": 0.004288072070732674, + "loss": 7.9688, + "step": 409600 + }, + { + "epoch": 1.6690277617938483, + "grad_norm": 2.4109930992126465, + "learning_rate": 0.004287725031193375, + "loss": 7.9196, + "step": 409700 + }, + { + "epoch": 1.6694351398172298, + "grad_norm": 3.1917884349823, + "learning_rate": 0.004287377921173422, + "loss": 7.9317, + "step": 409800 + }, + { + "epoch": 1.6698425178406113, + "grad_norm": 2.7681193351745605, + "learning_rate": 0.004287030740686535, + "loss": 7.956, + "step": 409900 + }, + { + "epoch": 1.6702498958639929, + "grad_norm": 4.684969425201416, + "learning_rate": 0.004286683489746447, + "loss": 7.9379, + "step": 410000 + }, + { + "epoch": 1.6702498958639929, + "eval_MaskedAccuracy": 0.49399434037778567, + "eval_loss": 1.6739771366119385, + "eval_runtime": 249.0096, + "eval_samples_per_second": 254.914, + "eval_steps_per_second": 0.996, + "step": 410000 + }, + { + "epoch": 1.6706572738873744, + "grad_norm": 3.6070775985717773, + "learning_rate": 0.004286336168366876, + "loss": 7.9518, + "step": 410100 + }, + { + "epoch": 1.6710646519107557, + "grad_norm": 6.102087497711182, + "learning_rate": 0.004285988776561558, + "loss": 7.9334, + "step": 410200 + }, + { + "epoch": 1.671472029934137, + "grad_norm": 7.099484920501709, + "learning_rate": 0.00428564131434423, + "loss": 7.9412, + "step": 410300 + }, + { + "epoch": 1.6718794079575185, + "grad_norm": 2.260991096496582, + "learning_rate": 0.004285293781728627, + "loss": 7.943, + "step": 410400 + }, + { + "epoch": 1.6722867859809, + "grad_norm": 5.115174293518066, + "learning_rate": 0.00428494617872849, + "loss": 7.9473, + "step": 410500 + }, + { + "epoch": 1.6726941640042816, + "grad_norm": 2.993117094039917, + "learning_rate": 0.004284598505357562, + "loss": 7.9453, + "step": 410600 + }, + { + "epoch": 1.6731015420276631, + "grad_norm": 3.088998556137085, + "learning_rate": 0.004284250761629587, + "loss": 7.9341, + "step": 410700 + }, + { + "epoch": 1.6735089200510445, + "grad_norm": 3.6426403522491455, + "learning_rate": 0.004283902947558312, + "loss": 7.9126, + "step": 410800 + }, + { + "epoch": 1.6739162980744258, + "grad_norm": 2.7926712036132812, + "learning_rate": 0.004283555063157483, + "loss": 7.9481, + "step": 410900 + }, + { + "epoch": 1.6743236760978073, + "grad_norm": 4.062406539916992, + "learning_rate": 0.00428320710844086, + "loss": 7.989, + "step": 411000 + }, + { + "epoch": 1.6743236760978073, + "eval_MaskedAccuracy": 0.49204426507389587, + "eval_loss": 1.6853642463684082, + "eval_runtime": 162.4687, + "eval_samples_per_second": 390.697, + "eval_steps_per_second": 1.526, + "step": 411000 + }, + { + "epoch": 1.6747310541211888, + "grad_norm": 3.0080082416534424, + "learning_rate": 0.004282859083422203, + "loss": 8.0003, + "step": 411100 + }, + { + "epoch": 1.6751384321445704, + "grad_norm": 3.0320534706115723, + "learning_rate": 0.0042825109881152726, + "loss": 7.9293, + "step": 411200 + }, + { + "epoch": 1.675545810167952, + "grad_norm": 6.334164142608643, + "learning_rate": 0.004282162822533823, + "loss": 7.9709, + "step": 411300 + }, + { + "epoch": 1.6759531881913332, + "grad_norm": 1.5587489604949951, + "learning_rate": 0.004281814586691617, + "loss": 7.9611, + "step": 411400 + }, + { + "epoch": 1.6763605662147147, + "grad_norm": 3.1051578521728516, + "learning_rate": 0.004281466280602431, + "loss": 7.9527, + "step": 411500 + }, + { + "epoch": 1.676767944238096, + "grad_norm": 5.084553241729736, + "learning_rate": 0.0042811179042800355, + "loss": 7.9475, + "step": 411600 + }, + { + "epoch": 1.6771753222614776, + "grad_norm": 4.1257243156433105, + "learning_rate": 0.004280769457738201, + "loss": 7.9343, + "step": 411700 + }, + { + "epoch": 1.6775827002848591, + "grad_norm": 1.3054580688476562, + "learning_rate": 0.004280420940990705, + "loss": 7.976, + "step": 411800 + }, + { + "epoch": 1.6779900783082407, + "grad_norm": 4.228930473327637, + "learning_rate": 0.004280072354051323, + "loss": 7.9528, + "step": 411900 + }, + { + "epoch": 1.678397456331622, + "grad_norm": 3.6710877418518066, + "learning_rate": 0.004279723696933832, + "loss": 7.9711, + "step": 412000 + }, + { + "epoch": 1.678397456331622, + "eval_MaskedAccuracy": 0.4943088891483813, + "eval_loss": 1.6768760681152344, + "eval_runtime": 214.2433, + "eval_samples_per_second": 296.28, + "eval_steps_per_second": 1.158, + "step": 412000 + }, + { + "epoch": 1.6788048343550035, + "grad_norm": 6.524370193481445, + "learning_rate": 0.0042793749696520165, + "loss": 7.9817, + "step": 412100 + }, + { + "epoch": 1.6792122123783848, + "grad_norm": 2.379122495651245, + "learning_rate": 0.004279026172219673, + "loss": 7.9257, + "step": 412200 + }, + { + "epoch": 1.6796195904017663, + "grad_norm": 2.0603673458099365, + "learning_rate": 0.004278677304650586, + "loss": 7.9422, + "step": 412300 + }, + { + "epoch": 1.6800269684251479, + "grad_norm": 1.7358134984970093, + "learning_rate": 0.00427832836695855, + "loss": 7.9817, + "step": 412400 + }, + { + "epoch": 1.6804343464485294, + "grad_norm": 3.6427125930786133, + "learning_rate": 0.004277979359157362, + "loss": 7.9542, + "step": 412500 + }, + { + "epoch": 1.680841724471911, + "grad_norm": 1.3322529792785645, + "learning_rate": 0.004277630281260814, + "loss": 7.998, + "step": 412600 + }, + { + "epoch": 1.6812491024952922, + "grad_norm": 3.6892545223236084, + "learning_rate": 0.004277281133282713, + "loss": 7.9742, + "step": 412700 + }, + { + "epoch": 1.6816564805186736, + "grad_norm": 1.3000271320343018, + "learning_rate": 0.004276931915236861, + "loss": 7.9364, + "step": 412800 + }, + { + "epoch": 1.682063858542055, + "grad_norm": 4.525582790374756, + "learning_rate": 0.004276582627137063, + "loss": 7.9527, + "step": 412900 + }, + { + "epoch": 1.6824712365654366, + "grad_norm": 2.7989420890808105, + "learning_rate": 0.0042762332689971275, + "loss": 7.9654, + "step": 413000 + }, + { + "epoch": 1.6824712365654366, + "eval_MaskedAccuracy": 0.49499718782733126, + "eval_loss": 1.671339988708496, + "eval_runtime": 231.7647, + "eval_samples_per_second": 273.881, + "eval_steps_per_second": 1.07, + "step": 413000 + }, + { + "epoch": 1.6828786145888182, + "grad_norm": 5.067391395568848, + "learning_rate": 0.004275883840830866, + "loss": 7.9547, + "step": 413100 + }, + { + "epoch": 1.6832859926121997, + "grad_norm": 1.8825346231460571, + "learning_rate": 0.0042755343426520995, + "loss": 7.9453, + "step": 413200 + }, + { + "epoch": 1.683693370635581, + "grad_norm": 3.201878786087036, + "learning_rate": 0.004275184774474639, + "loss": 7.9486, + "step": 413300 + }, + { + "epoch": 1.6841007486589623, + "grad_norm": 1.6017849445343018, + "learning_rate": 0.004274835136312307, + "loss": 7.9322, + "step": 413400 + }, + { + "epoch": 1.6845081266823438, + "grad_norm": 2.9095780849456787, + "learning_rate": 0.004274485428178923, + "loss": 7.9611, + "step": 413500 + }, + { + "epoch": 1.6849155047057254, + "grad_norm": 1.9746508598327637, + "learning_rate": 0.00427413565008831, + "loss": 7.9572, + "step": 413600 + }, + { + "epoch": 1.685322882729107, + "grad_norm": 4.102407932281494, + "learning_rate": 0.004273785802054307, + "loss": 7.9506, + "step": 413700 + }, + { + "epoch": 1.6857302607524884, + "grad_norm": 1.4797166585922241, + "learning_rate": 0.004273435884090749, + "loss": 7.9547, + "step": 413800 + }, + { + "epoch": 1.6861376387758698, + "grad_norm": 1.2575322389602661, + "learning_rate": 0.004273085896211464, + "loss": 7.943, + "step": 413900 + }, + { + "epoch": 1.6865450167992513, + "grad_norm": 2.634089231491089, + "learning_rate": 0.004272735838430278, + "loss": 7.9474, + "step": 414000 + }, + { + "epoch": 1.6865450167992513, + "eval_MaskedAccuracy": 0.4948957822579699, + "eval_loss": 1.6674184799194336, + "eval_runtime": 180.886, + "eval_samples_per_second": 350.917, + "eval_steps_per_second": 1.371, + "step": 414000 + }, + { + "epoch": 1.6869523948226326, + "grad_norm": 2.0500218868255615, + "learning_rate": 0.004272385710761043, + "loss": 7.9274, + "step": 414100 + }, + { + "epoch": 1.6873597728460141, + "grad_norm": 6.2046356201171875, + "learning_rate": 0.0042720355132175964, + "loss": 7.9405, + "step": 414200 + }, + { + "epoch": 1.6877671508693957, + "grad_norm": 4.2807936668396, + "learning_rate": 0.00427168524581379, + "loss": 7.9614, + "step": 414300 + }, + { + "epoch": 1.6881745288927772, + "grad_norm": 2.766878604888916, + "learning_rate": 0.004271334908563461, + "loss": 7.9708, + "step": 414400 + }, + { + "epoch": 1.6885819069161585, + "grad_norm": 1.8761850595474243, + "learning_rate": 0.004270984501480462, + "loss": 7.9459, + "step": 414500 + }, + { + "epoch": 1.68898928493954, + "grad_norm": 2.0470542907714844, + "learning_rate": 0.00427063402457866, + "loss": 7.9384, + "step": 414600 + }, + { + "epoch": 1.6893966629629213, + "grad_norm": 2.438100576400757, + "learning_rate": 0.004270283477871898, + "loss": 7.9537, + "step": 414700 + }, + { + "epoch": 1.6898040409863029, + "grad_norm": 4.45005464553833, + "learning_rate": 0.00426993286137404, + "loss": 7.928, + "step": 414800 + }, + { + "epoch": 1.6902114190096844, + "grad_norm": 2.664546489715576, + "learning_rate": 0.00426958217509895, + "loss": 7.9209, + "step": 414900 + }, + { + "epoch": 1.690618797033066, + "grad_norm": 1.286209225654602, + "learning_rate": 0.004269231419060486, + "loss": 7.955, + "step": 415000 + }, + { + "epoch": 1.690618797033066, + "eval_MaskedAccuracy": 0.49416102800073225, + "eval_loss": 1.6761102676391602, + "eval_runtime": 181.4399, + "eval_samples_per_second": 349.846, + "eval_steps_per_second": 1.367, + "step": 415000 + }, + { + "epoch": 1.6910261750564475, + "grad_norm": 3.3695225715637207, + "learning_rate": 0.004268880593272521, + "loss": 7.9491, + "step": 415100 + }, + { + "epoch": 1.6914335530798288, + "grad_norm": 1.6755496263504028, + "learning_rate": 0.004268529697748922, + "loss": 7.9383, + "step": 415200 + }, + { + "epoch": 1.69184093110321, + "grad_norm": 2.594820022583008, + "learning_rate": 0.004268178732503559, + "loss": 7.9261, + "step": 415300 + }, + { + "epoch": 1.6922483091265916, + "grad_norm": 3.90124773979187, + "learning_rate": 0.004267827697550309, + "loss": 7.9241, + "step": 415400 + }, + { + "epoch": 1.6926556871499732, + "grad_norm": 5.081125259399414, + "learning_rate": 0.004267476592903052, + "loss": 7.9661, + "step": 415500 + }, + { + "epoch": 1.6930630651733547, + "grad_norm": 2.1485257148742676, + "learning_rate": 0.004267125418575669, + "loss": 7.9798, + "step": 415600 + }, + { + "epoch": 1.6934704431967362, + "grad_norm": 1.2827850580215454, + "learning_rate": 0.004266774174582044, + "loss": 7.9659, + "step": 415700 + }, + { + "epoch": 1.6938778212201175, + "grad_norm": 6.2516984939575195, + "learning_rate": 0.004266422860936063, + "loss": 7.9508, + "step": 415800 + }, + { + "epoch": 1.6942851992434989, + "grad_norm": 4.668462753295898, + "learning_rate": 0.004266071477651613, + "loss": 7.9435, + "step": 415900 + }, + { + "epoch": 1.6946925772668804, + "grad_norm": 2.5708775520324707, + "learning_rate": 0.004265720024742591, + "loss": 7.9611, + "step": 416000 + }, + { + "epoch": 1.6946925772668804, + "eval_MaskedAccuracy": 0.49421570324190384, + "eval_loss": 1.6862056255340576, + "eval_runtime": 275.3313, + "eval_samples_per_second": 230.544, + "eval_steps_per_second": 0.901, + "step": 416000 + }, + { + "epoch": 1.695099955290262, + "grad_norm": 5.651008129119873, + "learning_rate": 0.004265368502222889, + "loss": 7.9128, + "step": 416100 + }, + { + "epoch": 1.6955073333136434, + "grad_norm": 4.246887683868408, + "learning_rate": 0.004265016910106409, + "loss": 7.9104, + "step": 416200 + }, + { + "epoch": 1.695914711337025, + "grad_norm": 3.257011890411377, + "learning_rate": 0.004264665248407044, + "loss": 7.9553, + "step": 416300 + }, + { + "epoch": 1.6963220893604063, + "grad_norm": 4.464527130126953, + "learning_rate": 0.004264313517138699, + "loss": 7.9419, + "step": 416400 + }, + { + "epoch": 1.6967294673837878, + "grad_norm": 1.7570393085479736, + "learning_rate": 0.0042639617163152755, + "loss": 7.9531, + "step": 416500 + }, + { + "epoch": 1.6971368454071691, + "grad_norm": 2.615110397338867, + "learning_rate": 0.004263609845950698, + "loss": 7.9562, + "step": 416600 + }, + { + "epoch": 1.6975442234305507, + "grad_norm": 1.7686477899551392, + "learning_rate": 0.004263257906058863, + "loss": 7.965, + "step": 416700 + }, + { + "epoch": 1.6979516014539322, + "grad_norm": 1.298539638519287, + "learning_rate": 0.004262905896653691, + "loss": 7.9437, + "step": 416800 + }, + { + "epoch": 1.6983589794773137, + "grad_norm": 4.391310214996338, + "learning_rate": 0.004262553817749098, + "loss": 7.9452, + "step": 416900 + }, + { + "epoch": 1.698766357500695, + "grad_norm": 5.422664165496826, + "learning_rate": 0.0042622016693590024, + "loss": 7.9323, + "step": 417000 + }, + { + "epoch": 1.698766357500695, + "eval_MaskedAccuracy": 0.49353902698434704, + "eval_loss": 1.6833051443099976, + "eval_runtime": 200.6313, + "eval_samples_per_second": 316.381, + "eval_steps_per_second": 1.236, + "step": 417000 + }, + { + "epoch": 1.6991737355240766, + "grad_norm": 5.574663162231445, + "learning_rate": 0.004261849451497324, + "loss": 7.9334, + "step": 417100 + }, + { + "epoch": 1.699581113547458, + "grad_norm": 1.7768208980560303, + "learning_rate": 0.004261497164177991, + "loss": 7.9233, + "step": 417200 + }, + { + "epoch": 1.6999884915708394, + "grad_norm": 3.46205472946167, + "learning_rate": 0.004261144807414934, + "loss": 7.9454, + "step": 417300 + }, + { + "epoch": 1.700395869594221, + "grad_norm": 8.98598575592041, + "learning_rate": 0.00426079238122208, + "loss": 7.9507, + "step": 417400 + }, + { + "epoch": 1.7008032476176025, + "grad_norm": 2.936594009399414, + "learning_rate": 0.004260439885613358, + "loss": 7.982, + "step": 417500 + }, + { + "epoch": 1.701210625640984, + "grad_norm": 3.1874189376831055, + "learning_rate": 0.004260087320602715, + "loss": 7.9595, + "step": 417600 + }, + { + "epoch": 1.7016180036643653, + "grad_norm": 5.177585601806641, + "learning_rate": 0.004259734686204081, + "loss": 7.9514, + "step": 417700 + }, + { + "epoch": 1.7020253816877466, + "grad_norm": 5.808584690093994, + "learning_rate": 0.004259381982431398, + "loss": 7.9618, + "step": 417800 + }, + { + "epoch": 1.7024327597111282, + "grad_norm": 2.695436954498291, + "learning_rate": 0.004259029209298612, + "loss": 7.9535, + "step": 417900 + }, + { + "epoch": 1.7028401377345097, + "grad_norm": 5.604297637939453, + "learning_rate": 0.004258676366819673, + "loss": 7.9603, + "step": 418000 + }, + { + "epoch": 1.7028401377345097, + "eval_MaskedAccuracy": 0.49453569524485747, + "eval_loss": 1.6734776496887207, + "eval_runtime": 289.2535, + "eval_samples_per_second": 219.448, + "eval_steps_per_second": 0.857, + "step": 418000 + }, + { + "epoch": 1.7032475157578912, + "grad_norm": 2.5026907920837402, + "learning_rate": 0.004258323455008524, + "loss": 7.975, + "step": 418100 + }, + { + "epoch": 1.7036548937812728, + "grad_norm": 2.7050795555114746, + "learning_rate": 0.004257970473879123, + "loss": 7.9621, + "step": 418200 + }, + { + "epoch": 1.704062271804654, + "grad_norm": 2.320899248123169, + "learning_rate": 0.0042576174234454185, + "loss": 7.9488, + "step": 418300 + }, + { + "epoch": 1.7044696498280354, + "grad_norm": 2.0656144618988037, + "learning_rate": 0.004257264303721376, + "loss": 7.8987, + "step": 418400 + }, + { + "epoch": 1.704877027851417, + "grad_norm": 4.858016490936279, + "learning_rate": 0.004256911114720962, + "loss": 7.9145, + "step": 418500 + }, + { + "epoch": 1.7052844058747985, + "grad_norm": 3.3390238285064697, + "learning_rate": 0.004256557856458122, + "loss": 7.9105, + "step": 418600 + }, + { + "epoch": 1.70569178389818, + "grad_norm": 1.1924008131027222, + "learning_rate": 0.004256204528946833, + "loss": 7.9466, + "step": 418700 + }, + { + "epoch": 1.7060991619215615, + "grad_norm": 2.3600430488586426, + "learning_rate": 0.004255851132201069, + "loss": 7.9802, + "step": 418800 + }, + { + "epoch": 1.7065065399449428, + "grad_norm": 3.2106237411499023, + "learning_rate": 0.004255497666234791, + "loss": 7.9314, + "step": 418900 + }, + { + "epoch": 1.7069139179683244, + "grad_norm": 2.626401662826538, + "learning_rate": 0.004255144131061982, + "loss": 7.953, + "step": 419000 + }, + { + "epoch": 1.7069139179683244, + "eval_MaskedAccuracy": 0.4944812553546451, + "eval_loss": 1.6595406532287598, + "eval_runtime": 214.6741, + "eval_samples_per_second": 295.685, + "eval_steps_per_second": 1.155, + "step": 419000 + }, + { + "epoch": 1.7073212959917057, + "grad_norm": 3.147618055343628, + "learning_rate": 0.004254790526696614, + "loss": 7.9468, + "step": 419100 + }, + { + "epoch": 1.7077286740150872, + "grad_norm": 4.491914749145508, + "learning_rate": 0.004254436853152674, + "loss": 7.9405, + "step": 419200 + }, + { + "epoch": 1.7081360520384687, + "grad_norm": 7.588096618652344, + "learning_rate": 0.004254083110444123, + "loss": 7.9451, + "step": 419300 + }, + { + "epoch": 1.7085434300618503, + "grad_norm": 2.622401714324951, + "learning_rate": 0.004253729298584968, + "loss": 7.9366, + "step": 419400 + }, + { + "epoch": 1.7089508080852316, + "grad_norm": 3.4954352378845215, + "learning_rate": 0.004253375417589195, + "loss": 7.9665, + "step": 419500 + }, + { + "epoch": 1.7093581861086131, + "grad_norm": 5.299036026000977, + "learning_rate": 0.00425302146747079, + "loss": 7.8988, + "step": 419600 + }, + { + "epoch": 1.7097655641319944, + "grad_norm": 1.441435694694519, + "learning_rate": 0.00425266744824375, + "loss": 7.9282, + "step": 419700 + }, + { + "epoch": 1.710172942155376, + "grad_norm": 6.788598537445068, + "learning_rate": 0.004252313359922064, + "loss": 7.9406, + "step": 419800 + }, + { + "epoch": 1.7105803201787575, + "grad_norm": 2.6914849281311035, + "learning_rate": 0.00425195920251974, + "loss": 7.9587, + "step": 419900 + }, + { + "epoch": 1.710987698202139, + "grad_norm": 1.9702292680740356, + "learning_rate": 0.0042516049760507775, + "loss": 7.9498, + "step": 420000 + }, + { + "epoch": 1.710987698202139, + "eval_MaskedAccuracy": 0.4946215008836926, + "eval_loss": 1.671088695526123, + "eval_runtime": 258.7533, + "eval_samples_per_second": 245.315, + "eval_steps_per_second": 0.958, + "step": 420000 + }, + { + "epoch": 1.7113950762255206, + "grad_norm": 7.365056037902832, + "learning_rate": 0.00425125068052917, + "loss": 7.9604, + "step": 420100 + }, + { + "epoch": 1.7118024542489019, + "grad_norm": 8.19728946685791, + "learning_rate": 0.004250896315968938, + "loss": 7.9543, + "step": 420200 + }, + { + "epoch": 1.7122098322722832, + "grad_norm": 2.184723138809204, + "learning_rate": 0.004250541882384093, + "loss": 7.9384, + "step": 420300 + }, + { + "epoch": 1.7126172102956647, + "grad_norm": 2.058471202850342, + "learning_rate": 0.004250187379788638, + "loss": 7.941, + "step": 420400 + }, + { + "epoch": 1.7130245883190462, + "grad_norm": 1.7724852561950684, + "learning_rate": 0.004249832808196584, + "loss": 7.9573, + "step": 420500 + }, + { + "epoch": 1.7134319663424278, + "grad_norm": 10.145926475524902, + "learning_rate": 0.004249478167621958, + "loss": 7.9009, + "step": 420600 + }, + { + "epoch": 1.7138393443658093, + "grad_norm": 6.171712398529053, + "learning_rate": 0.004249123458078784, + "loss": 7.9543, + "step": 420700 + }, + { + "epoch": 1.7142467223891906, + "grad_norm": 4.679589748382568, + "learning_rate": 0.004248768679581081, + "loss": 7.937, + "step": 420800 + }, + { + "epoch": 1.714654100412572, + "grad_norm": 7.17892599105835, + "learning_rate": 0.004248413832142874, + "loss": 7.9342, + "step": 420900 + }, + { + "epoch": 1.7150614784359535, + "grad_norm": 4.674642562866211, + "learning_rate": 0.004248058915778193, + "loss": 7.9379, + "step": 421000 + }, + { + "epoch": 1.7150614784359535, + "eval_MaskedAccuracy": 0.49462195343410076, + "eval_loss": 1.6705025434494019, + "eval_runtime": 249.3654, + "eval_samples_per_second": 254.55, + "eval_steps_per_second": 0.995, + "step": 421000 + }, + { + "epoch": 1.715468856459335, + "grad_norm": 1.7300959825515747, + "learning_rate": 0.004247703930501073, + "loss": 7.9063, + "step": 421100 + }, + { + "epoch": 1.7158762344827165, + "grad_norm": 9.26630687713623, + "learning_rate": 0.004247348876325544, + "loss": 7.9698, + "step": 421200 + }, + { + "epoch": 1.716283612506098, + "grad_norm": 2.407864809036255, + "learning_rate": 0.004246993753265649, + "loss": 7.9343, + "step": 421300 + }, + { + "epoch": 1.7166909905294794, + "grad_norm": 3.963239908218384, + "learning_rate": 0.004246638561335424, + "loss": 7.9579, + "step": 421400 + }, + { + "epoch": 1.717098368552861, + "grad_norm": 3.381040096282959, + "learning_rate": 0.004246283300548916, + "loss": 7.9266, + "step": 421500 + }, + { + "epoch": 1.7175057465762422, + "grad_norm": 4.004113674163818, + "learning_rate": 0.0042459279709201525, + "loss": 7.9318, + "step": 421600 + }, + { + "epoch": 1.7179131245996238, + "grad_norm": 4.8017168045043945, + "learning_rate": 0.004245572572463203, + "loss": 7.9577, + "step": 421700 + }, + { + "epoch": 1.7183205026230053, + "grad_norm": 3.4024367332458496, + "learning_rate": 0.00424521710519211, + "loss": 7.9593, + "step": 421800 + }, + { + "epoch": 1.7187278806463868, + "grad_norm": 4.002869129180908, + "learning_rate": 0.0042448615691209265, + "loss": 7.9441, + "step": 421900 + }, + { + "epoch": 1.7191352586697681, + "grad_norm": 2.9499707221984863, + "learning_rate": 0.004244505964263711, + "loss": 7.939, + "step": 422000 + }, + { + "epoch": 1.7191352586697681, + "eval_MaskedAccuracy": 0.4945934567815738, + "eval_loss": 1.6787841320037842, + "eval_runtime": 342.0885, + "eval_samples_per_second": 185.554, + "eval_steps_per_second": 0.725, + "step": 422000 + }, + { + "epoch": 1.7195426366931497, + "grad_norm": 2.5295894145965576, + "learning_rate": 0.004244150290634521, + "loss": 7.9285, + "step": 422100 + }, + { + "epoch": 1.719950014716531, + "grad_norm": 2.591674327850342, + "learning_rate": 0.004243794548247419, + "loss": 7.9027, + "step": 422200 + }, + { + "epoch": 1.7203573927399125, + "grad_norm": 3.139409303665161, + "learning_rate": 0.004243438737116469, + "loss": 7.9523, + "step": 422300 + }, + { + "epoch": 1.720764770763294, + "grad_norm": 6.544683933258057, + "learning_rate": 0.0042430828572557365, + "loss": 7.9359, + "step": 422400 + }, + { + "epoch": 1.7211721487866756, + "grad_norm": 2.374436140060425, + "learning_rate": 0.004242726908679294, + "loss": 7.9489, + "step": 422500 + }, + { + "epoch": 1.721579526810057, + "grad_norm": 2.56363582611084, + "learning_rate": 0.004242370891401212, + "loss": 7.9068, + "step": 422600 + }, + { + "epoch": 1.7219869048334384, + "grad_norm": 5.707026481628418, + "learning_rate": 0.004242014805435568, + "loss": 7.9544, + "step": 422700 + }, + { + "epoch": 1.7223942828568197, + "grad_norm": 1.828076958656311, + "learning_rate": 0.004241658650796436, + "loss": 7.9206, + "step": 422800 + }, + { + "epoch": 1.7228016608802013, + "grad_norm": 1.7943181991577148, + "learning_rate": 0.0042413024274979, + "loss": 7.9149, + "step": 422900 + }, + { + "epoch": 1.7232090389035828, + "grad_norm": 5.432728290557861, + "learning_rate": 0.004240946135554039, + "loss": 7.9043, + "step": 423000 + }, + { + "epoch": 1.7232090389035828, + "eval_MaskedAccuracy": 0.49358708868547657, + "eval_loss": 1.6763889789581299, + "eval_runtime": 164.9674, + "eval_samples_per_second": 384.779, + "eval_steps_per_second": 1.503, + "step": 423000 + }, + { + "epoch": 1.7236164169269643, + "grad_norm": 2.6649765968322754, + "learning_rate": 0.004240589774978941, + "loss": 7.9519, + "step": 423100 + }, + { + "epoch": 1.7240237949503459, + "grad_norm": 4.455107688903809, + "learning_rate": 0.0042402333457867035, + "loss": 7.9516, + "step": 423200 + }, + { + "epoch": 1.7244311729737272, + "grad_norm": 5.428768634796143, + "learning_rate": 0.004239876847991411, + "loss": 7.9421, + "step": 423300 + }, + { + "epoch": 1.7248385509971085, + "grad_norm": 5.04478120803833, + "learning_rate": 0.00423952028160716, + "loss": 7.9131, + "step": 423400 + }, + { + "epoch": 1.72524592902049, + "grad_norm": 4.8387932777404785, + "learning_rate": 0.004239163646648039, + "loss": 7.9496, + "step": 423500 + }, + { + "epoch": 1.7256533070438715, + "grad_norm": 4.750764846801758, + "learning_rate": 0.004238806943128157, + "loss": 7.9309, + "step": 423600 + }, + { + "epoch": 1.726060685067253, + "grad_norm": 3.087653160095215, + "learning_rate": 0.004238450171061615, + "loss": 7.9352, + "step": 423700 + }, + { + "epoch": 1.7264680630906346, + "grad_norm": 6.936778545379639, + "learning_rate": 0.004238093330462516, + "loss": 7.9261, + "step": 423800 + }, + { + "epoch": 1.726875441114016, + "grad_norm": 4.899052619934082, + "learning_rate": 0.004237736421344969, + "loss": 7.9258, + "step": 423900 + }, + { + "epoch": 1.7272828191373975, + "grad_norm": 7.004554748535156, + "learning_rate": 0.004237379443723085, + "loss": 7.9139, + "step": 424000 + }, + { + "epoch": 1.7272828191373975, + "eval_MaskedAccuracy": 0.49389334649659283, + "eval_loss": 1.671739101409912, + "eval_runtime": 160.3116, + "eval_samples_per_second": 395.954, + "eval_steps_per_second": 1.547, + "step": 424000 + }, + { + "epoch": 1.7276901971607788, + "grad_norm": 1.0757733583450317, + "learning_rate": 0.004237022397610971, + "loss": 7.9485, + "step": 424100 + }, + { + "epoch": 1.7280975751841603, + "grad_norm": 3.9845657348632812, + "learning_rate": 0.004236665283022757, + "loss": 7.9496, + "step": 424200 + }, + { + "epoch": 1.7285049532075418, + "grad_norm": 2.754138708114624, + "learning_rate": 0.00423630809997255, + "loss": 7.932, + "step": 424300 + }, + { + "epoch": 1.7289123312309234, + "grad_norm": 3.6710002422332764, + "learning_rate": 0.00423595084847447, + "loss": 7.9567, + "step": 424400 + }, + { + "epoch": 1.7293197092543047, + "grad_norm": 3.097033739089966, + "learning_rate": 0.004235593528542646, + "loss": 7.9502, + "step": 424500 + }, + { + "epoch": 1.7297270872776862, + "grad_norm": 2.262523651123047, + "learning_rate": 0.004235236140191199, + "loss": 7.9608, + "step": 424600 + }, + { + "epoch": 1.7301344653010675, + "grad_norm": 3.1382665634155273, + "learning_rate": 0.004234878683434268, + "loss": 7.922, + "step": 424700 + }, + { + "epoch": 1.730541843324449, + "grad_norm": 1.1934833526611328, + "learning_rate": 0.004234521158285986, + "loss": 7.9379, + "step": 424800 + }, + { + "epoch": 1.7309492213478306, + "grad_norm": 4.094176292419434, + "learning_rate": 0.00423416356476048, + "loss": 7.9662, + "step": 424900 + }, + { + "epoch": 1.7313565993712121, + "grad_norm": 4.058340072631836, + "learning_rate": 0.004233805902871889, + "loss": 7.9328, + "step": 425000 + }, + { + "epoch": 1.7313565993712121, + "eval_MaskedAccuracy": 0.493221795250412, + "eval_loss": 1.6672805547714233, + "eval_runtime": 167.1428, + "eval_samples_per_second": 379.771, + "eval_steps_per_second": 1.484, + "step": 425000 + }, + { + "epoch": 1.7317639773945936, + "grad_norm": 4.574451446533203, + "learning_rate": 0.004233448172634353, + "loss": 7.9546, + "step": 425100 + }, + { + "epoch": 1.732171355417975, + "grad_norm": 6.09646463394165, + "learning_rate": 0.004233090374062018, + "loss": 7.9528, + "step": 425200 + }, + { + "epoch": 1.7325787334413563, + "grad_norm": 1.7354048490524292, + "learning_rate": 0.004232732507169023, + "loss": 7.9722, + "step": 425300 + }, + { + "epoch": 1.7329861114647378, + "grad_norm": 3.8242075443267822, + "learning_rate": 0.004232374571969524, + "loss": 7.925, + "step": 425400 + }, + { + "epoch": 1.7333934894881193, + "grad_norm": 2.1813385486602783, + "learning_rate": 0.004232016568477671, + "loss": 7.9431, + "step": 425500 + }, + { + "epoch": 1.7338008675115009, + "grad_norm": 2.1258385181427, + "learning_rate": 0.004231658496707613, + "loss": 7.9297, + "step": 425600 + }, + { + "epoch": 1.7342082455348824, + "grad_norm": 1.3778568506240845, + "learning_rate": 0.004231300356673509, + "loss": 7.9432, + "step": 425700 + }, + { + "epoch": 1.7346156235582637, + "grad_norm": 2.5177571773529053, + "learning_rate": 0.004230942148389524, + "loss": 7.9426, + "step": 425800 + }, + { + "epoch": 1.735023001581645, + "grad_norm": 1.6045327186584473, + "learning_rate": 0.004230583871869813, + "loss": 7.9286, + "step": 425900 + }, + { + "epoch": 1.7354303796050266, + "grad_norm": 4.004244327545166, + "learning_rate": 0.004230225527128542, + "loss": 7.9145, + "step": 426000 + }, + { + "epoch": 1.7354303796050266, + "eval_MaskedAccuracy": 0.49541676722671146, + "eval_loss": 1.6673568487167358, + "eval_runtime": 279.7464, + "eval_samples_per_second": 226.905, + "eval_steps_per_second": 0.887, + "step": 426000 + }, + { + "epoch": 1.735837757628408, + "grad_norm": 1.5896828174591064, + "learning_rate": 0.004229867114179879, + "loss": 7.9383, + "step": 426100 + }, + { + "epoch": 1.7362451356517896, + "grad_norm": 4.541566848754883, + "learning_rate": 0.004229508633037998, + "loss": 7.9762, + "step": 426200 + }, + { + "epoch": 1.7366525136751711, + "grad_norm": 3.481902837753296, + "learning_rate": 0.0042291500837170655, + "loss": 7.9363, + "step": 426300 + }, + { + "epoch": 1.7370598916985525, + "grad_norm": 1.3038591146469116, + "learning_rate": 0.00422879146623126, + "loss": 7.9506, + "step": 426400 + }, + { + "epoch": 1.737467269721934, + "grad_norm": 2.6721527576446533, + "learning_rate": 0.004228432780594757, + "loss": 7.8815, + "step": 426500 + }, + { + "epoch": 1.7378746477453153, + "grad_norm": 2.730501413345337, + "learning_rate": 0.004228074026821736, + "loss": 7.9677, + "step": 426600 + }, + { + "epoch": 1.7382820257686968, + "grad_norm": 4.208138942718506, + "learning_rate": 0.004227715204926386, + "loss": 7.9414, + "step": 426700 + }, + { + "epoch": 1.7386894037920784, + "grad_norm": 1.6396872997283936, + "learning_rate": 0.004227356314922886, + "loss": 7.9424, + "step": 426800 + }, + { + "epoch": 1.73909678181546, + "grad_norm": 1.0482051372528076, + "learning_rate": 0.004226997356825428, + "loss": 7.9356, + "step": 426900 + }, + { + "epoch": 1.7395041598388412, + "grad_norm": 3.294713258743286, + "learning_rate": 0.004226638330648208, + "loss": 7.9513, + "step": 427000 + }, + { + "epoch": 1.7395041598388412, + "eval_MaskedAccuracy": 0.49443078227574316, + "eval_loss": 1.6736928224563599, + "eval_runtime": 150.1969, + "eval_samples_per_second": 422.619, + "eval_steps_per_second": 1.651, + "step": 427000 + }, + { + "epoch": 1.7399115378622227, + "grad_norm": 2.7487411499023438, + "learning_rate": 0.004226279236405414, + "loss": 7.9685, + "step": 427100 + }, + { + "epoch": 1.740318915885604, + "grad_norm": 4.569572448730469, + "learning_rate": 0.004225920074111246, + "loss": 7.9265, + "step": 427200 + }, + { + "epoch": 1.7407262939089856, + "grad_norm": 2.7802586555480957, + "learning_rate": 0.004225560843779905, + "loss": 7.9348, + "step": 427300 + }, + { + "epoch": 1.7411336719323671, + "grad_norm": 2.1910150051116943, + "learning_rate": 0.0042252015454255915, + "loss": 7.9242, + "step": 427400 + }, + { + "epoch": 1.7415410499557487, + "grad_norm": 4.726769924163818, + "learning_rate": 0.004224842179062509, + "loss": 7.9273, + "step": 427500 + }, + { + "epoch": 1.7419484279791302, + "grad_norm": 2.721534013748169, + "learning_rate": 0.004224482744704869, + "loss": 7.9336, + "step": 427600 + }, + { + "epoch": 1.7423558060025115, + "grad_norm": 1.8914376497268677, + "learning_rate": 0.004224123242366877, + "loss": 7.9219, + "step": 427700 + }, + { + "epoch": 1.7427631840258928, + "grad_norm": 1.1209359169006348, + "learning_rate": 0.00422376367206275, + "loss": 7.9069, + "step": 427800 + }, + { + "epoch": 1.7431705620492743, + "grad_norm": 1.9058761596679688, + "learning_rate": 0.004223404033806703, + "loss": 7.9621, + "step": 427900 + }, + { + "epoch": 1.7435779400726559, + "grad_norm": 1.6395440101623535, + "learning_rate": 0.0042230443276129615, + "loss": 7.9118, + "step": 428000 + }, + { + "epoch": 1.7435779400726559, + "eval_MaskedAccuracy": 0.4951851500268164, + "eval_loss": 1.6693843603134155, + "eval_runtime": 235.7121, + "eval_samples_per_second": 269.295, + "eval_steps_per_second": 1.052, + "step": 428000 + }, + { + "epoch": 1.7439853180960374, + "grad_norm": 5.839138984680176, + "learning_rate": 0.004222684553495732, + "loss": 7.8909, + "step": 428100 + }, + { + "epoch": 1.744392696119419, + "grad_norm": 2.5658891201019287, + "learning_rate": 0.004222324711469246, + "loss": 7.9404, + "step": 428200 + }, + { + "epoch": 1.7448000741428003, + "grad_norm": 1.5509271621704102, + "learning_rate": 0.004221964801547729, + "loss": 7.9292, + "step": 428300 + }, + { + "epoch": 1.7452074521661816, + "grad_norm": 5.172556400299072, + "learning_rate": 0.004221604823745414, + "loss": 7.8855, + "step": 428400 + }, + { + "epoch": 1.745614830189563, + "grad_norm": 6.072554111480713, + "learning_rate": 0.004221244778076527, + "loss": 7.9796, + "step": 428500 + }, + { + "epoch": 1.7460222082129446, + "grad_norm": 5.145614147186279, + "learning_rate": 0.004220884664555305, + "loss": 7.8932, + "step": 428600 + }, + { + "epoch": 1.7464295862363262, + "grad_norm": 3.9118359088897705, + "learning_rate": 0.004220524483195977, + "loss": 7.9148, + "step": 428700 + }, + { + "epoch": 1.7468369642597077, + "grad_norm": 7.136172771453857, + "learning_rate": 0.0042201642340128015, + "loss": 7.9739, + "step": 428800 + }, + { + "epoch": 1.747244342283089, + "grad_norm": 1.8290817737579346, + "learning_rate": 0.004219803917020007, + "loss": 7.9246, + "step": 428900 + }, + { + "epoch": 1.7476517203064705, + "grad_norm": 4.136419296264648, + "learning_rate": 0.0042194435322318435, + "loss": 7.8832, + "step": 429000 + }, + { + "epoch": 1.7476517203064705, + "eval_MaskedAccuracy": 0.494982637445449, + "eval_loss": 1.6747286319732666, + "eval_runtime": 256.3837, + "eval_samples_per_second": 247.582, + "eval_steps_per_second": 0.967, + "step": 429000 + }, + { + "epoch": 1.7480590983298518, + "grad_norm": 4.4007062911987305, + "learning_rate": 0.0042190830796625545, + "loss": 7.905, + "step": 429100 + }, + { + "epoch": 1.7484664763532334, + "grad_norm": 4.226682662963867, + "learning_rate": 0.004218722559326396, + "loss": 7.9215, + "step": 429200 + }, + { + "epoch": 1.748873854376615, + "grad_norm": 1.05141282081604, + "learning_rate": 0.004218361971237617, + "loss": 7.9446, + "step": 429300 + }, + { + "epoch": 1.7492812323999964, + "grad_norm": 3.943563461303711, + "learning_rate": 0.004218001315410477, + "loss": 7.9563, + "step": 429400 + }, + { + "epoch": 1.7496886104233778, + "grad_norm": 1.5362974405288696, + "learning_rate": 0.004217640591859233, + "loss": 7.9339, + "step": 429500 + }, + { + "epoch": 1.7500959884467593, + "grad_norm": 1.417009949684143, + "learning_rate": 0.004217279800598145, + "loss": 7.9275, + "step": 429600 + }, + { + "epoch": 1.7505033664701406, + "grad_norm": 2.833244562149048, + "learning_rate": 0.0042169189416414775, + "loss": 7.9469, + "step": 429700 + }, + { + "epoch": 1.7509107444935221, + "grad_norm": 3.411043882369995, + "learning_rate": 0.0042165580150035015, + "loss": 7.9252, + "step": 429800 + }, + { + "epoch": 1.7513181225169037, + "grad_norm": 2.143798828125, + "learning_rate": 0.004216197020698485, + "loss": 7.9404, + "step": 429900 + }, + { + "epoch": 1.7517255005402852, + "grad_norm": 2.492276430130005, + "learning_rate": 0.004215835958740691, + "loss": 7.9048, + "step": 430000 + }, + { + "epoch": 1.7517255005402852, + "eval_MaskedAccuracy": 0.49476444358347443, + "eval_loss": 1.6649599075317383, + "eval_runtime": 168.453, + "eval_samples_per_second": 376.817, + "eval_steps_per_second": 1.472, + "step": 430000 + }, + { + "epoch": 1.7521328785636667, + "grad_norm": 1.5841550827026367, + "learning_rate": 0.004215474829144404, + "loss": 7.9557, + "step": 430100 + }, + { + "epoch": 1.752540256587048, + "grad_norm": 1.9986809492111206, + "learning_rate": 0.0042151136319239, + "loss": 7.9075, + "step": 430200 + }, + { + "epoch": 1.7529476346104294, + "grad_norm": 5.14298677444458, + "learning_rate": 0.004214752367093458, + "loss": 7.9479, + "step": 430300 + }, + { + "epoch": 1.7533550126338109, + "grad_norm": 4.060118198394775, + "learning_rate": 0.004214391034667361, + "loss": 7.9276, + "step": 430400 + }, + { + "epoch": 1.7537623906571924, + "grad_norm": 6.250939846038818, + "learning_rate": 0.004214029634659891, + "loss": 7.9274, + "step": 430500 + }, + { + "epoch": 1.754169768680574, + "grad_norm": 1.6329816579818726, + "learning_rate": 0.0042136681670853484, + "loss": 7.9203, + "step": 430600 + }, + { + "epoch": 1.7545771467039555, + "grad_norm": 2.1676533222198486, + "learning_rate": 0.0042133066319580085, + "loss": 7.9316, + "step": 430700 + }, + { + "epoch": 1.7549845247273368, + "grad_norm": 3.689631700515747, + "learning_rate": 0.00421294502929217, + "loss": 7.9029, + "step": 430800 + }, + { + "epoch": 1.755391902750718, + "grad_norm": 4.070959091186523, + "learning_rate": 0.00421258335910213, + "loss": 7.9356, + "step": 430900 + }, + { + "epoch": 1.7557992807740996, + "grad_norm": 2.2822155952453613, + "learning_rate": 0.004212221621402184, + "loss": 7.935, + "step": 431000 + }, + { + "epoch": 1.7557992807740996, + "eval_MaskedAccuracy": 0.49453676369755617, + "eval_loss": 1.674303650856018, + "eval_runtime": 160.2692, + "eval_samples_per_second": 396.059, + "eval_steps_per_second": 1.547, + "step": 431000 + }, + { + "epoch": 1.7562066587974812, + "grad_norm": 2.610569953918457, + "learning_rate": 0.004211859816206645, + "loss": 7.93, + "step": 431100 + }, + { + "epoch": 1.7566140368208627, + "grad_norm": 2.744990825653076, + "learning_rate": 0.004211497943529805, + "loss": 7.9149, + "step": 431200 + }, + { + "epoch": 1.7570214148442442, + "grad_norm": 3.835881233215332, + "learning_rate": 0.004211136003385974, + "loss": 7.9793, + "step": 431300 + }, + { + "epoch": 1.7574287928676255, + "grad_norm": 1.9926955699920654, + "learning_rate": 0.004210773995789458, + "loss": 7.9155, + "step": 431400 + }, + { + "epoch": 1.757836170891007, + "grad_norm": 4.1470112800598145, + "learning_rate": 0.00421041192075458, + "loss": 7.9223, + "step": 431500 + }, + { + "epoch": 1.7582435489143884, + "grad_norm": 4.6428937911987305, + "learning_rate": 0.004210049778295644, + "loss": 7.9092, + "step": 431600 + }, + { + "epoch": 1.75865092693777, + "grad_norm": 1.5294262170791626, + "learning_rate": 0.004209687568426969, + "loss": 7.9197, + "step": 431700 + }, + { + "epoch": 1.7590583049611515, + "grad_norm": 2.488340377807617, + "learning_rate": 0.004209325291162881, + "loss": 7.9236, + "step": 431800 + }, + { + "epoch": 1.759465682984533, + "grad_norm": 1.558867335319519, + "learning_rate": 0.004208962946517693, + "loss": 7.9305, + "step": 431900 + }, + { + "epoch": 1.7598730610079143, + "grad_norm": 1.2909411191940308, + "learning_rate": 0.004208600534505739, + "loss": 7.91, + "step": 432000 + }, + { + "epoch": 1.7598730610079143, + "eval_MaskedAccuracy": 0.49530208904980155, + "eval_loss": 1.6677933931350708, + "eval_runtime": 226.9379, + "eval_samples_per_second": 279.706, + "eval_steps_per_second": 1.093, + "step": 432000 + }, + { + "epoch": 1.7602804390312958, + "grad_norm": 4.529399394989014, + "learning_rate": 0.004208238055141342, + "loss": 7.9588, + "step": 432100 + }, + { + "epoch": 1.7606878170546771, + "grad_norm": 6.088457107543945, + "learning_rate": 0.004207875508438836, + "loss": 7.9371, + "step": 432200 + }, + { + "epoch": 1.7610951950780587, + "grad_norm": 4.843735694885254, + "learning_rate": 0.004207512894412555, + "loss": 7.8799, + "step": 432300 + }, + { + "epoch": 1.7615025731014402, + "grad_norm": 1.129140019416809, + "learning_rate": 0.004207150213076832, + "loss": 7.9252, + "step": 432400 + }, + { + "epoch": 1.7619099511248217, + "grad_norm": 2.9595580101013184, + "learning_rate": 0.0042067874644460095, + "loss": 7.9051, + "step": 432500 + }, + { + "epoch": 1.7623173291482033, + "grad_norm": 12.065353393554688, + "learning_rate": 0.004206424648534424, + "loss": 7.9098, + "step": 432600 + }, + { + "epoch": 1.7627247071715846, + "grad_norm": 3.2894842624664307, + "learning_rate": 0.004206061765356425, + "loss": 7.9287, + "step": 432700 + }, + { + "epoch": 1.763132085194966, + "grad_norm": 2.2475671768188477, + "learning_rate": 0.00420569881492636, + "loss": 7.9415, + "step": 432800 + }, + { + "epoch": 1.7635394632183474, + "grad_norm": 4.475327014923096, + "learning_rate": 0.004205335797258577, + "loss": 7.9633, + "step": 432900 + }, + { + "epoch": 1.763946841241729, + "grad_norm": 1.3904350996017456, + "learning_rate": 0.004204972712367418, + "loss": 7.9322, + "step": 433000 + }, + { + "epoch": 1.763946841241729, + "eval_MaskedAccuracy": 0.4948103956917094, + "eval_loss": 1.6721564531326294, + "eval_runtime": 202.4683, + "eval_samples_per_second": 313.511, + "eval_steps_per_second": 1.225, + "step": 433000 + }, + { + "epoch": 1.7643542192651105, + "grad_norm": 4.464630126953125, + "learning_rate": 0.004204609560267249, + "loss": 7.9406, + "step": 433100 + }, + { + "epoch": 1.764761597288492, + "grad_norm": 4.836032867431641, + "learning_rate": 0.004204246340972421, + "loss": 7.9282, + "step": 433200 + }, + { + "epoch": 1.7651689753118733, + "grad_norm": 3.3716859817504883, + "learning_rate": 0.004203883054497301, + "loss": 7.8613, + "step": 433300 + }, + { + "epoch": 1.7655763533352546, + "grad_norm": 3.8334603309631348, + "learning_rate": 0.004203519700856246, + "loss": 7.8941, + "step": 433400 + }, + { + "epoch": 1.7659837313586362, + "grad_norm": 3.0918352603912354, + "learning_rate": 0.0042031562800636186, + "loss": 7.9405, + "step": 433500 + }, + { + "epoch": 1.7663911093820177, + "grad_norm": 4.291378498077393, + "learning_rate": 0.004202792792133799, + "loss": 7.9402, + "step": 433600 + }, + { + "epoch": 1.7667984874053992, + "grad_norm": 7.552089214324951, + "learning_rate": 0.004202429237081146, + "loss": 7.9158, + "step": 433700 + }, + { + "epoch": 1.7672058654287808, + "grad_norm": 3.02734375, + "learning_rate": 0.004202065614920036, + "loss": 7.955, + "step": 433800 + }, + { + "epoch": 1.767613243452162, + "grad_norm": 4.439110279083252, + "learning_rate": 0.0042017019256648525, + "loss": 7.9121, + "step": 433900 + }, + { + "epoch": 1.7680206214755436, + "grad_norm": 3.56619930267334, + "learning_rate": 0.0042013381693299596, + "loss": 7.9461, + "step": 434000 + }, + { + "epoch": 1.7680206214755436, + "eval_MaskedAccuracy": 0.49425697454076883, + "eval_loss": 1.674866795539856, + "eval_runtime": 168.5457, + "eval_samples_per_second": 376.61, + "eval_steps_per_second": 1.471, + "step": 434000 + }, + { + "epoch": 1.768427999498925, + "grad_norm": 6.718011856079102, + "learning_rate": 0.004200974345929748, + "loss": 7.9562, + "step": 434100 + }, + { + "epoch": 1.7688353775223065, + "grad_norm": 3.7811686992645264, + "learning_rate": 0.004200610455478597, + "loss": 7.9433, + "step": 434200 + }, + { + "epoch": 1.769242755545688, + "grad_norm": 3.410982847213745, + "learning_rate": 0.004200246497990897, + "loss": 7.9246, + "step": 434300 + }, + { + "epoch": 1.7696501335690695, + "grad_norm": 2.2441630363464355, + "learning_rate": 0.004199882473481041, + "loss": 7.9174, + "step": 434400 + }, + { + "epoch": 1.7700575115924508, + "grad_norm": 5.297140121459961, + "learning_rate": 0.004199518381963409, + "loss": 7.9172, + "step": 434500 + }, + { + "epoch": 1.7704648896158324, + "grad_norm": 1.750809907913208, + "learning_rate": 0.004199154223452406, + "loss": 7.9102, + "step": 434600 + }, + { + "epoch": 1.7708722676392137, + "grad_norm": 5.267836093902588, + "learning_rate": 0.004198789997962424, + "loss": 7.9104, + "step": 434700 + }, + { + "epoch": 1.7712796456625952, + "grad_norm": 3.8845372200012207, + "learning_rate": 0.004198425705507866, + "loss": 7.9406, + "step": 434800 + }, + { + "epoch": 1.7716870236859767, + "grad_norm": 5.069277763366699, + "learning_rate": 0.004198061346103134, + "loss": 7.9008, + "step": 434900 + }, + { + "epoch": 1.7720944017093583, + "grad_norm": 1.8418042659759521, + "learning_rate": 0.004197696919762629, + "loss": 7.9198, + "step": 435000 + }, + { + "epoch": 1.7720944017093583, + "eval_MaskedAccuracy": 0.49537977874568173, + "eval_loss": 1.6728644371032715, + "eval_runtime": 194.3036, + "eval_samples_per_second": 326.685, + "eval_steps_per_second": 1.276, + "step": 435000 + }, + { + "epoch": 1.7725017797327398, + "grad_norm": 2.8373005390167236, + "learning_rate": 0.00419733242650076, + "loss": 7.9016, + "step": 435100 + }, + { + "epoch": 1.7729091577561211, + "grad_norm": 3.8157289028167725, + "learning_rate": 0.004196967866331938, + "loss": 7.8901, + "step": 435200 + }, + { + "epoch": 1.7733165357795024, + "grad_norm": 3.1491050720214844, + "learning_rate": 0.004196603239270578, + "loss": 7.925, + "step": 435300 + }, + { + "epoch": 1.773723913802884, + "grad_norm": 3.543290376663208, + "learning_rate": 0.004196238545331096, + "loss": 7.9499, + "step": 435400 + }, + { + "epoch": 1.7741312918262655, + "grad_norm": 1.5468164682388306, + "learning_rate": 0.00419587378452791, + "loss": 7.9073, + "step": 435500 + }, + { + "epoch": 1.774538669849647, + "grad_norm": 1.449621319770813, + "learning_rate": 0.00419550895687544, + "loss": 7.9334, + "step": 435600 + }, + { + "epoch": 1.7749460478730286, + "grad_norm": 2.957839012145996, + "learning_rate": 0.004195144062388103, + "loss": 7.9364, + "step": 435700 + }, + { + "epoch": 1.7753534258964099, + "grad_norm": 1.856015682220459, + "learning_rate": 0.00419477910108034, + "loss": 7.9517, + "step": 435800 + }, + { + "epoch": 1.7757608039197912, + "grad_norm": 2.15834641456604, + "learning_rate": 0.004194414072966572, + "loss": 7.9036, + "step": 435900 + }, + { + "epoch": 1.7761681819431727, + "grad_norm": 5.483026504516602, + "learning_rate": 0.004194048978061223, + "loss": 7.8789, + "step": 436000 + }, + { + "epoch": 1.7761681819431727, + "eval_MaskedAccuracy": 0.4955424357984903, + "eval_loss": 1.6680715084075928, + "eval_runtime": 333.1299, + "eval_samples_per_second": 190.544, + "eval_steps_per_second": 0.744, + "step": 436000 + }, + { + "epoch": 1.7765755599665543, + "grad_norm": 3.4480948448181152, + "learning_rate": 0.004193683816378737, + "loss": 7.9173, + "step": 436100 + }, + { + "epoch": 1.7769829379899358, + "grad_norm": 1.7107717990875244, + "learning_rate": 0.004193318587933548, + "loss": 7.9448, + "step": 436200 + }, + { + "epoch": 1.7773903160133173, + "grad_norm": 6.747818946838379, + "learning_rate": 0.004192953292740097, + "loss": 7.9291, + "step": 436300 + }, + { + "epoch": 1.7777976940366986, + "grad_norm": 3.750035047531128, + "learning_rate": 0.004192587930812824, + "loss": 7.9411, + "step": 436400 + }, + { + "epoch": 1.7782050720600802, + "grad_norm": 1.7927358150482178, + "learning_rate": 0.004192222502166175, + "loss": 7.9195, + "step": 436500 + }, + { + "epoch": 1.7786124500834615, + "grad_norm": 1.3702152967453003, + "learning_rate": 0.004191857006814593, + "loss": 7.938, + "step": 436600 + }, + { + "epoch": 1.779019828106843, + "grad_norm": 2.765705108642578, + "learning_rate": 0.0041914914447725386, + "loss": 7.9257, + "step": 436700 + }, + { + "epoch": 1.7794272061302245, + "grad_norm": 2.7728800773620605, + "learning_rate": 0.004191125816054456, + "loss": 7.9118, + "step": 436800 + }, + { + "epoch": 1.779834584153606, + "grad_norm": 4.654808044433594, + "learning_rate": 0.004190760120674799, + "loss": 7.9745, + "step": 436900 + }, + { + "epoch": 1.7802419621769874, + "grad_norm": 5.276312351226807, + "learning_rate": 0.004190394358648029, + "loss": 7.9196, + "step": 437000 + }, + { + "epoch": 1.7802419621769874, + "eval_MaskedAccuracy": 0.49464101729552723, + "eval_loss": 1.6730901002883911, + "eval_runtime": 180.536, + "eval_samples_per_second": 351.597, + "eval_steps_per_second": 1.374, + "step": 437000 + }, + { + "epoch": 1.780649340200369, + "grad_norm": 3.10459566116333, + "learning_rate": 0.004190028529988608, + "loss": 7.9061, + "step": 437100 + }, + { + "epoch": 1.7810567182237502, + "grad_norm": 2.878385066986084, + "learning_rate": 0.004189662634710997, + "loss": 7.9194, + "step": 437200 + }, + { + "epoch": 1.7814640962471318, + "grad_norm": 4.35692024230957, + "learning_rate": 0.004189296672829658, + "loss": 7.9196, + "step": 437300 + }, + { + "epoch": 1.7818714742705133, + "grad_norm": 2.9915459156036377, + "learning_rate": 0.0041889306443590665, + "loss": 7.9393, + "step": 437400 + }, + { + "epoch": 1.7822788522938948, + "grad_norm": 6.092617034912109, + "learning_rate": 0.0041885645493136905, + "loss": 7.9078, + "step": 437500 + }, + { + "epoch": 1.7826862303172764, + "grad_norm": 5.489753246307373, + "learning_rate": 0.004188198387708, + "loss": 7.9125, + "step": 437600 + }, + { + "epoch": 1.7830936083406577, + "grad_norm": 4.809166431427002, + "learning_rate": 0.004187832159556479, + "loss": 7.9268, + "step": 437700 + }, + { + "epoch": 1.783500986364039, + "grad_norm": 2.3220558166503906, + "learning_rate": 0.004187465864873599, + "loss": 7.9213, + "step": 437800 + }, + { + "epoch": 1.7839083643874205, + "grad_norm": 4.348724365234375, + "learning_rate": 0.004187099503673847, + "loss": 7.9026, + "step": 437900 + }, + { + "epoch": 1.784315742410802, + "grad_norm": 1.4235539436340332, + "learning_rate": 0.004186733075971701, + "loss": 7.911, + "step": 438000 + }, + { + "epoch": 1.784315742410802, + "eval_MaskedAccuracy": 0.4959748050379288, + "eval_loss": 1.666916012763977, + "eval_runtime": 389.5847, + "eval_samples_per_second": 162.932, + "eval_steps_per_second": 0.637, + "step": 438000 + }, + { + "epoch": 1.7847231204341836, + "grad_norm": 4.689061641693115, + "learning_rate": 0.004186366581781657, + "loss": 7.9296, + "step": 438100 + }, + { + "epoch": 1.785130498457565, + "grad_norm": 1.690250039100647, + "learning_rate": 0.004186000021118195, + "loss": 7.9129, + "step": 438200 + }, + { + "epoch": 1.7855378764809464, + "grad_norm": 2.6211435794830322, + "learning_rate": 0.004185633393995812, + "loss": 7.9078, + "step": 438300 + }, + { + "epoch": 1.7859452545043277, + "grad_norm": 1.5534968376159668, + "learning_rate": 0.004185266700429004, + "loss": 7.8942, + "step": 438400 + }, + { + "epoch": 1.7863526325277093, + "grad_norm": 3.143200397491455, + "learning_rate": 0.004184899940432262, + "loss": 7.9078, + "step": 438500 + }, + { + "epoch": 1.7867600105510908, + "grad_norm": 2.1769890785217285, + "learning_rate": 0.004184533114020093, + "loss": 7.9193, + "step": 438600 + }, + { + "epoch": 1.7871673885744723, + "grad_norm": 7.959547519683838, + "learning_rate": 0.004184166221207, + "loss": 7.9189, + "step": 438700 + }, + { + "epoch": 1.7875747665978539, + "grad_norm": 2.8897268772125244, + "learning_rate": 0.004183799262007484, + "loss": 7.9022, + "step": 438800 + }, + { + "epoch": 1.7879821446212352, + "grad_norm": 3.0990419387817383, + "learning_rate": 0.004183432236436056, + "loss": 7.8757, + "step": 438900 + }, + { + "epoch": 1.7883895226446167, + "grad_norm": 3.6900036334991455, + "learning_rate": 0.0041830651445072245, + "loss": 7.9574, + "step": 439000 + }, + { + "epoch": 1.7883895226446167, + "eval_MaskedAccuracy": 0.49563555786652236, + "eval_loss": 1.673840880393982, + "eval_runtime": 176.5906, + "eval_samples_per_second": 359.453, + "eval_steps_per_second": 1.404, + "step": 439000 + }, + { + "epoch": 1.788796900667998, + "grad_norm": 2.1039018630981445, + "learning_rate": 0.004182697986235501, + "loss": 7.9234, + "step": 439100 + }, + { + "epoch": 1.7892042786913795, + "grad_norm": 4.542923927307129, + "learning_rate": 0.004182330761635402, + "loss": 7.9206, + "step": 439200 + }, + { + "epoch": 1.789611656714761, + "grad_norm": 2.3067336082458496, + "learning_rate": 0.00418196347072145, + "loss": 7.9344, + "step": 439300 + }, + { + "epoch": 1.7900190347381426, + "grad_norm": 1.9165045022964478, + "learning_rate": 0.004181596113508158, + "loss": 7.9013, + "step": 439400 + }, + { + "epoch": 1.790426412761524, + "grad_norm": 10.063130378723145, + "learning_rate": 0.004181228690010059, + "loss": 7.8963, + "step": 439500 + }, + { + "epoch": 1.7908337907849055, + "grad_norm": 7.03554105758667, + "learning_rate": 0.004180861200241673, + "loss": 7.9011, + "step": 439600 + }, + { + "epoch": 1.7912411688082868, + "grad_norm": 2.953171730041504, + "learning_rate": 0.0041804936442175365, + "loss": 7.9171, + "step": 439700 + }, + { + "epoch": 1.7916485468316683, + "grad_norm": 1.5528123378753662, + "learning_rate": 0.0041801260219521685, + "loss": 7.9011, + "step": 439800 + }, + { + "epoch": 1.7920559248550498, + "grad_norm": 3.7841126918792725, + "learning_rate": 0.004179758333460114, + "loss": 7.9069, + "step": 439900 + }, + { + "epoch": 1.7924633028784314, + "grad_norm": 3.153717279434204, + "learning_rate": 0.004179390578755899, + "loss": 7.9109, + "step": 440000 + }, + { + "epoch": 1.7924633028784314, + "eval_MaskedAccuracy": 0.4956089167070561, + "eval_loss": 1.660947561264038, + "eval_runtime": 168.1323, + "eval_samples_per_second": 377.536, + "eval_steps_per_second": 1.475, + "step": 440000 + }, + { + "epoch": 1.792870680901813, + "grad_norm": 8.45737361907959, + "learning_rate": 0.004179022757854072, + "loss": 7.8934, + "step": 440100 + }, + { + "epoch": 1.7932780589251942, + "grad_norm": 2.9157402515411377, + "learning_rate": 0.004178654870769175, + "loss": 7.9247, + "step": 440200 + }, + { + "epoch": 1.7936854369485755, + "grad_norm": 4.986844062805176, + "learning_rate": 0.0041782869175157515, + "loss": 7.9019, + "step": 440300 + }, + { + "epoch": 1.794092814971957, + "grad_norm": 3.2577908039093018, + "learning_rate": 0.004177918898108344, + "loss": 7.898, + "step": 440400 + }, + { + "epoch": 1.7945001929953386, + "grad_norm": 5.337949752807617, + "learning_rate": 0.0041775508125615046, + "loss": 7.941, + "step": 440500 + }, + { + "epoch": 1.7949075710187201, + "grad_norm": 4.633530139923096, + "learning_rate": 0.004177182660889788, + "loss": 7.9278, + "step": 440600 + }, + { + "epoch": 1.7953149490421016, + "grad_norm": 5.676455974578857, + "learning_rate": 0.004176814443107745, + "loss": 7.9107, + "step": 440700 + }, + { + "epoch": 1.795722327065483, + "grad_norm": 3.3541319370269775, + "learning_rate": 0.004176446159229939, + "loss": 7.9266, + "step": 440800 + }, + { + "epoch": 1.7961297050888643, + "grad_norm": 8.209612846374512, + "learning_rate": 0.004176077809270926, + "loss": 7.9407, + "step": 440900 + }, + { + "epoch": 1.7965370831122458, + "grad_norm": 2.504899263381958, + "learning_rate": 0.00417570939324527, + "loss": 7.9232, + "step": 441000 + }, + { + "epoch": 1.7965370831122458, + "eval_MaskedAccuracy": 0.49530813754566594, + "eval_loss": 1.6687226295471191, + "eval_runtime": 227.0884, + "eval_samples_per_second": 279.521, + "eval_steps_per_second": 1.092, + "step": 441000 + }, + { + "epoch": 1.7969444611356273, + "grad_norm": 3.49796199798584, + "learning_rate": 0.0041753409111675395, + "loss": 7.8928, + "step": 441100 + }, + { + "epoch": 1.7973518391590089, + "grad_norm": 1.6105597019195557, + "learning_rate": 0.004174972363052291, + "loss": 7.8804, + "step": 441200 + }, + { + "epoch": 1.7977592171823904, + "grad_norm": 1.1660513877868652, + "learning_rate": 0.0041746037489141, + "loss": 7.9187, + "step": 441300 + }, + { + "epoch": 1.7981665952057717, + "grad_norm": 3.5011515617370605, + "learning_rate": 0.00417423506876755, + "loss": 7.9022, + "step": 441400 + }, + { + "epoch": 1.7985739732291532, + "grad_norm": 4.8468403816223145, + "learning_rate": 0.004173866322627205, + "loss": 7.9085, + "step": 441500 + }, + { + "epoch": 1.7989813512525346, + "grad_norm": 4.980172634124756, + "learning_rate": 0.004173497510507649, + "loss": 7.9135, + "step": 441600 + }, + { + "epoch": 1.799388729275916, + "grad_norm": 1.7658847570419312, + "learning_rate": 0.004173128632423463, + "loss": 7.8693, + "step": 441700 + }, + { + "epoch": 1.7997961072992976, + "grad_norm": 3.6838791370391846, + "learning_rate": 0.004172759688389234, + "loss": 7.916, + "step": 441800 + }, + { + "epoch": 1.8002034853226792, + "grad_norm": 2.0375492572784424, + "learning_rate": 0.0041723906784195425, + "loss": 7.921, + "step": 441900 + }, + { + "epoch": 1.8006108633460605, + "grad_norm": 6.014378547668457, + "learning_rate": 0.0041720216025289796, + "loss": 7.8806, + "step": 442000 + }, + { + "epoch": 1.8006108633460605, + "eval_MaskedAccuracy": 0.49440137812230833, + "eval_loss": 1.6746090650558472, + "eval_runtime": 176.9757, + "eval_samples_per_second": 358.671, + "eval_steps_per_second": 1.401, + "step": 442000 + }, + { + "epoch": 1.801018241369442, + "grad_norm": 2.6855621337890625, + "learning_rate": 0.004171652460732139, + "loss": 7.932, + "step": 442100 + }, + { + "epoch": 1.8014256193928233, + "grad_norm": 3.248476028442383, + "learning_rate": 0.004171283253043618, + "loss": 7.8913, + "step": 442200 + }, + { + "epoch": 1.8018329974162048, + "grad_norm": 2.8973610401153564, + "learning_rate": 0.004170913979478002, + "loss": 7.9527, + "step": 442300 + }, + { + "epoch": 1.8022403754395864, + "grad_norm": 2.320071220397949, + "learning_rate": 0.0041705446400498985, + "loss": 7.9403, + "step": 442400 + }, + { + "epoch": 1.802647753462968, + "grad_norm": 5.010457992553711, + "learning_rate": 0.0041701752347739094, + "loss": 7.914, + "step": 442500 + }, + { + "epoch": 1.8030551314863492, + "grad_norm": 3.502883195877075, + "learning_rate": 0.004169805763664631, + "loss": 7.9174, + "step": 442600 + }, + { + "epoch": 1.8034625095097307, + "grad_norm": 1.849295973777771, + "learning_rate": 0.004169436226736683, + "loss": 7.9389, + "step": 442700 + }, + { + "epoch": 1.803869887533112, + "grad_norm": 2.2292819023132324, + "learning_rate": 0.004169066624004665, + "loss": 7.8988, + "step": 442800 + }, + { + "epoch": 1.8042772655564936, + "grad_norm": 1.607640027999878, + "learning_rate": 0.004168696955483185, + "loss": 7.8778, + "step": 442900 + }, + { + "epoch": 1.8046846435798751, + "grad_norm": 3.704303741455078, + "learning_rate": 0.004168327221186878, + "loss": 7.9121, + "step": 443000 + }, + { + "epoch": 1.8046846435798751, + "eval_MaskedAccuracy": 0.4949538749421863, + "eval_loss": 1.6765481233596802, + "eval_runtime": 174.5788, + "eval_samples_per_second": 363.595, + "eval_steps_per_second": 1.421, + "step": 443000 + }, + { + "epoch": 1.8050920216032567, + "grad_norm": 3.1414365768432617, + "learning_rate": 0.0041679574211303475, + "loss": 7.9105, + "step": 443100 + }, + { + "epoch": 1.8054993996266382, + "grad_norm": 1.1742734909057617, + "learning_rate": 0.004167587555328217, + "loss": 7.9042, + "step": 443200 + }, + { + "epoch": 1.8059067776500195, + "grad_norm": 4.276191234588623, + "learning_rate": 0.004167217623795106, + "loss": 7.9248, + "step": 443300 + }, + { + "epoch": 1.8063141556734008, + "grad_norm": 3.426767587661743, + "learning_rate": 0.004166847626545641, + "loss": 7.9353, + "step": 443400 + }, + { + "epoch": 1.8067215336967823, + "grad_norm": 3.6210439205169678, + "learning_rate": 0.004166477563594457, + "loss": 7.9257, + "step": 443500 + }, + { + "epoch": 1.8071289117201639, + "grad_norm": 3.102713108062744, + "learning_rate": 0.004166107434956181, + "loss": 7.8792, + "step": 443600 + }, + { + "epoch": 1.8075362897435454, + "grad_norm": 4.327525615692139, + "learning_rate": 0.004165737240645438, + "loss": 7.8969, + "step": 443700 + }, + { + "epoch": 1.807943667766927, + "grad_norm": 2.9925131797790527, + "learning_rate": 0.0041653669806768654, + "loss": 7.9257, + "step": 443800 + }, + { + "epoch": 1.8083510457903083, + "grad_norm": 4.600265979766846, + "learning_rate": 0.00416499665506511, + "loss": 7.9311, + "step": 443900 + }, + { + "epoch": 1.8087584238136898, + "grad_norm": 2.469639778137207, + "learning_rate": 0.0041646262638248025, + "loss": 7.9158, + "step": 444000 + }, + { + "epoch": 1.8087584238136898, + "eval_MaskedAccuracy": 0.49692060364473667, + "eval_loss": 1.665111780166626, + "eval_runtime": 186.8741, + "eval_samples_per_second": 339.672, + "eval_steps_per_second": 1.327, + "step": 444000 + }, + { + "epoch": 1.809165801837071, + "grad_norm": 1.4323352575302124, + "learning_rate": 0.004164255806970597, + "loss": 7.8947, + "step": 444100 + }, + { + "epoch": 1.8095731798604526, + "grad_norm": 1.980042815208435, + "learning_rate": 0.004163885284517136, + "loss": 7.9259, + "step": 444200 + }, + { + "epoch": 1.8099805578838342, + "grad_norm": 4.067874908447266, + "learning_rate": 0.0041635146964790724, + "loss": 7.9005, + "step": 444300 + }, + { + "epoch": 1.8103879359072157, + "grad_norm": 2.5142414569854736, + "learning_rate": 0.0041631440428710476, + "loss": 7.8821, + "step": 444400 + }, + { + "epoch": 1.810795313930597, + "grad_norm": 3.3953969478607178, + "learning_rate": 0.0041627733237077184, + "loss": 7.9353, + "step": 444500 + }, + { + "epoch": 1.8112026919539785, + "grad_norm": 2.7303261756896973, + "learning_rate": 0.004162402539003742, + "loss": 7.8995, + "step": 444600 + }, + { + "epoch": 1.8116100699773598, + "grad_norm": 5.998344898223877, + "learning_rate": 0.004162031688773784, + "loss": 7.9338, + "step": 444700 + }, + { + "epoch": 1.8120174480007414, + "grad_norm": 2.08536696434021, + "learning_rate": 0.004161660773032501, + "loss": 7.9245, + "step": 444800 + }, + { + "epoch": 1.812424826024123, + "grad_norm": 5.068397521972656, + "learning_rate": 0.004161289791794559, + "loss": 7.9051, + "step": 444900 + }, + { + "epoch": 1.8128322040475044, + "grad_norm": 3.3044345378875732, + "learning_rate": 0.00416091874507462, + "loss": 7.8989, + "step": 445000 + }, + { + "epoch": 1.8128322040475044, + "eval_MaskedAccuracy": 0.49567050833286097, + "eval_loss": 1.6687381267547607, + "eval_runtime": 262.1765, + "eval_samples_per_second": 242.112, + "eval_steps_per_second": 0.946, + "step": 445000 + }, + { + "epoch": 1.8132395820708858, + "grad_norm": 3.2127983570098877, + "learning_rate": 0.00416054763288736, + "loss": 7.9026, + "step": 445100 + }, + { + "epoch": 1.8136469600942673, + "grad_norm": 2.71120548248291, + "learning_rate": 0.004160176455247446, + "loss": 7.8831, + "step": 445200 + }, + { + "epoch": 1.8140543381176486, + "grad_norm": 3.842801094055176, + "learning_rate": 0.004159805212169553, + "loss": 7.8957, + "step": 445300 + }, + { + "epoch": 1.8144617161410301, + "grad_norm": 3.2258753776550293, + "learning_rate": 0.0041594339036683595, + "loss": 7.8848, + "step": 445400 + }, + { + "epoch": 1.8148690941644117, + "grad_norm": 1.8990269899368286, + "learning_rate": 0.004159062529758543, + "loss": 7.9124, + "step": 445500 + }, + { + "epoch": 1.8152764721877932, + "grad_norm": 3.415687322616577, + "learning_rate": 0.004158691090454788, + "loss": 7.9171, + "step": 445600 + }, + { + "epoch": 1.8156838502111747, + "grad_norm": 4.696331977844238, + "learning_rate": 0.004158319585771781, + "loss": 7.9059, + "step": 445700 + }, + { + "epoch": 1.816091228234556, + "grad_norm": 7.733753681182861, + "learning_rate": 0.004157948015724208, + "loss": 7.9152, + "step": 445800 + }, + { + "epoch": 1.8164986062579374, + "grad_norm": 5.251039505004883, + "learning_rate": 0.004157576380326765, + "loss": 7.9275, + "step": 445900 + }, + { + "epoch": 1.8169059842813189, + "grad_norm": 1.9722418785095215, + "learning_rate": 0.004157204679594133, + "loss": 7.8827, + "step": 446000 + }, + { + "epoch": 1.8169059842813189, + "eval_MaskedAccuracy": 0.4959173168432975, + "eval_loss": 1.6723440885543823, + "eval_runtime": 190.1191, + "eval_samples_per_second": 333.875, + "eval_steps_per_second": 1.304, + "step": 446000 + }, + { + "epoch": 1.8173133623047004, + "grad_norm": 3.0915658473968506, + "learning_rate": 0.004156832913541016, + "loss": 7.9025, + "step": 446100 + }, + { + "epoch": 1.817720740328082, + "grad_norm": 2.651003837585449, + "learning_rate": 0.004156461082182111, + "loss": 7.8733, + "step": 446200 + }, + { + "epoch": 1.8181281183514635, + "grad_norm": 1.6605842113494873, + "learning_rate": 0.004156089185532116, + "loss": 7.8446, + "step": 446300 + }, + { + "epoch": 1.8185354963748448, + "grad_norm": 2.606069326400757, + "learning_rate": 0.004155717223605732, + "loss": 7.8974, + "step": 446400 + }, + { + "epoch": 1.8189428743982263, + "grad_norm": 1.5527215003967285, + "learning_rate": 0.004155345196417669, + "loss": 7.8842, + "step": 446500 + }, + { + "epoch": 1.8193502524216076, + "grad_norm": 3.269723653793335, + "learning_rate": 0.004154973103982632, + "loss": 7.8867, + "step": 446600 + }, + { + "epoch": 1.8197576304449892, + "grad_norm": 3.024127960205078, + "learning_rate": 0.0041546009463153365, + "loss": 7.8656, + "step": 446700 + }, + { + "epoch": 1.8201650084683707, + "grad_norm": 3.980433225631714, + "learning_rate": 0.004154228723430491, + "loss": 7.8814, + "step": 446800 + }, + { + "epoch": 1.8205723864917522, + "grad_norm": 2.666149377822876, + "learning_rate": 0.004153856435342808, + "loss": 7.9254, + "step": 446900 + }, + { + "epoch": 1.8209797645151335, + "grad_norm": 1.558209776878357, + "learning_rate": 0.004153484082067014, + "loss": 7.9242, + "step": 447000 + }, + { + "epoch": 1.8209797645151335, + "eval_MaskedAccuracy": 0.4950336656005353, + "eval_loss": 1.6678202152252197, + "eval_runtime": 164.6074, + "eval_samples_per_second": 385.621, + "eval_steps_per_second": 1.507, + "step": 447000 + }, + { + "epoch": 1.821387142538515, + "grad_norm": 5.244185924530029, + "learning_rate": 0.00415311166361783, + "loss": 7.9113, + "step": 447100 + }, + { + "epoch": 1.8217945205618964, + "grad_norm": 2.0754573345184326, + "learning_rate": 0.0041527391800099765, + "loss": 7.9222, + "step": 447200 + }, + { + "epoch": 1.822201898585278, + "grad_norm": 2.7704737186431885, + "learning_rate": 0.004152366631258185, + "loss": 7.9639, + "step": 447300 + }, + { + "epoch": 1.8226092766086595, + "grad_norm": 8.123115539550781, + "learning_rate": 0.004151994017377173, + "loss": 7.942, + "step": 447400 + }, + { + "epoch": 1.823016654632041, + "grad_norm": 2.6104109287261963, + "learning_rate": 0.004151621338381678, + "loss": 7.9107, + "step": 447500 + }, + { + "epoch": 1.8234240326554223, + "grad_norm": 1.7403087615966797, + "learning_rate": 0.0041512485942864375, + "loss": 7.9298, + "step": 447600 + }, + { + "epoch": 1.8238314106788038, + "grad_norm": 2.766556978225708, + "learning_rate": 0.0041508757851061884, + "loss": 7.8635, + "step": 447700 + }, + { + "epoch": 1.8242387887021851, + "grad_norm": 4.337071895599365, + "learning_rate": 0.004150502910855666, + "loss": 7.9125, + "step": 447800 + }, + { + "epoch": 1.8246461667255667, + "grad_norm": 2.609506130218506, + "learning_rate": 0.004150129971549611, + "loss": 7.9264, + "step": 447900 + }, + { + "epoch": 1.8250535447489482, + "grad_norm": 3.140911102294922, + "learning_rate": 0.004149756967202771, + "loss": 7.8923, + "step": 448000 + }, + { + "epoch": 1.8250535447489482, + "eval_MaskedAccuracy": 0.495953354737639, + "eval_loss": 1.6645445823669434, + "eval_runtime": 178.6428, + "eval_samples_per_second": 355.324, + "eval_steps_per_second": 1.388, + "step": 448000 + }, + { + "epoch": 1.8254609227723297, + "grad_norm": 3.31099271774292, + "learning_rate": 0.0041493838978298924, + "loss": 7.9058, + "step": 448100 + }, + { + "epoch": 1.8258683007957113, + "grad_norm": 1.3624597787857056, + "learning_rate": 0.0041490107634457206, + "loss": 7.9254, + "step": 448200 + }, + { + "epoch": 1.8262756788190926, + "grad_norm": 2.529301404953003, + "learning_rate": 0.00414863756406501, + "loss": 7.9183, + "step": 448300 + }, + { + "epoch": 1.826683056842474, + "grad_norm": 6.022654056549072, + "learning_rate": 0.004148264299702512, + "loss": 7.9112, + "step": 448400 + }, + { + "epoch": 1.8270904348658554, + "grad_norm": 4.083413600921631, + "learning_rate": 0.004147890970372994, + "loss": 7.905, + "step": 448500 + }, + { + "epoch": 1.827497812889237, + "grad_norm": 1.5772367715835571, + "learning_rate": 0.004147517576091207, + "loss": 7.9241, + "step": 448600 + }, + { + "epoch": 1.8279051909126185, + "grad_norm": 6.243408203125, + "learning_rate": 0.004147144116871915, + "loss": 7.9068, + "step": 448700 + }, + { + "epoch": 1.828312568936, + "grad_norm": 3.281571865081787, + "learning_rate": 0.004146770592729879, + "loss": 7.8934, + "step": 448800 + }, + { + "epoch": 1.8287199469593813, + "grad_norm": 5.442648887634277, + "learning_rate": 0.004146397003679876, + "loss": 7.8683, + "step": 448900 + }, + { + "epoch": 1.8291273249827629, + "grad_norm": 2.891380548477173, + "learning_rate": 0.004146023349736674, + "loss": 7.9045, + "step": 449000 + }, + { + "epoch": 1.8291273249827629, + "eval_MaskedAccuracy": 0.49615328179758333, + "eval_loss": 1.6711750030517578, + "eval_runtime": 227.1922, + "eval_samples_per_second": 279.393, + "eval_steps_per_second": 1.092, + "step": 449000 + }, + { + "epoch": 1.8295347030061442, + "grad_norm": 4.703136920928955, + "learning_rate": 0.0041456496309150385, + "loss": 7.8932, + "step": 449100 + }, + { + "epoch": 1.8299420810295257, + "grad_norm": 1.8629577159881592, + "learning_rate": 0.0041452758472297485, + "loss": 7.9143, + "step": 449200 + }, + { + "epoch": 1.8303494590529072, + "grad_norm": 2.245495319366455, + "learning_rate": 0.004144901998695584, + "loss": 7.8996, + "step": 449300 + }, + { + "epoch": 1.8307568370762888, + "grad_norm": 4.677794456481934, + "learning_rate": 0.004144528085327317, + "loss": 7.9425, + "step": 449400 + }, + { + "epoch": 1.83116421509967, + "grad_norm": 3.489626407623291, + "learning_rate": 0.004144154107139741, + "loss": 7.9098, + "step": 449500 + }, + { + "epoch": 1.8315715931230516, + "grad_norm": 2.117542028427124, + "learning_rate": 0.004143780064147631, + "loss": 7.8836, + "step": 449600 + }, + { + "epoch": 1.831978971146433, + "grad_norm": 2.0413732528686523, + "learning_rate": 0.00414340595636578, + "loss": 7.8868, + "step": 449700 + }, + { + "epoch": 1.8323863491698145, + "grad_norm": 2.2001099586486816, + "learning_rate": 0.004143031783808978, + "loss": 7.8777, + "step": 449800 + }, + { + "epoch": 1.832793727193196, + "grad_norm": 3.2869749069213867, + "learning_rate": 0.004142657546492024, + "loss": 7.892, + "step": 449900 + }, + { + "epoch": 1.8332011052165775, + "grad_norm": 3.861534595489502, + "learning_rate": 0.004142283244429706, + "loss": 7.941, + "step": 450000 + }, + { + "epoch": 1.8332011052165775, + "eval_MaskedAccuracy": 0.4951523046993051, + "eval_loss": 1.661887764930725, + "eval_runtime": 213.694, + "eval_samples_per_second": 297.042, + "eval_steps_per_second": 1.161, + "step": 450000 + }, + { + "epoch": 1.8336084832399588, + "grad_norm": 6.784153938293457, + "learning_rate": 0.004141908877636822, + "loss": 7.9454, + "step": 450100 + }, + { + "epoch": 1.8340158612633404, + "grad_norm": 3.286696434020996, + "learning_rate": 0.004141534446128179, + "loss": 7.9103, + "step": 450200 + }, + { + "epoch": 1.8344232392867217, + "grad_norm": 2.0969045162200928, + "learning_rate": 0.00414115994991858, + "loss": 7.9031, + "step": 450300 + }, + { + "epoch": 1.8348306173101032, + "grad_norm": 2.1802902221679688, + "learning_rate": 0.0041407853890228245, + "loss": 7.9219, + "step": 450400 + }, + { + "epoch": 1.8352379953334848, + "grad_norm": 11.451210021972656, + "learning_rate": 0.0041404107634557255, + "loss": 7.8825, + "step": 450500 + }, + { + "epoch": 1.8356453733568663, + "grad_norm": 2.645416259765625, + "learning_rate": 0.004140036073232091, + "loss": 7.8948, + "step": 450600 + }, + { + "epoch": 1.8360527513802478, + "grad_norm": 2.332348108291626, + "learning_rate": 0.004139661318366737, + "loss": 7.8975, + "step": 450700 + }, + { + "epoch": 1.8364601294036291, + "grad_norm": 2.727524757385254, + "learning_rate": 0.0041392864988744745, + "loss": 7.9111, + "step": 450800 + }, + { + "epoch": 1.8368675074270104, + "grad_norm": 2.6782305240631104, + "learning_rate": 0.0041389116147701275, + "loss": 7.8691, + "step": 450900 + }, + { + "epoch": 1.837274885450392, + "grad_norm": 3.461545705795288, + "learning_rate": 0.0041385366660685145, + "loss": 7.9214, + "step": 451000 + }, + { + "epoch": 1.837274885450392, + "eval_MaskedAccuracy": 0.4965261188255572, + "eval_loss": 1.6573883295059204, + "eval_runtime": 220.9948, + "eval_samples_per_second": 287.228, + "eval_steps_per_second": 1.122, + "step": 451000 + }, + { + "epoch": 1.8376822634737735, + "grad_norm": 1.2859609127044678, + "learning_rate": 0.00413816165278446, + "loss": 7.8879, + "step": 451100 + }, + { + "epoch": 1.838089641497155, + "grad_norm": 5.609396934509277, + "learning_rate": 0.004137786574932793, + "loss": 7.9193, + "step": 451200 + }, + { + "epoch": 1.8384970195205366, + "grad_norm": 3.979440212249756, + "learning_rate": 0.004137411432528341, + "loss": 7.8831, + "step": 451300 + }, + { + "epoch": 1.8389043975439179, + "grad_norm": 4.6923346519470215, + "learning_rate": 0.00413703622558593, + "loss": 7.9044, + "step": 451400 + }, + { + "epoch": 1.8393117755672994, + "grad_norm": 5.283751487731934, + "learning_rate": 0.004136660954120404, + "loss": 7.9141, + "step": 451500 + }, + { + "epoch": 1.8397191535906807, + "grad_norm": 5.3510422706604, + "learning_rate": 0.004136285618146588, + "loss": 7.922, + "step": 451600 + }, + { + "epoch": 1.8401265316140623, + "grad_norm": 8.144210815429688, + "learning_rate": 0.004135910217679331, + "loss": 7.8678, + "step": 451700 + }, + { + "epoch": 1.8405339096374438, + "grad_norm": 2.038489818572998, + "learning_rate": 0.004135534752733466, + "loss": 7.9156, + "step": 451800 + }, + { + "epoch": 1.8409412876608253, + "grad_norm": 11.392291069030762, + "learning_rate": 0.004135159223323846, + "loss": 7.9059, + "step": 451900 + }, + { + "epoch": 1.8413486656842066, + "grad_norm": 5.011401653289795, + "learning_rate": 0.004134783629465308, + "loss": 7.9042, + "step": 452000 + }, + { + "epoch": 1.8413486656842066, + "eval_MaskedAccuracy": 0.4961874772093576, + "eval_loss": 1.6660220623016357, + "eval_runtime": 159.7033, + "eval_samples_per_second": 397.462, + "eval_steps_per_second": 1.553, + "step": 452000 + }, + { + "epoch": 1.8417560437075882, + "grad_norm": 3.6451973915100098, + "learning_rate": 0.004134407971172704, + "loss": 7.8707, + "step": 452100 + }, + { + "epoch": 1.8421634217309695, + "grad_norm": 4.899003505706787, + "learning_rate": 0.004134032248460891, + "loss": 7.8941, + "step": 452200 + }, + { + "epoch": 1.842570799754351, + "grad_norm": 4.606278896331787, + "learning_rate": 0.004133656461344721, + "loss": 7.9084, + "step": 452300 + }, + { + "epoch": 1.8429781777777325, + "grad_norm": 2.0069868564605713, + "learning_rate": 0.004133280609839054, + "loss": 7.9013, + "step": 452400 + }, + { + "epoch": 1.843385555801114, + "grad_norm": 2.7358434200286865, + "learning_rate": 0.004132904693958741, + "loss": 7.8769, + "step": 452500 + }, + { + "epoch": 1.8437929338244954, + "grad_norm": 1.9440211057662964, + "learning_rate": 0.004132528713718655, + "loss": 7.917, + "step": 452600 + }, + { + "epoch": 1.844200311847877, + "grad_norm": 4.830276966094971, + "learning_rate": 0.004132152669133653, + "loss": 7.923, + "step": 452700 + }, + { + "epoch": 1.8446076898712582, + "grad_norm": 1.4976750612258911, + "learning_rate": 0.004131776560218604, + "loss": 7.9158, + "step": 452800 + }, + { + "epoch": 1.8450150678946398, + "grad_norm": 2.558852195739746, + "learning_rate": 0.0041314003869883705, + "loss": 7.8968, + "step": 452900 + }, + { + "epoch": 1.8454224459180213, + "grad_norm": 3.535841464996338, + "learning_rate": 0.004131024149457828, + "loss": 7.8685, + "step": 453000 + }, + { + "epoch": 1.8454224459180213, + "eval_MaskedAccuracy": 0.4963606994584085, + "eval_loss": 1.6634576320648193, + "eval_runtime": 153.1638, + "eval_samples_per_second": 414.432, + "eval_steps_per_second": 1.619, + "step": 453000 + }, + { + "epoch": 1.8458298239414028, + "grad_norm": 4.483730792999268, + "learning_rate": 0.004130647847641857, + "loss": 7.8971, + "step": 453100 + }, + { + "epoch": 1.8462372019647844, + "grad_norm": 5.824855327606201, + "learning_rate": 0.004130271481555332, + "loss": 7.8685, + "step": 453200 + }, + { + "epoch": 1.8466445799881657, + "grad_norm": 4.613622665405273, + "learning_rate": 0.00412989505121314, + "loss": 7.9381, + "step": 453300 + }, + { + "epoch": 1.847051958011547, + "grad_norm": 4.366022109985352, + "learning_rate": 0.004129518556630148, + "loss": 7.8921, + "step": 453400 + }, + { + "epoch": 1.8474593360349285, + "grad_norm": 2.6968445777893066, + "learning_rate": 0.004129141997821257, + "loss": 7.8926, + "step": 453500 + }, + { + "epoch": 1.84786671405831, + "grad_norm": 4.72305965423584, + "learning_rate": 0.004128765374801345, + "loss": 7.9136, + "step": 453600 + }, + { + "epoch": 1.8482740920816916, + "grad_norm": 3.338348150253296, + "learning_rate": 0.004128388687585303, + "loss": 7.894, + "step": 453700 + }, + { + "epoch": 1.848681470105073, + "grad_norm": 7.44401216506958, + "learning_rate": 0.004128011936188016, + "loss": 7.9059, + "step": 453800 + }, + { + "epoch": 1.8490888481284544, + "grad_norm": 4.553011417388916, + "learning_rate": 0.0041276351206243934, + "loss": 7.9385, + "step": 453900 + }, + { + "epoch": 1.849496226151836, + "grad_norm": 2.3087642192840576, + "learning_rate": 0.0041272582409093254, + "loss": 7.891, + "step": 454000 + }, + { + "epoch": 1.849496226151836, + "eval_MaskedAccuracy": 0.4961357549464838, + "eval_loss": 1.6584280729293823, + "eval_runtime": 230.442, + "eval_samples_per_second": 275.453, + "eval_steps_per_second": 1.076, + "step": 454000 + }, + { + "epoch": 1.8499036041752173, + "grad_norm": 3.6199896335601807, + "learning_rate": 0.004126881297057713, + "loss": 7.9171, + "step": 454100 + }, + { + "epoch": 1.8503109821985988, + "grad_norm": 4.018579006195068, + "learning_rate": 0.004126504289084462, + "loss": 7.9041, + "step": 454200 + }, + { + "epoch": 1.8507183602219803, + "grad_norm": 3.0367114543914795, + "learning_rate": 0.004126127217004473, + "loss": 7.8894, + "step": 454300 + }, + { + "epoch": 1.8511257382453619, + "grad_norm": 4.969758987426758, + "learning_rate": 0.004125750080832656, + "loss": 7.8856, + "step": 454400 + }, + { + "epoch": 1.8515331162687432, + "grad_norm": 2.2117040157318115, + "learning_rate": 0.004125372880583914, + "loss": 7.8793, + "step": 454500 + }, + { + "epoch": 1.8519404942921247, + "grad_norm": 4.151038646697998, + "learning_rate": 0.004124995616273167, + "loss": 7.8797, + "step": 454600 + }, + { + "epoch": 1.852347872315506, + "grad_norm": 1.7162184715270996, + "learning_rate": 0.004124618287915334, + "loss": 7.8996, + "step": 454700 + }, + { + "epoch": 1.8527552503388875, + "grad_norm": 1.5427240133285522, + "learning_rate": 0.004124240895525328, + "loss": 7.9155, + "step": 454800 + }, + { + "epoch": 1.853162628362269, + "grad_norm": 3.322561502456665, + "learning_rate": 0.004123863439118065, + "loss": 7.8792, + "step": 454900 + }, + { + "epoch": 1.8535700063856506, + "grad_norm": 8.798599243164062, + "learning_rate": 0.004123485918708471, + "loss": 7.8988, + "step": 455000 + }, + { + "epoch": 1.8535700063856506, + "eval_MaskedAccuracy": 0.4965567334365523, + "eval_loss": 1.6644086837768555, + "eval_runtime": 287.5826, + "eval_samples_per_second": 220.723, + "eval_steps_per_second": 0.862, + "step": 455000 + }, + { + "epoch": 1.853977384409032, + "grad_norm": 1.1775785684585571, + "learning_rate": 0.004123108334311479, + "loss": 7.924, + "step": 455100 + }, + { + "epoch": 1.8543847624324135, + "grad_norm": 3.676361560821533, + "learning_rate": 0.004122730685942003, + "loss": 7.9139, + "step": 455200 + }, + { + "epoch": 1.8547921404557948, + "grad_norm": 1.9800739288330078, + "learning_rate": 0.004122352973614984, + "loss": 7.895, + "step": 455300 + }, + { + "epoch": 1.8551995184791763, + "grad_norm": 2.143259048461914, + "learning_rate": 0.004121975197345349, + "loss": 7.8782, + "step": 455400 + }, + { + "epoch": 1.8556068965025578, + "grad_norm": 1.4323902130126953, + "learning_rate": 0.004121597357148036, + "loss": 7.8973, + "step": 455500 + }, + { + "epoch": 1.8560142745259394, + "grad_norm": 1.667303204536438, + "learning_rate": 0.004121219453037981, + "loss": 7.9141, + "step": 455600 + }, + { + "epoch": 1.856421652549321, + "grad_norm": 2.2656047344207764, + "learning_rate": 0.00412084148503013, + "loss": 7.9419, + "step": 455700 + }, + { + "epoch": 1.8568290305727022, + "grad_norm": 2.6010971069335938, + "learning_rate": 0.004120463453139422, + "loss": 7.9153, + "step": 455800 + }, + { + "epoch": 1.8572364085960835, + "grad_norm": 8.667475700378418, + "learning_rate": 0.004120085357380807, + "loss": 7.9037, + "step": 455900 + }, + { + "epoch": 1.857643786619465, + "grad_norm": 1.9657787084579468, + "learning_rate": 0.004119707197769229, + "loss": 7.9178, + "step": 456000 + }, + { + "epoch": 1.857643786619465, + "eval_MaskedAccuracy": 0.496212023289223, + "eval_loss": 1.6610968112945557, + "eval_runtime": 372.771, + "eval_samples_per_second": 170.281, + "eval_steps_per_second": 0.665, + "step": 456000 + }, + { + "epoch": 1.8580511646428466, + "grad_norm": 1.846003770828247, + "learning_rate": 0.004119328974319637, + "loss": 7.8931, + "step": 456100 + }, + { + "epoch": 1.8584585426662281, + "grad_norm": 4.922017574310303, + "learning_rate": 0.004118950687046992, + "loss": 7.865, + "step": 456200 + }, + { + "epoch": 1.8588659206896097, + "grad_norm": 5.104384422302246, + "learning_rate": 0.004118572335966235, + "loss": 7.8998, + "step": 456300 + }, + { + "epoch": 1.859273298712991, + "grad_norm": 3.1299219131469727, + "learning_rate": 0.004118193921092341, + "loss": 7.8815, + "step": 456400 + }, + { + "epoch": 1.8596806767363725, + "grad_norm": 3.355180501937866, + "learning_rate": 0.004117815442440259, + "loss": 7.9069, + "step": 456500 + }, + { + "epoch": 1.8600880547597538, + "grad_norm": 4.386593341827393, + "learning_rate": 0.004117436900024955, + "loss": 7.8773, + "step": 456600 + }, + { + "epoch": 1.8604954327831353, + "grad_norm": 3.226004123687744, + "learning_rate": 0.004117058293861398, + "loss": 7.8986, + "step": 456700 + }, + { + "epoch": 1.8609028108065169, + "grad_norm": 1.5177836418151855, + "learning_rate": 0.004116679623964555, + "loss": 7.8996, + "step": 456800 + }, + { + "epoch": 1.8613101888298984, + "grad_norm": 2.3991305828094482, + "learning_rate": 0.004116300890349398, + "loss": 7.9361, + "step": 456900 + }, + { + "epoch": 1.8617175668532797, + "grad_norm": 2.6616926193237305, + "learning_rate": 0.004115922093030896, + "loss": 7.8761, + "step": 457000 + }, + { + "epoch": 1.8617175668532797, + "eval_MaskedAccuracy": 0.4976523812898243, + "eval_loss": 1.6497547626495361, + "eval_runtime": 179.1595, + "eval_samples_per_second": 354.299, + "eval_steps_per_second": 1.384, + "step": 457000 + }, + { + "epoch": 1.8621249448766612, + "grad_norm": 3.0376691818237305, + "learning_rate": 0.004115543232024025, + "loss": 7.9175, + "step": 457100 + }, + { + "epoch": 1.8625323229000426, + "grad_norm": 3.201857805252075, + "learning_rate": 0.004115164307343773, + "loss": 7.853, + "step": 457200 + }, + { + "epoch": 1.862939700923424, + "grad_norm": 2.287074089050293, + "learning_rate": 0.004114785319005115, + "loss": 7.8973, + "step": 457300 + }, + { + "epoch": 1.8633470789468056, + "grad_norm": 4.273191928863525, + "learning_rate": 0.004114406267023028, + "loss": 7.9027, + "step": 457400 + }, + { + "epoch": 1.8637544569701872, + "grad_norm": 2.2495110034942627, + "learning_rate": 0.004114027151412505, + "loss": 7.8978, + "step": 457500 + }, + { + "epoch": 1.8641618349935685, + "grad_norm": 5.090861797332764, + "learning_rate": 0.004113647972188537, + "loss": 7.8977, + "step": 457600 + }, + { + "epoch": 1.86456921301695, + "grad_norm": 2.9712412357330322, + "learning_rate": 0.004113268729366111, + "loss": 7.914, + "step": 457700 + }, + { + "epoch": 1.8649765910403313, + "grad_norm": 1.534548044204712, + "learning_rate": 0.004112889422960217, + "loss": 7.8898, + "step": 457800 + }, + { + "epoch": 1.8653839690637128, + "grad_norm": 1.4076720476150513, + "learning_rate": 0.004112510052985856, + "loss": 7.8964, + "step": 457900 + }, + { + "epoch": 1.8657913470870944, + "grad_norm": 3.8278603553771973, + "learning_rate": 0.004112130619458025, + "loss": 7.8828, + "step": 458000 + }, + { + "epoch": 1.8657913470870944, + "eval_MaskedAccuracy": 0.4966755756615507, + "eval_loss": 1.6578221321105957, + "eval_runtime": 178.832, + "eval_samples_per_second": 354.948, + "eval_steps_per_second": 1.387, + "step": 458000 + }, + { + "epoch": 1.866198725110476, + "grad_norm": 5.5682454109191895, + "learning_rate": 0.004111751122391722, + "loss": 7.8974, + "step": 458100 + }, + { + "epoch": 1.8666061031338574, + "grad_norm": 2.8451478481292725, + "learning_rate": 0.004111371561801957, + "loss": 7.8977, + "step": 458200 + }, + { + "epoch": 1.8670134811572388, + "grad_norm": 3.2823026180267334, + "learning_rate": 0.004110991937703733, + "loss": 7.8918, + "step": 458300 + }, + { + "epoch": 1.86742085918062, + "grad_norm": 6.824443340301514, + "learning_rate": 0.0041106122501120635, + "loss": 7.8635, + "step": 458400 + }, + { + "epoch": 1.8678282372040016, + "grad_norm": 6.437410354614258, + "learning_rate": 0.004110232499041946, + "loss": 7.9025, + "step": 458500 + }, + { + "epoch": 1.8682356152273831, + "grad_norm": 5.972446918487549, + "learning_rate": 0.004109852684508407, + "loss": 7.8893, + "step": 458600 + }, + { + "epoch": 1.8686429932507647, + "grad_norm": 3.5736684799194336, + "learning_rate": 0.004109472806526464, + "loss": 7.8966, + "step": 458700 + }, + { + "epoch": 1.8690503712741462, + "grad_norm": 3.444176197052002, + "learning_rate": 0.0041090928651111224, + "loss": 7.8729, + "step": 458800 + }, + { + "epoch": 1.8694577492975275, + "grad_norm": 7.170397758483887, + "learning_rate": 0.004108712860277412, + "loss": 7.8855, + "step": 458900 + }, + { + "epoch": 1.869865127320909, + "grad_norm": 3.737161159515381, + "learning_rate": 0.004108332792040354, + "loss": 7.9061, + "step": 459000 + }, + { + "epoch": 1.869865127320909, + "eval_MaskedAccuracy": 0.4972140486354773, + "eval_loss": 1.6708306074142456, + "eval_runtime": 187.0603, + "eval_samples_per_second": 339.334, + "eval_steps_per_second": 1.326, + "step": 459000 + }, + { + "epoch": 1.8702725053442903, + "grad_norm": 5.48805570602417, + "learning_rate": 0.004107952660414973, + "loss": 7.8729, + "step": 459100 + }, + { + "epoch": 1.8706798833676719, + "grad_norm": 6.40994930267334, + "learning_rate": 0.004107572465416302, + "loss": 7.9006, + "step": 459200 + }, + { + "epoch": 1.8710872613910534, + "grad_norm": 3.788102626800537, + "learning_rate": 0.004107192207059372, + "loss": 7.8982, + "step": 459300 + }, + { + "epoch": 1.871494639414435, + "grad_norm": 1.819547176361084, + "learning_rate": 0.004106811885359216, + "loss": 7.8924, + "step": 459400 + }, + { + "epoch": 1.8719020174378163, + "grad_norm": 6.286037445068359, + "learning_rate": 0.004106431500330866, + "loss": 7.9078, + "step": 459500 + }, + { + "epoch": 1.8723093954611978, + "grad_norm": 2.6707632541656494, + "learning_rate": 0.004106051051989363, + "loss": 7.8837, + "step": 459600 + }, + { + "epoch": 1.872716773484579, + "grad_norm": 2.4007577896118164, + "learning_rate": 0.004105670540349755, + "loss": 7.8965, + "step": 459700 + }, + { + "epoch": 1.8731241515079606, + "grad_norm": 2.9365899562835693, + "learning_rate": 0.0041052899654270706, + "loss": 7.8642, + "step": 459800 + }, + { + "epoch": 1.8735315295313422, + "grad_norm": 5.498284339904785, + "learning_rate": 0.004104909327236368, + "loss": 7.9186, + "step": 459900 + }, + { + "epoch": 1.8739389075547237, + "grad_norm": 2.893430233001709, + "learning_rate": 0.004104528625792691, + "loss": 7.8869, + "step": 460000 + }, + { + "epoch": 1.8739389075547237, + "eval_MaskedAccuracy": 0.49687180917215784, + "eval_loss": 1.670817494392395, + "eval_runtime": 215.0947, + "eval_samples_per_second": 295.107, + "eval_steps_per_second": 1.153, + "step": 460000 + }, + { + "epoch": 1.874346285578105, + "grad_norm": 7.116004943847656, + "learning_rate": 0.004104147861111094, + "loss": 7.8856, + "step": 460100 + }, + { + "epoch": 1.8747536636014865, + "grad_norm": 4.6221208572387695, + "learning_rate": 0.004103767033206633, + "loss": 7.8595, + "step": 460200 + }, + { + "epoch": 1.8751610416248679, + "grad_norm": 4.07310676574707, + "learning_rate": 0.004103386142094354, + "loss": 7.8936, + "step": 460300 + }, + { + "epoch": 1.8755684196482494, + "grad_norm": 1.851848840713501, + "learning_rate": 0.00410300518778932, + "loss": 7.9016, + "step": 460400 + }, + { + "epoch": 1.875975797671631, + "grad_norm": 4.325085163116455, + "learning_rate": 0.0041026241703066004, + "loss": 7.8796, + "step": 460500 + }, + { + "epoch": 1.8763831756950125, + "grad_norm": 1.8312495946884155, + "learning_rate": 0.004102243089661252, + "loss": 7.8858, + "step": 460600 + }, + { + "epoch": 1.876790553718394, + "grad_norm": 5.251564979553223, + "learning_rate": 0.004101861945868342, + "loss": 7.8825, + "step": 460700 + }, + { + "epoch": 1.8771979317417753, + "grad_norm": 6.409215927124023, + "learning_rate": 0.004101480738942937, + "loss": 7.8694, + "step": 460800 + }, + { + "epoch": 1.8776053097651566, + "grad_norm": 3.440920829772949, + "learning_rate": 0.004101099468900114, + "loss": 7.8854, + "step": 460900 + }, + { + "epoch": 1.8780126877885381, + "grad_norm": 2.75022292137146, + "learning_rate": 0.004100718135754942, + "loss": 7.9107, + "step": 461000 + }, + { + "epoch": 1.8780126877885381, + "eval_MaskedAccuracy": 0.4972361158192994, + "eval_loss": 1.656812071800232, + "eval_runtime": 181.3266, + "eval_samples_per_second": 350.064, + "eval_steps_per_second": 1.368, + "step": 461000 + }, + { + "epoch": 1.8784200658119197, + "grad_norm": 1.467983603477478, + "learning_rate": 0.004100336739522496, + "loss": 7.9035, + "step": 461100 + }, + { + "epoch": 1.8788274438353012, + "grad_norm": 4.852626323699951, + "learning_rate": 0.004099955280217857, + "loss": 7.8854, + "step": 461200 + }, + { + "epoch": 1.8792348218586827, + "grad_norm": 3.740487575531006, + "learning_rate": 0.004099573757856104, + "loss": 7.9009, + "step": 461300 + }, + { + "epoch": 1.879642199882064, + "grad_norm": 3.978701114654541, + "learning_rate": 0.0040991921724523196, + "loss": 7.9037, + "step": 461400 + }, + { + "epoch": 1.8800495779054456, + "grad_norm": 2.751621723175049, + "learning_rate": 0.0040988105240215966, + "loss": 7.8742, + "step": 461500 + }, + { + "epoch": 1.880456955928827, + "grad_norm": 2.0758726596832275, + "learning_rate": 0.004098428812579016, + "loss": 7.8796, + "step": 461600 + }, + { + "epoch": 1.8808643339522084, + "grad_norm": 4.8139495849609375, + "learning_rate": 0.004098047038139674, + "loss": 7.8671, + "step": 461700 + }, + { + "epoch": 1.88127171197559, + "grad_norm": 3.995478868484497, + "learning_rate": 0.004097665200718666, + "loss": 7.8402, + "step": 461800 + }, + { + "epoch": 1.8816790899989715, + "grad_norm": 5.929808616638184, + "learning_rate": 0.004097283300331081, + "loss": 7.9057, + "step": 461900 + }, + { + "epoch": 1.8820864680223528, + "grad_norm": 4.032388687133789, + "learning_rate": 0.004096901336992025, + "loss": 7.867, + "step": 462000 + }, + { + "epoch": 1.8820864680223528, + "eval_MaskedAccuracy": 0.4966355526760221, + "eval_loss": 1.6544089317321777, + "eval_runtime": 233.0379, + "eval_samples_per_second": 272.385, + "eval_steps_per_second": 1.064, + "step": 462000 + }, + { + "epoch": 1.8824938460457343, + "grad_norm": 1.7739145755767822, + "learning_rate": 0.004096519310716591, + "loss": 7.8774, + "step": 462100 + }, + { + "epoch": 1.8829012240691156, + "grad_norm": 1.3551661968231201, + "learning_rate": 0.004096137221519888, + "loss": 7.914, + "step": 462200 + }, + { + "epoch": 1.8833086020924972, + "grad_norm": 7.672719478607178, + "learning_rate": 0.004095755069417018, + "loss": 7.89, + "step": 462300 + }, + { + "epoch": 1.8837159801158787, + "grad_norm": 5.353662967681885, + "learning_rate": 0.004095372854423089, + "loss": 7.8856, + "step": 462400 + }, + { + "epoch": 1.8841233581392602, + "grad_norm": 3.249173641204834, + "learning_rate": 0.004094990576553222, + "loss": 7.8568, + "step": 462500 + }, + { + "epoch": 1.8845307361626416, + "grad_norm": 3.1244406700134277, + "learning_rate": 0.004094608235822522, + "loss": 7.8715, + "step": 462600 + }, + { + "epoch": 1.884938114186023, + "grad_norm": 2.498183012008667, + "learning_rate": 0.0040942258322461044, + "loss": 7.879, + "step": 462700 + }, + { + "epoch": 1.8853454922094044, + "grad_norm": 2.7141098976135254, + "learning_rate": 0.004093843365839088, + "loss": 7.8946, + "step": 462800 + }, + { + "epoch": 1.885752870232786, + "grad_norm": 2.811753511428833, + "learning_rate": 0.004093460836616596, + "loss": 7.8782, + "step": 462900 + }, + { + "epoch": 1.8861602482561675, + "grad_norm": 7.897110939025879, + "learning_rate": 0.004093078244593749, + "loss": 7.8573, + "step": 463000 + }, + { + "epoch": 1.8861602482561675, + "eval_MaskedAccuracy": 0.49610789181503145, + "eval_loss": 1.6706278324127197, + "eval_runtime": 185.2555, + "eval_samples_per_second": 342.64, + "eval_steps_per_second": 1.339, + "step": 463000 + }, + { + "epoch": 1.886567626279549, + "grad_norm": 4.551334857940674, + "learning_rate": 0.004092695589785684, + "loss": 7.8957, + "step": 463100 + }, + { + "epoch": 1.8869750043029305, + "grad_norm": 2.587092876434326, + "learning_rate": 0.004092312872207516, + "loss": 7.8956, + "step": 463200 + }, + { + "epoch": 1.8873823823263118, + "grad_norm": 2.0081067085266113, + "learning_rate": 0.004091930091874383, + "loss": 7.877, + "step": 463300 + }, + { + "epoch": 1.8877897603496931, + "grad_norm": 2.8725426197052, + "learning_rate": 0.004091547248801414, + "loss": 7.8703, + "step": 463400 + }, + { + "epoch": 1.8881971383730747, + "grad_norm": 4.049835205078125, + "learning_rate": 0.004091164343003749, + "loss": 7.9268, + "step": 463500 + }, + { + "epoch": 1.8886045163964562, + "grad_norm": 5.329404354095459, + "learning_rate": 0.004090781374496521, + "loss": 7.8657, + "step": 463600 + }, + { + "epoch": 1.8890118944198377, + "grad_norm": 3.372039556503296, + "learning_rate": 0.004090398343294875, + "loss": 7.8803, + "step": 463700 + }, + { + "epoch": 1.8894192724432193, + "grad_norm": 4.426126003265381, + "learning_rate": 0.004090015249413954, + "loss": 7.8952, + "step": 463800 + }, + { + "epoch": 1.8898266504666006, + "grad_norm": 2.545288324356079, + "learning_rate": 0.0040896320928689054, + "loss": 7.9167, + "step": 463900 + }, + { + "epoch": 1.890234028489982, + "grad_norm": 1.8516414165496826, + "learning_rate": 0.004089248873674872, + "loss": 7.8635, + "step": 464000 + }, + { + "epoch": 1.890234028489982, + "eval_MaskedAccuracy": 0.4977357268850934, + "eval_loss": 1.646777868270874, + "eval_runtime": 179.0391, + "eval_samples_per_second": 354.537, + "eval_steps_per_second": 1.385, + "step": 464000 + }, + { + "epoch": 1.8906414065133634, + "grad_norm": 2.3388543128967285, + "learning_rate": 0.004088865591847009, + "loss": 7.9046, + "step": 464100 + }, + { + "epoch": 1.891048784536745, + "grad_norm": 2.0005366802215576, + "learning_rate": 0.004088482247400468, + "loss": 7.8999, + "step": 464200 + }, + { + "epoch": 1.8914561625601265, + "grad_norm": 4.26557731628418, + "learning_rate": 0.0040880988403504005, + "loss": 7.8838, + "step": 464300 + }, + { + "epoch": 1.891863540583508, + "grad_norm": 2.6730682849884033, + "learning_rate": 0.004087715370711972, + "loss": 7.8887, + "step": 464400 + }, + { + "epoch": 1.8922709186068893, + "grad_norm": 5.512271404266357, + "learning_rate": 0.0040873318385003365, + "loss": 7.9034, + "step": 464500 + }, + { + "epoch": 1.8926782966302709, + "grad_norm": 4.042498588562012, + "learning_rate": 0.004086948243730663, + "loss": 7.8898, + "step": 464600 + }, + { + "epoch": 1.8930856746536522, + "grad_norm": 3.528592348098755, + "learning_rate": 0.004086564586418112, + "loss": 7.892, + "step": 464700 + }, + { + "epoch": 1.8934930526770337, + "grad_norm": 4.220452308654785, + "learning_rate": 0.004086180866577856, + "loss": 7.8771, + "step": 464800 + }, + { + "epoch": 1.8939004307004152, + "grad_norm": 4.932054042816162, + "learning_rate": 0.004085797084225067, + "loss": 7.8865, + "step": 464900 + }, + { + "epoch": 1.8943078087237968, + "grad_norm": 4.68433141708374, + "learning_rate": 0.00408541323937491, + "loss": 7.9124, + "step": 465000 + }, + { + "epoch": 1.8943078087237968, + "eval_MaskedAccuracy": 0.496761901124384, + "eval_loss": 1.6511331796646118, + "eval_runtime": 163.5458, + "eval_samples_per_second": 388.124, + "eval_steps_per_second": 1.516, + "step": 465000 + }, + { + "epoch": 1.894715186747178, + "grad_norm": 2.8748650550842285, + "learning_rate": 0.004085029332042569, + "loss": 7.8964, + "step": 465100 + }, + { + "epoch": 1.8951225647705596, + "grad_norm": 1.8150627613067627, + "learning_rate": 0.00408464536224322, + "loss": 7.9157, + "step": 465200 + }, + { + "epoch": 1.895529942793941, + "grad_norm": 2.498502492904663, + "learning_rate": 0.0040842613299920385, + "loss": 7.882, + "step": 465300 + }, + { + "epoch": 1.8959373208173225, + "grad_norm": 10.029559135437012, + "learning_rate": 0.004083877235304211, + "loss": 7.8841, + "step": 465400 + }, + { + "epoch": 1.896344698840704, + "grad_norm": 1.2890334129333496, + "learning_rate": 0.004083493078194922, + "loss": 7.8709, + "step": 465500 + }, + { + "epoch": 1.8967520768640855, + "grad_norm": 4.392586708068848, + "learning_rate": 0.004083108858679358, + "loss": 7.9104, + "step": 465600 + }, + { + "epoch": 1.897159454887467, + "grad_norm": 2.6691832542419434, + "learning_rate": 0.004082724576772705, + "loss": 7.8731, + "step": 465700 + }, + { + "epoch": 1.8975668329108484, + "grad_norm": 8.658385276794434, + "learning_rate": 0.004082340232490165, + "loss": 7.8833, + "step": 465800 + }, + { + "epoch": 1.8979742109342297, + "grad_norm": 3.7266769409179688, + "learning_rate": 0.004081955825846923, + "loss": 7.9004, + "step": 465900 + }, + { + "epoch": 1.8983815889576112, + "grad_norm": 2.238558292388916, + "learning_rate": 0.004081571356858191, + "loss": 7.881, + "step": 466000 + }, + { + "epoch": 1.8983815889576112, + "eval_MaskedAccuracy": 0.4971370541862614, + "eval_loss": 1.659732699394226, + "eval_runtime": 170.1739, + "eval_samples_per_second": 373.007, + "eval_steps_per_second": 1.457, + "step": 466000 + }, + { + "epoch": 1.8987889669809928, + "grad_norm": 5.282289505004883, + "learning_rate": 0.00408118682553916, + "loss": 7.8756, + "step": 466100 + }, + { + "epoch": 1.8991963450043743, + "grad_norm": 6.814397811889648, + "learning_rate": 0.004080802231905035, + "loss": 7.8595, + "step": 466200 + }, + { + "epoch": 1.8996037230277558, + "grad_norm": 6.039074420928955, + "learning_rate": 0.004080417575971024, + "loss": 7.8788, + "step": 466300 + }, + { + "epoch": 1.9000111010511371, + "grad_norm": 5.3705058097839355, + "learning_rate": 0.004080032857752321, + "loss": 7.8888, + "step": 466400 + }, + { + "epoch": 1.9004184790745184, + "grad_norm": 1.4663264751434326, + "learning_rate": 0.00407964807726415, + "loss": 7.888, + "step": 466500 + }, + { + "epoch": 1.9008258570979, + "grad_norm": 4.302461624145508, + "learning_rate": 0.004079263234521717, + "loss": 7.9108, + "step": 466600 + }, + { + "epoch": 1.9012332351212815, + "grad_norm": 8.045517921447754, + "learning_rate": 0.004078878329540241, + "loss": 7.921, + "step": 466700 + }, + { + "epoch": 1.901640613144663, + "grad_norm": 3.6992404460906982, + "learning_rate": 0.004078493362334935, + "loss": 7.8639, + "step": 466800 + }, + { + "epoch": 1.9020479911680446, + "grad_norm": 1.4270908832550049, + "learning_rate": 0.004078108332921021, + "loss": 7.864, + "step": 466900 + }, + { + "epoch": 1.9024553691914259, + "grad_norm": 3.777878999710083, + "learning_rate": 0.004077723241313717, + "loss": 7.8879, + "step": 467000 + }, + { + "epoch": 1.9024553691914259, + "eval_MaskedAccuracy": 0.49775283973197887, + "eval_loss": 1.6489890813827515, + "eval_runtime": 209.5076, + "eval_samples_per_second": 302.977, + "eval_steps_per_second": 1.184, + "step": 467000 + }, + { + "epoch": 1.9028627472148074, + "grad_norm": 3.6529839038848877, + "learning_rate": 0.004077338087528257, + "loss": 7.9005, + "step": 467100 + }, + { + "epoch": 1.9032701252381887, + "grad_norm": 4.476478099822998, + "learning_rate": 0.004076952871579866, + "loss": 7.931, + "step": 467200 + }, + { + "epoch": 1.9036775032615703, + "grad_norm": 1.4068933725357056, + "learning_rate": 0.004076567593483765, + "loss": 7.8746, + "step": 467300 + }, + { + "epoch": 1.9040848812849518, + "grad_norm": 2.029770612716675, + "learning_rate": 0.004076182253255195, + "loss": 7.8543, + "step": 467400 + }, + { + "epoch": 1.9044922593083333, + "grad_norm": 2.378837823867798, + "learning_rate": 0.004075796850909382, + "loss": 7.8565, + "step": 467500 + }, + { + "epoch": 1.9048996373317146, + "grad_norm": 2.1673130989074707, + "learning_rate": 0.00407541138646157, + "loss": 7.8698, + "step": 467600 + }, + { + "epoch": 1.9053070153550962, + "grad_norm": 1.8733528852462769, + "learning_rate": 0.004075025859926995, + "loss": 7.9058, + "step": 467700 + }, + { + "epoch": 1.9057143933784775, + "grad_norm": 5.9949951171875, + "learning_rate": 0.004074640271320902, + "loss": 7.9013, + "step": 467800 + }, + { + "epoch": 1.906121771401859, + "grad_norm": 1.5321779251098633, + "learning_rate": 0.004074254620658537, + "loss": 7.8523, + "step": 467900 + }, + { + "epoch": 1.9065291494252405, + "grad_norm": 7.5324835777282715, + "learning_rate": 0.004073868907955139, + "loss": 7.8647, + "step": 468000 + }, + { + "epoch": 1.9065291494252405, + "eval_MaskedAccuracy": 0.4969127485025389, + "eval_loss": 1.6676855087280273, + "eval_runtime": 192.8688, + "eval_samples_per_second": 329.115, + "eval_steps_per_second": 1.286, + "step": 468000 + }, + { + "epoch": 1.906936527448622, + "grad_norm": 1.861028790473938, + "learning_rate": 0.004073483133225972, + "loss": 7.8922, + "step": 468100 + }, + { + "epoch": 1.9073439054720036, + "grad_norm": 4.3910908699035645, + "learning_rate": 0.004073097296486274, + "loss": 7.899, + "step": 468200 + }, + { + "epoch": 1.907751283495385, + "grad_norm": 1.6898629665374756, + "learning_rate": 0.004072711397751302, + "loss": 7.8955, + "step": 468300 + }, + { + "epoch": 1.9081586615187662, + "grad_norm": 2.9406893253326416, + "learning_rate": 0.004072325437036316, + "loss": 7.889, + "step": 468400 + }, + { + "epoch": 1.9085660395421478, + "grad_norm": 1.2531869411468506, + "learning_rate": 0.00407193941435657, + "loss": 7.9048, + "step": 468500 + }, + { + "epoch": 1.9089734175655293, + "grad_norm": 3.3710033893585205, + "learning_rate": 0.004071553329727329, + "loss": 7.8771, + "step": 468600 + }, + { + "epoch": 1.9093807955889108, + "grad_norm": 3.323848247528076, + "learning_rate": 0.004071167183163855, + "loss": 7.8752, + "step": 468700 + }, + { + "epoch": 1.9097881736122924, + "grad_norm": 2.5802295207977295, + "learning_rate": 0.004070780974681417, + "loss": 7.8805, + "step": 468800 + }, + { + "epoch": 1.9101955516356737, + "grad_norm": 3.3420770168304443, + "learning_rate": 0.004070394704295283, + "loss": 7.8891, + "step": 468900 + }, + { + "epoch": 1.910602929659055, + "grad_norm": 2.558593273162842, + "learning_rate": 0.004070008372020724, + "loss": 7.9017, + "step": 469000 + }, + { + "epoch": 1.910602929659055, + "eval_MaskedAccuracy": 0.4963021289787646, + "eval_loss": 1.6615022420883179, + "eval_runtime": 204.791, + "eval_samples_per_second": 309.955, + "eval_steps_per_second": 1.211, + "step": 469000 + }, + { + "epoch": 1.9110103076824365, + "grad_norm": 3.3948159217834473, + "learning_rate": 0.004069621977873013, + "loss": 7.9062, + "step": 469100 + }, + { + "epoch": 1.911417685705818, + "grad_norm": 4.204961776733398, + "learning_rate": 0.004069235521867424, + "loss": 7.8834, + "step": 469200 + }, + { + "epoch": 1.9118250637291996, + "grad_norm": 1.9316223859786987, + "learning_rate": 0.004068849004019239, + "loss": 7.9044, + "step": 469300 + }, + { + "epoch": 1.9122324417525811, + "grad_norm": 5.4399094581604, + "learning_rate": 0.004068462424343744, + "loss": 7.8593, + "step": 469400 + }, + { + "epoch": 1.9126398197759624, + "grad_norm": 5.390496730804443, + "learning_rate": 0.0040680757828562165, + "loss": 7.8793, + "step": 469500 + }, + { + "epoch": 1.913047197799344, + "grad_norm": 4.0297722816467285, + "learning_rate": 0.004067689079571947, + "loss": 7.8563, + "step": 469600 + }, + { + "epoch": 1.9134545758227253, + "grad_norm": 5.022799968719482, + "learning_rate": 0.004067302314506215, + "loss": 7.8751, + "step": 469700 + }, + { + "epoch": 1.9138619538461068, + "grad_norm": 5.983433723449707, + "learning_rate": 0.004066915487674315, + "loss": 7.8482, + "step": 469800 + }, + { + "epoch": 1.9142693318694883, + "grad_norm": 1.6672743558883667, + "learning_rate": 0.004066528599091545, + "loss": 7.8626, + "step": 469900 + }, + { + "epoch": 1.9146767098928699, + "grad_norm": 3.781672954559326, + "learning_rate": 0.0040661416487731954, + "loss": 7.8722, + "step": 470000 + }, + { + "epoch": 1.9146767098928699, + "eval_MaskedAccuracy": 0.4977106534616291, + "eval_loss": 1.6562927961349487, + "eval_runtime": 291.0083, + "eval_samples_per_second": 218.124, + "eval_steps_per_second": 0.852, + "step": 470000 + }, + { + "epoch": 1.9150840879162512, + "grad_norm": 6.619106292724609, + "learning_rate": 0.00406575463673457, + "loss": 7.8557, + "step": 470100 + }, + { + "epoch": 1.9154914659396327, + "grad_norm": 4.2112884521484375, + "learning_rate": 0.004065367562990967, + "loss": 7.884, + "step": 470200 + }, + { + "epoch": 1.915898843963014, + "grad_norm": 2.700252056121826, + "learning_rate": 0.004064980427557689, + "loss": 7.926, + "step": 470300 + }, + { + "epoch": 1.9163062219863956, + "grad_norm": 1.110736608505249, + "learning_rate": 0.004064593230450038, + "loss": 7.8823, + "step": 470400 + }, + { + "epoch": 1.916713600009777, + "grad_norm": 5.880198955535889, + "learning_rate": 0.0040642059716833245, + "loss": 7.8785, + "step": 470500 + }, + { + "epoch": 1.9171209780331586, + "grad_norm": 1.161250114440918, + "learning_rate": 0.004063818651272862, + "loss": 7.8841, + "step": 470600 + }, + { + "epoch": 1.9175283560565402, + "grad_norm": 3.639497995376587, + "learning_rate": 0.004063431269233957, + "loss": 7.8756, + "step": 470700 + }, + { + "epoch": 1.9179357340799215, + "grad_norm": 2.357867479324341, + "learning_rate": 0.004063043825581932, + "loss": 7.8585, + "step": 470800 + }, + { + "epoch": 1.9183431121033028, + "grad_norm": 3.1401703357696533, + "learning_rate": 0.004062656320332102, + "loss": 7.8698, + "step": 470900 + }, + { + "epoch": 1.9187504901266843, + "grad_norm": 1.814631462097168, + "learning_rate": 0.004062268753499785, + "loss": 7.869, + "step": 471000 + }, + { + "epoch": 1.9187504901266843, + "eval_MaskedAccuracy": 0.4971491722059388, + "eval_loss": 1.6693766117095947, + "eval_runtime": 183.1856, + "eval_samples_per_second": 346.512, + "eval_steps_per_second": 1.354, + "step": 471000 + }, + { + "epoch": 1.9191578681500658, + "grad_norm": 1.5636416673660278, + "learning_rate": 0.004061881125100306, + "loss": 7.8423, + "step": 471100 + }, + { + "epoch": 1.9195652461734474, + "grad_norm": 6.844422817230225, + "learning_rate": 0.004061493435148992, + "loss": 7.8685, + "step": 471200 + }, + { + "epoch": 1.919972624196829, + "grad_norm": 1.8475629091262817, + "learning_rate": 0.00406110568366116, + "loss": 7.8783, + "step": 471300 + }, + { + "epoch": 1.9203800022202102, + "grad_norm": 5.86833381652832, + "learning_rate": 0.0040607178706521556, + "loss": 7.8639, + "step": 471400 + }, + { + "epoch": 1.9207873802435915, + "grad_norm": 1.7145296335220337, + "learning_rate": 0.004060329996137304, + "loss": 7.8992, + "step": 471500 + }, + { + "epoch": 1.921194758266973, + "grad_norm": 1.964928388595581, + "learning_rate": 0.004059942060131937, + "loss": 7.9292, + "step": 471600 + }, + { + "epoch": 1.9216021362903546, + "grad_norm": 5.93284797668457, + "learning_rate": 0.0040595540626514005, + "loss": 7.8975, + "step": 471700 + }, + { + "epoch": 1.9220095143137361, + "grad_norm": 1.2464560270309448, + "learning_rate": 0.004059166003711019, + "loss": 7.91, + "step": 471800 + }, + { + "epoch": 1.9224168923371177, + "grad_norm": 1.7859487533569336, + "learning_rate": 0.004058777883326144, + "loss": 7.9084, + "step": 471900 + }, + { + "epoch": 1.922824270360499, + "grad_norm": 6.065908432006836, + "learning_rate": 0.004058389701512122, + "loss": 7.8726, + "step": 472000 + }, + { + "epoch": 1.922824270360499, + "eval_MaskedAccuracy": 0.4974083189341227, + "eval_loss": 1.6643046140670776, + "eval_runtime": 198.8314, + "eval_samples_per_second": 319.245, + "eval_steps_per_second": 1.247, + "step": 472000 + }, + { + "epoch": 1.9232316483838805, + "grad_norm": 1.6700581312179565, + "learning_rate": 0.0040580014582842945, + "loss": 7.8676, + "step": 472100 + }, + { + "epoch": 1.9236390264072618, + "grad_norm": 2.486999988555908, + "learning_rate": 0.004057613153658009, + "loss": 7.8765, + "step": 472200 + }, + { + "epoch": 1.9240464044306433, + "grad_norm": 3.6034107208251953, + "learning_rate": 0.0040572247876486285, + "loss": 7.8548, + "step": 472300 + }, + { + "epoch": 1.9244537824540249, + "grad_norm": 3.9016072750091553, + "learning_rate": 0.0040568363602714996, + "loss": 7.8644, + "step": 472400 + }, + { + "epoch": 1.9248611604774064, + "grad_norm": 5.163150787353516, + "learning_rate": 0.004056447871541978, + "loss": 7.8592, + "step": 472500 + }, + { + "epoch": 1.9252685385007877, + "grad_norm": 1.991635799407959, + "learning_rate": 0.004056059321475423, + "loss": 7.8754, + "step": 472600 + }, + { + "epoch": 1.9256759165241693, + "grad_norm": 2.20558762550354, + "learning_rate": 0.004055670710087199, + "loss": 7.9016, + "step": 472700 + }, + { + "epoch": 1.9260832945475506, + "grad_norm": 7.398116588592529, + "learning_rate": 0.004055282037392666, + "loss": 7.8612, + "step": 472800 + }, + { + "epoch": 1.926490672570932, + "grad_norm": 5.232546806335449, + "learning_rate": 0.004054893303407193, + "loss": 7.8753, + "step": 472900 + }, + { + "epoch": 1.9268980505943136, + "grad_norm": 3.3898725509643555, + "learning_rate": 0.0040545045081461515, + "loss": 7.906, + "step": 473000 + }, + { + "epoch": 1.9268980505943136, + "eval_MaskedAccuracy": 0.497005546678431, + "eval_loss": 1.658578872680664, + "eval_runtime": 168.2489, + "eval_samples_per_second": 377.274, + "eval_steps_per_second": 1.474, + "step": 473000 + }, + { + "epoch": 1.9273054286176952, + "grad_norm": 1.957472324371338, + "learning_rate": 0.004054115651624905, + "loss": 7.8652, + "step": 473100 + }, + { + "epoch": 1.9277128066410767, + "grad_norm": 5.501332759857178, + "learning_rate": 0.004053726733858837, + "loss": 7.8913, + "step": 473200 + }, + { + "epoch": 1.928120184664458, + "grad_norm": 3.65632963180542, + "learning_rate": 0.0040533377548633166, + "loss": 7.8604, + "step": 473300 + }, + { + "epoch": 1.9285275626878393, + "grad_norm": 1.9907680749893188, + "learning_rate": 0.004052948714653717, + "loss": 7.8895, + "step": 473400 + }, + { + "epoch": 1.9289349407112208, + "grad_norm": 3.3675484657287598, + "learning_rate": 0.004052559613245433, + "loss": 7.9118, + "step": 473500 + }, + { + "epoch": 1.9293423187346024, + "grad_norm": 4.073822975158691, + "learning_rate": 0.004052170450653838, + "loss": 7.8552, + "step": 473600 + }, + { + "epoch": 1.929749696757984, + "grad_norm": 4.024270057678223, + "learning_rate": 0.0040517812268943175, + "loss": 7.897, + "step": 473700 + }, + { + "epoch": 1.9301570747813654, + "grad_norm": 2.1195008754730225, + "learning_rate": 0.004051391941982263, + "loss": 7.8737, + "step": 473800 + }, + { + "epoch": 1.9305644528047468, + "grad_norm": 6.140163898468018, + "learning_rate": 0.004051002595933068, + "loss": 7.8937, + "step": 473900 + }, + { + "epoch": 1.930971830828128, + "grad_norm": 1.4223819971084595, + "learning_rate": 0.004050613188762124, + "loss": 7.8811, + "step": 474000 + }, + { + "epoch": 1.930971830828128, + "eval_MaskedAccuracy": 0.4983757766751915, + "eval_loss": 1.6546658277511597, + "eval_runtime": 180.9313, + "eval_samples_per_second": 350.829, + "eval_steps_per_second": 1.371, + "step": 474000 + }, + { + "epoch": 1.9313792088515096, + "grad_norm": 3.788630962371826, + "learning_rate": 0.00405022372048482, + "loss": 7.8691, + "step": 474100 + }, + { + "epoch": 1.9317865868748911, + "grad_norm": 1.8217180967330933, + "learning_rate": 0.004049834191116557, + "loss": 7.8523, + "step": 474200 + }, + { + "epoch": 1.9321939648982727, + "grad_norm": 5.193520545959473, + "learning_rate": 0.004049444600672735, + "loss": 7.8737, + "step": 474300 + }, + { + "epoch": 1.9326013429216542, + "grad_norm": 4.182032585144043, + "learning_rate": 0.004049054949168761, + "loss": 7.8781, + "step": 474400 + }, + { + "epoch": 1.9330087209450355, + "grad_norm": 6.593281269073486, + "learning_rate": 0.004048665236620033, + "loss": 7.8806, + "step": 474500 + }, + { + "epoch": 1.933416098968417, + "grad_norm": 4.737550735473633, + "learning_rate": 0.004048275463041967, + "loss": 7.8708, + "step": 474600 + }, + { + "epoch": 1.9338234769917984, + "grad_norm": 2.312268018722534, + "learning_rate": 0.004047885628449966, + "loss": 7.8792, + "step": 474700 + }, + { + "epoch": 1.9342308550151799, + "grad_norm": 2.0605013370513916, + "learning_rate": 0.004047495732859442, + "loss": 7.8817, + "step": 474800 + }, + { + "epoch": 1.9346382330385614, + "grad_norm": 10.729848861694336, + "learning_rate": 0.0040471057762858165, + "loss": 7.888, + "step": 474900 + }, + { + "epoch": 1.935045611061943, + "grad_norm": 2.2714881896972656, + "learning_rate": 0.004046715758744496, + "loss": 7.8554, + "step": 475000 + }, + { + "epoch": 1.935045611061943, + "eval_MaskedAccuracy": 0.49690260572009565, + "eval_loss": 1.654526710510254, + "eval_runtime": 161.1518, + "eval_samples_per_second": 393.89, + "eval_steps_per_second": 1.539, + "step": 475000 + }, + { + "epoch": 1.9354529890853243, + "grad_norm": 3.213550090789795, + "learning_rate": 0.004046325680250914, + "loss": 7.8573, + "step": 475100 + }, + { + "epoch": 1.9358603671087058, + "grad_norm": 4.958645820617676, + "learning_rate": 0.004045935540820482, + "loss": 7.9061, + "step": 475200 + }, + { + "epoch": 1.936267745132087, + "grad_norm": 1.992046594619751, + "learning_rate": 0.004045545340468625, + "loss": 7.8761, + "step": 475300 + }, + { + "epoch": 1.9366751231554686, + "grad_norm": 3.0850229263305664, + "learning_rate": 0.0040451550792107716, + "loss": 7.8495, + "step": 475400 + }, + { + "epoch": 1.9370825011788502, + "grad_norm": 2.6831929683685303, + "learning_rate": 0.004044764757062348, + "loss": 7.8992, + "step": 475500 + }, + { + "epoch": 1.9374898792022317, + "grad_norm": 5.275318622589111, + "learning_rate": 0.004044374374038785, + "loss": 7.8351, + "step": 475600 + }, + { + "epoch": 1.9378972572256132, + "grad_norm": 3.3698630332946777, + "learning_rate": 0.004043983930155527, + "loss": 7.8287, + "step": 475700 + }, + { + "epoch": 1.9383046352489945, + "grad_norm": 6.257713317871094, + "learning_rate": 0.0040435934254280026, + "loss": 7.8632, + "step": 475800 + }, + { + "epoch": 1.9387120132723759, + "grad_norm": 2.4408538341522217, + "learning_rate": 0.004043202859871646, + "loss": 7.9321, + "step": 475900 + }, + { + "epoch": 1.9391193912957574, + "grad_norm": 4.434767723083496, + "learning_rate": 0.004042812233501905, + "loss": 7.8824, + "step": 476000 + }, + { + "epoch": 1.9391193912957574, + "eval_MaskedAccuracy": 0.49757411130997103, + "eval_loss": 1.6568632125854492, + "eval_runtime": 222.6206, + "eval_samples_per_second": 285.131, + "eval_steps_per_second": 1.114, + "step": 476000 + }, + { + "epoch": 1.939526769319139, + "grad_norm": 5.1438517570495605, + "learning_rate": 0.004042421546334225, + "loss": 7.8831, + "step": 476100 + }, + { + "epoch": 1.9399341473425205, + "grad_norm": 3.293837547302246, + "learning_rate": 0.004042030798384047, + "loss": 7.8711, + "step": 476200 + }, + { + "epoch": 1.940341525365902, + "grad_norm": 6.498884201049805, + "learning_rate": 0.00404163998966682, + "loss": 7.8842, + "step": 476300 + }, + { + "epoch": 1.9407489033892833, + "grad_norm": 2.4626593589782715, + "learning_rate": 0.004041249120197994, + "loss": 7.8796, + "step": 476400 + }, + { + "epoch": 1.9411562814126646, + "grad_norm": 3.8586249351501465, + "learning_rate": 0.004040858189993024, + "loss": 7.8582, + "step": 476500 + }, + { + "epoch": 1.9415636594360461, + "grad_norm": 6.778619766235352, + "learning_rate": 0.0040404671990673675, + "loss": 7.8856, + "step": 476600 + }, + { + "epoch": 1.9419710374594277, + "grad_norm": 2.4657950401306152, + "learning_rate": 0.004040076147436481, + "loss": 7.8696, + "step": 476700 + }, + { + "epoch": 1.9423784154828092, + "grad_norm": 4.460400581359863, + "learning_rate": 0.004039685035115825, + "loss": 7.8893, + "step": 476800 + }, + { + "epoch": 1.9427857935061907, + "grad_norm": 4.682711601257324, + "learning_rate": 0.004039293862120863, + "loss": 7.8663, + "step": 476900 + }, + { + "epoch": 1.943193171529572, + "grad_norm": 1.674350380897522, + "learning_rate": 0.004038902628467053, + "loss": 7.8653, + "step": 477000 + }, + { + "epoch": 1.943193171529572, + "eval_MaskedAccuracy": 0.49762501700375905, + "eval_loss": 1.67079758644104, + "eval_runtime": 222.9317, + "eval_samples_per_second": 284.733, + "eval_steps_per_second": 1.112, + "step": 477000 + }, + { + "epoch": 1.9436005495529536, + "grad_norm": 3.078145980834961, + "learning_rate": 0.004038511334169871, + "loss": 7.8835, + "step": 477100 + }, + { + "epoch": 1.944007927576335, + "grad_norm": 4.532050609588623, + "learning_rate": 0.004038119979244781, + "loss": 7.8699, + "step": 477200 + }, + { + "epoch": 1.9444153055997164, + "grad_norm": 1.8098946809768677, + "learning_rate": 0.0040377285637072605, + "loss": 7.8955, + "step": 477300 + }, + { + "epoch": 1.944822683623098, + "grad_norm": 4.762369632720947, + "learning_rate": 0.004037337087572785, + "loss": 7.8899, + "step": 477400 + }, + { + "epoch": 1.9452300616464795, + "grad_norm": 6.186821937561035, + "learning_rate": 0.00403694555085683, + "loss": 7.8725, + "step": 477500 + }, + { + "epoch": 1.9456374396698608, + "grad_norm": 1.7627458572387695, + "learning_rate": 0.0040365539535748675, + "loss": 7.8508, + "step": 477600 + }, + { + "epoch": 1.9460448176932423, + "grad_norm": 4.719379901885986, + "learning_rate": 0.00403616229574239, + "loss": 7.8766, + "step": 477700 + }, + { + "epoch": 1.9464521957166236, + "grad_norm": 1.4635705947875977, + "learning_rate": 0.00403577057737488, + "loss": 7.8901, + "step": 477800 + }, + { + "epoch": 1.9468595737400052, + "grad_norm": 2.690516471862793, + "learning_rate": 0.00403537879848782, + "loss": 7.8986, + "step": 477900 + }, + { + "epoch": 1.9472669517633867, + "grad_norm": 2.8039186000823975, + "learning_rate": 0.0040349869590967065, + "loss": 7.8739, + "step": 478000 + }, + { + "epoch": 1.9472669517633867, + "eval_MaskedAccuracy": 0.49792528945519005, + "eval_loss": 1.654314398765564, + "eval_runtime": 172.2687, + "eval_samples_per_second": 368.471, + "eval_steps_per_second": 1.44, + "step": 478000 + }, + { + "epoch": 1.9476743297867682, + "grad_norm": 2.728649377822876, + "learning_rate": 0.004034595059217011, + "loss": 7.8701, + "step": 478100 + }, + { + "epoch": 1.9480817078101498, + "grad_norm": 5.797906875610352, + "learning_rate": 0.004034203098864252, + "loss": 7.8526, + "step": 478200 + }, + { + "epoch": 1.948489085833531, + "grad_norm": 8.51126480102539, + "learning_rate": 0.004033811078053917, + "loss": 7.8827, + "step": 478300 + }, + { + "epoch": 1.9488964638569124, + "grad_norm": 1.855424404144287, + "learning_rate": 0.0040334189968015, + "loss": 7.9023, + "step": 478400 + }, + { + "epoch": 1.949303841880294, + "grad_norm": 1.8557064533233643, + "learning_rate": 0.00403302685512251, + "loss": 7.8768, + "step": 478500 + }, + { + "epoch": 1.9497112199036755, + "grad_norm": 2.491926431655884, + "learning_rate": 0.0040326346530324434, + "loss": 7.877, + "step": 478600 + }, + { + "epoch": 1.950118597927057, + "grad_norm": 3.1537840366363525, + "learning_rate": 0.004032242390546807, + "loss": 7.842, + "step": 478700 + }, + { + "epoch": 1.9505259759504385, + "grad_norm": 6.29361629486084, + "learning_rate": 0.004031850067681113, + "loss": 7.8575, + "step": 478800 + }, + { + "epoch": 1.9509333539738198, + "grad_norm": 1.7191801071166992, + "learning_rate": 0.004031457684450868, + "loss": 7.8868, + "step": 478900 + }, + { + "epoch": 1.9513407319972011, + "grad_norm": 2.9524221420288086, + "learning_rate": 0.004031065240871586, + "loss": 7.8854, + "step": 479000 + }, + { + "epoch": 1.9513407319972011, + "eval_MaskedAccuracy": 0.4979104902478439, + "eval_loss": 1.6542531251907349, + "eval_runtime": 154.3286, + "eval_samples_per_second": 411.304, + "eval_steps_per_second": 1.607, + "step": 479000 + }, + { + "epoch": 1.9517481100205827, + "grad_norm": 2.500034809112549, + "learning_rate": 0.004030672736958779, + "loss": 7.8849, + "step": 479100 + }, + { + "epoch": 1.9521554880439642, + "grad_norm": 1.9168018102645874, + "learning_rate": 0.004030280172727973, + "loss": 7.8631, + "step": 479200 + }, + { + "epoch": 1.9525628660673457, + "grad_norm": 5.046385765075684, + "learning_rate": 0.004029887548194683, + "loss": 7.8896, + "step": 479300 + }, + { + "epoch": 1.9529702440907273, + "grad_norm": 2.453263521194458, + "learning_rate": 0.0040294948633744325, + "loss": 7.8544, + "step": 479400 + }, + { + "epoch": 1.9533776221141086, + "grad_norm": 6.771822452545166, + "learning_rate": 0.004029102118282749, + "loss": 7.8464, + "step": 479500 + }, + { + "epoch": 1.9537850001374901, + "grad_norm": 1.666183590888977, + "learning_rate": 0.00402870931293516, + "loss": 7.8639, + "step": 479600 + }, + { + "epoch": 1.9541923781608714, + "grad_norm": 3.315634250640869, + "learning_rate": 0.004028316447347187, + "loss": 7.8744, + "step": 479700 + }, + { + "epoch": 1.954599756184253, + "grad_norm": 4.202087879180908, + "learning_rate": 0.004027923521534369, + "loss": 7.8341, + "step": 479800 + }, + { + "epoch": 1.9550071342076345, + "grad_norm": 2.855398654937744, + "learning_rate": 0.004027530535512241, + "loss": 7.8417, + "step": 479900 + }, + { + "epoch": 1.955414512231016, + "grad_norm": 1.7981055974960327, + "learning_rate": 0.004027137489296336, + "loss": 7.8472, + "step": 480000 + }, + { + "epoch": 1.955414512231016, + "eval_MaskedAccuracy": 0.4980796443724053, + "eval_loss": 1.653088927268982, + "eval_runtime": 267.2063, + "eval_samples_per_second": 237.554, + "eval_steps_per_second": 0.928, + "step": 480000 + }, + { + "epoch": 1.9558218902543973, + "grad_norm": 7.516102313995361, + "learning_rate": 0.004026744382902195, + "loss": 7.8659, + "step": 480100 + }, + { + "epoch": 1.9562292682777789, + "grad_norm": 3.189872980117798, + "learning_rate": 0.0040263512163453636, + "loss": 7.8605, + "step": 480200 + }, + { + "epoch": 1.9566366463011602, + "grad_norm": 3.456697702407837, + "learning_rate": 0.00402595798964138, + "loss": 7.9078, + "step": 480300 + }, + { + "epoch": 1.9570440243245417, + "grad_norm": 1.9788827896118164, + "learning_rate": 0.004025564702805789, + "loss": 7.8707, + "step": 480400 + }, + { + "epoch": 1.9574514023479233, + "grad_norm": 4.001372814178467, + "learning_rate": 0.004025171355854143, + "loss": 7.873, + "step": 480500 + }, + { + "epoch": 1.9578587803713048, + "grad_norm": 1.7897107601165771, + "learning_rate": 0.004024777948801997, + "loss": 7.8635, + "step": 480600 + }, + { + "epoch": 1.9582661583946863, + "grad_norm": 2.17834734916687, + "learning_rate": 0.0040243844816649, + "loss": 7.8759, + "step": 480700 + }, + { + "epoch": 1.9586735364180676, + "grad_norm": 4.898231029510498, + "learning_rate": 0.004023990954458407, + "loss": 7.8657, + "step": 480800 + }, + { + "epoch": 1.959080914441449, + "grad_norm": 3.996277332305908, + "learning_rate": 0.004023597367198079, + "loss": 7.8389, + "step": 480900 + }, + { + "epoch": 1.9594882924648305, + "grad_norm": 2.099057197570801, + "learning_rate": 0.004023203719899471, + "loss": 7.8278, + "step": 481000 + }, + { + "epoch": 1.9594882924648305, + "eval_MaskedAccuracy": 0.49734578672849916, + "eval_loss": 1.6585326194763184, + "eval_runtime": 293.0499, + "eval_samples_per_second": 216.605, + "eval_steps_per_second": 0.846, + "step": 481000 + }, + { + "epoch": 1.959895670488212, + "grad_norm": 3.6001155376434326, + "learning_rate": 0.0040228100125781485, + "loss": 7.8929, + "step": 481100 + }, + { + "epoch": 1.9603030485115935, + "grad_norm": 4.5417256355285645, + "learning_rate": 0.0040224162452496765, + "loss": 7.8526, + "step": 481200 + }, + { + "epoch": 1.960710426534975, + "grad_norm": 3.4753739833831787, + "learning_rate": 0.00402202241792963, + "loss": 7.8555, + "step": 481300 + }, + { + "epoch": 1.9611178045583564, + "grad_norm": 3.7852957248687744, + "learning_rate": 0.0040216285306335755, + "loss": 7.8576, + "step": 481400 + }, + { + "epoch": 1.9615251825817377, + "grad_norm": 3.904681444168091, + "learning_rate": 0.004021234583377078, + "loss": 7.8712, + "step": 481500 + }, + { + "epoch": 1.9619325606051192, + "grad_norm": 7.926792621612549, + "learning_rate": 0.004020840576175721, + "loss": 7.8576, + "step": 481600 + }, + { + "epoch": 1.9623399386285008, + "grad_norm": 6.559882640838623, + "learning_rate": 0.004020446509045077, + "loss": 7.858, + "step": 481700 + }, + { + "epoch": 1.9627473166518823, + "grad_norm": 2.3641748428344727, + "learning_rate": 0.004020052382000732, + "loss": 7.8343, + "step": 481800 + }, + { + "epoch": 1.9631546946752638, + "grad_norm": 2.38405179977417, + "learning_rate": 0.004019658195058259, + "loss": 7.8967, + "step": 481900 + }, + { + "epoch": 1.9635620726986451, + "grad_norm": 2.101755142211914, + "learning_rate": 0.004019263948233245, + "loss": 7.8933, + "step": 482000 + }, + { + "epoch": 1.9635620726986451, + "eval_MaskedAccuracy": 0.49814752555018316, + "eval_loss": 1.6608792543411255, + "eval_runtime": 192.4729, + "eval_samples_per_second": 329.792, + "eval_steps_per_second": 1.288, + "step": 482000 + }, + { + "epoch": 1.9639694507220267, + "grad_norm": 4.64115571975708, + "learning_rate": 0.004018869641541278, + "loss": 7.8328, + "step": 482100 + }, + { + "epoch": 1.964376828745408, + "grad_norm": 1.7008798122406006, + "learning_rate": 0.00401847527499795, + "loss": 7.8733, + "step": 482200 + }, + { + "epoch": 1.9647842067687895, + "grad_norm": 4.0245442390441895, + "learning_rate": 0.004018080848618849, + "loss": 7.8603, + "step": 482300 + }, + { + "epoch": 1.965191584792171, + "grad_norm": 5.126151084899902, + "learning_rate": 0.004017686362419571, + "loss": 7.8705, + "step": 482400 + }, + { + "epoch": 1.9655989628155526, + "grad_norm": 1.8972097635269165, + "learning_rate": 0.004017291816415706, + "loss": 7.8862, + "step": 482500 + }, + { + "epoch": 1.9660063408389339, + "grad_norm": 6.24486780166626, + "learning_rate": 0.004016897210622862, + "loss": 7.9271, + "step": 482600 + }, + { + "epoch": 1.9664137188623154, + "grad_norm": 4.7259697914123535, + "learning_rate": 0.004016502545056628, + "loss": 7.871, + "step": 482700 + }, + { + "epoch": 1.9668210968856967, + "grad_norm": 1.8844248056411743, + "learning_rate": 0.004016107819732618, + "loss": 7.8451, + "step": 482800 + }, + { + "epoch": 1.9672284749090783, + "grad_norm": 1.8297553062438965, + "learning_rate": 0.004015713034666439, + "loss": 7.8553, + "step": 482900 + }, + { + "epoch": 1.9676358529324598, + "grad_norm": 4.883835792541504, + "learning_rate": 0.004015318189873694, + "loss": 7.8533, + "step": 483000 + }, + { + "epoch": 1.9676358529324598, + "eval_MaskedAccuracy": 0.49614832957562766, + "eval_loss": 1.6681147813796997, + "eval_runtime": 194.7594, + "eval_samples_per_second": 325.92, + "eval_steps_per_second": 1.273, + "step": 483000 + }, + { + "epoch": 1.9680432309558413, + "grad_norm": 2.663156509399414, + "learning_rate": 0.004014923285369992, + "loss": 7.8835, + "step": 483100 + }, + { + "epoch": 1.9684506089792229, + "grad_norm": 2.0312697887420654, + "learning_rate": 0.0040145283211709465, + "loss": 7.8721, + "step": 483200 + }, + { + "epoch": 1.9688579870026042, + "grad_norm": 2.1541903018951416, + "learning_rate": 0.004014133297292174, + "loss": 7.8663, + "step": 483300 + }, + { + "epoch": 1.9692653650259855, + "grad_norm": 1.7734742164611816, + "learning_rate": 0.0040137382137492884, + "loss": 7.8796, + "step": 483400 + }, + { + "epoch": 1.969672743049367, + "grad_norm": 7.295593738555908, + "learning_rate": 0.0040133430705579145, + "loss": 7.8988, + "step": 483500 + }, + { + "epoch": 1.9700801210727485, + "grad_norm": 5.96539831161499, + "learning_rate": 0.004012947867733674, + "loss": 7.8703, + "step": 483600 + }, + { + "epoch": 1.97048749909613, + "grad_norm": 4.10883903503418, + "learning_rate": 0.004012552605292188, + "loss": 7.863, + "step": 483700 + }, + { + "epoch": 1.9708948771195116, + "grad_norm": 4.31527853012085, + "learning_rate": 0.004012157283249081, + "loss": 7.8343, + "step": 483800 + }, + { + "epoch": 1.971302255142893, + "grad_norm": 7.688234329223633, + "learning_rate": 0.004011761901619987, + "loss": 7.857, + "step": 483900 + }, + { + "epoch": 1.9717096331662742, + "grad_norm": 5.177247047424316, + "learning_rate": 0.004011366460420539, + "loss": 7.8835, + "step": 484000 + }, + { + "epoch": 1.9717096331662742, + "eval_MaskedAccuracy": 0.498085815293933, + "eval_loss": 1.6607754230499268, + "eval_runtime": 187.5598, + "eval_samples_per_second": 338.431, + "eval_steps_per_second": 1.322, + "step": 484000 + }, + { + "epoch": 1.9721170111896558, + "grad_norm": 2.6931183338165283, + "learning_rate": 0.004010970959666362, + "loss": 7.868, + "step": 484100 + }, + { + "epoch": 1.9725243892130373, + "grad_norm": 2.1372954845428467, + "learning_rate": 0.004010575399373104, + "loss": 7.8838, + "step": 484200 + }, + { + "epoch": 1.9729317672364188, + "grad_norm": 4.982362270355225, + "learning_rate": 0.004010179779556398, + "loss": 7.8403, + "step": 484300 + }, + { + "epoch": 1.9733391452598004, + "grad_norm": 2.3827271461486816, + "learning_rate": 0.004009784100231887, + "loss": 7.8794, + "step": 484400 + }, + { + "epoch": 1.9737465232831817, + "grad_norm": 4.464802265167236, + "learning_rate": 0.004009388361415203, + "loss": 7.8898, + "step": 484500 + }, + { + "epoch": 1.9741539013065632, + "grad_norm": 1.570246934890747, + "learning_rate": 0.004008992563122011, + "loss": 7.8775, + "step": 484600 + }, + { + "epoch": 1.9745612793299445, + "grad_norm": 4.5733723640441895, + "learning_rate": 0.004008596705367944, + "loss": 7.8572, + "step": 484700 + }, + { + "epoch": 1.974968657353326, + "grad_norm": 2.002183675765991, + "learning_rate": 0.004008200788168656, + "loss": 7.8508, + "step": 484800 + }, + { + "epoch": 1.9753760353767076, + "grad_norm": 3.640878438949585, + "learning_rate": 0.004007804811539806, + "loss": 7.844, + "step": 484900 + }, + { + "epoch": 1.9757834134000891, + "grad_norm": 5.636162281036377, + "learning_rate": 0.004007408775497042, + "loss": 7.8452, + "step": 485000 + }, + { + "epoch": 1.9757834134000891, + "eval_MaskedAccuracy": 0.4974547461623308, + "eval_loss": 1.656631588935852, + "eval_runtime": 174.8193, + "eval_samples_per_second": 363.095, + "eval_steps_per_second": 1.419, + "step": 485000 + }, + { + "epoch": 1.9761907914234704, + "grad_norm": 7.8830037117004395, + "learning_rate": 0.004007012680056021, + "loss": 7.8794, + "step": 485100 + }, + { + "epoch": 1.976598169446852, + "grad_norm": 1.603452444076538, + "learning_rate": 0.004006616525232406, + "loss": 7.8654, + "step": 485200 + }, + { + "epoch": 1.9770055474702333, + "grad_norm": 2.641059160232544, + "learning_rate": 0.004006220311041858, + "loss": 7.8737, + "step": 485300 + }, + { + "epoch": 1.9774129254936148, + "grad_norm": 2.383936643600464, + "learning_rate": 0.004005824037500043, + "loss": 7.873, + "step": 485400 + }, + { + "epoch": 1.9778203035169963, + "grad_norm": 4.151749610900879, + "learning_rate": 0.004005427704622632, + "loss": 7.8806, + "step": 485500 + }, + { + "epoch": 1.9782276815403779, + "grad_norm": 3.9820024967193604, + "learning_rate": 0.004005031312425284, + "loss": 7.8726, + "step": 485600 + }, + { + "epoch": 1.9786350595637594, + "grad_norm": 1.2179168462753296, + "learning_rate": 0.004004634860923675, + "loss": 7.8509, + "step": 485700 + }, + { + "epoch": 1.9790424375871407, + "grad_norm": 3.316539764404297, + "learning_rate": 0.004004238350133474, + "loss": 7.9288, + "step": 485800 + }, + { + "epoch": 1.979449815610522, + "grad_norm": 1.5485013723373413, + "learning_rate": 0.004003841780070365, + "loss": 7.8605, + "step": 485900 + }, + { + "epoch": 1.9798571936339036, + "grad_norm": 7.027229309082031, + "learning_rate": 0.004003445150750029, + "loss": 7.8808, + "step": 486000 + }, + { + "epoch": 1.9798571936339036, + "eval_MaskedAccuracy": 0.497335183035008, + "eval_loss": 1.665848731994629, + "eval_runtime": 244.3163, + "eval_samples_per_second": 259.811, + "eval_steps_per_second": 1.015, + "step": 486000 + }, + { + "epoch": 1.980264571657285, + "grad_norm": 3.497182607650757, + "learning_rate": 0.004003048462188146, + "loss": 7.8623, + "step": 486100 + }, + { + "epoch": 1.9806719496806666, + "grad_norm": 5.535577774047852, + "learning_rate": 0.0040026517144003924, + "loss": 7.8717, + "step": 486200 + }, + { + "epoch": 1.9810793277040482, + "grad_norm": 4.201848983764648, + "learning_rate": 0.0040022549074024555, + "loss": 7.8874, + "step": 486300 + }, + { + "epoch": 1.9814867057274295, + "grad_norm": 3.704149007797241, + "learning_rate": 0.0040018580412100185, + "loss": 7.8675, + "step": 486400 + }, + { + "epoch": 1.9818940837508108, + "grad_norm": 4.645805358886719, + "learning_rate": 0.004001461115838779, + "loss": 7.8413, + "step": 486500 + }, + { + "epoch": 1.9823014617741923, + "grad_norm": 2.235441207885742, + "learning_rate": 0.0040010641313044295, + "loss": 7.8991, + "step": 486600 + }, + { + "epoch": 1.9827088397975738, + "grad_norm": 5.017754077911377, + "learning_rate": 0.004000667087622661, + "loss": 7.8697, + "step": 486700 + }, + { + "epoch": 1.9831162178209554, + "grad_norm": 2.454075336456299, + "learning_rate": 0.004000269984809177, + "loss": 7.8887, + "step": 486800 + }, + { + "epoch": 1.983523595844337, + "grad_norm": 1.3705803155899048, + "learning_rate": 0.0039998728228796765, + "loss": 7.8627, + "step": 486900 + }, + { + "epoch": 1.9839309738677182, + "grad_norm": 3.2323405742645264, + "learning_rate": 0.003999475601849856, + "loss": 7.8695, + "step": 487000 + }, + { + "epoch": 1.9839309738677182, + "eval_MaskedAccuracy": 0.49819125713116347, + "eval_loss": 1.6521326303482056, + "eval_runtime": 175.3211, + "eval_samples_per_second": 362.056, + "eval_steps_per_second": 1.415, + "step": 487000 + }, + { + "epoch": 1.9843383518910997, + "grad_norm": 2.3861207962036133, + "learning_rate": 0.003999078321735422, + "loss": 7.843, + "step": 487100 + }, + { + "epoch": 1.984745729914481, + "grad_norm": 5.130801200866699, + "learning_rate": 0.003998680982552084, + "loss": 7.8155, + "step": 487200 + }, + { + "epoch": 1.9851531079378626, + "grad_norm": 5.160671234130859, + "learning_rate": 0.0039982835843155465, + "loss": 7.892, + "step": 487300 + }, + { + "epoch": 1.9855604859612441, + "grad_norm": 4.723651885986328, + "learning_rate": 0.00399788612704152, + "loss": 7.8924, + "step": 487400 + }, + { + "epoch": 1.9859678639846257, + "grad_norm": 2.9012949466705322, + "learning_rate": 0.00399748861074572, + "loss": 7.8565, + "step": 487500 + }, + { + "epoch": 1.986375242008007, + "grad_norm": 4.409277439117432, + "learning_rate": 0.0039970910354438635, + "loss": 7.8184, + "step": 487600 + }, + { + "epoch": 1.9867826200313885, + "grad_norm": 4.830262660980225, + "learning_rate": 0.003996693401151668, + "loss": 7.849, + "step": 487700 + }, + { + "epoch": 1.9871899980547698, + "grad_norm": 7.154356002807617, + "learning_rate": 0.003996295707884856, + "loss": 7.8645, + "step": 487800 + }, + { + "epoch": 1.9875973760781513, + "grad_norm": 2.3467037677764893, + "learning_rate": 0.003995897955659146, + "loss": 7.8955, + "step": 487900 + }, + { + "epoch": 1.9880047541015329, + "grad_norm": 4.408042907714844, + "learning_rate": 0.003995500144490269, + "loss": 7.8682, + "step": 488000 + }, + { + "epoch": 1.9880047541015329, + "eval_MaskedAccuracy": 0.4977144905082705, + "eval_loss": 1.656249761581421, + "eval_runtime": 171.7931, + "eval_samples_per_second": 369.491, + "eval_steps_per_second": 1.444, + "step": 488000 + }, + { + "epoch": 1.9884121321249144, + "grad_norm": 6.825170516967773, + "learning_rate": 0.003995102274393941, + "loss": 7.8591, + "step": 488100 + }, + { + "epoch": 1.988819510148296, + "grad_norm": 4.52269172668457, + "learning_rate": 0.003994704345385907, + "loss": 7.8472, + "step": 488200 + }, + { + "epoch": 1.9892268881716773, + "grad_norm": 3.2726306915283203, + "learning_rate": 0.0039943063574818995, + "loss": 7.8211, + "step": 488300 + }, + { + "epoch": 1.9896342661950586, + "grad_norm": 2.61883544921875, + "learning_rate": 0.003993908310697633, + "loss": 7.8732, + "step": 488400 + }, + { + "epoch": 1.99004164421844, + "grad_norm": 4.642792224884033, + "learning_rate": 0.0039935102050488655, + "loss": 7.85, + "step": 488500 + }, + { + "epoch": 1.9904490222418216, + "grad_norm": 2.5082287788391113, + "learning_rate": 0.003993112040551326, + "loss": 7.8357, + "step": 488600 + }, + { + "epoch": 1.9908564002652032, + "grad_norm": 4.244477272033691, + "learning_rate": 0.003992713817220759, + "loss": 7.8668, + "step": 488700 + }, + { + "epoch": 1.9912637782885847, + "grad_norm": 3.799135208129883, + "learning_rate": 0.003992315535072908, + "loss": 7.8422, + "step": 488800 + }, + { + "epoch": 1.991671156311966, + "grad_norm": 2.4758245944976807, + "learning_rate": 0.003991917194123517, + "loss": 7.8798, + "step": 488900 + }, + { + "epoch": 1.9920785343353473, + "grad_norm": 3.006605386734009, + "learning_rate": 0.003991518794388338, + "loss": 7.8743, + "step": 489000 + }, + { + "epoch": 1.9920785343353473, + "eval_MaskedAccuracy": 0.49804167414093853, + "eval_loss": 1.6533012390136719, + "eval_runtime": 186.1393, + "eval_samples_per_second": 341.014, + "eval_steps_per_second": 1.332, + "step": 489000 + }, + { + "epoch": 1.9924859123587289, + "grad_norm": 5.0802459716796875, + "learning_rate": 0.003991120335883123, + "loss": 7.8781, + "step": 489100 + }, + { + "epoch": 1.9928932903821104, + "grad_norm": 3.296128988265991, + "learning_rate": 0.003990721818623624, + "loss": 7.8878, + "step": 489200 + }, + { + "epoch": 1.993300668405492, + "grad_norm": 4.713596820831299, + "learning_rate": 0.003990323242625591, + "loss": 7.8469, + "step": 489300 + }, + { + "epoch": 1.9937080464288734, + "grad_norm": 7.093138217926025, + "learning_rate": 0.003989924607904786, + "loss": 7.8772, + "step": 489400 + }, + { + "epoch": 1.9941154244522548, + "grad_norm": 3.2654430866241455, + "learning_rate": 0.0039895259144769706, + "loss": 7.869, + "step": 489500 + }, + { + "epoch": 1.9945228024756363, + "grad_norm": 4.690536975860596, + "learning_rate": 0.0039891271623579095, + "loss": 7.8711, + "step": 489600 + }, + { + "epoch": 1.9949301804990176, + "grad_norm": 4.180046081542969, + "learning_rate": 0.00398872835156336, + "loss": 7.861, + "step": 489700 + }, + { + "epoch": 1.9953375585223991, + "grad_norm": 1.7061024904251099, + "learning_rate": 0.003988329482109093, + "loss": 7.8457, + "step": 489800 + }, + { + "epoch": 1.9957449365457807, + "grad_norm": 8.832228660583496, + "learning_rate": 0.003987930554010878, + "loss": 7.825, + "step": 489900 + }, + { + "epoch": 1.9961523145691622, + "grad_norm": 5.911707878112793, + "learning_rate": 0.003987531567284494, + "loss": 7.8635, + "step": 490000 + }, + { + "epoch": 1.9961523145691622, + "eval_MaskedAccuracy": 0.4982665301124076, + "eval_loss": 1.6461238861083984, + "eval_runtime": 186.756, + "eval_samples_per_second": 339.887, + "eval_steps_per_second": 1.328, + "step": 490000 + }, + { + "epoch": 1.9965596925925435, + "grad_norm": 3.7599871158599854, + "learning_rate": 0.003987132521945702, + "loss": 7.8498, + "step": 490100 + }, + { + "epoch": 1.996967070615925, + "grad_norm": 2.7844014167785645, + "learning_rate": 0.003986733418010286, + "loss": 7.8577, + "step": 490200 + }, + { + "epoch": 1.9973744486393064, + "grad_norm": 2.239363193511963, + "learning_rate": 0.003986334255494022, + "loss": 7.8533, + "step": 490300 + }, + { + "epoch": 1.9977818266626879, + "grad_norm": 2.896563768386841, + "learning_rate": 0.003985935034412689, + "loss": 7.8719, + "step": 490400 + }, + { + "epoch": 1.9981892046860694, + "grad_norm": 3.142861843109131, + "learning_rate": 0.003985535754782079, + "loss": 7.846, + "step": 490500 + }, + { + "epoch": 1.998596582709451, + "grad_norm": 1.930145502090454, + "learning_rate": 0.003985136416617973, + "loss": 7.8546, + "step": 490600 + }, + { + "epoch": 1.9990039607328325, + "grad_norm": 2.043785572052002, + "learning_rate": 0.003984737019936155, + "loss": 7.9126, + "step": 490700 + }, + { + "epoch": 1.9994113387562138, + "grad_norm": 4.26570463180542, + "learning_rate": 0.003984337564752418, + "loss": 7.8457, + "step": 490800 + }, + { + "epoch": 1.999818716779595, + "grad_norm": 2.3818819522857666, + "learning_rate": 0.003983938051082552, + "loss": 7.8458, + "step": 490900 + }, + { + "epoch": 2.0002260948029766, + "grad_norm": 3.318009614944458, + "learning_rate": 0.003983538478942357, + "loss": 7.8652, + "step": 491000 + }, + { + "epoch": 2.0002260948029766, + "eval_MaskedAccuracy": 0.4978805718052601, + "eval_loss": 1.6651256084442139, + "eval_runtime": 148.2624, + "eval_samples_per_second": 428.133, + "eval_steps_per_second": 1.673, + "step": 491000 + }, + { + "epoch": 2.000633472826358, + "grad_norm": 2.78810977935791, + "learning_rate": 0.003983138848347631, + "loss": 7.8623, + "step": 491100 + }, + { + "epoch": 2.0010408508497397, + "grad_norm": 3.5467097759246826, + "learning_rate": 0.003982739159314167, + "loss": 7.8841, + "step": 491200 + }, + { + "epoch": 2.0014482288731212, + "grad_norm": 10.010056495666504, + "learning_rate": 0.003982339411857772, + "loss": 7.8765, + "step": 491300 + }, + { + "epoch": 2.0018556068965028, + "grad_norm": 2.5871756076812744, + "learning_rate": 0.003981939605994252, + "loss": 7.8858, + "step": 491400 + }, + { + "epoch": 2.002262984919884, + "grad_norm": 6.318182468414307, + "learning_rate": 0.003981539741739408, + "loss": 7.8725, + "step": 491500 + }, + { + "epoch": 2.0026703629432654, + "grad_norm": 2.4176394939422607, + "learning_rate": 0.003981139819109054, + "loss": 7.8824, + "step": 491600 + }, + { + "epoch": 2.003077740966647, + "grad_norm": 4.502716064453125, + "learning_rate": 0.003980739838118991, + "loss": 7.8732, + "step": 491700 + }, + { + "epoch": 2.0034851189900285, + "grad_norm": 2.8272433280944824, + "learning_rate": 0.003980339798785041, + "loss": 7.8563, + "step": 491800 + }, + { + "epoch": 2.00389249701341, + "grad_norm": 2.6620304584503174, + "learning_rate": 0.00397993970112302, + "loss": 7.8728, + "step": 491900 + }, + { + "epoch": 2.0042998750367915, + "grad_norm": 4.319948673248291, + "learning_rate": 0.003979539545148746, + "loss": 7.8687, + "step": 492000 + }, + { + "epoch": 2.0042998750367915, + "eval_MaskedAccuracy": 0.49793333894702246, + "eval_loss": 1.6577565670013428, + "eval_runtime": 149.5513, + "eval_samples_per_second": 424.443, + "eval_steps_per_second": 1.658, + "step": 492000 + }, + { + "epoch": 2.0047072530601726, + "grad_norm": 1.5957372188568115, + "learning_rate": 0.003979139330878037, + "loss": 7.8855, + "step": 492100 + }, + { + "epoch": 2.005114631083554, + "grad_norm": 2.106097936630249, + "learning_rate": 0.003978739058326709, + "loss": 7.8231, + "step": 492200 + }, + { + "epoch": 2.0055220091069357, + "grad_norm": 4.910973072052002, + "learning_rate": 0.0039783387275105974, + "loss": 7.8617, + "step": 492300 + }, + { + "epoch": 2.005929387130317, + "grad_norm": 13.576258659362793, + "learning_rate": 0.003977938338445527, + "loss": 7.8684, + "step": 492400 + }, + { + "epoch": 2.0063367651536987, + "grad_norm": 8.415559768676758, + "learning_rate": 0.003977537891147327, + "loss": 7.8494, + "step": 492500 + }, + { + "epoch": 2.0067441431770803, + "grad_norm": 3.934183120727539, + "learning_rate": 0.003977137385631824, + "loss": 7.867, + "step": 492600 + }, + { + "epoch": 2.0071515212004614, + "grad_norm": 4.725040435791016, + "learning_rate": 0.003976736821914864, + "loss": 7.857, + "step": 492700 + }, + { + "epoch": 2.007558899223843, + "grad_norm": 2.1035566329956055, + "learning_rate": 0.00397633620001227, + "loss": 7.8536, + "step": 492800 + }, + { + "epoch": 2.0079662772472244, + "grad_norm": 5.978846073150635, + "learning_rate": 0.003975935519939892, + "loss": 7.8704, + "step": 492900 + }, + { + "epoch": 2.008373655270606, + "grad_norm": 2.2010936737060547, + "learning_rate": 0.003975534781713568, + "loss": 7.8661, + "step": 493000 + }, + { + "epoch": 2.008373655270606, + "eval_MaskedAccuracy": 0.49859173485021163, + "eval_loss": 1.6547033786773682, + "eval_runtime": 149.7466, + "eval_samples_per_second": 423.89, + "eval_steps_per_second": 1.656, + "step": 493000 + }, + { + "epoch": 2.0087810332939875, + "grad_norm": 5.013829708099365, + "learning_rate": 0.003975133985349135, + "loss": 7.8827, + "step": 493100 + }, + { + "epoch": 2.009188411317369, + "grad_norm": 2.1907660961151123, + "learning_rate": 0.003974733130862448, + "loss": 7.9041, + "step": 493200 + }, + { + "epoch": 2.00959578934075, + "grad_norm": 1.990098476409912, + "learning_rate": 0.003974332218269345, + "loss": 7.8904, + "step": 493300 + }, + { + "epoch": 2.0100031673641316, + "grad_norm": 2.6972572803497314, + "learning_rate": 0.003973931247585682, + "loss": 7.872, + "step": 493400 + }, + { + "epoch": 2.010410545387513, + "grad_norm": 7.332125186920166, + "learning_rate": 0.003973530218827305, + "loss": 7.8676, + "step": 493500 + }, + { + "epoch": 2.0108179234108947, + "grad_norm": 7.7640299797058105, + "learning_rate": 0.00397312913201008, + "loss": 7.8457, + "step": 493600 + }, + { + "epoch": 2.0112253014342762, + "grad_norm": 3.2454802989959717, + "learning_rate": 0.003972727987149856, + "loss": 7.874, + "step": 493700 + }, + { + "epoch": 2.011632679457658, + "grad_norm": 4.539496898651123, + "learning_rate": 0.003972326784262497, + "loss": 7.8526, + "step": 493800 + }, + { + "epoch": 2.0120400574810393, + "grad_norm": 1.4194692373275757, + "learning_rate": 0.003971925523363862, + "loss": 7.8871, + "step": 493900 + }, + { + "epoch": 2.0124474355044204, + "grad_norm": 2.3241472244262695, + "learning_rate": 0.003971524204469813, + "loss": 7.8561, + "step": 494000 + }, + { + "epoch": 2.0124474355044204, + "eval_MaskedAccuracy": 0.4992459584295612, + "eval_loss": 1.6490083932876587, + "eval_runtime": 148.9955, + "eval_samples_per_second": 426.026, + "eval_steps_per_second": 1.664, + "step": 494000 + }, + { + "epoch": 2.012854813527802, + "grad_norm": 2.2037127017974854, + "learning_rate": 0.003971122827596219, + "loss": 7.8539, + "step": 494100 + }, + { + "epoch": 2.0132621915511835, + "grad_norm": 2.7499496936798096, + "learning_rate": 0.003970721392758945, + "loss": 7.8661, + "step": 494200 + }, + { + "epoch": 2.013669569574565, + "grad_norm": 2.684345006942749, + "learning_rate": 0.003970319899973867, + "loss": 7.8737, + "step": 494300 + }, + { + "epoch": 2.0140769475979465, + "grad_norm": 2.927569627761841, + "learning_rate": 0.003969918349256856, + "loss": 7.8905, + "step": 494400 + }, + { + "epoch": 2.014484325621328, + "grad_norm": 3.2021238803863525, + "learning_rate": 0.0039695167406237864, + "loss": 7.8662, + "step": 494500 + }, + { + "epoch": 2.014891703644709, + "grad_norm": 1.8485453128814697, + "learning_rate": 0.0039691150740905415, + "loss": 7.8778, + "step": 494600 + }, + { + "epoch": 2.0152990816680907, + "grad_norm": 8.958436965942383, + "learning_rate": 0.0039687133496729915, + "loss": 7.9229, + "step": 494700 + }, + { + "epoch": 2.015706459691472, + "grad_norm": 1.9357823133468628, + "learning_rate": 0.003968311567387029, + "loss": 7.8505, + "step": 494800 + }, + { + "epoch": 2.0161138377148538, + "grad_norm": 1.8658493757247925, + "learning_rate": 0.003967909727248532, + "loss": 7.8604, + "step": 494900 + }, + { + "epoch": 2.0165212157382353, + "grad_norm": 2.6488263607025146, + "learning_rate": 0.003967507829273389, + "loss": 7.8379, + "step": 495000 + }, + { + "epoch": 2.0165212157382353, + "eval_MaskedAccuracy": 0.4984190639202395, + "eval_loss": 1.6625776290893555, + "eval_runtime": 149.651, + "eval_samples_per_second": 424.16, + "eval_steps_per_second": 1.657, + "step": 495000 + }, + { + "epoch": 2.016928593761617, + "grad_norm": 3.5286457538604736, + "learning_rate": 0.003967105873477492, + "loss": 7.8431, + "step": 495100 + }, + { + "epoch": 2.017335971784998, + "grad_norm": 2.9637646675109863, + "learning_rate": 0.003966703859876732, + "loss": 7.8244, + "step": 495200 + }, + { + "epoch": 2.0177433498083794, + "grad_norm": 1.8317160606384277, + "learning_rate": 0.0039663017884870025, + "loss": 7.8177, + "step": 495300 + }, + { + "epoch": 2.018150727831761, + "grad_norm": 6.571487903594971, + "learning_rate": 0.003965899659324195, + "loss": 7.856, + "step": 495400 + }, + { + "epoch": 2.0185581058551425, + "grad_norm": 4.068114757537842, + "learning_rate": 0.003965497472404212, + "loss": 7.8958, + "step": 495500 + }, + { + "epoch": 2.018965483878524, + "grad_norm": 4.081752300262451, + "learning_rate": 0.0039650952277429504, + "loss": 7.8686, + "step": 495600 + }, + { + "epoch": 2.0193728619019056, + "grad_norm": 3.2896957397460938, + "learning_rate": 0.003964692925356319, + "loss": 7.8418, + "step": 495700 + }, + { + "epoch": 2.0197802399252867, + "grad_norm": 2.8759496212005615, + "learning_rate": 0.003964290565260214, + "loss": 7.8325, + "step": 495800 + }, + { + "epoch": 2.020187617948668, + "grad_norm": 6.6146416664123535, + "learning_rate": 0.00396388814747055, + "loss": 7.8462, + "step": 495900 + }, + { + "epoch": 2.0205949959720497, + "grad_norm": 3.111217975616455, + "learning_rate": 0.003963485672003234, + "loss": 7.8373, + "step": 496000 + }, + { + "epoch": 2.0205949959720497, + "eval_MaskedAccuracy": 0.49895371591193666, + "eval_loss": 1.6523817777633667, + "eval_runtime": 149.372, + "eval_samples_per_second": 424.952, + "eval_steps_per_second": 1.66, + "step": 496000 + }, + { + "epoch": 2.0210023739954313, + "grad_norm": 2.6997292041778564, + "learning_rate": 0.003963083138874182, + "loss": 7.8901, + "step": 496100 + }, + { + "epoch": 2.021409752018813, + "grad_norm": 1.9682706594467163, + "learning_rate": 0.0039626805480993, + "loss": 7.8991, + "step": 496200 + }, + { + "epoch": 2.0218171300421943, + "grad_norm": 8.418217658996582, + "learning_rate": 0.003962277899694509, + "loss": 7.8693, + "step": 496300 + }, + { + "epoch": 2.022224508065576, + "grad_norm": 1.703735113143921, + "learning_rate": 0.003961875193675732, + "loss": 7.8759, + "step": 496400 + }, + { + "epoch": 2.022631886088957, + "grad_norm": 3.291315793991089, + "learning_rate": 0.003961472430058886, + "loss": 7.8827, + "step": 496500 + }, + { + "epoch": 2.0230392641123385, + "grad_norm": 9.770030975341797, + "learning_rate": 0.003961069608859893, + "loss": 7.8737, + "step": 496600 + }, + { + "epoch": 2.02344664213572, + "grad_norm": 2.2468302249908447, + "learning_rate": 0.003960666730094681, + "loss": 7.8525, + "step": 496700 + }, + { + "epoch": 2.0238540201591015, + "grad_norm": 2.570030689239502, + "learning_rate": 0.003960263793779182, + "loss": 7.8559, + "step": 496800 + }, + { + "epoch": 2.024261398182483, + "grad_norm": 9.67927074432373, + "learning_rate": 0.003959860799929313, + "loss": 7.8569, + "step": 496900 + }, + { + "epoch": 2.0246687762058646, + "grad_norm": 5.694683074951172, + "learning_rate": 0.0039594577485610235, + "loss": 7.8907, + "step": 497000 + }, + { + "epoch": 2.0246687762058646, + "eval_MaskedAccuracy": 0.49784782853504506, + "eval_loss": 1.651322841644287, + "eval_runtime": 147.3449, + "eval_samples_per_second": 430.799, + "eval_steps_per_second": 1.683, + "step": 497000 + }, + { + "epoch": 2.0250761542292457, + "grad_norm": 5.3224334716796875, + "learning_rate": 0.0039590546396902375, + "loss": 7.883, + "step": 497100 + }, + { + "epoch": 2.0254835322526272, + "grad_norm": 1.8543610572814941, + "learning_rate": 0.0039586514733329, + "loss": 7.8683, + "step": 497200 + }, + { + "epoch": 2.0258909102760088, + "grad_norm": 2.410493850708008, + "learning_rate": 0.003958248249504936, + "loss": 7.8893, + "step": 497300 + }, + { + "epoch": 2.0262982882993903, + "grad_norm": 1.44971764087677, + "learning_rate": 0.0039578449682223, + "loss": 7.8402, + "step": 497400 + }, + { + "epoch": 2.026705666322772, + "grad_norm": 6.2107086181640625, + "learning_rate": 0.003957441629500934, + "loss": 7.861, + "step": 497500 + }, + { + "epoch": 2.0271130443461534, + "grad_norm": 1.6982815265655518, + "learning_rate": 0.00395703823335678, + "loss": 7.8832, + "step": 497600 + }, + { + "epoch": 2.0275204223695344, + "grad_norm": 8.036022186279297, + "learning_rate": 0.00395663477980579, + "loss": 7.8428, + "step": 497700 + }, + { + "epoch": 2.027927800392916, + "grad_norm": 4.244318962097168, + "learning_rate": 0.003956231268863912, + "loss": 7.848, + "step": 497800 + }, + { + "epoch": 2.0283351784162975, + "grad_norm": 3.505483388900757, + "learning_rate": 0.003955827700547101, + "loss": 7.8499, + "step": 497900 + }, + { + "epoch": 2.028742556439679, + "grad_norm": 1.8905874490737915, + "learning_rate": 0.003955424074871312, + "loss": 7.8887, + "step": 498000 + }, + { + "epoch": 2.028742556439679, + "eval_MaskedAccuracy": 0.4991002723353723, + "eval_loss": 1.649802803993225, + "eval_runtime": 154.2681, + "eval_samples_per_second": 411.465, + "eval_steps_per_second": 1.608, + "step": 498000 + }, + { + "epoch": 2.0291499344630606, + "grad_norm": 3.2744388580322266, + "learning_rate": 0.003955020391852497, + "loss": 7.8617, + "step": 498100 + }, + { + "epoch": 2.029557312486442, + "grad_norm": 2.8778340816497803, + "learning_rate": 0.003954616651506626, + "loss": 7.8796, + "step": 498200 + }, + { + "epoch": 2.029964690509823, + "grad_norm": 3.6095352172851562, + "learning_rate": 0.003954212853849658, + "loss": 7.8673, + "step": 498300 + }, + { + "epoch": 2.0303720685332047, + "grad_norm": 9.938542366027832, + "learning_rate": 0.00395380899889755, + "loss": 7.8706, + "step": 498400 + }, + { + "epoch": 2.0307794465565863, + "grad_norm": 7.641909599304199, + "learning_rate": 0.003953405086666274, + "loss": 7.853, + "step": 498500 + }, + { + "epoch": 2.031186824579968, + "grad_norm": 1.5943167209625244, + "learning_rate": 0.003953001117171798, + "loss": 7.8503, + "step": 498600 + }, + { + "epoch": 2.0315942026033493, + "grad_norm": 2.4938507080078125, + "learning_rate": 0.003952597090430098, + "loss": 7.845, + "step": 498700 + }, + { + "epoch": 2.032001580626731, + "grad_norm": 3.876598834991455, + "learning_rate": 0.003952193006457136, + "loss": 7.846, + "step": 498800 + }, + { + "epoch": 2.0324089586501124, + "grad_norm": 7.685597896575928, + "learning_rate": 0.003951788865268895, + "loss": 7.8387, + "step": 498900 + }, + { + "epoch": 2.0328163366734935, + "grad_norm": 2.6308696269989014, + "learning_rate": 0.003951384666881356, + "loss": 7.8479, + "step": 499000 + }, + { + "epoch": 2.0328163366734935, + "eval_MaskedAccuracy": 0.4985315018264538, + "eval_loss": 1.650436282157898, + "eval_runtime": 154.2302, + "eval_samples_per_second": 411.567, + "eval_steps_per_second": 1.608, + "step": 499000 + }, + { + "epoch": 2.033223714696875, + "grad_norm": 6.766794681549072, + "learning_rate": 0.0039509804113104985, + "loss": 7.886, + "step": 499100 + }, + { + "epoch": 2.0336310927202566, + "grad_norm": 3.8292505741119385, + "learning_rate": 0.003950576098572298, + "loss": 7.8787, + "step": 499200 + }, + { + "epoch": 2.034038470743638, + "grad_norm": 2.4238884449005127, + "learning_rate": 0.003950171728682741, + "loss": 7.8691, + "step": 499300 + }, + { + "epoch": 2.0344458487670196, + "grad_norm": 9.240036964416504, + "learning_rate": 0.003949767301657816, + "loss": 7.8608, + "step": 499400 + }, + { + "epoch": 2.034853226790401, + "grad_norm": 3.8402020931243896, + "learning_rate": 0.0039493628175135105, + "loss": 7.868, + "step": 499500 + }, + { + "epoch": 2.0352606048137822, + "grad_norm": 2.0422768592834473, + "learning_rate": 0.003948958276265822, + "loss": 7.8657, + "step": 499600 + }, + { + "epoch": 2.0356679828371638, + "grad_norm": 2.8022241592407227, + "learning_rate": 0.003948553677930741, + "loss": 7.8771, + "step": 499700 + }, + { + "epoch": 2.0360753608605453, + "grad_norm": 5.078003883361816, + "learning_rate": 0.003948149022524265, + "loss": 7.8755, + "step": 499800 + }, + { + "epoch": 2.036482738883927, + "grad_norm": 2.3374109268188477, + "learning_rate": 0.003947744310062384, + "loss": 7.8508, + "step": 499900 + }, + { + "epoch": 2.0368901169073084, + "grad_norm": 5.849517345428467, + "learning_rate": 0.003947339540561103, + "loss": 7.8546, + "step": 500000 + }, + { + "epoch": 2.0368901169073084, + "eval_MaskedAccuracy": 0.4980729870603918, + "eval_loss": 1.6496412754058838, + "eval_runtime": 150.4494, + "eval_samples_per_second": 421.909, + "eval_steps_per_second": 1.648, + "step": 500000 + }, + { + "epoch": 2.03729749493069, + "grad_norm": 1.8953150510787964, + "learning_rate": 0.003946934714036427, + "loss": 7.8451, + "step": 500100 + }, + { + "epoch": 2.037704872954071, + "grad_norm": 2.111159563064575, + "learning_rate": 0.003946529830504364, + "loss": 7.8453, + "step": 500200 + }, + { + "epoch": 2.0381122509774525, + "grad_norm": 2.262208938598633, + "learning_rate": 0.003946124889980916, + "loss": 7.8774, + "step": 500300 + }, + { + "epoch": 2.038519629000834, + "grad_norm": 4.010697841644287, + "learning_rate": 0.0039457198924820885, + "loss": 7.8669, + "step": 500400 + }, + { + "epoch": 2.0389270070242156, + "grad_norm": 5.616472244262695, + "learning_rate": 0.003945314838023902, + "loss": 7.8648, + "step": 500500 + }, + { + "epoch": 2.039334385047597, + "grad_norm": 5.5384111404418945, + "learning_rate": 0.003944909726622365, + "loss": 7.845, + "step": 500600 + }, + { + "epoch": 2.0397417630709787, + "grad_norm": 4.254812717437744, + "learning_rate": 0.0039445045582934945, + "loss": 7.8772, + "step": 500700 + }, + { + "epoch": 2.0401491410943597, + "grad_norm": 2.6096653938293457, + "learning_rate": 0.003944099333053311, + "loss": 7.856, + "step": 500800 + }, + { + "epoch": 2.0405565191177413, + "grad_norm": 2.5192205905914307, + "learning_rate": 0.003943694050917833, + "loss": 7.8865, + "step": 500900 + }, + { + "epoch": 2.040963897141123, + "grad_norm": 1.8438318967819214, + "learning_rate": 0.003943288711903082, + "loss": 7.8819, + "step": 501000 + }, + { + "epoch": 2.040963897141123, + "eval_MaskedAccuracy": 0.4982288280269719, + "eval_loss": 1.647727370262146, + "eval_runtime": 157.4726, + "eval_samples_per_second": 403.092, + "eval_steps_per_second": 1.575, + "step": 501000 + }, + { + "epoch": 2.0413712751645043, + "grad_norm": 5.1324076652526855, + "learning_rate": 0.003942883316025085, + "loss": 7.8547, + "step": 501100 + }, + { + "epoch": 2.041778653187886, + "grad_norm": 4.713312149047852, + "learning_rate": 0.003942477863299873, + "loss": 7.8933, + "step": 501200 + }, + { + "epoch": 2.0421860312112674, + "grad_norm": 3.2050485610961914, + "learning_rate": 0.003942072353743475, + "loss": 7.8831, + "step": 501300 + }, + { + "epoch": 2.042593409234649, + "grad_norm": 5.312648773193359, + "learning_rate": 0.00394166678737192, + "loss": 7.8576, + "step": 501400 + }, + { + "epoch": 2.04300078725803, + "grad_norm": 1.825149416923523, + "learning_rate": 0.003941261164201243, + "loss": 7.8245, + "step": 501500 + }, + { + "epoch": 2.0434081652814116, + "grad_norm": 6.237276554107666, + "learning_rate": 0.00394085548424749, + "loss": 7.8614, + "step": 501600 + }, + { + "epoch": 2.043815543304793, + "grad_norm": 3.27168345451355, + "learning_rate": 0.003940449747526681, + "loss": 7.8625, + "step": 501700 + }, + { + "epoch": 2.0442229213281746, + "grad_norm": 5.12900447845459, + "learning_rate": 0.003940043954054862, + "loss": 7.8834, + "step": 501800 + }, + { + "epoch": 2.044630299351556, + "grad_norm": 3.0178205966949463, + "learning_rate": 0.003939638103848086, + "loss": 7.9093, + "step": 501900 + }, + { + "epoch": 2.0450376773749377, + "grad_norm": 2.8164114952087402, + "learning_rate": 0.003939232196922395, + "loss": 7.8627, + "step": 502000 + }, + { + "epoch": 2.0450376773749377, + "eval_MaskedAccuracy": 0.49816018406318197, + "eval_loss": 1.6543567180633545, + "eval_runtime": 163.2898, + "eval_samples_per_second": 388.732, + "eval_steps_per_second": 1.519, + "step": 502000 + }, + { + "epoch": 2.045445055398319, + "grad_norm": 5.639309883117676, + "learning_rate": 0.003938826233293827, + "loss": 7.8648, + "step": 502100 + }, + { + "epoch": 2.0458524334217003, + "grad_norm": 3.8196420669555664, + "learning_rate": 0.003938420212978441, + "loss": 7.8399, + "step": 502200 + }, + { + "epoch": 2.046259811445082, + "grad_norm": 6.940499782562256, + "learning_rate": 0.003938014135992289, + "loss": 7.86, + "step": 502300 + }, + { + "epoch": 2.0466671894684634, + "grad_norm": 4.158481121063232, + "learning_rate": 0.0039376080023514224, + "loss": 7.8347, + "step": 502400 + }, + { + "epoch": 2.047074567491845, + "grad_norm": 6.34360408782959, + "learning_rate": 0.0039372018120719015, + "loss": 7.8619, + "step": 502500 + }, + { + "epoch": 2.0474819455152264, + "grad_norm": 1.8799878358840942, + "learning_rate": 0.003936795565169784, + "loss": 7.8507, + "step": 502600 + }, + { + "epoch": 2.0478893235386075, + "grad_norm": 6.622459411621094, + "learning_rate": 0.003936389261661131, + "loss": 7.8526, + "step": 502700 + }, + { + "epoch": 2.048296701561989, + "grad_norm": 3.0950870513916016, + "learning_rate": 0.003935982901562, + "loss": 7.8605, + "step": 502800 + }, + { + "epoch": 2.0487040795853706, + "grad_norm": 2.631889581680298, + "learning_rate": 0.0039355764848884615, + "loss": 7.8505, + "step": 502900 + }, + { + "epoch": 2.049111457608752, + "grad_norm": 5.816521167755127, + "learning_rate": 0.003935170011656585, + "loss": 7.8576, + "step": 503000 + }, + { + "epoch": 2.049111457608752, + "eval_MaskedAccuracy": 0.49909299324845074, + "eval_loss": 1.6481876373291016, + "eval_runtime": 208.8966, + "eval_samples_per_second": 303.863, + "eval_steps_per_second": 1.187, + "step": 503000 + }, + { + "epoch": 2.0495188356321337, + "grad_norm": 4.197556018829346, + "learning_rate": 0.003934763481882442, + "loss": 7.8734, + "step": 503100 + }, + { + "epoch": 2.049926213655515, + "grad_norm": 4.809361934661865, + "learning_rate": 0.003934356895582095, + "loss": 7.8417, + "step": 503200 + }, + { + "epoch": 2.0503335916788963, + "grad_norm": 4.074956893920898, + "learning_rate": 0.003933950252771629, + "loss": 7.8513, + "step": 503300 + }, + { + "epoch": 2.050740969702278, + "grad_norm": 6.168447971343994, + "learning_rate": 0.0039335435534671144, + "loss": 7.8305, + "step": 503400 + }, + { + "epoch": 2.0511483477256593, + "grad_norm": 2.2486164569854736, + "learning_rate": 0.0039331367976846365, + "loss": 7.8466, + "step": 503500 + }, + { + "epoch": 2.051555725749041, + "grad_norm": 2.659501791000366, + "learning_rate": 0.003932729985440271, + "loss": 7.8763, + "step": 503600 + }, + { + "epoch": 2.0519631037724224, + "grad_norm": 2.3019237518310547, + "learning_rate": 0.003932323116750102, + "loss": 7.8584, + "step": 503700 + }, + { + "epoch": 2.052370481795804, + "grad_norm": 3.082778215408325, + "learning_rate": 0.003931916191630222, + "loss": 7.8585, + "step": 503800 + }, + { + "epoch": 2.0527778598191855, + "grad_norm": 5.441107749938965, + "learning_rate": 0.003931509210096709, + "loss": 7.8785, + "step": 503900 + }, + { + "epoch": 2.0531852378425666, + "grad_norm": 3.09259295463562, + "learning_rate": 0.003931102172165656, + "loss": 7.8469, + "step": 504000 + }, + { + "epoch": 2.0531852378425666, + "eval_MaskedAccuracy": 0.498288683919097, + "eval_loss": 1.6587414741516113, + "eval_runtime": 165.1973, + "eval_samples_per_second": 384.244, + "eval_steps_per_second": 1.501, + "step": 504000 + }, + { + "epoch": 2.053592615865948, + "grad_norm": 2.1826281547546387, + "learning_rate": 0.003930695077853157, + "loss": 7.8734, + "step": 504100 + }, + { + "epoch": 2.0539999938893296, + "grad_norm": 4.1279296875, + "learning_rate": 0.003930287927175303, + "loss": 7.8529, + "step": 504200 + }, + { + "epoch": 2.054407371912711, + "grad_norm": 1.551027536392212, + "learning_rate": 0.003929880720148198, + "loss": 7.8445, + "step": 504300 + }, + { + "epoch": 2.0548147499360927, + "grad_norm": 2.608344793319702, + "learning_rate": 0.003929473456787942, + "loss": 7.8375, + "step": 504400 + }, + { + "epoch": 2.0552221279594742, + "grad_norm": 5.23867130279541, + "learning_rate": 0.00392906613711063, + "loss": 7.8857, + "step": 504500 + }, + { + "epoch": 2.0556295059828553, + "grad_norm": 4.096222400665283, + "learning_rate": 0.003928658761132365, + "loss": 7.8897, + "step": 504600 + }, + { + "epoch": 2.056036884006237, + "grad_norm": 6.972052097320557, + "learning_rate": 0.003928251328869257, + "loss": 7.8818, + "step": 504700 + }, + { + "epoch": 2.0564442620296184, + "grad_norm": 3.181206226348877, + "learning_rate": 0.003927843840337411, + "loss": 7.8621, + "step": 504800 + }, + { + "epoch": 2.056851640053, + "grad_norm": 1.3935866355895996, + "learning_rate": 0.00392743629555294, + "loss": 7.8976, + "step": 504900 + }, + { + "epoch": 2.0572590180763815, + "grad_norm": 4.864538669586182, + "learning_rate": 0.003927028694531953, + "loss": 7.8811, + "step": 505000 + }, + { + "epoch": 2.0572590180763815, + "eval_MaskedAccuracy": 0.497431730936525, + "eval_loss": 1.6568992137908936, + "eval_runtime": 178.3794, + "eval_samples_per_second": 355.848, + "eval_steps_per_second": 1.39, + "step": 505000 + }, + { + "epoch": 2.057666396099763, + "grad_norm": 2.557091236114502, + "learning_rate": 0.003926621037290565, + "loss": 7.8348, + "step": 505100 + }, + { + "epoch": 2.058073774123144, + "grad_norm": 3.1932973861694336, + "learning_rate": 0.003926213323844895, + "loss": 7.8379, + "step": 505200 + }, + { + "epoch": 2.0584811521465256, + "grad_norm": 3.5232841968536377, + "learning_rate": 0.003925805554211059, + "loss": 7.8917, + "step": 505300 + }, + { + "epoch": 2.058888530169907, + "grad_norm": 1.8708343505859375, + "learning_rate": 0.003925397728405181, + "loss": 7.8413, + "step": 505400 + }, + { + "epoch": 2.0592959081932887, + "grad_norm": 4.613933086395264, + "learning_rate": 0.003924989846443382, + "loss": 7.8797, + "step": 505500 + }, + { + "epoch": 2.05970328621667, + "grad_norm": 8.044772148132324, + "learning_rate": 0.003924581908341792, + "loss": 7.8726, + "step": 505600 + }, + { + "epoch": 2.0601106642400517, + "grad_norm": 4.191234588623047, + "learning_rate": 0.0039241739141165355, + "loss": 7.8826, + "step": 505700 + }, + { + "epoch": 2.060518042263433, + "grad_norm": 3.295250415802002, + "learning_rate": 0.003923765863783748, + "loss": 7.8648, + "step": 505800 + }, + { + "epoch": 2.0609254202868144, + "grad_norm": 1.919803500175476, + "learning_rate": 0.003923357757359557, + "loss": 7.8739, + "step": 505900 + }, + { + "epoch": 2.061332798310196, + "grad_norm": 3.5921905040740967, + "learning_rate": 0.003922949594860091, + "loss": 7.8661, + "step": 506000 + }, + { + "epoch": 2.061332798310196, + "eval_MaskedAccuracy": 0.4990109497480266, + "eval_loss": 1.6470351219177246, + "eval_runtime": 169.5359, + "eval_samples_per_second": 374.41, + "eval_steps_per_second": 1.463, + "step": 506000 + }, + { + "epoch": 2.0617401763335774, + "grad_norm": 4.288949489593506, + "learning_rate": 0.003922541376301499, + "loss": 7.8576, + "step": 506100 + }, + { + "epoch": 2.062147554356959, + "grad_norm": 3.8970656394958496, + "learning_rate": 0.003922133101699917, + "loss": 7.8358, + "step": 506200 + }, + { + "epoch": 2.0625549323803405, + "grad_norm": 5.335996627807617, + "learning_rate": 0.003921724771071478, + "loss": 7.8553, + "step": 506300 + }, + { + "epoch": 2.062962310403722, + "grad_norm": 2.890336036682129, + "learning_rate": 0.003921316384432334, + "loss": 7.8439, + "step": 506400 + }, + { + "epoch": 2.063369688427103, + "grad_norm": 2.4204955101013184, + "learning_rate": 0.0039209079417986325, + "loss": 7.8322, + "step": 506500 + }, + { + "epoch": 2.0637770664504846, + "grad_norm": 4.1408772468566895, + "learning_rate": 0.003920499443186511, + "loss": 7.8684, + "step": 506600 + }, + { + "epoch": 2.064184444473866, + "grad_norm": 5.479893207550049, + "learning_rate": 0.003920090888612128, + "loss": 7.8955, + "step": 506700 + }, + { + "epoch": 2.0645918224972477, + "grad_norm": 4.912505626678467, + "learning_rate": 0.003919682278091631, + "loss": 7.8907, + "step": 506800 + }, + { + "epoch": 2.0649992005206292, + "grad_norm": 4.852830410003662, + "learning_rate": 0.003919273611641179, + "loss": 7.8876, + "step": 506900 + }, + { + "epoch": 2.0654065785440108, + "grad_norm": 8.33307933807373, + "learning_rate": 0.003918864889276931, + "loss": 7.8647, + "step": 507000 + }, + { + "epoch": 2.0654065785440108, + "eval_MaskedAccuracy": 0.49786567208399146, + "eval_loss": 1.6547155380249023, + "eval_runtime": 156.5502, + "eval_samples_per_second": 405.467, + "eval_steps_per_second": 1.584, + "step": 507000 + }, + { + "epoch": 2.065813956567392, + "grad_norm": 7.5528483390808105, + "learning_rate": 0.003918456111015038, + "loss": 7.877, + "step": 507100 + }, + { + "epoch": 2.0662213345907734, + "grad_norm": 8.536968231201172, + "learning_rate": 0.003918047276871674, + "loss": 7.8847, + "step": 507200 + }, + { + "epoch": 2.066628712614155, + "grad_norm": 2.540980339050293, + "learning_rate": 0.003917638386862986, + "loss": 7.8876, + "step": 507300 + }, + { + "epoch": 2.0670360906375365, + "grad_norm": 1.9017066955566406, + "learning_rate": 0.003917229441005151, + "loss": 7.8696, + "step": 507400 + }, + { + "epoch": 2.067443468660918, + "grad_norm": 4.358914375305176, + "learning_rate": 0.003916820439314332, + "loss": 7.8921, + "step": 507500 + }, + { + "epoch": 2.0678508466842995, + "grad_norm": 8.98803997039795, + "learning_rate": 0.003916411381806701, + "loss": 7.8506, + "step": 507600 + }, + { + "epoch": 2.0682582247076806, + "grad_norm": 1.7998460531234741, + "learning_rate": 0.003916002268498411, + "loss": 7.8614, + "step": 507700 + }, + { + "epoch": 2.068665602731062, + "grad_norm": 4.80010986328125, + "learning_rate": 0.0039155930994056514, + "loss": 7.8716, + "step": 507800 + }, + { + "epoch": 2.0690729807544437, + "grad_norm": 4.340163707733154, + "learning_rate": 0.003915183874544606, + "loss": 7.8666, + "step": 507900 + }, + { + "epoch": 2.069480358777825, + "grad_norm": 5.179835796356201, + "learning_rate": 0.003914774593931448, + "loss": 7.8264, + "step": 508000 + }, + { + "epoch": 2.069480358777825, + "eval_MaskedAccuracy": 0.49846833664187323, + "eval_loss": 1.6485658884048462, + "eval_runtime": 159.1141, + "eval_samples_per_second": 398.934, + "eval_steps_per_second": 1.559, + "step": 508000 + }, + { + "epoch": 2.0698877368012067, + "grad_norm": 2.6034064292907715, + "learning_rate": 0.003914365257582357, + "loss": 7.8407, + "step": 508100 + }, + { + "epoch": 2.0702951148245883, + "grad_norm": 3.7942333221435547, + "learning_rate": 0.003913955865513524, + "loss": 7.8936, + "step": 508200 + }, + { + "epoch": 2.0707024928479694, + "grad_norm": 5.063714504241943, + "learning_rate": 0.003913546417741127, + "loss": 7.8372, + "step": 508300 + }, + { + "epoch": 2.071109870871351, + "grad_norm": 2.7904000282287598, + "learning_rate": 0.003913136914281354, + "loss": 7.8756, + "step": 508400 + }, + { + "epoch": 2.0715172488947324, + "grad_norm": 1.6965965032577515, + "learning_rate": 0.003912727355150397, + "loss": 7.8745, + "step": 508500 + }, + { + "epoch": 2.071924626918114, + "grad_norm": 2.195662021636963, + "learning_rate": 0.003912317740364454, + "loss": 7.8828, + "step": 508600 + }, + { + "epoch": 2.0723320049414955, + "grad_norm": 2.2256686687469482, + "learning_rate": 0.003911908069939709, + "loss": 7.8735, + "step": 508700 + }, + { + "epoch": 2.072739382964877, + "grad_norm": 4.389769554138184, + "learning_rate": 0.003911498343892368, + "loss": 7.8894, + "step": 508800 + }, + { + "epoch": 2.0731467609882586, + "grad_norm": 4.874904632568359, + "learning_rate": 0.003911088562238624, + "loss": 7.8908, + "step": 508900 + }, + { + "epoch": 2.0735541390116397, + "grad_norm": 4.446139812469482, + "learning_rate": 0.003910678724994674, + "loss": 7.8519, + "step": 509000 + }, + { + "epoch": 2.0735541390116397, + "eval_MaskedAccuracy": 0.4977517520413056, + "eval_loss": 1.6625655889511108, + "eval_runtime": 153.8827, + "eval_samples_per_second": 412.496, + "eval_steps_per_second": 1.612, + "step": 509000 + }, + { + "epoch": 2.073961517035021, + "grad_norm": 2.1489932537078857, + "learning_rate": 0.003910268832176721, + "loss": 7.8871, + "step": 509100 + }, + { + "epoch": 2.0743688950584027, + "grad_norm": 3.3913214206695557, + "learning_rate": 0.003909858883800986, + "loss": 7.8739, + "step": 509200 + }, + { + "epoch": 2.0747762730817843, + "grad_norm": 4.963801860809326, + "learning_rate": 0.00390944887988366, + "loss": 7.8621, + "step": 509300 + }, + { + "epoch": 2.075183651105166, + "grad_norm": 3.815263271331787, + "learning_rate": 0.003909038820440954, + "loss": 7.8576, + "step": 509400 + }, + { + "epoch": 2.0755910291285473, + "grad_norm": 4.6263532638549805, + "learning_rate": 0.003908628705489091, + "loss": 7.859, + "step": 509500 + }, + { + "epoch": 2.0759984071519284, + "grad_norm": 7.340849876403809, + "learning_rate": 0.0039082185350442725, + "loss": 7.8834, + "step": 509600 + }, + { + "epoch": 2.07640578517531, + "grad_norm": 3.5412895679473877, + "learning_rate": 0.003907808309122724, + "loss": 7.8686, + "step": 509700 + }, + { + "epoch": 2.0768131631986915, + "grad_norm": 2.75848650932312, + "learning_rate": 0.003907398027740666, + "loss": 7.8514, + "step": 509800 + }, + { + "epoch": 2.077220541222073, + "grad_norm": 2.8344566822052, + "learning_rate": 0.003906987690914311, + "loss": 7.8755, + "step": 509900 + }, + { + "epoch": 2.0776279192454545, + "grad_norm": 2.3449015617370605, + "learning_rate": 0.003906577298659882, + "loss": 7.8879, + "step": 510000 + }, + { + "epoch": 2.0776279192454545, + "eval_MaskedAccuracy": 0.4976222971445992, + "eval_loss": 1.6617436408996582, + "eval_runtime": 160.4301, + "eval_samples_per_second": 395.661, + "eval_steps_per_second": 1.546, + "step": 510000 + }, + { + "epoch": 2.078035297268836, + "grad_norm": 4.79791784286499, + "learning_rate": 0.0039061668509936093, + "loss": 7.8708, + "step": 510100 + }, + { + "epoch": 2.078442675292217, + "grad_norm": 4.853598117828369, + "learning_rate": 0.0039057563479317366, + "loss": 7.8542, + "step": 510200 + }, + { + "epoch": 2.0788500533155987, + "grad_norm": 3.1397154331207275, + "learning_rate": 0.0039053457894904684, + "loss": 7.8636, + "step": 510300 + }, + { + "epoch": 2.0792574313389802, + "grad_norm": 2.812790632247925, + "learning_rate": 0.003904935175686046, + "loss": 7.8622, + "step": 510400 + }, + { + "epoch": 2.0796648093623618, + "grad_norm": 5.004056930541992, + "learning_rate": 0.003904524506534699, + "loss": 7.8346, + "step": 510500 + }, + { + "epoch": 2.0800721873857433, + "grad_norm": 5.575089454650879, + "learning_rate": 0.003904113782052664, + "loss": 7.8355, + "step": 510600 + }, + { + "epoch": 2.080479565409125, + "grad_norm": 4.813459873199463, + "learning_rate": 0.0039037030022561784, + "loss": 7.8792, + "step": 510700 + }, + { + "epoch": 2.080886943432506, + "grad_norm": 2.780196189880371, + "learning_rate": 0.0039032921671614796, + "loss": 7.8541, + "step": 510800 + }, + { + "epoch": 2.0812943214558874, + "grad_norm": 2.5562331676483154, + "learning_rate": 0.0039028812767848125, + "loss": 7.8891, + "step": 510900 + }, + { + "epoch": 2.081701699479269, + "grad_norm": 2.959256410598755, + "learning_rate": 0.0039024703311424267, + "loss": 7.9042, + "step": 511000 + }, + { + "epoch": 2.081701699479269, + "eval_MaskedAccuracy": 0.49948171891703885, + "eval_loss": 1.6568866968154907, + "eval_runtime": 178.5216, + "eval_samples_per_second": 355.565, + "eval_steps_per_second": 1.389, + "step": 511000 + }, + { + "epoch": 2.0821090775026505, + "grad_norm": 2.2464606761932373, + "learning_rate": 0.0039020593302505576, + "loss": 7.8597, + "step": 511100 + }, + { + "epoch": 2.082516455526032, + "grad_norm": 7.631260871887207, + "learning_rate": 0.0039016482741254656, + "loss": 7.8887, + "step": 511200 + }, + { + "epoch": 2.0829238335494136, + "grad_norm": 5.0504655838012695, + "learning_rate": 0.003901237162783396, + "loss": 7.8744, + "step": 511300 + }, + { + "epoch": 2.083331211572795, + "grad_norm": 6.261029243469238, + "learning_rate": 0.003900825996240606, + "loss": 7.862, + "step": 511400 + }, + { + "epoch": 2.083738589596176, + "grad_norm": 2.310039520263672, + "learning_rate": 0.003900414774513348, + "loss": 7.8867, + "step": 511500 + }, + { + "epoch": 2.0841459676195577, + "grad_norm": 1.6989558935165405, + "learning_rate": 0.00390000349761788, + "loss": 7.8412, + "step": 511600 + }, + { + "epoch": 2.0845533456429393, + "grad_norm": 1.8398146629333496, + "learning_rate": 0.0038995921655704613, + "loss": 7.8892, + "step": 511700 + }, + { + "epoch": 2.084960723666321, + "grad_norm": 2.726363182067871, + "learning_rate": 0.0038991807783873584, + "loss": 7.8887, + "step": 511800 + }, + { + "epoch": 2.0853681016897023, + "grad_norm": 2.66855525970459, + "learning_rate": 0.0038987693360848334, + "loss": 7.8638, + "step": 511900 + }, + { + "epoch": 2.085775479713084, + "grad_norm": 2.9646921157836914, + "learning_rate": 0.0038983578386791486, + "loss": 7.8417, + "step": 512000 + }, + { + "epoch": 2.085775479713084, + "eval_MaskedAccuracy": 0.49866002346569377, + "eval_loss": 1.6616344451904297, + "eval_runtime": 162.631, + "eval_samples_per_second": 390.307, + "eval_steps_per_second": 1.525, + "step": 512000 + }, + { + "epoch": 2.086182857736465, + "grad_norm": 2.0324671268463135, + "learning_rate": 0.003897946286186581, + "loss": 7.855, + "step": 512100 + }, + { + "epoch": 2.0865902357598465, + "grad_norm": 3.845881223678589, + "learning_rate": 0.0038975346786233934, + "loss": 7.8509, + "step": 512200 + }, + { + "epoch": 2.086997613783228, + "grad_norm": 5.2466888427734375, + "learning_rate": 0.0038971230160058613, + "loss": 7.8224, + "step": 512300 + }, + { + "epoch": 2.0874049918066095, + "grad_norm": 1.7458776235580444, + "learning_rate": 0.0038967112983502626, + "loss": 7.8584, + "step": 512400 + }, + { + "epoch": 2.087812369829991, + "grad_norm": 2.2177734375, + "learning_rate": 0.0038962995256728726, + "loss": 7.8277, + "step": 512500 + }, + { + "epoch": 2.0882197478533726, + "grad_norm": 4.301079750061035, + "learning_rate": 0.0038958876979899707, + "loss": 7.8364, + "step": 512600 + }, + { + "epoch": 2.0886271258767537, + "grad_norm": 8.370576858520508, + "learning_rate": 0.0038954758153178404, + "loss": 7.8261, + "step": 512700 + }, + { + "epoch": 2.0890345039001352, + "grad_norm": 2.8205881118774414, + "learning_rate": 0.0038950638776727633, + "loss": 7.8063, + "step": 512800 + }, + { + "epoch": 2.0894418819235168, + "grad_norm": 1.931097149848938, + "learning_rate": 0.00389465188507103, + "loss": 7.8492, + "step": 512900 + }, + { + "epoch": 2.0898492599468983, + "grad_norm": 1.9785386323928833, + "learning_rate": 0.003894239837528921, + "loss": 7.8588, + "step": 513000 + }, + { + "epoch": 2.0898492599468983, + "eval_MaskedAccuracy": 0.49861447942393755, + "eval_loss": 1.642656683921814, + "eval_runtime": 187.0413, + "eval_samples_per_second": 339.369, + "eval_steps_per_second": 1.326, + "step": 513000 + }, + { + "epoch": 2.09025663797028, + "grad_norm": 2.9555182456970215, + "learning_rate": 0.0038938277350627315, + "loss": 7.8609, + "step": 513100 + }, + { + "epoch": 2.0906640159936614, + "grad_norm": 5.055253028869629, + "learning_rate": 0.003893415577688753, + "loss": 7.8673, + "step": 513200 + }, + { + "epoch": 2.0910713940170425, + "grad_norm": 4.722437858581543, + "learning_rate": 0.0038930033654232823, + "loss": 7.8351, + "step": 513300 + }, + { + "epoch": 2.091478772040424, + "grad_norm": 3.5397512912750244, + "learning_rate": 0.0038925910982826138, + "loss": 7.8484, + "step": 513400 + }, + { + "epoch": 2.0918861500638055, + "grad_norm": 1.4733760356903076, + "learning_rate": 0.0038921787762830483, + "loss": 7.8361, + "step": 513500 + }, + { + "epoch": 2.092293528087187, + "grad_norm": 5.55131721496582, + "learning_rate": 0.003891766399440887, + "loss": 7.8594, + "step": 513600 + }, + { + "epoch": 2.0927009061105686, + "grad_norm": 2.943793773651123, + "learning_rate": 0.0038913539677724344, + "loss": 7.8328, + "step": 513700 + }, + { + "epoch": 2.09310828413395, + "grad_norm": 2.3448822498321533, + "learning_rate": 0.0038909414812939923, + "loss": 7.8266, + "step": 513800 + }, + { + "epoch": 2.0935156621573316, + "grad_norm": 4.41709041595459, + "learning_rate": 0.00389052894002187, + "loss": 7.8796, + "step": 513900 + }, + { + "epoch": 2.0939230401807127, + "grad_norm": 6.548536777496338, + "learning_rate": 0.0038901163439723795, + "loss": 7.8454, + "step": 514000 + }, + { + "epoch": 2.0939230401807127, + "eval_MaskedAccuracy": 0.4990569694959531, + "eval_loss": 1.6530210971832275, + "eval_runtime": 173.0732, + "eval_samples_per_second": 366.758, + "eval_steps_per_second": 1.433, + "step": 514000 + }, + { + "epoch": 2.0943304182040943, + "grad_norm": 4.337338447570801, + "learning_rate": 0.0038897036931618317, + "loss": 7.8809, + "step": 514100 + }, + { + "epoch": 2.094737796227476, + "grad_norm": 2.3941030502319336, + "learning_rate": 0.003889290987606542, + "loss": 7.8403, + "step": 514200 + }, + { + "epoch": 2.0951451742508573, + "grad_norm": 5.223713397979736, + "learning_rate": 0.0038888782273228287, + "loss": 7.8693, + "step": 514300 + }, + { + "epoch": 2.095552552274239, + "grad_norm": 6.067909240722656, + "learning_rate": 0.0038884654123270065, + "loss": 7.8672, + "step": 514400 + }, + { + "epoch": 2.0959599302976204, + "grad_norm": 5.647371768951416, + "learning_rate": 0.003888052542635397, + "loss": 7.8555, + "step": 514500 + }, + { + "epoch": 2.0963673083210015, + "grad_norm": 3.459240198135376, + "learning_rate": 0.0038876396182643217, + "loss": 7.8589, + "step": 514600 + }, + { + "epoch": 2.096774686344383, + "grad_norm": 3.0185189247131348, + "learning_rate": 0.0038872266392301082, + "loss": 7.8543, + "step": 514700 + }, + { + "epoch": 2.0971820643677646, + "grad_norm": 2.327406883239746, + "learning_rate": 0.0038868136055490816, + "loss": 7.8275, + "step": 514800 + }, + { + "epoch": 2.097589442391146, + "grad_norm": 4.88460636138916, + "learning_rate": 0.0038864005172375734, + "loss": 7.8607, + "step": 514900 + }, + { + "epoch": 2.0979968204145276, + "grad_norm": 3.7140984535217285, + "learning_rate": 0.0038859873743119166, + "loss": 7.8526, + "step": 515000 + }, + { + "epoch": 2.0979968204145276, + "eval_MaskedAccuracy": 0.49905880600640645, + "eval_loss": 1.6508853435516357, + "eval_runtime": 157.7835, + "eval_samples_per_second": 402.298, + "eval_steps_per_second": 1.572, + "step": 515000 + }, + { + "epoch": 2.098404198437909, + "grad_norm": 3.6966841220855713, + "learning_rate": 0.0038855741767884434, + "loss": 7.8116, + "step": 515100 + }, + { + "epoch": 2.0988115764612902, + "grad_norm": 1.6206698417663574, + "learning_rate": 0.003885160924683486, + "loss": 7.8647, + "step": 515200 + }, + { + "epoch": 2.0992189544846718, + "grad_norm": 2.2783801555633545, + "learning_rate": 0.003884747618013387, + "loss": 7.8496, + "step": 515300 + }, + { + "epoch": 2.0996263325080533, + "grad_norm": 3.005263566970825, + "learning_rate": 0.0038843342567944903, + "loss": 7.8593, + "step": 515400 + }, + { + "epoch": 2.100033710531435, + "grad_norm": 2.094526767730713, + "learning_rate": 0.0038839208410431265, + "loss": 7.8606, + "step": 515500 + }, + { + "epoch": 2.1004410885548164, + "grad_norm": 8.773946762084961, + "learning_rate": 0.003883507370775649, + "loss": 7.8611, + "step": 515600 + }, + { + "epoch": 2.100848466578198, + "grad_norm": 2.737407684326172, + "learning_rate": 0.003883093846008401, + "loss": 7.8187, + "step": 515700 + }, + { + "epoch": 2.101255844601579, + "grad_norm": 2.1632235050201416, + "learning_rate": 0.0038826802667577356, + "loss": 7.8351, + "step": 515800 + }, + { + "epoch": 2.1016632226249605, + "grad_norm": 5.39825963973999, + "learning_rate": 0.003882266633039998, + "loss": 7.8346, + "step": 515900 + }, + { + "epoch": 2.102070600648342, + "grad_norm": 4.6561174392700195, + "learning_rate": 0.003881852944871543, + "loss": 7.8485, + "step": 516000 + }, + { + "epoch": 2.102070600648342, + "eval_MaskedAccuracy": 0.4992994920440185, + "eval_loss": 1.6478400230407715, + "eval_runtime": 170.1425, + "eval_samples_per_second": 373.076, + "eval_steps_per_second": 1.458, + "step": 516000 + }, + { + "epoch": 2.1024779786717236, + "grad_norm": 3.3429903984069824, + "learning_rate": 0.0038814392022687274, + "loss": 7.8529, + "step": 516100 + }, + { + "epoch": 2.102885356695105, + "grad_norm": 8.731779098510742, + "learning_rate": 0.0038810254052479127, + "loss": 7.8222, + "step": 516200 + }, + { + "epoch": 2.1032927347184867, + "grad_norm": 3.482006311416626, + "learning_rate": 0.0038806115538254516, + "loss": 7.8448, + "step": 516300 + }, + { + "epoch": 2.103700112741868, + "grad_norm": 5.820671558380127, + "learning_rate": 0.003880197648017711, + "loss": 7.8503, + "step": 516400 + }, + { + "epoch": 2.1041074907652493, + "grad_norm": 3.015876531600952, + "learning_rate": 0.003879783687841048, + "loss": 7.8595, + "step": 516500 + }, + { + "epoch": 2.104514868788631, + "grad_norm": 3.7448184490203857, + "learning_rate": 0.003879369673311833, + "loss": 7.8652, + "step": 516600 + }, + { + "epoch": 2.1049222468120123, + "grad_norm": 2.8957302570343018, + "learning_rate": 0.003878955604446434, + "loss": 7.8539, + "step": 516700 + }, + { + "epoch": 2.105329624835394, + "grad_norm": 3.009982109069824, + "learning_rate": 0.0038785414812612233, + "loss": 7.8172, + "step": 516800 + }, + { + "epoch": 2.1057370028587754, + "grad_norm": 7.062688827514648, + "learning_rate": 0.003878127303772568, + "loss": 7.8541, + "step": 516900 + }, + { + "epoch": 2.106144380882157, + "grad_norm": 4.3620924949646, + "learning_rate": 0.003877713071996851, + "loss": 7.8689, + "step": 517000 + }, + { + "epoch": 2.106144380882157, + "eval_MaskedAccuracy": 0.49807315141597197, + "eval_loss": 1.6623533964157104, + "eval_runtime": 155.7107, + "eval_samples_per_second": 407.653, + "eval_steps_per_second": 1.593, + "step": 517000 + }, + { + "epoch": 2.106551758905538, + "grad_norm": 3.7454354763031006, + "learning_rate": 0.0038772987859504423, + "loss": 7.8493, + "step": 517100 + }, + { + "epoch": 2.1069591369289196, + "grad_norm": 5.084829330444336, + "learning_rate": 0.0038768844456497247, + "loss": 7.8519, + "step": 517200 + }, + { + "epoch": 2.107366514952301, + "grad_norm": 5.766896724700928, + "learning_rate": 0.0038764700511110784, + "loss": 7.8476, + "step": 517300 + }, + { + "epoch": 2.1077738929756826, + "grad_norm": 2.518277883529663, + "learning_rate": 0.0038760556023508855, + "loss": 7.8506, + "step": 517400 + }, + { + "epoch": 2.108181270999064, + "grad_norm": 3.968675374984741, + "learning_rate": 0.003875641099385529, + "loss": 7.842, + "step": 517500 + }, + { + "epoch": 2.1085886490224457, + "grad_norm": 5.5741705894470215, + "learning_rate": 0.0038752265422314002, + "loss": 7.867, + "step": 517600 + }, + { + "epoch": 2.108996027045827, + "grad_norm": 4.208227634429932, + "learning_rate": 0.003874811930904893, + "loss": 7.8408, + "step": 517700 + }, + { + "epoch": 2.1094034050692083, + "grad_norm": 3.1576449871063232, + "learning_rate": 0.0038743972654223884, + "loss": 7.8388, + "step": 517800 + }, + { + "epoch": 2.10981078309259, + "grad_norm": 4.84845495223999, + "learning_rate": 0.003873982545800287, + "loss": 7.8128, + "step": 517900 + }, + { + "epoch": 2.1102181611159714, + "grad_norm": 2.3880467414855957, + "learning_rate": 0.0038735677720549842, + "loss": 7.8527, + "step": 518000 + }, + { + "epoch": 2.1102181611159714, + "eval_MaskedAccuracy": 0.5003301768561452, + "eval_loss": 1.6516457796096802, + "eval_runtime": 161.9699, + "eval_samples_per_second": 391.9, + "eval_steps_per_second": 1.531, + "step": 518000 + }, + { + "epoch": 2.110625539139353, + "grad_norm": 5.248900413513184, + "learning_rate": 0.003873152944202876, + "loss": 7.8289, + "step": 518100 + }, + { + "epoch": 2.1110329171627344, + "grad_norm": 3.9732167720794678, + "learning_rate": 0.003872738062260366, + "loss": 7.8636, + "step": 518200 + }, + { + "epoch": 2.1114402951861155, + "grad_norm": 3.315394639968872, + "learning_rate": 0.003872323126243854, + "loss": 7.8254, + "step": 518300 + }, + { + "epoch": 2.111847673209497, + "grad_norm": 7.163773059844971, + "learning_rate": 0.0038719081361697476, + "loss": 7.8377, + "step": 518400 + }, + { + "epoch": 2.1122550512328786, + "grad_norm": 3.326075315475464, + "learning_rate": 0.0038714930920544484, + "loss": 7.8421, + "step": 518500 + }, + { + "epoch": 2.11266242925626, + "grad_norm": 2.2585558891296387, + "learning_rate": 0.0038710779939143706, + "loss": 7.865, + "step": 518600 + }, + { + "epoch": 2.1130698072796417, + "grad_norm": 4.726303577423096, + "learning_rate": 0.0038706628417659213, + "loss": 7.8414, + "step": 518700 + }, + { + "epoch": 2.113477185303023, + "grad_norm": 2.197706699371338, + "learning_rate": 0.0038702476356255204, + "loss": 7.8443, + "step": 518800 + }, + { + "epoch": 2.1138845633264047, + "grad_norm": 4.723580360412598, + "learning_rate": 0.0038698323755095793, + "loss": 7.8326, + "step": 518900 + }, + { + "epoch": 2.114291941349786, + "grad_norm": 2.8791539669036865, + "learning_rate": 0.003869417061434514, + "loss": 7.8506, + "step": 519000 + }, + { + "epoch": 2.114291941349786, + "eval_MaskedAccuracy": 0.49916622843293673, + "eval_loss": 1.6447579860687256, + "eval_runtime": 157.7297, + "eval_samples_per_second": 402.435, + "eval_steps_per_second": 1.572, + "step": 519000 + }, + { + "epoch": 2.1146993193731674, + "grad_norm": 6.8712687492370605, + "learning_rate": 0.0038690016934167445, + "loss": 7.8443, + "step": 519100 + }, + { + "epoch": 2.115106697396549, + "grad_norm": 1.3840206861495972, + "learning_rate": 0.003868586271472687, + "loss": 7.839, + "step": 519200 + }, + { + "epoch": 2.1155140754199304, + "grad_norm": 3.4413862228393555, + "learning_rate": 0.003868170795618772, + "loss": 7.8617, + "step": 519300 + }, + { + "epoch": 2.115921453443312, + "grad_norm": 3.5457684993743896, + "learning_rate": 0.0038677552658714282, + "loss": 7.8538, + "step": 519400 + }, + { + "epoch": 2.1163288314666935, + "grad_norm": 3.5211522579193115, + "learning_rate": 0.003867339682247083, + "loss": 7.8487, + "step": 519500 + }, + { + "epoch": 2.1167362094900746, + "grad_norm": 2.000596046447754, + "learning_rate": 0.0038669240447621604, + "loss": 7.8483, + "step": 519600 + }, + { + "epoch": 2.117143587513456, + "grad_norm": 2.7407684326171875, + "learning_rate": 0.0038665083534330977, + "loss": 7.849, + "step": 519700 + }, + { + "epoch": 2.1175509655368376, + "grad_norm": 2.1641077995300293, + "learning_rate": 0.00386609260827633, + "loss": 7.8685, + "step": 519800 + }, + { + "epoch": 2.117958343560219, + "grad_norm": 4.879209041595459, + "learning_rate": 0.003865676809308291, + "loss": 7.841, + "step": 519900 + }, + { + "epoch": 2.1183657215836007, + "grad_norm": 4.267777442932129, + "learning_rate": 0.0038652609565454194, + "loss": 7.8681, + "step": 520000 + }, + { + "epoch": 2.1183657215836007, + "eval_MaskedAccuracy": 0.4989128060239453, + "eval_loss": 1.6525324583053589, + "eval_runtime": 170.4922, + "eval_samples_per_second": 372.31, + "eval_steps_per_second": 1.455, + "step": 520000 + }, + { + "epoch": 2.1187730996069822, + "grad_norm": 4.678677558898926, + "learning_rate": 0.0038648450500041584, + "loss": 7.8279, + "step": 520100 + }, + { + "epoch": 2.1191804776303633, + "grad_norm": 1.9841251373291016, + "learning_rate": 0.003864429089700947, + "loss": 7.8761, + "step": 520200 + }, + { + "epoch": 2.119587855653745, + "grad_norm": 3.339817523956299, + "learning_rate": 0.0038640130756522344, + "loss": 7.8515, + "step": 520300 + }, + { + "epoch": 2.1199952336771264, + "grad_norm": 3.953916072845459, + "learning_rate": 0.003863597007874463, + "loss": 7.8564, + "step": 520400 + }, + { + "epoch": 2.120402611700508, + "grad_norm": 2.9301578998565674, + "learning_rate": 0.003863180886384087, + "loss": 7.8552, + "step": 520500 + }, + { + "epoch": 2.1208099897238895, + "grad_norm": 4.771327495574951, + "learning_rate": 0.003862764711197554, + "loss": 7.8701, + "step": 520600 + }, + { + "epoch": 2.121217367747271, + "grad_norm": 9.336111068725586, + "learning_rate": 0.0038623484823313205, + "loss": 7.844, + "step": 520700 + }, + { + "epoch": 2.121624745770652, + "grad_norm": 2.9203686714172363, + "learning_rate": 0.003861932199801845, + "loss": 7.8717, + "step": 520800 + }, + { + "epoch": 2.1220321237940336, + "grad_norm": 6.97675895690918, + "learning_rate": 0.0038615158636255815, + "loss": 7.8497, + "step": 520900 + }, + { + "epoch": 2.122439501817415, + "grad_norm": 3.968886137008667, + "learning_rate": 0.0038610994738189942, + "loss": 7.8311, + "step": 521000 + }, + { + "epoch": 2.122439501817415, + "eval_MaskedAccuracy": 0.4993250706805788, + "eval_loss": 1.649429202079773, + "eval_runtime": 156.1808, + "eval_samples_per_second": 406.427, + "eval_steps_per_second": 1.588, + "step": 521000 + }, + { + "epoch": 2.1228468798407967, + "grad_norm": 4.034364700317383, + "learning_rate": 0.003860683030398538, + "loss": 7.857, + "step": 521100 + }, + { + "epoch": 2.123254257864178, + "grad_norm": 5.871217727661133, + "learning_rate": 0.003860266533380682, + "loss": 7.8487, + "step": 521200 + }, + { + "epoch": 2.1236616358875597, + "grad_norm": 1.6691020727157593, + "learning_rate": 0.003859849982781893, + "loss": 7.8432, + "step": 521300 + }, + { + "epoch": 2.1240690139109413, + "grad_norm": 3.405710220336914, + "learning_rate": 0.003859433378618635, + "loss": 7.8389, + "step": 521400 + }, + { + "epoch": 2.1244763919343224, + "grad_norm": 3.6802337169647217, + "learning_rate": 0.0038590167209073805, + "loss": 7.8668, + "step": 521500 + }, + { + "epoch": 2.124883769957704, + "grad_norm": 2.612476348876953, + "learning_rate": 0.0038586000096646057, + "loss": 7.8415, + "step": 521600 + }, + { + "epoch": 2.1252911479810854, + "grad_norm": 2.7740638256073, + "learning_rate": 0.0038581832449067805, + "loss": 7.8302, + "step": 521700 + }, + { + "epoch": 2.125698526004467, + "grad_norm": 6.684861183166504, + "learning_rate": 0.003857766426650386, + "loss": 7.8427, + "step": 521800 + }, + { + "epoch": 2.1261059040278485, + "grad_norm": 4.789067268371582, + "learning_rate": 0.0038573495549119, + "loss": 7.8429, + "step": 521900 + }, + { + "epoch": 2.12651328205123, + "grad_norm": 3.1102423667907715, + "learning_rate": 0.003856932629707804, + "loss": 7.8439, + "step": 522000 + }, + { + "epoch": 2.12651328205123, + "eval_MaskedAccuracy": 0.49906116697405856, + "eval_loss": 1.6518633365631104, + "eval_runtime": 153.0851, + "eval_samples_per_second": 414.645, + "eval_steps_per_second": 1.62, + "step": 522000 + }, + { + "epoch": 2.126920660074611, + "grad_norm": 5.182793140411377, + "learning_rate": 0.003856515651054577, + "loss": 7.8526, + "step": 522100 + }, + { + "epoch": 2.1273280380979926, + "grad_norm": 4.2344651222229, + "learning_rate": 0.0038560986189687075, + "loss": 7.8276, + "step": 522200 + }, + { + "epoch": 2.127735416121374, + "grad_norm": 4.652752876281738, + "learning_rate": 0.0038556815334666833, + "loss": 7.8345, + "step": 522300 + }, + { + "epoch": 2.1281427941447557, + "grad_norm": 2.306577444076538, + "learning_rate": 0.0038552643945649976, + "loss": 7.8185, + "step": 522400 + }, + { + "epoch": 2.1285501721681372, + "grad_norm": 2.9456610679626465, + "learning_rate": 0.003854847202280141, + "loss": 7.8668, + "step": 522500 + }, + { + "epoch": 2.1289575501915188, + "grad_norm": 3.9706289768218994, + "learning_rate": 0.0038544299566286053, + "loss": 7.8662, + "step": 522600 + }, + { + "epoch": 2.1293649282149, + "grad_norm": 2.4402737617492676, + "learning_rate": 0.00385401265762688, + "loss": 7.8397, + "step": 522700 + }, + { + "epoch": 2.1297723062382814, + "grad_norm": 4.685486316680908, + "learning_rate": 0.003853595305291472, + "loss": 7.8325, + "step": 522800 + }, + { + "epoch": 2.130179684261663, + "grad_norm": 3.8829379081726074, + "learning_rate": 0.003853177899638879, + "loss": 7.8706, + "step": 522900 + }, + { + "epoch": 2.1305870622850445, + "grad_norm": 7.463794708251953, + "learning_rate": 0.003852760440685605, + "loss": 7.8707, + "step": 523000 + }, + { + "epoch": 2.1305870622850445, + "eval_MaskedAccuracy": 0.4990442157171856, + "eval_loss": 1.6476261615753174, + "eval_runtime": 154.2854, + "eval_samples_per_second": 411.419, + "eval_steps_per_second": 1.607, + "step": 523000 + }, + { + "epoch": 2.130994440308426, + "grad_norm": 4.382684707641602, + "learning_rate": 0.0038523429284481525, + "loss": 7.8549, + "step": 523100 + }, + { + "epoch": 2.1314018183318075, + "grad_norm": 3.828449010848999, + "learning_rate": 0.0038519253629430254, + "loss": 7.8547, + "step": 523200 + }, + { + "epoch": 2.1318091963551886, + "grad_norm": 3.674806833267212, + "learning_rate": 0.0038515077441867353, + "loss": 7.8408, + "step": 523300 + }, + { + "epoch": 2.13221657437857, + "grad_norm": 2.4122889041900635, + "learning_rate": 0.0038510900721957933, + "loss": 7.8451, + "step": 523400 + }, + { + "epoch": 2.1326239524019517, + "grad_norm": 3.8041248321533203, + "learning_rate": 0.0038506723469867083, + "loss": 7.8486, + "step": 523500 + }, + { + "epoch": 2.133031330425333, + "grad_norm": 3.0059142112731934, + "learning_rate": 0.0038502545685759997, + "loss": 7.8389, + "step": 523600 + }, + { + "epoch": 2.1334387084487147, + "grad_norm": 5.589702606201172, + "learning_rate": 0.003849836736980186, + "loss": 7.8639, + "step": 523700 + }, + { + "epoch": 2.1338460864720963, + "grad_norm": 2.9695308208465576, + "learning_rate": 0.0038494188522157804, + "loss": 7.8568, + "step": 523800 + }, + { + "epoch": 2.134253464495478, + "grad_norm": 1.8465338945388794, + "learning_rate": 0.0038490009142993047, + "loss": 7.8472, + "step": 523900 + }, + { + "epoch": 2.134660842518859, + "grad_norm": 3.8933956623077393, + "learning_rate": 0.0038485829232472878, + "loss": 7.8666, + "step": 524000 + }, + { + "epoch": 2.134660842518859, + "eval_MaskedAccuracy": 0.498521111636213, + "eval_loss": 1.6424434185028076, + "eval_runtime": 196.697, + "eval_samples_per_second": 322.71, + "eval_steps_per_second": 1.261, + "step": 524000 + }, + { + "epoch": 2.1350682205422404, + "grad_norm": 6.770482540130615, + "learning_rate": 0.003848164879076251, + "loss": 7.8278, + "step": 524100 + }, + { + "epoch": 2.135475598565622, + "grad_norm": 3.3351528644561768, + "learning_rate": 0.003847746781802722, + "loss": 7.8047, + "step": 524200 + }, + { + "epoch": 2.1358829765890035, + "grad_norm": 1.5136771202087402, + "learning_rate": 0.0038473286314432316, + "loss": 7.8479, + "step": 524300 + }, + { + "epoch": 2.136290354612385, + "grad_norm": 4.587497711181641, + "learning_rate": 0.0038469104280143105, + "loss": 7.8619, + "step": 524400 + }, + { + "epoch": 2.1366977326357666, + "grad_norm": 3.4201977252960205, + "learning_rate": 0.0038464921715324948, + "loss": 7.8444, + "step": 524500 + }, + { + "epoch": 2.1371051106591477, + "grad_norm": 2.8858134746551514, + "learning_rate": 0.0038460738620143212, + "loss": 7.8444, + "step": 524600 + }, + { + "epoch": 2.137512488682529, + "grad_norm": 4.345208644866943, + "learning_rate": 0.0038456554994763235, + "loss": 7.8184, + "step": 524700 + }, + { + "epoch": 2.1379198667059107, + "grad_norm": 3.379286289215088, + "learning_rate": 0.0038452370839350426, + "loss": 7.8281, + "step": 524800 + }, + { + "epoch": 2.1383272447292923, + "grad_norm": 4.41142463684082, + "learning_rate": 0.003844818615407021, + "loss": 7.8362, + "step": 524900 + }, + { + "epoch": 2.138734622752674, + "grad_norm": 1.7225568294525146, + "learning_rate": 0.0038444000939088056, + "loss": 7.853, + "step": 525000 + }, + { + "epoch": 2.138734622752674, + "eval_MaskedAccuracy": 0.4993731993985832, + "eval_loss": 1.6521061658859253, + "eval_runtime": 154.7934, + "eval_samples_per_second": 410.069, + "eval_steps_per_second": 1.602, + "step": 525000 + }, + { + "epoch": 2.1391420007760553, + "grad_norm": 3.65887713432312, + "learning_rate": 0.003843981519456943, + "loss": 7.8451, + "step": 525100 + }, + { + "epoch": 2.1395493787994364, + "grad_norm": 1.7522298097610474, + "learning_rate": 0.0038435628920679794, + "loss": 7.8768, + "step": 525200 + }, + { + "epoch": 2.139956756822818, + "grad_norm": 3.946798086166382, + "learning_rate": 0.0038431442117584608, + "loss": 7.8728, + "step": 525300 + }, + { + "epoch": 2.1403641348461995, + "grad_norm": 2.1456899642944336, + "learning_rate": 0.003842725478544948, + "loss": 7.8832, + "step": 525400 + }, + { + "epoch": 2.140771512869581, + "grad_norm": 3.279982328414917, + "learning_rate": 0.0038423066924439912, + "loss": 7.8431, + "step": 525500 + }, + { + "epoch": 2.1411788908929625, + "grad_norm": 3.695875406265259, + "learning_rate": 0.0038418878534721462, + "loss": 7.8462, + "step": 525600 + }, + { + "epoch": 2.141586268916344, + "grad_norm": 2.8035428524017334, + "learning_rate": 0.003841468961645979, + "loss": 7.8557, + "step": 525700 + }, + { + "epoch": 2.141993646939725, + "grad_norm": 1.9271063804626465, + "learning_rate": 0.0038410500169820433, + "loss": 7.8382, + "step": 525800 + }, + { + "epoch": 2.1424010249631067, + "grad_norm": 3.3840794563293457, + "learning_rate": 0.003840631019496905, + "loss": 7.8448, + "step": 525900 + }, + { + "epoch": 2.1428084029864882, + "grad_norm": 2.669886350631714, + "learning_rate": 0.003840211969207129, + "loss": 7.8347, + "step": 526000 + }, + { + "epoch": 2.1428084029864882, + "eval_MaskedAccuracy": 0.4983857915834357, + "eval_loss": 1.6485259532928467, + "eval_runtime": 157.1212, + "eval_samples_per_second": 403.994, + "eval_steps_per_second": 1.578, + "step": 526000 + }, + { + "epoch": 2.1432157810098698, + "grad_norm": 2.8446619510650635, + "learning_rate": 0.0038397928661292827, + "loss": 7.8434, + "step": 526100 + }, + { + "epoch": 2.1436231590332513, + "grad_norm": 2.5263686180114746, + "learning_rate": 0.003839373710279936, + "loss": 7.8465, + "step": 526200 + }, + { + "epoch": 2.144030537056633, + "grad_norm": 2.31062650680542, + "learning_rate": 0.0038389545016756605, + "loss": 7.841, + "step": 526300 + }, + { + "epoch": 2.1444379150800144, + "grad_norm": 1.9868495464324951, + "learning_rate": 0.003838535240333024, + "loss": 7.7923, + "step": 526400 + }, + { + "epoch": 2.1448452931033954, + "grad_norm": 1.8735777139663696, + "learning_rate": 0.0038381159262686096, + "loss": 7.8812, + "step": 526500 + }, + { + "epoch": 2.145252671126777, + "grad_norm": 2.031118631362915, + "learning_rate": 0.0038376965594989977, + "loss": 7.8441, + "step": 526600 + }, + { + "epoch": 2.1456600491501585, + "grad_norm": 3.5911500453948975, + "learning_rate": 0.003837277140040757, + "loss": 7.8436, + "step": 526700 + }, + { + "epoch": 2.14606742717354, + "grad_norm": 2.9640281200408936, + "learning_rate": 0.0038368576679104777, + "loss": 7.8411, + "step": 526800 + }, + { + "epoch": 2.1464748051969216, + "grad_norm": 1.554478406906128, + "learning_rate": 0.003836438143124744, + "loss": 7.8642, + "step": 526900 + }, + { + "epoch": 2.146882183220303, + "grad_norm": 3.397552490234375, + "learning_rate": 0.003836018565700135, + "loss": 7.8349, + "step": 527000 + }, + { + "epoch": 2.146882183220303, + "eval_MaskedAccuracy": 0.4983316146545811, + "eval_loss": 1.6525747776031494, + "eval_runtime": 163.3248, + "eval_samples_per_second": 388.649, + "eval_steps_per_second": 1.518, + "step": 527000 + }, + { + "epoch": 2.147289561243684, + "grad_norm": 2.814387321472168, + "learning_rate": 0.003835598935653243, + "loss": 7.8687, + "step": 527100 + }, + { + "epoch": 2.1476969392670657, + "grad_norm": 4.296543121337891, + "learning_rate": 0.003835179253000662, + "loss": 7.8874, + "step": 527200 + }, + { + "epoch": 2.1481043172904473, + "grad_norm": 3.7360377311706543, + "learning_rate": 0.00383475951775898, + "loss": 7.8229, + "step": 527300 + }, + { + "epoch": 2.148511695313829, + "grad_norm": 1.4035084247589111, + "learning_rate": 0.003834339729944791, + "loss": 7.849, + "step": 527400 + }, + { + "epoch": 2.1489190733372103, + "grad_norm": 4.194318771362305, + "learning_rate": 0.0038339198895746937, + "loss": 7.8487, + "step": 527500 + }, + { + "epoch": 2.149326451360592, + "grad_norm": 3.1967825889587402, + "learning_rate": 0.0038334999966652857, + "loss": 7.8524, + "step": 527600 + }, + { + "epoch": 2.149733829383973, + "grad_norm": 10.551984786987305, + "learning_rate": 0.003833080051233164, + "loss": 7.8903, + "step": 527700 + }, + { + "epoch": 2.1501412074073545, + "grad_norm": 1.961940050125122, + "learning_rate": 0.0038326600532949382, + "loss": 7.8301, + "step": 527800 + }, + { + "epoch": 2.150548585430736, + "grad_norm": 2.9610583782196045, + "learning_rate": 0.003832240002867211, + "loss": 7.8362, + "step": 527900 + }, + { + "epoch": 2.1509559634541175, + "grad_norm": 2.6585211753845215, + "learning_rate": 0.0038318198999665863, + "loss": 7.8502, + "step": 528000 + }, + { + "epoch": 2.1509559634541175, + "eval_MaskedAccuracy": 0.49930813654902767, + "eval_loss": 1.6571317911148071, + "eval_runtime": 160.0369, + "eval_samples_per_second": 396.634, + "eval_steps_per_second": 1.55, + "step": 528000 + }, + { + "epoch": 2.151363341477499, + "grad_norm": 4.147009372711182, + "learning_rate": 0.003831399744609676, + "loss": 7.8445, + "step": 528100 + }, + { + "epoch": 2.1517707195008806, + "grad_norm": 3.613985776901245, + "learning_rate": 0.0038309795368130866, + "loss": 7.8648, + "step": 528200 + }, + { + "epoch": 2.1521780975242617, + "grad_norm": 5.673200607299805, + "learning_rate": 0.0038305592765934364, + "loss": 7.8482, + "step": 528300 + }, + { + "epoch": 2.1525854755476432, + "grad_norm": 5.594901084899902, + "learning_rate": 0.003830138963967339, + "loss": 7.8462, + "step": 528400 + }, + { + "epoch": 2.1529928535710248, + "grad_norm": 3.565707206726074, + "learning_rate": 0.0038297185989514104, + "loss": 7.8494, + "step": 528500 + }, + { + "epoch": 2.1534002315944063, + "grad_norm": 10.87060260772705, + "learning_rate": 0.003829298181562268, + "loss": 7.8588, + "step": 528600 + }, + { + "epoch": 2.153807609617788, + "grad_norm": 2.678865432739258, + "learning_rate": 0.0038288777118165415, + "loss": 7.8739, + "step": 528700 + }, + { + "epoch": 2.1542149876411694, + "grad_norm": 6.544186115264893, + "learning_rate": 0.0038284571897308464, + "loss": 7.832, + "step": 528800 + }, + { + "epoch": 2.154622365664551, + "grad_norm": 5.820631980895996, + "learning_rate": 0.0038280366153218112, + "loss": 7.84, + "step": 528900 + }, + { + "epoch": 2.155029743687932, + "grad_norm": 2.8094024658203125, + "learning_rate": 0.003827615988606059, + "loss": 7.8275, + "step": 529000 + }, + { + "epoch": 2.155029743687932, + "eval_MaskedAccuracy": 0.5000002667794017, + "eval_loss": 1.6423031091690063, + "eval_runtime": 157.6177, + "eval_samples_per_second": 402.721, + "eval_steps_per_second": 1.573, + "step": 529000 + }, + { + "epoch": 2.1554371217113135, + "grad_norm": 3.2447738647460938, + "learning_rate": 0.0038271953096002224, + "loss": 7.8538, + "step": 529100 + }, + { + "epoch": 2.155844499734695, + "grad_norm": 2.1335766315460205, + "learning_rate": 0.003826774578320937, + "loss": 7.8385, + "step": 529200 + }, + { + "epoch": 2.1562518777580766, + "grad_norm": 8.36882495880127, + "learning_rate": 0.0038263537947848324, + "loss": 7.8431, + "step": 529300 + }, + { + "epoch": 2.156659255781458, + "grad_norm": 3.091593027114868, + "learning_rate": 0.003825932959008545, + "loss": 7.8359, + "step": 529400 + }, + { + "epoch": 2.1570666338048397, + "grad_norm": 1.5677893161773682, + "learning_rate": 0.0038255120710087155, + "loss": 7.8607, + "step": 529500 + }, + { + "epoch": 2.1574740118282207, + "grad_norm": 2.9303829669952393, + "learning_rate": 0.0038250911308019757, + "loss": 7.8359, + "step": 529600 + }, + { + "epoch": 2.1578813898516023, + "grad_norm": 3.8750693798065186, + "learning_rate": 0.0038246701384049755, + "loss": 7.8578, + "step": 529700 + }, + { + "epoch": 2.158288767874984, + "grad_norm": 2.241936206817627, + "learning_rate": 0.0038242490938343553, + "loss": 7.8387, + "step": 529800 + }, + { + "epoch": 2.1586961458983653, + "grad_norm": 3.8586337566375732, + "learning_rate": 0.0038238279971067623, + "loss": 7.8409, + "step": 529900 + }, + { + "epoch": 2.159103523921747, + "grad_norm": 3.5696747303009033, + "learning_rate": 0.003823406848238848, + "loss": 7.8027, + "step": 530000 + }, + { + "epoch": 2.159103523921747, + "eval_MaskedAccuracy": 0.4998697965700189, + "eval_loss": 1.644965648651123, + "eval_runtime": 154.371, + "eval_samples_per_second": 411.191, + "eval_steps_per_second": 1.607, + "step": 530000 + }, + { + "epoch": 2.1595109019451284, + "grad_norm": 8.334405899047852, + "learning_rate": 0.003822985647247259, + "loss": 7.8247, + "step": 530100 + }, + { + "epoch": 2.1599182799685095, + "grad_norm": 4.211894989013672, + "learning_rate": 0.0038225643941486426, + "loss": 7.817, + "step": 530200 + }, + { + "epoch": 2.160325657991891, + "grad_norm": 2.0790696144104004, + "learning_rate": 0.0038221430889596635, + "loss": 7.839, + "step": 530300 + }, + { + "epoch": 2.1607330360152726, + "grad_norm": 5.07918643951416, + "learning_rate": 0.0038217217316969727, + "loss": 7.8126, + "step": 530400 + }, + { + "epoch": 2.161140414038654, + "grad_norm": 5.834768772125244, + "learning_rate": 0.00382130032237723, + "loss": 7.8014, + "step": 530500 + }, + { + "epoch": 2.1615477920620356, + "grad_norm": 2.6662633419036865, + "learning_rate": 0.003820878861017097, + "loss": 7.8394, + "step": 530600 + }, + { + "epoch": 2.161955170085417, + "grad_norm": 3.3728983402252197, + "learning_rate": 0.0038204573476332345, + "loss": 7.8006, + "step": 530700 + }, + { + "epoch": 2.1623625481087982, + "grad_norm": 5.740840911865234, + "learning_rate": 0.003820035782242308, + "loss": 7.8252, + "step": 530800 + }, + { + "epoch": 2.1627699261321798, + "grad_norm": 2.832451105117798, + "learning_rate": 0.0038196141648609806, + "loss": 7.8221, + "step": 530900 + }, + { + "epoch": 2.1631773041555613, + "grad_norm": 2.4910361766815186, + "learning_rate": 0.003819192495505927, + "loss": 7.8279, + "step": 531000 + }, + { + "epoch": 2.1631773041555613, + "eval_MaskedAccuracy": 0.4997051236722038, + "eval_loss": 1.6499806642532349, + "eval_runtime": 157.8028, + "eval_samples_per_second": 402.249, + "eval_steps_per_second": 1.572, + "step": 531000 + }, + { + "epoch": 2.163584682178943, + "grad_norm": 1.957298994064331, + "learning_rate": 0.0038187707741938145, + "loss": 7.8521, + "step": 531100 + }, + { + "epoch": 2.1639920602023244, + "grad_norm": 5.408090591430664, + "learning_rate": 0.003818349000941315, + "loss": 7.8317, + "step": 531200 + }, + { + "epoch": 2.164399438225706, + "grad_norm": 5.594817638397217, + "learning_rate": 0.0038179271757651056, + "loss": 7.8408, + "step": 531300 + }, + { + "epoch": 2.1648068162490874, + "grad_norm": 3.2920899391174316, + "learning_rate": 0.003817505298681865, + "loss": 7.8267, + "step": 531400 + }, + { + "epoch": 2.1652141942724685, + "grad_norm": 6.1781439781188965, + "learning_rate": 0.0038170833697082677, + "loss": 7.8509, + "step": 531500 + }, + { + "epoch": 2.16562157229585, + "grad_norm": 1.8539202213287354, + "learning_rate": 0.0038166613888610005, + "loss": 7.8519, + "step": 531600 + }, + { + "epoch": 2.1660289503192316, + "grad_norm": 3.158538818359375, + "learning_rate": 0.003816239356156741, + "loss": 7.8044, + "step": 531700 + }, + { + "epoch": 2.166436328342613, + "grad_norm": 5.459998607635498, + "learning_rate": 0.0038158172716121776, + "loss": 7.8652, + "step": 531800 + }, + { + "epoch": 2.1668437063659947, + "grad_norm": 3.2789273262023926, + "learning_rate": 0.003815395135243998, + "loss": 7.8071, + "step": 531900 + }, + { + "epoch": 2.167251084389376, + "grad_norm": 1.4929612874984741, + "learning_rate": 0.0038149729470688884, + "loss": 7.8441, + "step": 532000 + }, + { + "epoch": 2.167251084389376, + "eval_MaskedAccuracy": 0.49909206600661943, + "eval_loss": 1.6480473279953003, + "eval_runtime": 158.6008, + "eval_samples_per_second": 400.225, + "eval_steps_per_second": 1.564, + "step": 532000 + }, + { + "epoch": 2.1676584624127573, + "grad_norm": 2.944103479385376, + "learning_rate": 0.003814550707103539, + "loss": 7.8238, + "step": 532100 + }, + { + "epoch": 2.168065840436139, + "grad_norm": 5.1649651527404785, + "learning_rate": 0.0038141284153646498, + "loss": 7.8642, + "step": 532200 + }, + { + "epoch": 2.1684732184595203, + "grad_norm": 3.9113659858703613, + "learning_rate": 0.003813706071868912, + "loss": 7.843, + "step": 532300 + }, + { + "epoch": 2.168880596482902, + "grad_norm": 8.139547348022461, + "learning_rate": 0.0038132836766330233, + "loss": 7.8724, + "step": 532400 + }, + { + "epoch": 2.1692879745062834, + "grad_norm": 1.2801121473312378, + "learning_rate": 0.0038128612296736875, + "loss": 7.843, + "step": 532500 + }, + { + "epoch": 2.169695352529665, + "grad_norm": 2.4380743503570557, + "learning_rate": 0.0038124387310075956, + "loss": 7.8175, + "step": 532600 + }, + { + "epoch": 2.170102730553046, + "grad_norm": 8.219067573547363, + "learning_rate": 0.003812016180651463, + "loss": 7.8253, + "step": 532700 + }, + { + "epoch": 2.1705101085764276, + "grad_norm": 3.3391449451446533, + "learning_rate": 0.003811593578621991, + "loss": 7.8213, + "step": 532800 + }, + { + "epoch": 2.170917486599809, + "grad_norm": 3.765515089035034, + "learning_rate": 0.0038111709249358844, + "loss": 7.7826, + "step": 532900 + }, + { + "epoch": 2.1713248646231906, + "grad_norm": 7.724734306335449, + "learning_rate": 0.0038107482196098564, + "loss": 7.8197, + "step": 533000 + }, + { + "epoch": 2.1713248646231906, + "eval_MaskedAccuracy": 0.4996287348683913, + "eval_loss": 1.6528569459915161, + "eval_runtime": 157.3759, + "eval_samples_per_second": 403.34, + "eval_steps_per_second": 1.576, + "step": 533000 + }, + { + "epoch": 2.171732242646572, + "grad_norm": 2.16390323638916, + "learning_rate": 0.003810325462660616, + "loss": 7.8005, + "step": 533100 + }, + { + "epoch": 2.1721396206699537, + "grad_norm": 6.207178592681885, + "learning_rate": 0.0038099026541048783, + "loss": 7.8461, + "step": 533200 + }, + { + "epoch": 2.172546998693335, + "grad_norm": 5.238420009613037, + "learning_rate": 0.0038094797939593637, + "loss": 7.8158, + "step": 533300 + }, + { + "epoch": 2.1729543767167163, + "grad_norm": 6.278920650482178, + "learning_rate": 0.0038090568822407855, + "loss": 7.837, + "step": 533400 + }, + { + "epoch": 2.173361754740098, + "grad_norm": 7.702585220336914, + "learning_rate": 0.003808633918965862, + "loss": 7.8248, + "step": 533500 + }, + { + "epoch": 2.1737691327634794, + "grad_norm": 3.48207426071167, + "learning_rate": 0.0038082109041513183, + "loss": 7.8354, + "step": 533600 + }, + { + "epoch": 2.174176510786861, + "grad_norm": 4.544466972351074, + "learning_rate": 0.0038077878378138777, + "loss": 7.8698, + "step": 533700 + }, + { + "epoch": 2.1745838888102424, + "grad_norm": 4.2195611000061035, + "learning_rate": 0.003807364719970264, + "loss": 7.8203, + "step": 533800 + }, + { + "epoch": 2.174991266833624, + "grad_norm": 11.225996017456055, + "learning_rate": 0.0038069415506372085, + "loss": 7.8338, + "step": 533900 + }, + { + "epoch": 2.175398644857005, + "grad_norm": 10.38953971862793, + "learning_rate": 0.0038065183298314352, + "loss": 7.8005, + "step": 534000 + }, + { + "epoch": 2.175398644857005, + "eval_MaskedAccuracy": 0.49920180309420636, + "eval_loss": 1.6535975933074951, + "eval_runtime": 231.6333, + "eval_samples_per_second": 274.037, + "eval_steps_per_second": 1.071, + "step": 534000 + }, + { + "epoch": 2.1758060228803866, + "grad_norm": 8.93643569946289, + "learning_rate": 0.0038060950575696854, + "loss": 7.8441, + "step": 534100 + }, + { + "epoch": 2.176213400903768, + "grad_norm": 4.134316921234131, + "learning_rate": 0.0038056717338686872, + "loss": 7.824, + "step": 534200 + }, + { + "epoch": 2.1766207789271497, + "grad_norm": 3.2579381465911865, + "learning_rate": 0.003805248358745182, + "loss": 7.8401, + "step": 534300 + }, + { + "epoch": 2.177028156950531, + "grad_norm": 3.608017921447754, + "learning_rate": 0.003804824932215902, + "loss": 7.8208, + "step": 534400 + }, + { + "epoch": 2.1774355349739127, + "grad_norm": 5.360105514526367, + "learning_rate": 0.0038044014542975946, + "loss": 7.806, + "step": 534500 + }, + { + "epoch": 2.177842912997294, + "grad_norm": 5.1331586837768555, + "learning_rate": 0.003803977925006997, + "loss": 7.7993, + "step": 534600 + }, + { + "epoch": 2.1782502910206754, + "grad_norm": 2.071043014526367, + "learning_rate": 0.0038035543443608534, + "loss": 7.8678, + "step": 534700 + }, + { + "epoch": 2.178657669044057, + "grad_norm": 6.327230453491211, + "learning_rate": 0.003803130712375911, + "loss": 7.8327, + "step": 534800 + }, + { + "epoch": 2.1790650470674384, + "grad_norm": 6.99228572845459, + "learning_rate": 0.00380270702906892, + "loss": 7.8138, + "step": 534900 + }, + { + "epoch": 2.17947242509082, + "grad_norm": 3.339266300201416, + "learning_rate": 0.0038022832944566284, + "loss": 7.8173, + "step": 535000 + }, + { + "epoch": 2.17947242509082, + "eval_MaskedAccuracy": 0.49975830231760876, + "eval_loss": 1.6462153196334839, + "eval_runtime": 173.1718, + "eval_samples_per_second": 366.549, + "eval_steps_per_second": 1.432, + "step": 535000 + }, + { + "epoch": 2.1798798031142015, + "grad_norm": 2.3373918533325195, + "learning_rate": 0.003801859508555789, + "loss": 7.8105, + "step": 535100 + }, + { + "epoch": 2.1802871811375826, + "grad_norm": 2.4057939052581787, + "learning_rate": 0.003801435671383158, + "loss": 7.8449, + "step": 535200 + }, + { + "epoch": 2.180694559160964, + "grad_norm": 2.7518136501312256, + "learning_rate": 0.0038010117829554903, + "loss": 7.8294, + "step": 535300 + }, + { + "epoch": 2.1811019371843456, + "grad_norm": 2.5903055667877197, + "learning_rate": 0.0038005878432895465, + "loss": 7.7928, + "step": 535400 + }, + { + "epoch": 2.181509315207727, + "grad_norm": 3.3852591514587402, + "learning_rate": 0.0038001638524020856, + "loss": 7.8247, + "step": 535500 + }, + { + "epoch": 2.1819166932311087, + "grad_norm": 4.770111083984375, + "learning_rate": 0.003799739810309872, + "loss": 7.8157, + "step": 535600 + }, + { + "epoch": 2.1823240712544902, + "grad_norm": 3.0455150604248047, + "learning_rate": 0.0037993157170296645, + "loss": 7.8099, + "step": 535700 + }, + { + "epoch": 2.1827314492778713, + "grad_norm": 2.8539605140686035, + "learning_rate": 0.003798891572578236, + "loss": 7.8076, + "step": 535800 + }, + { + "epoch": 2.183138827301253, + "grad_norm": 7.808065891265869, + "learning_rate": 0.003798467376972358, + "loss": 7.8456, + "step": 535900 + }, + { + "epoch": 2.1835462053246344, + "grad_norm": 4.614112854003906, + "learning_rate": 0.0037980431302287974, + "loss": 7.8519, + "step": 536000 + }, + { + "epoch": 2.1835462053246344, + "eval_MaskedAccuracy": 0.4990841277760691, + "eval_loss": 1.6543712615966797, + "eval_runtime": 162.9235, + "eval_samples_per_second": 389.606, + "eval_steps_per_second": 1.522, + "step": 536000 + }, + { + "epoch": 2.183953583348016, + "grad_norm": 4.20052433013916, + "learning_rate": 0.003797618832364323, + "loss": 7.8395, + "step": 536100 + }, + { + "epoch": 2.1843609613713975, + "grad_norm": 5.1673784255981445, + "learning_rate": 0.0037971944833957074, + "loss": 7.8113, + "step": 536200 + }, + { + "epoch": 2.184768339394779, + "grad_norm": 1.4922406673431396, + "learning_rate": 0.0037967700833397375, + "loss": 7.8287, + "step": 536300 + }, + { + "epoch": 2.1851757174181605, + "grad_norm": 4.0081095695495605, + "learning_rate": 0.0037963456322131902, + "loss": 7.8482, + "step": 536400 + }, + { + "epoch": 2.1855830954415416, + "grad_norm": 6.309442520141602, + "learning_rate": 0.003795921130032841, + "loss": 7.8272, + "step": 536500 + }, + { + "epoch": 2.185990473464923, + "grad_norm": 8.077388763427734, + "learning_rate": 0.003795496576815472, + "loss": 7.8017, + "step": 536600 + }, + { + "epoch": 2.1863978514883047, + "grad_norm": 4.251585483551025, + "learning_rate": 0.003795071972577877, + "loss": 7.8262, + "step": 536700 + }, + { + "epoch": 2.186805229511686, + "grad_norm": 3.7119550704956055, + "learning_rate": 0.003794647317336835, + "loss": 7.8292, + "step": 536800 + }, + { + "epoch": 2.1872126075350677, + "grad_norm": 4.681642055511475, + "learning_rate": 0.003794222611109138, + "loss": 7.806, + "step": 536900 + }, + { + "epoch": 2.1876199855584493, + "grad_norm": 2.630150318145752, + "learning_rate": 0.0037937978539115736, + "loss": 7.8133, + "step": 537000 + }, + { + "epoch": 2.1876199855584493, + "eval_MaskedAccuracy": 0.500926470716596, + "eval_loss": 1.6362963914871216, + "eval_runtime": 158.0037, + "eval_samples_per_second": 401.737, + "eval_steps_per_second": 1.57, + "step": 537000 + }, + { + "epoch": 2.1880273635818304, + "grad_norm": 3.701378583908081, + "learning_rate": 0.0037933730457609376, + "loss": 7.8424, + "step": 537100 + }, + { + "epoch": 2.188434741605212, + "grad_norm": 4.026693344116211, + "learning_rate": 0.003792948186674025, + "loss": 7.7871, + "step": 537200 + }, + { + "epoch": 2.1888421196285934, + "grad_norm": 1.4569182395935059, + "learning_rate": 0.0037925232766676297, + "loss": 7.856, + "step": 537300 + }, + { + "epoch": 2.189249497651975, + "grad_norm": 2.1214449405670166, + "learning_rate": 0.003792098315758554, + "loss": 7.8341, + "step": 537400 + }, + { + "epoch": 2.1896568756753565, + "grad_norm": 4.9652814865112305, + "learning_rate": 0.0037916733039635993, + "loss": 7.8549, + "step": 537500 + }, + { + "epoch": 2.190064253698738, + "grad_norm": 1.9439303874969482, + "learning_rate": 0.003791248241299567, + "loss": 7.8433, + "step": 537600 + }, + { + "epoch": 2.190471631722119, + "grad_norm": 8.613899230957031, + "learning_rate": 0.003790823127783262, + "loss": 7.8526, + "step": 537700 + }, + { + "epoch": 2.1908790097455006, + "grad_norm": 8.970516204833984, + "learning_rate": 0.003790397963431496, + "loss": 7.8374, + "step": 537800 + }, + { + "epoch": 2.191286387768882, + "grad_norm": 5.993648529052734, + "learning_rate": 0.0037899727482610676, + "loss": 7.8458, + "step": 537900 + }, + { + "epoch": 2.1916937657922637, + "grad_norm": 3.2168002128601074, + "learning_rate": 0.003789547482288795, + "loss": 7.8049, + "step": 538000 + }, + { + "epoch": 2.1916937657922637, + "eval_MaskedAccuracy": 0.5003411745718921, + "eval_loss": 1.6429698467254639, + "eval_runtime": 168.3955, + "eval_samples_per_second": 376.946, + "eval_steps_per_second": 1.473, + "step": 538000 + }, + { + "epoch": 2.1921011438156452, + "grad_norm": 3.894731283187866, + "learning_rate": 0.003789122165531492, + "loss": 7.874, + "step": 538100 + }, + { + "epoch": 2.192508521839027, + "grad_norm": 2.1121866703033447, + "learning_rate": 0.0037886967980059728, + "loss": 7.8311, + "step": 538200 + }, + { + "epoch": 2.192915899862408, + "grad_norm": 4.866525650024414, + "learning_rate": 0.0037882713797290565, + "loss": 7.8234, + "step": 538300 + }, + { + "epoch": 2.1933232778857894, + "grad_norm": 3.906606435775757, + "learning_rate": 0.003787845910717556, + "loss": 7.8283, + "step": 538400 + }, + { + "epoch": 2.193730655909171, + "grad_norm": 4.288459777832031, + "learning_rate": 0.0037874203909882957, + "loss": 7.8652, + "step": 538500 + }, + { + "epoch": 2.1941380339325525, + "grad_norm": 2.739276885986328, + "learning_rate": 0.0037869948205580973, + "loss": 7.8477, + "step": 538600 + }, + { + "epoch": 2.194545411955934, + "grad_norm": 5.647121906280518, + "learning_rate": 0.003786569199443788, + "loss": 7.8288, + "step": 538700 + }, + { + "epoch": 2.1949527899793155, + "grad_norm": 7.2101826667785645, + "learning_rate": 0.0037861435276621926, + "loss": 7.8173, + "step": 538800 + }, + { + "epoch": 2.195360168002697, + "grad_norm": 2.7694876194000244, + "learning_rate": 0.0037857178052301405, + "loss": 7.8293, + "step": 538900 + }, + { + "epoch": 2.195767546026078, + "grad_norm": 3.975038766860962, + "learning_rate": 0.003785292032164464, + "loss": 7.8396, + "step": 539000 + }, + { + "epoch": 2.195767546026078, + "eval_MaskedAccuracy": 0.4981996990229896, + "eval_loss": 1.6436177492141724, + "eval_runtime": 166.5758, + "eval_samples_per_second": 381.064, + "eval_steps_per_second": 1.489, + "step": 539000 + }, + { + "epoch": 2.1961749240494597, + "grad_norm": 5.791339874267578, + "learning_rate": 0.0037848662084819947, + "loss": 7.8639, + "step": 539100 + }, + { + "epoch": 2.196582302072841, + "grad_norm": 2.7626590728759766, + "learning_rate": 0.0037844403341995717, + "loss": 7.8601, + "step": 539200 + }, + { + "epoch": 2.1969896800962228, + "grad_norm": 4.960364818572998, + "learning_rate": 0.003784014409334029, + "loss": 7.8235, + "step": 539300 + }, + { + "epoch": 2.1973970581196043, + "grad_norm": 6.897753715515137, + "learning_rate": 0.0037835884339022046, + "loss": 7.8117, + "step": 539400 + }, + { + "epoch": 2.197804436142986, + "grad_norm": 3.9038569927215576, + "learning_rate": 0.0037831624079209423, + "loss": 7.8342, + "step": 539500 + }, + { + "epoch": 2.198211814166367, + "grad_norm": 4.344732761383057, + "learning_rate": 0.003782736331407085, + "loss": 7.8416, + "step": 539600 + }, + { + "epoch": 2.1986191921897484, + "grad_norm": 4.075064659118652, + "learning_rate": 0.0037823102043774764, + "loss": 7.8247, + "step": 539700 + }, + { + "epoch": 2.19902657021313, + "grad_norm": 2.741217851638794, + "learning_rate": 0.0037818840268489654, + "loss": 7.8261, + "step": 539800 + }, + { + "epoch": 2.1994339482365115, + "grad_norm": 6.242165565490723, + "learning_rate": 0.0037814577988383967, + "loss": 7.8609, + "step": 539900 + }, + { + "epoch": 2.199841326259893, + "grad_norm": 2.72274112701416, + "learning_rate": 0.003781031520362623, + "loss": 7.829, + "step": 540000 + }, + { + "epoch": 2.199841326259893, + "eval_MaskedAccuracy": 0.49986804644871313, + "eval_loss": 1.6495589017868042, + "eval_runtime": 159.9924, + "eval_samples_per_second": 396.744, + "eval_steps_per_second": 1.55, + "step": 540000 + }, + { + "epoch": 2.2002487042832746, + "grad_norm": 3.6194722652435303, + "learning_rate": 0.0037806051914385007, + "loss": 7.8281, + "step": 540100 + }, + { + "epoch": 2.2006560823066557, + "grad_norm": 2.2882962226867676, + "learning_rate": 0.0037801788120828835, + "loss": 7.8213, + "step": 540200 + }, + { + "epoch": 2.201063460330037, + "grad_norm": 7.555246353149414, + "learning_rate": 0.003779752382312627, + "loss": 7.8318, + "step": 540300 + }, + { + "epoch": 2.2014708383534187, + "grad_norm": 2.5695087909698486, + "learning_rate": 0.003779325902144591, + "loss": 7.8081, + "step": 540400 + }, + { + "epoch": 2.2018782163768003, + "grad_norm": 4.27829122543335, + "learning_rate": 0.003778899371595635, + "loss": 7.821, + "step": 540500 + }, + { + "epoch": 2.202285594400182, + "grad_norm": 2.186814785003662, + "learning_rate": 0.0037784727906826233, + "loss": 7.8395, + "step": 540600 + }, + { + "epoch": 2.2026929724235633, + "grad_norm": 1.746558666229248, + "learning_rate": 0.003778046159422424, + "loss": 7.8174, + "step": 540700 + }, + { + "epoch": 2.2031003504469444, + "grad_norm": 4.74664831161499, + "learning_rate": 0.0037776194778318998, + "loss": 7.8418, + "step": 540800 + }, + { + "epoch": 2.203507728470326, + "grad_norm": 8.453991889953613, + "learning_rate": 0.0037771927459279214, + "loss": 7.8433, + "step": 540900 + }, + { + "epoch": 2.2039151064937075, + "grad_norm": 3.422455310821533, + "learning_rate": 0.003776765963727358, + "loss": 7.8282, + "step": 541000 + }, + { + "epoch": 2.2039151064937075, + "eval_MaskedAccuracy": 0.4993272886132466, + "eval_loss": 1.6447093486785889, + "eval_runtime": 157.4615, + "eval_samples_per_second": 403.121, + "eval_steps_per_second": 1.575, + "step": 541000 + }, + { + "epoch": 2.204322484517089, + "grad_norm": 7.143071174621582, + "learning_rate": 0.0037763391312470866, + "loss": 7.8194, + "step": 541100 + }, + { + "epoch": 2.2047298625404705, + "grad_norm": 3.948322057723999, + "learning_rate": 0.0037759122485039763, + "loss": 7.8531, + "step": 541200 + }, + { + "epoch": 2.205137240563852, + "grad_norm": 2.6446802616119385, + "learning_rate": 0.0037754853155149088, + "loss": 7.8152, + "step": 541300 + }, + { + "epoch": 2.2055446185872336, + "grad_norm": 2.490008592605591, + "learning_rate": 0.0037750583322967617, + "loss": 7.8329, + "step": 541400 + }, + { + "epoch": 2.2059519966106147, + "grad_norm": 3.8207499980926514, + "learning_rate": 0.0037746312988664187, + "loss": 7.8432, + "step": 541500 + }, + { + "epoch": 2.2063593746339962, + "grad_norm": 4.180668354034424, + "learning_rate": 0.0037742042152407564, + "loss": 7.8069, + "step": 541600 + }, + { + "epoch": 2.2067667526573778, + "grad_norm": 2.0634877681732178, + "learning_rate": 0.003773777081436664, + "loss": 7.8385, + "step": 541700 + }, + { + "epoch": 2.2071741306807593, + "grad_norm": 6.988525390625, + "learning_rate": 0.0037733498974710285, + "loss": 7.8492, + "step": 541800 + }, + { + "epoch": 2.207581508704141, + "grad_norm": 3.995124340057373, + "learning_rate": 0.003772922663360737, + "loss": 7.8394, + "step": 541900 + }, + { + "epoch": 2.2079888867275224, + "grad_norm": 5.8167195320129395, + "learning_rate": 0.003772495379122682, + "loss": 7.8169, + "step": 542000 + }, + { + "epoch": 2.2079888867275224, + "eval_MaskedAccuracy": 0.49963134657016, + "eval_loss": 1.6503068208694458, + "eval_runtime": 160.0787, + "eval_samples_per_second": 396.53, + "eval_steps_per_second": 1.549, + "step": 542000 + }, + { + "epoch": 2.2083962647509034, + "grad_norm": 3.727559804916382, + "learning_rate": 0.0037720680447737532, + "loss": 7.8193, + "step": 542100 + }, + { + "epoch": 2.208803642774285, + "grad_norm": 3.18107271194458, + "learning_rate": 0.0037716406603308518, + "loss": 7.8175, + "step": 542200 + }, + { + "epoch": 2.2092110207976665, + "grad_norm": 3.3818202018737793, + "learning_rate": 0.00377121322581087, + "loss": 7.8551, + "step": 542300 + }, + { + "epoch": 2.209618398821048, + "grad_norm": 1.877657175064087, + "learning_rate": 0.0037707857412307075, + "loss": 7.8386, + "step": 542400 + }, + { + "epoch": 2.2100257768444296, + "grad_norm": 3.3674402236938477, + "learning_rate": 0.003770358206607266, + "loss": 7.8581, + "step": 542500 + }, + { + "epoch": 2.210433154867811, + "grad_norm": 4.645495891571045, + "learning_rate": 0.003769930621957445, + "loss": 7.8186, + "step": 542600 + }, + { + "epoch": 2.210840532891192, + "grad_norm": 5.1082024574279785, + "learning_rate": 0.0037695029872981513, + "loss": 7.8224, + "step": 542700 + }, + { + "epoch": 2.2112479109145737, + "grad_norm": 2.660203456878662, + "learning_rate": 0.0037690753026462893, + "loss": 7.8291, + "step": 542800 + }, + { + "epoch": 2.2116552889379553, + "grad_norm": 7.779784679412842, + "learning_rate": 0.0037686475680187713, + "loss": 7.7967, + "step": 542900 + }, + { + "epoch": 2.212062666961337, + "grad_norm": 2.886215925216675, + "learning_rate": 0.0037682197834325087, + "loss": 7.8453, + "step": 543000 + }, + { + "epoch": 2.212062666961337, + "eval_MaskedAccuracy": 0.5005089513368042, + "eval_loss": 1.6450562477111816, + "eval_runtime": 206.8242, + "eval_samples_per_second": 306.908, + "eval_steps_per_second": 1.199, + "step": 543000 + }, + { + "epoch": 2.2124700449847183, + "grad_norm": 2.2003118991851807, + "learning_rate": 0.0037677919489044116, + "loss": 7.84, + "step": 543100 + }, + { + "epoch": 2.2128774230081, + "grad_norm": 3.7952935695648193, + "learning_rate": 0.0037673640644513926, + "loss": 7.8513, + "step": 543200 + }, + { + "epoch": 2.213284801031481, + "grad_norm": 3.5863821506500244, + "learning_rate": 0.003766936130090373, + "loss": 7.8173, + "step": 543300 + }, + { + "epoch": 2.2136921790548625, + "grad_norm": 7.687413215637207, + "learning_rate": 0.0037665081458382638, + "loss": 7.8407, + "step": 543400 + }, + { + "epoch": 2.214099557078244, + "grad_norm": 2.257976770401001, + "learning_rate": 0.0037660801117119945, + "loss": 7.8417, + "step": 543500 + }, + { + "epoch": 2.2145069351016256, + "grad_norm": 9.945231437683105, + "learning_rate": 0.003765652027728486, + "loss": 7.8569, + "step": 543600 + }, + { + "epoch": 2.214914313125007, + "grad_norm": 2.041649341583252, + "learning_rate": 0.003765223893904657, + "loss": 7.8409, + "step": 543700 + }, + { + "epoch": 2.2153216911483886, + "grad_norm": 4.65684175491333, + "learning_rate": 0.003764795710257438, + "loss": 7.8805, + "step": 543800 + }, + { + "epoch": 2.21572906917177, + "grad_norm": 3.468898296356201, + "learning_rate": 0.0037643674768037564, + "loss": 7.8164, + "step": 543900 + }, + { + "epoch": 2.2161364471951512, + "grad_norm": 5.240268230438232, + "learning_rate": 0.0037639391935605417, + "loss": 7.8377, + "step": 544000 + }, + { + "epoch": 2.2161364471951512, + "eval_MaskedAccuracy": 0.5007431908521538, + "eval_loss": 1.647658348083496, + "eval_runtime": 168.2637, + "eval_samples_per_second": 377.241, + "eval_steps_per_second": 1.474, + "step": 544000 + }, + { + "epoch": 2.2165438252185328, + "grad_norm": 3.825221538543701, + "learning_rate": 0.0037635108605447295, + "loss": 7.8212, + "step": 544100 + }, + { + "epoch": 2.2169512032419143, + "grad_norm": 2.2604832649230957, + "learning_rate": 0.00376308247777325, + "loss": 7.8093, + "step": 544200 + }, + { + "epoch": 2.217358581265296, + "grad_norm": 4.002956390380859, + "learning_rate": 0.003762654045263041, + "loss": 7.8089, + "step": 544300 + }, + { + "epoch": 2.2177659592886774, + "grad_norm": 2.3894317150115967, + "learning_rate": 0.003762225563031042, + "loss": 7.836, + "step": 544400 + }, + { + "epoch": 2.218173337312059, + "grad_norm": 3.4666402339935303, + "learning_rate": 0.0037617970310941935, + "loss": 7.8195, + "step": 544500 + }, + { + "epoch": 2.21858071533544, + "grad_norm": 10.492304801940918, + "learning_rate": 0.003761368449469437, + "loss": 7.8253, + "step": 544600 + }, + { + "epoch": 2.2189880933588215, + "grad_norm": 1.558408498764038, + "learning_rate": 0.0037609398181737167, + "loss": 7.7988, + "step": 544700 + }, + { + "epoch": 2.219395471382203, + "grad_norm": 5.022818565368652, + "learning_rate": 0.0037605111372239794, + "loss": 7.8072, + "step": 544800 + }, + { + "epoch": 2.2198028494055846, + "grad_norm": 6.653713226318359, + "learning_rate": 0.003760082406637168, + "loss": 7.8233, + "step": 544900 + }, + { + "epoch": 2.220210227428966, + "grad_norm": 5.106639862060547, + "learning_rate": 0.0037596536264302384, + "loss": 7.8141, + "step": 545000 + }, + { + "epoch": 2.220210227428966, + "eval_MaskedAccuracy": 0.5007562128437728, + "eval_loss": 1.6528507471084595, + "eval_runtime": 177.6296, + "eval_samples_per_second": 357.35, + "eval_steps_per_second": 1.396, + "step": 545000 + }, + { + "epoch": 2.2206176054523477, + "grad_norm": 2.980137586593628, + "learning_rate": 0.0037592247966201377, + "loss": 7.8296, + "step": 545100 + }, + { + "epoch": 2.2210249834757287, + "grad_norm": 4.1934332847595215, + "learning_rate": 0.0037587959172238243, + "loss": 7.8092, + "step": 545200 + }, + { + "epoch": 2.2214323614991103, + "grad_norm": 3.990290403366089, + "learning_rate": 0.0037583669882582525, + "loss": 7.8209, + "step": 545300 + }, + { + "epoch": 2.221839739522492, + "grad_norm": 3.0091090202331543, + "learning_rate": 0.0037579380097403775, + "loss": 7.825, + "step": 545400 + }, + { + "epoch": 2.2222471175458733, + "grad_norm": 1.3348726034164429, + "learning_rate": 0.0037575089816871614, + "loss": 7.803, + "step": 545500 + }, + { + "epoch": 2.222654495569255, + "grad_norm": 4.475834846496582, + "learning_rate": 0.0037570799041155693, + "loss": 7.8535, + "step": 545600 + }, + { + "epoch": 2.2230618735926364, + "grad_norm": 6.8989787101745605, + "learning_rate": 0.003756650777042563, + "loss": 7.8025, + "step": 545700 + }, + { + "epoch": 2.2234692516160175, + "grad_norm": 5.077826976776123, + "learning_rate": 0.003756221600485106, + "loss": 7.8013, + "step": 545800 + }, + { + "epoch": 2.223876629639399, + "grad_norm": 2.5384793281555176, + "learning_rate": 0.0037557923744601656, + "loss": 7.8181, + "step": 545900 + }, + { + "epoch": 2.2242840076627806, + "grad_norm": 4.500861644744873, + "learning_rate": 0.0037553630989847144, + "loss": 7.8304, + "step": 546000 + }, + { + "epoch": 2.2242840076627806, + "eval_MaskedAccuracy": 0.5001765546564289, + "eval_loss": 1.6489428281784058, + "eval_runtime": 157.7882, + "eval_samples_per_second": 402.286, + "eval_steps_per_second": 1.572, + "step": 546000 + }, + { + "epoch": 2.224691385686162, + "grad_norm": 3.0855696201324463, + "learning_rate": 0.0037549337740757196, + "loss": 7.8139, + "step": 546100 + }, + { + "epoch": 2.2250987637095436, + "grad_norm": 3.060678720474243, + "learning_rate": 0.0037545043997501606, + "loss": 7.7787, + "step": 546200 + }, + { + "epoch": 2.225506141732925, + "grad_norm": 3.354079008102417, + "learning_rate": 0.0037540749760250143, + "loss": 7.8235, + "step": 546300 + }, + { + "epoch": 2.2259135197563067, + "grad_norm": 2.1660995483398438, + "learning_rate": 0.003753645502917249, + "loss": 7.814, + "step": 546400 + }, + { + "epoch": 2.226320897779688, + "grad_norm": 2.482395887374878, + "learning_rate": 0.003753215980443851, + "loss": 7.7994, + "step": 546500 + }, + { + "epoch": 2.2267282758030693, + "grad_norm": 5.098010540008545, + "learning_rate": 0.0037527864086217954, + "loss": 7.8203, + "step": 546600 + }, + { + "epoch": 2.227135653826451, + "grad_norm": 4.216400623321533, + "learning_rate": 0.003752356787468074, + "loss": 7.7922, + "step": 546700 + }, + { + "epoch": 2.2275430318498324, + "grad_norm": 5.281837463378906, + "learning_rate": 0.0037519271169996625, + "loss": 7.8095, + "step": 546800 + }, + { + "epoch": 2.227950409873214, + "grad_norm": 8.910423278808594, + "learning_rate": 0.003751497397233558, + "loss": 7.796, + "step": 546900 + }, + { + "epoch": 2.2283577878965954, + "grad_norm": 3.28885817527771, + "learning_rate": 0.0037510676281867425, + "loss": 7.8271, + "step": 547000 + }, + { + "epoch": 2.2283577878965954, + "eval_MaskedAccuracy": 0.5011636216941383, + "eval_loss": 1.6514025926589966, + "eval_runtime": 170.1542, + "eval_samples_per_second": 373.05, + "eval_steps_per_second": 1.458, + "step": 547000 + }, + { + "epoch": 2.2287651659199765, + "grad_norm": 5.114754676818848, + "learning_rate": 0.003750637809876211, + "loss": 7.8091, + "step": 547100 + }, + { + "epoch": 2.229172543943358, + "grad_norm": 3.2427895069122314, + "learning_rate": 0.003750207942318954, + "loss": 7.8476, + "step": 547200 + }, + { + "epoch": 2.2295799219667396, + "grad_norm": 3.4154791831970215, + "learning_rate": 0.0037497780255319677, + "loss": 7.7966, + "step": 547300 + }, + { + "epoch": 2.229987299990121, + "grad_norm": 5.754019737243652, + "learning_rate": 0.003749348059532246, + "loss": 7.8493, + "step": 547400 + }, + { + "epoch": 2.2303946780135027, + "grad_norm": 3.7867672443389893, + "learning_rate": 0.0037489180443367877, + "loss": 7.793, + "step": 547500 + }, + { + "epoch": 2.230802056036884, + "grad_norm": 3.4546449184417725, + "learning_rate": 0.0037484879799626, + "loss": 7.8381, + "step": 547600 + }, + { + "epoch": 2.2312094340602653, + "grad_norm": 6.485270977020264, + "learning_rate": 0.003748057866426681, + "loss": 7.8148, + "step": 547700 + }, + { + "epoch": 2.231616812083647, + "grad_norm": 3.5456833839416504, + "learning_rate": 0.003747627703746035, + "loss": 7.8025, + "step": 547800 + }, + { + "epoch": 2.2320241901070283, + "grad_norm": 4.4459967613220215, + "learning_rate": 0.0037471974919376714, + "loss": 7.8222, + "step": 547900 + }, + { + "epoch": 2.23243156813041, + "grad_norm": 6.9822001457214355, + "learning_rate": 0.0037467672310185973, + "loss": 7.8157, + "step": 548000 + }, + { + "epoch": 2.23243156813041, + "eval_MaskedAccuracy": 0.5004119611941124, + "eval_loss": 1.6517724990844727, + "eval_runtime": 156.7003, + "eval_samples_per_second": 405.079, + "eval_steps_per_second": 1.583, + "step": 548000 + }, + { + "epoch": 2.2328389461537914, + "grad_norm": 3.1292076110839844, + "learning_rate": 0.003746336921005827, + "loss": 7.8315, + "step": 548100 + }, + { + "epoch": 2.233246324177173, + "grad_norm": 8.645283699035645, + "learning_rate": 0.0037459065619163683, + "loss": 7.8157, + "step": 548200 + }, + { + "epoch": 2.233653702200554, + "grad_norm": 3.8814592361450195, + "learning_rate": 0.0037454761537672374, + "loss": 7.8482, + "step": 548300 + }, + { + "epoch": 2.2340610802239356, + "grad_norm": 6.201331615447998, + "learning_rate": 0.0037450456965754456, + "loss": 7.8313, + "step": 548400 + }, + { + "epoch": 2.234468458247317, + "grad_norm": 3.4205143451690674, + "learning_rate": 0.0037446151903580164, + "loss": 7.8251, + "step": 548500 + }, + { + "epoch": 2.2348758362706986, + "grad_norm": 2.4528141021728516, + "learning_rate": 0.0037441846351319668, + "loss": 7.8233, + "step": 548600 + }, + { + "epoch": 2.23528321429408, + "grad_norm": 4.1301093101501465, + "learning_rate": 0.003743754030914326, + "loss": 7.8346, + "step": 548700 + }, + { + "epoch": 2.2356905923174617, + "grad_norm": 2.6611547470092773, + "learning_rate": 0.0037433233777221108, + "loss": 7.8525, + "step": 548800 + }, + { + "epoch": 2.2360979703408432, + "grad_norm": 3.2765557765960693, + "learning_rate": 0.0037428926755723488, + "loss": 7.8312, + "step": 548900 + }, + { + "epoch": 2.2365053483642243, + "grad_norm": 2.1360530853271484, + "learning_rate": 0.0037424619244820685, + "loss": 7.8125, + "step": 549000 + }, + { + "epoch": 2.2365053483642243, + "eval_MaskedAccuracy": 0.5003689653270004, + "eval_loss": 1.6474647521972656, + "eval_runtime": 201.9599, + "eval_samples_per_second": 314.3, + "eval_steps_per_second": 1.228, + "step": 549000 + }, + { + "epoch": 2.236912726387606, + "grad_norm": 5.588429927825928, + "learning_rate": 0.0037420311244682988, + "loss": 7.802, + "step": 549100 + }, + { + "epoch": 2.2373201044109874, + "grad_norm": 3.0774121284484863, + "learning_rate": 0.003741600275548074, + "loss": 7.8041, + "step": 549200 + }, + { + "epoch": 2.237727482434369, + "grad_norm": 4.307383060455322, + "learning_rate": 0.0037411693777384242, + "loss": 7.8031, + "step": 549300 + }, + { + "epoch": 2.2381348604577505, + "grad_norm": 4.235053539276123, + "learning_rate": 0.003740738431056393, + "loss": 7.7924, + "step": 549400 + }, + { + "epoch": 2.238542238481132, + "grad_norm": 3.976292371749878, + "learning_rate": 0.0037403074355190067, + "loss": 7.8167, + "step": 549500 + }, + { + "epoch": 2.238949616504513, + "grad_norm": 11.318830490112305, + "learning_rate": 0.003739876391143312, + "loss": 7.7853, + "step": 549600 + }, + { + "epoch": 2.2393569945278946, + "grad_norm": 5.350687026977539, + "learning_rate": 0.0037394452979463495, + "loss": 7.8366, + "step": 549700 + }, + { + "epoch": 2.239764372551276, + "grad_norm": 3.1678311824798584, + "learning_rate": 0.0037390141559451606, + "loss": 7.8202, + "step": 549800 + }, + { + "epoch": 2.2401717505746577, + "grad_norm": 4.260071277618408, + "learning_rate": 0.003738582965156799, + "loss": 7.8077, + "step": 549900 + }, + { + "epoch": 2.240579128598039, + "grad_norm": 2.2095463275909424, + "learning_rate": 0.0037381517255982975, + "loss": 7.8411, + "step": 550000 + }, + { + "epoch": 2.240579128598039, + "eval_MaskedAccuracy": 0.5005593729909046, + "eval_loss": 1.642432689666748, + "eval_runtime": 171.0837, + "eval_samples_per_second": 371.023, + "eval_steps_per_second": 1.45, + "step": 550000 + }, + { + "epoch": 2.2409865066214207, + "grad_norm": 6.515848636627197, + "learning_rate": 0.0037377204372867168, + "loss": 7.8074, + "step": 550100 + }, + { + "epoch": 2.241393884644802, + "grad_norm": 2.398937940597534, + "learning_rate": 0.003737289100239101, + "loss": 7.8119, + "step": 550200 + }, + { + "epoch": 2.2418012626681834, + "grad_norm": 2.9077181816101074, + "learning_rate": 0.0037368577144725073, + "loss": 7.8166, + "step": 550300 + }, + { + "epoch": 2.242208640691565, + "grad_norm": 4.874625205993652, + "learning_rate": 0.003736426280003991, + "loss": 7.8138, + "step": 550400 + }, + { + "epoch": 2.2426160187149464, + "grad_norm": 5.9073967933654785, + "learning_rate": 0.0037359947968506053, + "loss": 7.8335, + "step": 550500 + }, + { + "epoch": 2.243023396738328, + "grad_norm": 3.353095293045044, + "learning_rate": 0.0037355632650294137, + "loss": 7.792, + "step": 550600 + }, + { + "epoch": 2.2434307747617095, + "grad_norm": 1.4904179573059082, + "learning_rate": 0.0037351316845574715, + "loss": 7.8373, + "step": 550700 + }, + { + "epoch": 2.2438381527850906, + "grad_norm": 1.9586129188537598, + "learning_rate": 0.003734700055451846, + "loss": 7.8444, + "step": 550800 + }, + { + "epoch": 2.244245530808472, + "grad_norm": 3.9642250537872314, + "learning_rate": 0.0037342683777296013, + "loss": 7.8427, + "step": 550900 + }, + { + "epoch": 2.2446529088318536, + "grad_norm": 3.0593836307525635, + "learning_rate": 0.003733836651407803, + "loss": 7.7923, + "step": 551000 + }, + { + "epoch": 2.2446529088318536, + "eval_MaskedAccuracy": 0.501333218923484, + "eval_loss": 1.635178565979004, + "eval_runtime": 161.8285, + "eval_samples_per_second": 392.242, + "eval_steps_per_second": 1.532, + "step": 551000 + }, + { + "epoch": 2.245060286855235, + "grad_norm": 3.8172152042388916, + "learning_rate": 0.0037334048765035174, + "loss": 7.8236, + "step": 551100 + }, + { + "epoch": 2.2454676648786167, + "grad_norm": 3.798973321914673, + "learning_rate": 0.0037329730530338227, + "loss": 7.7827, + "step": 551200 + }, + { + "epoch": 2.2458750429019982, + "grad_norm": 1.6006982326507568, + "learning_rate": 0.003732541181015784, + "loss": 7.8251, + "step": 551300 + }, + { + "epoch": 2.2462824209253798, + "grad_norm": 4.4721198081970215, + "learning_rate": 0.0037321092604664773, + "loss": 7.8398, + "step": 551400 + }, + { + "epoch": 2.246689798948761, + "grad_norm": 2.283562421798706, + "learning_rate": 0.003731677291402978, + "loss": 7.7928, + "step": 551500 + }, + { + "epoch": 2.2470971769721424, + "grad_norm": 7.920275688171387, + "learning_rate": 0.003731245273842359, + "loss": 7.8573, + "step": 551600 + }, + { + "epoch": 2.247504554995524, + "grad_norm": 1.8555209636688232, + "learning_rate": 0.0037308132078017133, + "loss": 7.8155, + "step": 551700 + }, + { + "epoch": 2.2479119330189055, + "grad_norm": 2.872659921646118, + "learning_rate": 0.0037303810932981163, + "loss": 7.8305, + "step": 551800 + }, + { + "epoch": 2.248319311042287, + "grad_norm": 1.9988043308258057, + "learning_rate": 0.00372994893034865, + "loss": 7.8389, + "step": 551900 + }, + { + "epoch": 2.2487266890656685, + "grad_norm": 3.3895010948181152, + "learning_rate": 0.0037295167189703984, + "loss": 7.843, + "step": 552000 + }, + { + "epoch": 2.2487266890656685, + "eval_MaskedAccuracy": 0.5008253832687625, + "eval_loss": 1.643225908279419, + "eval_runtime": 185.1265, + "eval_samples_per_second": 342.879, + "eval_steps_per_second": 1.34, + "step": 552000 + }, + { + "epoch": 2.2491340670890496, + "grad_norm": 2.5584287643432617, + "learning_rate": 0.0037290844591804516, + "loss": 7.8113, + "step": 552100 + }, + { + "epoch": 2.249541445112431, + "grad_norm": 13.224796295166016, + "learning_rate": 0.0037286521509959014, + "loss": 7.8325, + "step": 552200 + }, + { + "epoch": 2.2499488231358127, + "grad_norm": 2.5514280796051025, + "learning_rate": 0.0037282197944338326, + "loss": 7.8443, + "step": 552300 + }, + { + "epoch": 2.250356201159194, + "grad_norm": 3.5032153129577637, + "learning_rate": 0.0037277873895113453, + "loss": 7.794, + "step": 552400 + }, + { + "epoch": 2.2507635791825757, + "grad_norm": 4.562185287475586, + "learning_rate": 0.003727354936245535, + "loss": 7.7988, + "step": 552500 + }, + { + "epoch": 2.2511709572059573, + "grad_norm": 5.177770614624023, + "learning_rate": 0.0037269224346534965, + "loss": 7.7994, + "step": 552600 + }, + { + "epoch": 2.2515783352293384, + "grad_norm": 2.4556784629821777, + "learning_rate": 0.003726489884752326, + "loss": 7.8352, + "step": 552700 + }, + { + "epoch": 2.25198571325272, + "grad_norm": 7.931498050689697, + "learning_rate": 0.0037260572865591276, + "loss": 7.8044, + "step": 552800 + }, + { + "epoch": 2.2523930912761014, + "grad_norm": 4.53995943069458, + "learning_rate": 0.003725624640091002, + "loss": 7.8239, + "step": 552900 + }, + { + "epoch": 2.252800469299483, + "grad_norm": 4.165882587432861, + "learning_rate": 0.003725191945365057, + "loss": 7.7913, + "step": 553000 + }, + { + "epoch": 2.252800469299483, + "eval_MaskedAccuracy": 0.5001803101334295, + "eval_loss": 1.644093632698059, + "eval_runtime": 194.6042, + "eval_samples_per_second": 326.18, + "eval_steps_per_second": 1.274, + "step": 553000 + }, + { + "epoch": 2.2532078473228645, + "grad_norm": 4.2804179191589355, + "learning_rate": 0.003724759202398399, + "loss": 7.8301, + "step": 553100 + }, + { + "epoch": 2.253615225346246, + "grad_norm": 5.424826622009277, + "learning_rate": 0.003724326411208135, + "loss": 7.8126, + "step": 553200 + }, + { + "epoch": 2.254022603369627, + "grad_norm": 2.1051621437072754, + "learning_rate": 0.0037238935718113776, + "loss": 7.8271, + "step": 553300 + }, + { + "epoch": 2.2544299813930087, + "grad_norm": 2.4027085304260254, + "learning_rate": 0.003723460684225241, + "loss": 7.8519, + "step": 553400 + }, + { + "epoch": 2.25483735941639, + "grad_norm": 9.889691352844238, + "learning_rate": 0.003723027748466836, + "loss": 7.8033, + "step": 553500 + }, + { + "epoch": 2.2552447374397717, + "grad_norm": 7.366210460662842, + "learning_rate": 0.003722594764553277, + "loss": 7.8267, + "step": 553600 + }, + { + "epoch": 2.2556521154631533, + "grad_norm": 6.189861297607422, + "learning_rate": 0.0037221617325016855, + "loss": 7.8356, + "step": 553700 + }, + { + "epoch": 2.256059493486535, + "grad_norm": 1.8813151121139526, + "learning_rate": 0.0037217286523291786, + "loss": 7.7938, + "step": 553800 + }, + { + "epoch": 2.2564668715099163, + "grad_norm": 1.6196503639221191, + "learning_rate": 0.0037212955240528856, + "loss": 7.8403, + "step": 553900 + }, + { + "epoch": 2.2568742495332974, + "grad_norm": 4.88037109375, + "learning_rate": 0.0037208623476899226, + "loss": 7.8406, + "step": 554000 + }, + { + "epoch": 2.2568742495332974, + "eval_MaskedAccuracy": 0.5005618339050674, + "eval_loss": 1.6498075723648071, + "eval_runtime": 159.7862, + "eval_samples_per_second": 397.256, + "eval_steps_per_second": 1.552, + "step": 554000 + }, + { + "epoch": 2.257281627556679, + "grad_norm": 3.5858309268951416, + "learning_rate": 0.003720429123257416, + "loss": 7.834, + "step": 554100 + }, + { + "epoch": 2.2576890055800605, + "grad_norm": 1.9539397954940796, + "learning_rate": 0.0037199958507724957, + "loss": 7.7847, + "step": 554200 + }, + { + "epoch": 2.258096383603442, + "grad_norm": 2.2557578086853027, + "learning_rate": 0.0037195625302522935, + "loss": 7.7873, + "step": 554300 + }, + { + "epoch": 2.2585037616268235, + "grad_norm": 3.007535457611084, + "learning_rate": 0.003719129161713936, + "loss": 7.8134, + "step": 554400 + }, + { + "epoch": 2.2589111396502046, + "grad_norm": 8.381376266479492, + "learning_rate": 0.0037186957451745602, + "loss": 7.8051, + "step": 554500 + }, + { + "epoch": 2.259318517673586, + "grad_norm": 4.962038516998291, + "learning_rate": 0.0037182622806513016, + "loss": 7.833, + "step": 554600 + }, + { + "epoch": 2.2597258956969677, + "grad_norm": 3.697455883026123, + "learning_rate": 0.0037178287681612944, + "loss": 7.7938, + "step": 554700 + }, + { + "epoch": 2.2601332737203492, + "grad_norm": 6.414445877075195, + "learning_rate": 0.003717395207721681, + "loss": 7.819, + "step": 554800 + }, + { + "epoch": 2.2605406517437308, + "grad_norm": 5.8277907371521, + "learning_rate": 0.0037169615993495964, + "loss": 7.8174, + "step": 554900 + }, + { + "epoch": 2.2609480297671123, + "grad_norm": 1.5174105167388916, + "learning_rate": 0.0037165279430621916, + "loss": 7.8382, + "step": 555000 + }, + { + "epoch": 2.2609480297671123, + "eval_MaskedAccuracy": 0.5009572925655903, + "eval_loss": 1.6404433250427246, + "eval_runtime": 156.6815, + "eval_samples_per_second": 405.128, + "eval_steps_per_second": 1.583, + "step": 555000 + }, + { + "epoch": 2.261355407790494, + "grad_norm": 3.088292121887207, + "learning_rate": 0.003716094238876608, + "loss": 7.7966, + "step": 555100 + }, + { + "epoch": 2.261762785813875, + "grad_norm": 3.5430777072906494, + "learning_rate": 0.0037156604868099916, + "loss": 7.8028, + "step": 555200 + }, + { + "epoch": 2.2621701638372564, + "grad_norm": 3.1515798568725586, + "learning_rate": 0.0037152266868794898, + "loss": 7.8386, + "step": 555300 + }, + { + "epoch": 2.262577541860638, + "grad_norm": 2.4247777462005615, + "learning_rate": 0.0037147928391022566, + "loss": 7.8397, + "step": 555400 + }, + { + "epoch": 2.2629849198840195, + "grad_norm": 4.718557834625244, + "learning_rate": 0.0037143589434954407, + "loss": 7.8009, + "step": 555500 + }, + { + "epoch": 2.263392297907401, + "grad_norm": 5.006442070007324, + "learning_rate": 0.0037139250000761985, + "loss": 7.8163, + "step": 555600 + }, + { + "epoch": 2.2637996759307826, + "grad_norm": 3.265850067138672, + "learning_rate": 0.003713491008861681, + "loss": 7.7798, + "step": 555700 + }, + { + "epoch": 2.2642070539541637, + "grad_norm": 7.734994411468506, + "learning_rate": 0.003713056969869053, + "loss": 7.8296, + "step": 555800 + }, + { + "epoch": 2.264614431977545, + "grad_norm": 10.472969055175781, + "learning_rate": 0.0037126228831154706, + "loss": 7.8263, + "step": 555900 + }, + { + "epoch": 2.2650218100009267, + "grad_norm": 2.4553678035736084, + "learning_rate": 0.0037121887486181015, + "loss": 7.8158, + "step": 556000 + }, + { + "epoch": 2.2650218100009267, + "eval_MaskedAccuracy": 0.5005522445873212, + "eval_loss": 1.646085500717163, + "eval_runtime": 155.4707, + "eval_samples_per_second": 408.283, + "eval_steps_per_second": 1.595, + "step": 556000 + }, + { + "epoch": 2.2654291880243083, + "grad_norm": 10.690045356750488, + "learning_rate": 0.003711754566394102, + "loss": 7.837, + "step": 556100 + }, + { + "epoch": 2.26583656604769, + "grad_norm": 4.021206855773926, + "learning_rate": 0.00371132033646064, + "loss": 7.81, + "step": 556200 + }, + { + "epoch": 2.2662439440710713, + "grad_norm": 8.149909019470215, + "learning_rate": 0.003710886058834882, + "loss": 7.7881, + "step": 556300 + }, + { + "epoch": 2.266651322094453, + "grad_norm": 3.954235553741455, + "learning_rate": 0.003710451733534001, + "loss": 7.8244, + "step": 556400 + }, + { + "epoch": 2.267058700117834, + "grad_norm": 7.280412673950195, + "learning_rate": 0.0037100173605751647, + "loss": 7.8122, + "step": 556500 + }, + { + "epoch": 2.2674660781412155, + "grad_norm": 3.6498842239379883, + "learning_rate": 0.003709582939975548, + "loss": 7.8038, + "step": 556600 + }, + { + "epoch": 2.267873456164597, + "grad_norm": 4.224459171295166, + "learning_rate": 0.003709148471752326, + "loss": 7.8348, + "step": 556700 + }, + { + "epoch": 2.2682808341879785, + "grad_norm": 5.864927768707275, + "learning_rate": 0.003708713955922673, + "loss": 7.7983, + "step": 556800 + }, + { + "epoch": 2.26868821221136, + "grad_norm": 5.801333427429199, + "learning_rate": 0.003708279392503769, + "loss": 7.8092, + "step": 556900 + }, + { + "epoch": 2.269095590234741, + "grad_norm": 3.181314468383789, + "learning_rate": 0.003707844781512797, + "loss": 7.8201, + "step": 557000 + }, + { + "epoch": 2.269095590234741, + "eval_MaskedAccuracy": 0.50079975482266, + "eval_loss": 1.6471331119537354, + "eval_runtime": 155.9278, + "eval_samples_per_second": 407.086, + "eval_steps_per_second": 1.59, + "step": 557000 + }, + { + "epoch": 2.2695029682581227, + "grad_norm": 5.057437419891357, + "learning_rate": 0.0037074101229669356, + "loss": 7.7945, + "step": 557100 + }, + { + "epoch": 2.2699103462815042, + "grad_norm": 3.7329092025756836, + "learning_rate": 0.0037069754168833715, + "loss": 7.7986, + "step": 557200 + }, + { + "epoch": 2.2703177243048858, + "grad_norm": 4.425315856933594, + "learning_rate": 0.0037065406632792917, + "loss": 7.8158, + "step": 557300 + }, + { + "epoch": 2.2707251023282673, + "grad_norm": 2.3986151218414307, + "learning_rate": 0.0037061058621718837, + "loss": 7.8124, + "step": 557400 + }, + { + "epoch": 2.271132480351649, + "grad_norm": 3.9646878242492676, + "learning_rate": 0.003705671013578333, + "loss": 7.8108, + "step": 557500 + }, + { + "epoch": 2.2715398583750304, + "grad_norm": 4.580368518829346, + "learning_rate": 0.0037052361175158414, + "loss": 7.8063, + "step": 557600 + }, + { + "epoch": 2.2719472363984115, + "grad_norm": 7.307309150695801, + "learning_rate": 0.0037048011740015954, + "loss": 7.812, + "step": 557700 + }, + { + "epoch": 2.272354614421793, + "grad_norm": 5.580484867095947, + "learning_rate": 0.00370436618305279, + "loss": 7.8363, + "step": 557800 + }, + { + "epoch": 2.2727619924451745, + "grad_norm": 6.419212818145752, + "learning_rate": 0.003703931144686628, + "loss": 7.8152, + "step": 557900 + }, + { + "epoch": 2.273169370468556, + "grad_norm": 3.574993133544922, + "learning_rate": 0.003703496058920303, + "loss": 7.7717, + "step": 558000 + }, + { + "epoch": 2.273169370468556, + "eval_MaskedAccuracy": 0.5011912879313032, + "eval_loss": 1.6435333490371704, + "eval_runtime": 160.4227, + "eval_samples_per_second": 395.68, + "eval_steps_per_second": 1.546, + "step": 558000 + }, + { + "epoch": 2.2735767484919376, + "grad_norm": 2.877426862716675, + "learning_rate": 0.0037030609257710186, + "loss": 7.8217, + "step": 558100 + }, + { + "epoch": 2.273984126515319, + "grad_norm": 4.252274990081787, + "learning_rate": 0.003702625745255979, + "loss": 7.8042, + "step": 558200 + }, + { + "epoch": 2.2743915045387, + "grad_norm": 7.433331489562988, + "learning_rate": 0.0037021905173923847, + "loss": 7.8158, + "step": 558300 + }, + { + "epoch": 2.2747988825620817, + "grad_norm": 4.4136786460876465, + "learning_rate": 0.0037017552421974493, + "loss": 7.8343, + "step": 558400 + }, + { + "epoch": 2.2752062605854633, + "grad_norm": 1.7757295370101929, + "learning_rate": 0.0037013199196883778, + "loss": 7.8028, + "step": 558500 + }, + { + "epoch": 2.275613638608845, + "grad_norm": 2.7505078315734863, + "learning_rate": 0.00370088454988238, + "loss": 7.8028, + "step": 558600 + }, + { + "epoch": 2.2760210166322263, + "grad_norm": 3.4686386585235596, + "learning_rate": 0.0037004491327966697, + "loss": 7.7894, + "step": 558700 + }, + { + "epoch": 2.276428394655608, + "grad_norm": 3.3943593502044678, + "learning_rate": 0.003700013668448465, + "loss": 7.8202, + "step": 558800 + }, + { + "epoch": 2.2768357726789894, + "grad_norm": 4.104429244995117, + "learning_rate": 0.0036995781568549776, + "loss": 7.8157, + "step": 558900 + }, + { + "epoch": 2.2772431507023705, + "grad_norm": 5.528374195098877, + "learning_rate": 0.003699142598033427, + "loss": 7.8183, + "step": 559000 + }, + { + "epoch": 2.2772431507023705, + "eval_MaskedAccuracy": 0.500252366596004, + "eval_loss": 1.6462630033493042, + "eval_runtime": 181.7661, + "eval_samples_per_second": 349.218, + "eval_steps_per_second": 1.364, + "step": 559000 + }, + { + "epoch": 2.277650528725752, + "grad_norm": 3.605768918991089, + "learning_rate": 0.0036987069920010316, + "loss": 7.7941, + "step": 559100 + }, + { + "epoch": 2.2780579067491336, + "grad_norm": 4.0323286056518555, + "learning_rate": 0.003698271338775016, + "loss": 7.8123, + "step": 559200 + }, + { + "epoch": 2.278465284772515, + "grad_norm": 2.5782217979431152, + "learning_rate": 0.003697835638372599, + "loss": 7.7876, + "step": 559300 + }, + { + "epoch": 2.2788726627958966, + "grad_norm": 3.2254178524017334, + "learning_rate": 0.0036973998908110114, + "loss": 7.8112, + "step": 559400 + }, + { + "epoch": 2.2792800408192777, + "grad_norm": 6.719834804534912, + "learning_rate": 0.003696964096107479, + "loss": 7.7808, + "step": 559500 + }, + { + "epoch": 2.2796874188426592, + "grad_norm": 7.524669647216797, + "learning_rate": 0.003696528254279231, + "loss": 7.8251, + "step": 559600 + }, + { + "epoch": 2.2800947968660408, + "grad_norm": 7.806594371795654, + "learning_rate": 0.0036960923653435005, + "loss": 7.8273, + "step": 559700 + }, + { + "epoch": 2.2805021748894223, + "grad_norm": 10.59316635131836, + "learning_rate": 0.003695656429317515, + "loss": 7.7829, + "step": 559800 + }, + { + "epoch": 2.280909552912804, + "grad_norm": 2.955613374710083, + "learning_rate": 0.0036952204462185126, + "loss": 7.791, + "step": 559900 + }, + { + "epoch": 2.2813169309361854, + "grad_norm": 5.315120697021484, + "learning_rate": 0.003694784416063736, + "loss": 7.81, + "step": 560000 + }, + { + "epoch": 2.2813169309361854, + "eval_MaskedAccuracy": 0.5001056928718093, + "eval_loss": 1.6514586210250854, + "eval_runtime": 154.8769, + "eval_samples_per_second": 409.848, + "eval_steps_per_second": 1.601, + "step": 560000 + }, + { + "epoch": 2.281724308959567, + "grad_norm": 2.851351737976074, + "learning_rate": 0.0036943483388704123, + "loss": 7.7882, + "step": 560100 + }, + { + "epoch": 2.282131686982948, + "grad_norm": 6.286218643188477, + "learning_rate": 0.003693912214655789, + "loss": 7.8126, + "step": 560200 + }, + { + "epoch": 2.2825390650063295, + "grad_norm": 7.340025424957275, + "learning_rate": 0.003693476043437107, + "loss": 7.795, + "step": 560300 + }, + { + "epoch": 2.282946443029711, + "grad_norm": 5.198561668395996, + "learning_rate": 0.0036930398252316113, + "loss": 7.8086, + "step": 560400 + }, + { + "epoch": 2.2833538210530926, + "grad_norm": 2.6386916637420654, + "learning_rate": 0.003692603560056541, + "loss": 7.8095, + "step": 560500 + }, + { + "epoch": 2.283761199076474, + "grad_norm": 5.484129428863525, + "learning_rate": 0.0036921672479291543, + "loss": 7.8065, + "step": 560600 + }, + { + "epoch": 2.2841685770998557, + "grad_norm": 4.82759952545166, + "learning_rate": 0.003691730888866699, + "loss": 7.8265, + "step": 560700 + }, + { + "epoch": 2.2845759551232367, + "grad_norm": 2.8393630981445312, + "learning_rate": 0.0036912944828864256, + "loss": 7.7741, + "step": 560800 + }, + { + "epoch": 2.2849833331466183, + "grad_norm": 3.239849090576172, + "learning_rate": 0.003690858030005584, + "loss": 7.8024, + "step": 560900 + }, + { + "epoch": 2.28539071117, + "grad_norm": 4.779313087463379, + "learning_rate": 0.0036904215302414316, + "loss": 7.825, + "step": 561000 + }, + { + "epoch": 2.28539071117, + "eval_MaskedAccuracy": 0.5005410275150758, + "eval_loss": 1.6495428085327148, + "eval_runtime": 162.8685, + "eval_samples_per_second": 389.738, + "eval_steps_per_second": 1.523, + "step": 561000 + }, + { + "epoch": 2.2857980891933813, + "grad_norm": 9.7829008102417, + "learning_rate": 0.003689984983611227, + "loss": 7.7901, + "step": 561100 + }, + { + "epoch": 2.286205467216763, + "grad_norm": 4.617170333862305, + "learning_rate": 0.0036895483901322294, + "loss": 7.8269, + "step": 561200 + }, + { + "epoch": 2.2866128452401444, + "grad_norm": 2.352316379547119, + "learning_rate": 0.003689111749821697, + "loss": 7.842, + "step": 561300 + }, + { + "epoch": 2.287020223263526, + "grad_norm": 4.032354354858398, + "learning_rate": 0.0036886750626968943, + "loss": 7.801, + "step": 561400 + }, + { + "epoch": 2.287427601286907, + "grad_norm": 5.562901973724365, + "learning_rate": 0.003688238328775084, + "loss": 7.7932, + "step": 561500 + }, + { + "epoch": 2.2878349793102886, + "grad_norm": 3.7815651893615723, + "learning_rate": 0.003687801548073535, + "loss": 7.8262, + "step": 561600 + }, + { + "epoch": 2.28824235733367, + "grad_norm": 6.96209192276001, + "learning_rate": 0.0036873647206095155, + "loss": 7.8029, + "step": 561700 + }, + { + "epoch": 2.2886497353570516, + "grad_norm": 2.10092830657959, + "learning_rate": 0.0036869278464002927, + "loss": 7.8132, + "step": 561800 + }, + { + "epoch": 2.289057113380433, + "grad_norm": 3.034524917602539, + "learning_rate": 0.0036864909254631407, + "loss": 7.8082, + "step": 561900 + }, + { + "epoch": 2.2894644914038143, + "grad_norm": 7.174429416656494, + "learning_rate": 0.003686053957815332, + "loss": 7.8039, + "step": 562000 + }, + { + "epoch": 2.2894644914038143, + "eval_MaskedAccuracy": 0.5008579440761833, + "eval_loss": 1.64951491355896, + "eval_runtime": 169.9335, + "eval_samples_per_second": 373.534, + "eval_steps_per_second": 1.459, + "step": 562000 + }, + { + "epoch": 2.289871869427196, + "grad_norm": 2.877821922302246, + "learning_rate": 0.0036856169434741444, + "loss": 7.7858, + "step": 562100 + }, + { + "epoch": 2.2902792474505773, + "grad_norm": 3.2900638580322266, + "learning_rate": 0.003685179882456856, + "loss": 7.8389, + "step": 562200 + }, + { + "epoch": 2.290686625473959, + "grad_norm": 2.1930456161499023, + "learning_rate": 0.0036847427747807456, + "loss": 7.774, + "step": 562300 + }, + { + "epoch": 2.2910940034973404, + "grad_norm": 1.7845197916030884, + "learning_rate": 0.0036843056204630973, + "loss": 7.7848, + "step": 562400 + }, + { + "epoch": 2.291501381520722, + "grad_norm": 3.4839022159576416, + "learning_rate": 0.0036838684195211868, + "loss": 7.8392, + "step": 562500 + }, + { + "epoch": 2.2919087595441034, + "grad_norm": 3.169654369354248, + "learning_rate": 0.0036834311719723045, + "loss": 7.8162, + "step": 562600 + }, + { + "epoch": 2.2923161375674845, + "grad_norm": 5.8147382736206055, + "learning_rate": 0.0036829938778337344, + "loss": 7.8284, + "step": 562700 + }, + { + "epoch": 2.292723515590866, + "grad_norm": 3.330246686935425, + "learning_rate": 0.003682556537122767, + "loss": 7.825, + "step": 562800 + }, + { + "epoch": 2.2931308936142476, + "grad_norm": 4.501898288726807, + "learning_rate": 0.003682119149856691, + "loss": 7.8016, + "step": 562900 + }, + { + "epoch": 2.293538271637629, + "grad_norm": 6.830868721008301, + "learning_rate": 0.0036816817160528004, + "loss": 7.8366, + "step": 563000 + }, + { + "epoch": 2.293538271637629, + "eval_MaskedAccuracy": 0.5006073604882039, + "eval_loss": 1.6413272619247437, + "eval_runtime": 165.8437, + "eval_samples_per_second": 382.746, + "eval_steps_per_second": 1.495, + "step": 563000 + }, + { + "epoch": 2.2939456496610107, + "grad_norm": 3.261218786239624, + "learning_rate": 0.0036812442357283866, + "loss": 7.8113, + "step": 563100 + }, + { + "epoch": 2.294353027684392, + "grad_norm": 5.727258682250977, + "learning_rate": 0.0036808067089007493, + "loss": 7.8122, + "step": 563200 + }, + { + "epoch": 2.2947604057077733, + "grad_norm": 3.5103304386138916, + "learning_rate": 0.0036803691355871792, + "loss": 7.8079, + "step": 563300 + }, + { + "epoch": 2.295167783731155, + "grad_norm": 6.098264217376709, + "learning_rate": 0.003679931515804987, + "loss": 7.7835, + "step": 563400 + }, + { + "epoch": 2.2955751617545364, + "grad_norm": 7.136855125427246, + "learning_rate": 0.0036794938495714642, + "loss": 7.7704, + "step": 563500 + }, + { + "epoch": 2.295982539777918, + "grad_norm": 6.015887260437012, + "learning_rate": 0.0036790561369039144, + "loss": 7.8326, + "step": 563600 + }, + { + "epoch": 2.2963899178012994, + "grad_norm": 2.1826488971710205, + "learning_rate": 0.0036786183778196473, + "loss": 7.7864, + "step": 563700 + }, + { + "epoch": 2.296797295824681, + "grad_norm": 7.495033264160156, + "learning_rate": 0.00367818057233597, + "loss": 7.8182, + "step": 563800 + }, + { + "epoch": 2.2972046738480625, + "grad_norm": 7.047767162322998, + "learning_rate": 0.003677742720470191, + "loss": 7.8145, + "step": 563900 + }, + { + "epoch": 2.2976120518714436, + "grad_norm": 6.113947868347168, + "learning_rate": 0.003677304822239619, + "loss": 7.8133, + "step": 564000 + }, + { + "epoch": 2.2976120518714436, + "eval_MaskedAccuracy": 0.500017308723155, + "eval_loss": 1.6545196771621704, + "eval_runtime": 161.5075, + "eval_samples_per_second": 393.022, + "eval_steps_per_second": 1.536, + "step": 564000 + }, + { + "epoch": 2.298019429894825, + "grad_norm": 6.169644355773926, + "learning_rate": 0.0036768668776615706, + "loss": 7.8446, + "step": 564100 + }, + { + "epoch": 2.2984268079182066, + "grad_norm": 2.7540698051452637, + "learning_rate": 0.0036764288867533533, + "loss": 7.8195, + "step": 564200 + }, + { + "epoch": 2.298834185941588, + "grad_norm": 2.0158894062042236, + "learning_rate": 0.0036759908495322883, + "loss": 7.7954, + "step": 564300 + }, + { + "epoch": 2.2992415639649697, + "grad_norm": 3.291651964187622, + "learning_rate": 0.003675552766015695, + "loss": 7.8378, + "step": 564400 + }, + { + "epoch": 2.299648941988351, + "grad_norm": 6.923490524291992, + "learning_rate": 0.003675114636220887, + "loss": 7.813, + "step": 564500 + }, + { + "epoch": 2.3000563200117323, + "grad_norm": 2.207007646560669, + "learning_rate": 0.0036746764601651885, + "loss": 7.7775, + "step": 564600 + }, + { + "epoch": 2.300463698035114, + "grad_norm": 6.48522424697876, + "learning_rate": 0.003674238237865923, + "loss": 7.8255, + "step": 564700 + }, + { + "epoch": 2.3008710760584954, + "grad_norm": 4.100906848907471, + "learning_rate": 0.0036737999693404197, + "loss": 7.771, + "step": 564800 + }, + { + "epoch": 2.301278454081877, + "grad_norm": 6.507815361022949, + "learning_rate": 0.0036733616546060005, + "loss": 7.8102, + "step": 564900 + }, + { + "epoch": 2.3016858321052585, + "grad_norm": 3.452202796936035, + "learning_rate": 0.003672923293679999, + "loss": 7.8209, + "step": 565000 + }, + { + "epoch": 2.3016858321052585, + "eval_MaskedAccuracy": 0.5015413176382573, + "eval_loss": 1.6353548765182495, + "eval_runtime": 199.7571, + "eval_samples_per_second": 317.766, + "eval_steps_per_second": 1.242, + "step": 565000 + }, + { + "epoch": 2.30209321012864, + "grad_norm": 4.378198623657227, + "learning_rate": 0.003672484886579738, + "loss": 7.7998, + "step": 565100 + }, + { + "epoch": 2.302500588152021, + "grad_norm": 2.597689628601074, + "learning_rate": 0.003672046433322559, + "loss": 7.8292, + "step": 565200 + }, + { + "epoch": 2.3029079661754026, + "grad_norm": 3.111417055130005, + "learning_rate": 0.0036716079339257908, + "loss": 7.8239, + "step": 565300 + }, + { + "epoch": 2.303315344198784, + "grad_norm": 4.451837062835693, + "learning_rate": 0.0036711693884067735, + "loss": 7.8201, + "step": 565400 + }, + { + "epoch": 2.3037227222221657, + "grad_norm": 10.4205904006958, + "learning_rate": 0.0036707307967828433, + "loss": 7.8191, + "step": 565500 + }, + { + "epoch": 2.304130100245547, + "grad_norm": 1.5484758615493774, + "learning_rate": 0.0036702921590713367, + "loss": 7.8065, + "step": 565600 + }, + { + "epoch": 2.3045374782689287, + "grad_norm": 4.571197986602783, + "learning_rate": 0.003669853475289599, + "loss": 7.8109, + "step": 565700 + }, + { + "epoch": 2.30494485629231, + "grad_norm": 6.3760247230529785, + "learning_rate": 0.003669414745454978, + "loss": 7.7971, + "step": 565800 + }, + { + "epoch": 2.3053522343156914, + "grad_norm": 4.224003791809082, + "learning_rate": 0.0036689759695848105, + "loss": 7.8078, + "step": 565900 + }, + { + "epoch": 2.305759612339073, + "grad_norm": 7.778561115264893, + "learning_rate": 0.003668537147696447, + "loss": 7.8465, + "step": 566000 + }, + { + "epoch": 2.305759612339073, + "eval_MaskedAccuracy": 0.5002568947516487, + "eval_loss": 1.639491319656372, + "eval_runtime": 156.5749, + "eval_samples_per_second": 405.403, + "eval_steps_per_second": 1.584, + "step": 566000 + }, + { + "epoch": 2.3061669903624544, + "grad_norm": 4.958521842956543, + "learning_rate": 0.00366809827980724, + "loss": 7.8103, + "step": 566100 + }, + { + "epoch": 2.306574368385836, + "grad_norm": 10.79844856262207, + "learning_rate": 0.0036676593659345346, + "loss": 7.818, + "step": 566200 + }, + { + "epoch": 2.3069817464092175, + "grad_norm": 2.0336101055145264, + "learning_rate": 0.003667220406095688, + "loss": 7.819, + "step": 566300 + }, + { + "epoch": 2.307389124432599, + "grad_norm": 1.9391813278198242, + "learning_rate": 0.0036667814003080537, + "loss": 7.7723, + "step": 566400 + }, + { + "epoch": 2.30779650245598, + "grad_norm": 5.151214122772217, + "learning_rate": 0.0036663423485889897, + "loss": 7.8015, + "step": 566500 + }, + { + "epoch": 2.3082038804793616, + "grad_norm": 7.156179904937744, + "learning_rate": 0.003665903250955848, + "loss": 7.792, + "step": 566600 + }, + { + "epoch": 2.308611258502743, + "grad_norm": 7.693875789642334, + "learning_rate": 0.003665464107425993, + "loss": 7.8119, + "step": 566700 + }, + { + "epoch": 2.3090186365261247, + "grad_norm": 3.309619665145874, + "learning_rate": 0.0036650249180167835, + "loss": 7.8418, + "step": 566800 + }, + { + "epoch": 2.3094260145495062, + "grad_norm": 2.2675909996032715, + "learning_rate": 0.003664585682745583, + "loss": 7.8168, + "step": 566900 + }, + { + "epoch": 2.3098333925728873, + "grad_norm": 9.942997932434082, + "learning_rate": 0.0036641464016297586, + "loss": 7.8399, + "step": 567000 + }, + { + "epoch": 2.3098333925728873, + "eval_MaskedAccuracy": 0.5007904791842843, + "eval_loss": 1.6460387706756592, + "eval_runtime": 163.1776, + "eval_samples_per_second": 388.999, + "eval_steps_per_second": 1.52, + "step": 567000 + }, + { + "epoch": 2.310240770596269, + "grad_norm": 2.691694974899292, + "learning_rate": 0.0036637070746866813, + "loss": 7.8004, + "step": 567100 + }, + { + "epoch": 2.3106481486196504, + "grad_norm": 7.271874904632568, + "learning_rate": 0.003663267701933716, + "loss": 7.8369, + "step": 567200 + }, + { + "epoch": 2.311055526643032, + "grad_norm": 6.275662422180176, + "learning_rate": 0.003662828283388233, + "loss": 7.8187, + "step": 567300 + }, + { + "epoch": 2.3114629046664135, + "grad_norm": 5.2271881103515625, + "learning_rate": 0.0036623888190675984, + "loss": 7.8217, + "step": 567400 + }, + { + "epoch": 2.311870282689795, + "grad_norm": 8.13151741027832, + "learning_rate": 0.0036619493089891995, + "loss": 7.7958, + "step": 567500 + }, + { + "epoch": 2.3122776607131765, + "grad_norm": 6.556377410888672, + "learning_rate": 0.003661509753170407, + "loss": 7.8084, + "step": 567600 + }, + { + "epoch": 2.3126850387365576, + "grad_norm": 11.249375343322754, + "learning_rate": 0.0036610701516285983, + "loss": 7.8146, + "step": 567700 + }, + { + "epoch": 2.313092416759939, + "grad_norm": 4.4853129386901855, + "learning_rate": 0.0036606305043811495, + "loss": 7.8035, + "step": 567800 + }, + { + "epoch": 2.3134997947833207, + "grad_norm": 7.194879531860352, + "learning_rate": 0.0036601908114454506, + "loss": 7.8378, + "step": 567900 + }, + { + "epoch": 2.313907172806702, + "grad_norm": 4.056886196136475, + "learning_rate": 0.0036597510728388766, + "loss": 7.7952, + "step": 568000 + }, + { + "epoch": 2.313907172806702, + "eval_MaskedAccuracy": 0.5013411376071463, + "eval_loss": 1.6393439769744873, + "eval_runtime": 162.3513, + "eval_samples_per_second": 390.979, + "eval_steps_per_second": 1.528, + "step": 568000 + }, + { + "epoch": 2.3143145508300837, + "grad_norm": 1.8405582904815674, + "learning_rate": 0.003659311288578821, + "loss": 7.8059, + "step": 568100 + }, + { + "epoch": 2.3147219288534653, + "grad_norm": 7.408936977386475, + "learning_rate": 0.0036588714586826636, + "loss": 7.811, + "step": 568200 + }, + { + "epoch": 2.3151293068768464, + "grad_norm": 3.2693755626678467, + "learning_rate": 0.0036584315831677977, + "loss": 7.8182, + "step": 568300 + }, + { + "epoch": 2.315536684900228, + "grad_norm": 2.8912346363067627, + "learning_rate": 0.0036579916620516116, + "loss": 7.754, + "step": 568400 + }, + { + "epoch": 2.3159440629236094, + "grad_norm": 2.818791151046753, + "learning_rate": 0.003657551695351498, + "loss": 7.7531, + "step": 568500 + }, + { + "epoch": 2.316351440946991, + "grad_norm": 5.777654647827148, + "learning_rate": 0.0036571116830848537, + "loss": 7.8041, + "step": 568600 + }, + { + "epoch": 2.3167588189703725, + "grad_norm": 2.4717233180999756, + "learning_rate": 0.0036566716252690694, + "loss": 7.7759, + "step": 568700 + }, + { + "epoch": 2.317166196993754, + "grad_norm": 4.8509697914123535, + "learning_rate": 0.0036562315219215522, + "loss": 7.797, + "step": 568800 + }, + { + "epoch": 2.3175735750171356, + "grad_norm": 2.7276742458343506, + "learning_rate": 0.0036557913730596923, + "loss": 7.7912, + "step": 568900 + }, + { + "epoch": 2.3179809530405167, + "grad_norm": 7.866903781890869, + "learning_rate": 0.0036553511787008955, + "loss": 7.7853, + "step": 569000 + }, + { + "epoch": 2.3179809530405167, + "eval_MaskedAccuracy": 0.5008868233028845, + "eval_loss": 1.6494930982589722, + "eval_runtime": 154.7865, + "eval_samples_per_second": 410.087, + "eval_steps_per_second": 1.602, + "step": 569000 + }, + { + "epoch": 2.318388331063898, + "grad_norm": 2.6144144535064697, + "learning_rate": 0.003654910938862568, + "loss": 7.775, + "step": 569100 + }, + { + "epoch": 2.3187957090872797, + "grad_norm": 3.0788028240203857, + "learning_rate": 0.0036544706535621085, + "loss": 7.7952, + "step": 569200 + }, + { + "epoch": 2.3192030871106613, + "grad_norm": 6.774467945098877, + "learning_rate": 0.003654030322816921, + "loss": 7.7876, + "step": 569300 + }, + { + "epoch": 2.319610465134043, + "grad_norm": 7.350885391235352, + "learning_rate": 0.0036535899466444213, + "loss": 7.8123, + "step": 569400 + }, + { + "epoch": 2.320017843157424, + "grad_norm": 2.695387601852417, + "learning_rate": 0.0036531495250620187, + "loss": 7.8159, + "step": 569500 + }, + { + "epoch": 2.3204252211808054, + "grad_norm": 2.6932992935180664, + "learning_rate": 0.0036527090580871263, + "loss": 7.8055, + "step": 569600 + }, + { + "epoch": 2.320832599204187, + "grad_norm": 2.527388572692871, + "learning_rate": 0.0036522685457371557, + "loss": 7.7804, + "step": 569700 + }, + { + "epoch": 2.3212399772275685, + "grad_norm": 4.48877477645874, + "learning_rate": 0.0036518279880295218, + "loss": 7.8505, + "step": 569800 + }, + { + "epoch": 2.32164735525095, + "grad_norm": 5.784459590911865, + "learning_rate": 0.0036513873849816428, + "loss": 7.8049, + "step": 569900 + }, + { + "epoch": 2.3220547332743315, + "grad_norm": 7.657225608825684, + "learning_rate": 0.0036509467366109393, + "loss": 7.8017, + "step": 570000 + }, + { + "epoch": 2.3220547332743315, + "eval_MaskedAccuracy": 0.5004812969165234, + "eval_loss": 1.6518429517745972, + "eval_runtime": 183.4467, + "eval_samples_per_second": 346.019, + "eval_steps_per_second": 1.352, + "step": 570000 + }, + { + "epoch": 2.322462111297713, + "grad_norm": 5.676888465881348, + "learning_rate": 0.0036505060429348345, + "loss": 7.787, + "step": 570100 + }, + { + "epoch": 2.322869489321094, + "grad_norm": 3.2929296493530273, + "learning_rate": 0.0036500653039707467, + "loss": 7.791, + "step": 570200 + }, + { + "epoch": 2.3232768673444757, + "grad_norm": 3.6335253715515137, + "learning_rate": 0.003649624519736106, + "loss": 7.7994, + "step": 570300 + }, + { + "epoch": 2.3236842453678572, + "grad_norm": 4.016894340515137, + "learning_rate": 0.0036491836902483366, + "loss": 7.8071, + "step": 570400 + }, + { + "epoch": 2.3240916233912388, + "grad_norm": 3.255833864212036, + "learning_rate": 0.003648742815524863, + "loss": 7.8296, + "step": 570500 + }, + { + "epoch": 2.3244990014146203, + "grad_norm": 3.7515814304351807, + "learning_rate": 0.0036483018955831216, + "loss": 7.7897, + "step": 570600 + }, + { + "epoch": 2.324906379438002, + "grad_norm": 3.1917057037353516, + "learning_rate": 0.00364786093044054, + "loss": 7.7937, + "step": 570700 + }, + { + "epoch": 2.325313757461383, + "grad_norm": 4.040051460266113, + "learning_rate": 0.0036474199201145564, + "loss": 7.7956, + "step": 570800 + }, + { + "epoch": 2.3257211354847644, + "grad_norm": 8.876458168029785, + "learning_rate": 0.003646978864622602, + "loss": 7.8038, + "step": 570900 + }, + { + "epoch": 2.326128513508146, + "grad_norm": 6.235292911529541, + "learning_rate": 0.0036465377639821137, + "loss": 7.8209, + "step": 571000 + }, + { + "epoch": 2.326128513508146, + "eval_MaskedAccuracy": 0.5000286903679956, + "eval_loss": 1.6561836004257202, + "eval_runtime": 207.5661, + "eval_samples_per_second": 305.811, + "eval_steps_per_second": 1.195, + "step": 571000 + }, + { + "epoch": 2.3265358915315275, + "grad_norm": 4.052027225494385, + "learning_rate": 0.0036460966182105324, + "loss": 7.8094, + "step": 571100 + }, + { + "epoch": 2.326943269554909, + "grad_norm": 2.9445784091949463, + "learning_rate": 0.0036456554273253004, + "loss": 7.7966, + "step": 571200 + }, + { + "epoch": 2.3273506475782906, + "grad_norm": 5.168513774871826, + "learning_rate": 0.00364521419134386, + "loss": 7.8173, + "step": 571300 + }, + { + "epoch": 2.327758025601672, + "grad_norm": 4.562358379364014, + "learning_rate": 0.0036447729102836533, + "loss": 7.7718, + "step": 571400 + }, + { + "epoch": 2.328165403625053, + "grad_norm": 5.8594651222229, + "learning_rate": 0.0036443315841621256, + "loss": 7.8094, + "step": 571500 + }, + { + "epoch": 2.3285727816484347, + "grad_norm": 5.5165019035339355, + "learning_rate": 0.0036438902129967304, + "loss": 7.7973, + "step": 571600 + }, + { + "epoch": 2.3289801596718163, + "grad_norm": 3.9439899921417236, + "learning_rate": 0.003643448796804908, + "loss": 7.7941, + "step": 571700 + }, + { + "epoch": 2.329387537695198, + "grad_norm": 3.9785115718841553, + "learning_rate": 0.003643007335604119, + "loss": 7.7978, + "step": 571800 + }, + { + "epoch": 2.3297949157185793, + "grad_norm": 9.531577110290527, + "learning_rate": 0.003642565829411813, + "loss": 7.7614, + "step": 571900 + }, + { + "epoch": 2.3302022937419604, + "grad_norm": 1.9221528768539429, + "learning_rate": 0.003642124278245446, + "loss": 7.8186, + "step": 572000 + }, + { + "epoch": 2.3302022937419604, + "eval_MaskedAccuracy": 0.5009346109963853, + "eval_loss": 1.6393803358078003, + "eval_runtime": 168.0914, + "eval_samples_per_second": 377.628, + "eval_steps_per_second": 1.475, + "step": 572000 + }, + { + "epoch": 2.330609671765342, + "grad_norm": 5.259122848510742, + "learning_rate": 0.003641682682122474, + "loss": 7.7697, + "step": 572100 + }, + { + "epoch": 2.3310170497887235, + "grad_norm": 9.222817420959473, + "learning_rate": 0.003641241041060359, + "loss": 7.8259, + "step": 572200 + }, + { + "epoch": 2.331424427812105, + "grad_norm": 2.1270833015441895, + "learning_rate": 0.003640799355076559, + "loss": 7.7948, + "step": 572300 + }, + { + "epoch": 2.3318318058354865, + "grad_norm": 5.878179550170898, + "learning_rate": 0.0036403576241885336, + "loss": 7.82, + "step": 572400 + }, + { + "epoch": 2.332239183858868, + "grad_norm": 5.052548408508301, + "learning_rate": 0.003639915848413748, + "loss": 7.7924, + "step": 572500 + }, + { + "epoch": 2.3326465618822496, + "grad_norm": 5.933427333831787, + "learning_rate": 0.003639474027769669, + "loss": 7.7903, + "step": 572600 + }, + { + "epoch": 2.3330539399056307, + "grad_norm": 2.792175054550171, + "learning_rate": 0.0036390321622737616, + "loss": 7.803, + "step": 572700 + }, + { + "epoch": 2.3334613179290122, + "grad_norm": 3.595899820327759, + "learning_rate": 0.003638590251943498, + "loss": 7.7795, + "step": 572800 + }, + { + "epoch": 2.3338686959523938, + "grad_norm": 3.459865093231201, + "learning_rate": 0.0036381482967963517, + "loss": 7.8132, + "step": 572900 + }, + { + "epoch": 2.3342760739757753, + "grad_norm": 3.5949485301971436, + "learning_rate": 0.0036377062968497883, + "loss": 7.8101, + "step": 573000 + }, + { + "epoch": 2.3342760739757753, + "eval_MaskedAccuracy": 0.5011215973574518, + "eval_loss": 1.6427503824234009, + "eval_runtime": 152.7683, + "eval_samples_per_second": 415.505, + "eval_steps_per_second": 1.623, + "step": 573000 + }, + { + "epoch": 2.334683451999157, + "grad_norm": 3.4251813888549805, + "learning_rate": 0.003637264252121289, + "loss": 7.7793, + "step": 573100 + }, + { + "epoch": 2.3350908300225384, + "grad_norm": 3.6462955474853516, + "learning_rate": 0.003636822162628328, + "loss": 7.8069, + "step": 573200 + }, + { + "epoch": 2.3354982080459195, + "grad_norm": 2.881012201309204, + "learning_rate": 0.0036363800283883843, + "loss": 7.7885, + "step": 573300 + }, + { + "epoch": 2.335905586069301, + "grad_norm": 9.830028533935547, + "learning_rate": 0.003635937849418939, + "loss": 7.8005, + "step": 573400 + }, + { + "epoch": 2.3363129640926825, + "grad_norm": 5.260698318481445, + "learning_rate": 0.003635495625737469, + "loss": 7.7855, + "step": 573500 + }, + { + "epoch": 2.336720342116064, + "grad_norm": 3.468350410461426, + "learning_rate": 0.0036350533573614617, + "loss": 7.824, + "step": 573600 + }, + { + "epoch": 2.3371277201394456, + "grad_norm": 4.699432849884033, + "learning_rate": 0.0036346110443083992, + "loss": 7.7941, + "step": 573700 + }, + { + "epoch": 2.337535098162827, + "grad_norm": 2.475425958633423, + "learning_rate": 0.0036341686865957756, + "loss": 7.7852, + "step": 573800 + }, + { + "epoch": 2.3379424761862087, + "grad_norm": 3.1460745334625244, + "learning_rate": 0.003633726284241068, + "loss": 7.7886, + "step": 573900 + }, + { + "epoch": 2.3383498542095897, + "grad_norm": 2.798072338104248, + "learning_rate": 0.0036332838372617774, + "loss": 7.7765, + "step": 574000 + }, + { + "epoch": 2.3383498542095897, + "eval_MaskedAccuracy": 0.5010669093212721, + "eval_loss": 1.6389412879943848, + "eval_runtime": 152.9499, + "eval_samples_per_second": 415.012, + "eval_steps_per_second": 1.621, + "step": 574000 + }, + { + "epoch": 2.3387572322329713, + "grad_norm": 3.818546772003174, + "learning_rate": 0.00363284134567539, + "loss": 7.7845, + "step": 574100 + }, + { + "epoch": 2.339164610256353, + "grad_norm": 3.3038523197174072, + "learning_rate": 0.003632398809499405, + "loss": 7.8184, + "step": 574200 + }, + { + "epoch": 2.3395719882797343, + "grad_norm": 4.6410908699035645, + "learning_rate": 0.0036319562287513156, + "loss": 7.788, + "step": 574300 + }, + { + "epoch": 2.339979366303116, + "grad_norm": 1.9949098825454712, + "learning_rate": 0.003631513603448619, + "loss": 7.7812, + "step": 574400 + }, + { + "epoch": 2.340386744326497, + "grad_norm": 2.74983549118042, + "learning_rate": 0.0036310709336088132, + "loss": 7.787, + "step": 574500 + }, + { + "epoch": 2.3407941223498785, + "grad_norm": 5.470245361328125, + "learning_rate": 0.003630628219249402, + "loss": 7.8137, + "step": 574600 + }, + { + "epoch": 2.34120150037326, + "grad_norm": 2.465470314025879, + "learning_rate": 0.0036301854603878864, + "loss": 7.8113, + "step": 574700 + }, + { + "epoch": 2.3416088783966416, + "grad_norm": 4.013788223266602, + "learning_rate": 0.003629742657041771, + "loss": 7.8333, + "step": 574800 + }, + { + "epoch": 2.342016256420023, + "grad_norm": 3.817545175552368, + "learning_rate": 0.0036292998092285643, + "loss": 7.8007, + "step": 574900 + }, + { + "epoch": 2.3424236344434046, + "grad_norm": 4.242929458618164, + "learning_rate": 0.003628856916965773, + "loss": 7.7877, + "step": 575000 + }, + { + "epoch": 2.3424236344434046, + "eval_MaskedAccuracy": 0.5012850973635277, + "eval_loss": 1.6421722173690796, + "eval_runtime": 158.7684, + "eval_samples_per_second": 399.802, + "eval_steps_per_second": 1.562, + "step": 575000 + }, + { + "epoch": 2.342831012466786, + "grad_norm": 3.80789852142334, + "learning_rate": 0.0036284139802709053, + "loss": 7.7879, + "step": 575100 + }, + { + "epoch": 2.3432383904901672, + "grad_norm": 4.318171977996826, + "learning_rate": 0.0036279709991614735, + "loss": 7.8149, + "step": 575200 + }, + { + "epoch": 2.3436457685135488, + "grad_norm": 2.998563766479492, + "learning_rate": 0.003627527973654996, + "loss": 7.7942, + "step": 575300 + }, + { + "epoch": 2.3440531465369303, + "grad_norm": 7.166109561920166, + "learning_rate": 0.00362708490376898, + "loss": 7.7971, + "step": 575400 + }, + { + "epoch": 2.344460524560312, + "grad_norm": 3.0089938640594482, + "learning_rate": 0.0036266417895209463, + "loss": 7.775, + "step": 575500 + }, + { + "epoch": 2.3448679025836934, + "grad_norm": 4.467271327972412, + "learning_rate": 0.0036261986309284186, + "loss": 7.801, + "step": 575600 + }, + { + "epoch": 2.345275280607075, + "grad_norm": 1.9695075750350952, + "learning_rate": 0.003625755428008911, + "loss": 7.8133, + "step": 575700 + }, + { + "epoch": 2.345682658630456, + "grad_norm": 2.8095858097076416, + "learning_rate": 0.0036253121807799456, + "loss": 7.81, + "step": 575800 + }, + { + "epoch": 2.3460900366538375, + "grad_norm": 7.263975620269775, + "learning_rate": 0.0036248688892590464, + "loss": 7.805, + "step": 575900 + }, + { + "epoch": 2.346497414677219, + "grad_norm": 2.4742026329040527, + "learning_rate": 0.003624425553463742, + "loss": 7.7794, + "step": 576000 + }, + { + "epoch": 2.346497414677219, + "eval_MaskedAccuracy": 0.5012928785636943, + "eval_loss": 1.6390610933303833, + "eval_runtime": 157.8489, + "eval_samples_per_second": 402.132, + "eval_steps_per_second": 1.571, + "step": 576000 + }, + { + "epoch": 2.3469047927006006, + "grad_norm": 5.670180320739746, + "learning_rate": 0.0036239821734115594, + "loss": 7.8105, + "step": 576100 + }, + { + "epoch": 2.347312170723982, + "grad_norm": 3.840780258178711, + "learning_rate": 0.0036235387491200246, + "loss": 7.8036, + "step": 576200 + }, + { + "epoch": 2.3477195487473637, + "grad_norm": 1.773376703262329, + "learning_rate": 0.0036230952806066757, + "loss": 7.7732, + "step": 576300 + }, + { + "epoch": 2.348126926770745, + "grad_norm": 5.712237358093262, + "learning_rate": 0.0036226517678890346, + "loss": 7.7694, + "step": 576400 + }, + { + "epoch": 2.3485343047941263, + "grad_norm": 3.7197134494781494, + "learning_rate": 0.003622208210984643, + "loss": 7.8039, + "step": 576500 + }, + { + "epoch": 2.348941682817508, + "grad_norm": 3.3471412658691406, + "learning_rate": 0.0036217646099110387, + "loss": 7.805, + "step": 576600 + }, + { + "epoch": 2.3493490608408893, + "grad_norm": 4.894758701324463, + "learning_rate": 0.0036213209646857513, + "loss": 7.8104, + "step": 576700 + }, + { + "epoch": 2.349756438864271, + "grad_norm": 3.1351892948150635, + "learning_rate": 0.00362087727532633, + "loss": 7.8164, + "step": 576800 + }, + { + "epoch": 2.3501638168876524, + "grad_norm": 1.4317957162857056, + "learning_rate": 0.0036204335418503135, + "loss": 7.8159, + "step": 576900 + }, + { + "epoch": 2.3505711949110335, + "grad_norm": 2.3741769790649414, + "learning_rate": 0.0036199897642752413, + "loss": 7.8004, + "step": 577000 + }, + { + "epoch": 2.3505711949110335, + "eval_MaskedAccuracy": 0.501224630950296, + "eval_loss": 1.6415728330612183, + "eval_runtime": 171.3502, + "eval_samples_per_second": 370.446, + "eval_steps_per_second": 1.447, + "step": 577000 + }, + { + "epoch": 2.350978572934415, + "grad_norm": 6.531877040863037, + "learning_rate": 0.0036195459426186607, + "loss": 7.7916, + "step": 577100 + }, + { + "epoch": 2.3513859509577966, + "grad_norm": 6.963188648223877, + "learning_rate": 0.0036191020768981176, + "loss": 7.7925, + "step": 577200 + }, + { + "epoch": 2.351793328981178, + "grad_norm": 3.9958503246307373, + "learning_rate": 0.003618658167131158, + "loss": 7.7877, + "step": 577300 + }, + { + "epoch": 2.3522007070045596, + "grad_norm": 3.9696502685546875, + "learning_rate": 0.0036182142133353353, + "loss": 7.806, + "step": 577400 + }, + { + "epoch": 2.352608085027941, + "grad_norm": 6.785215377807617, + "learning_rate": 0.003617770215528201, + "loss": 7.8257, + "step": 577500 + }, + { + "epoch": 2.3530154630513227, + "grad_norm": 3.5732803344726562, + "learning_rate": 0.0036173261737273087, + "loss": 7.7939, + "step": 577600 + }, + { + "epoch": 2.353422841074704, + "grad_norm": 4.2556962966918945, + "learning_rate": 0.003616882087950216, + "loss": 7.7948, + "step": 577700 + }, + { + "epoch": 2.3538302190980853, + "grad_norm": 3.27388334274292, + "learning_rate": 0.0036164379582144758, + "loss": 7.8147, + "step": 577800 + }, + { + "epoch": 2.354237597121467, + "grad_norm": 3.56278395652771, + "learning_rate": 0.003615993784537652, + "loss": 7.8115, + "step": 577900 + }, + { + "epoch": 2.3546449751448484, + "grad_norm": 2.025696039199829, + "learning_rate": 0.0036155495669372983, + "loss": 7.7848, + "step": 578000 + }, + { + "epoch": 2.3546449751448484, + "eval_MaskedAccuracy": 0.5011612738226733, + "eval_loss": 1.6435589790344238, + "eval_runtime": 164.2816, + "eval_samples_per_second": 386.385, + "eval_steps_per_second": 1.51, + "step": 578000 + }, + { + "epoch": 2.35505235316823, + "grad_norm": 2.4971704483032227, + "learning_rate": 0.003615105305430979, + "loss": 7.7912, + "step": 578100 + }, + { + "epoch": 2.3554597311916114, + "grad_norm": 2.6171789169311523, + "learning_rate": 0.0036146610000362617, + "loss": 7.7789, + "step": 578200 + }, + { + "epoch": 2.3558671092149925, + "grad_norm": 2.426166534423828, + "learning_rate": 0.003614216650770709, + "loss": 7.7846, + "step": 578300 + }, + { + "epoch": 2.356274487238374, + "grad_norm": 3.547307014465332, + "learning_rate": 0.0036137722576518923, + "loss": 7.8254, + "step": 578400 + }, + { + "epoch": 2.3566818652617556, + "grad_norm": 2.7063729763031006, + "learning_rate": 0.003613327820697375, + "loss": 7.7681, + "step": 578500 + }, + { + "epoch": 2.357089243285137, + "grad_norm": 3.314094066619873, + "learning_rate": 0.0036128833399247324, + "loss": 7.8256, + "step": 578600 + }, + { + "epoch": 2.3574966213085187, + "grad_norm": 2.50994873046875, + "learning_rate": 0.0036124388153515354, + "loss": 7.8004, + "step": 578700 + }, + { + "epoch": 2.3579039993319, + "grad_norm": 2.1616404056549072, + "learning_rate": 0.003611994246995362, + "loss": 7.7966, + "step": 578800 + }, + { + "epoch": 2.3583113773552817, + "grad_norm": 5.168814659118652, + "learning_rate": 0.0036115496348737815, + "loss": 7.7673, + "step": 578900 + }, + { + "epoch": 2.358718755378663, + "grad_norm": 4.807029724121094, + "learning_rate": 0.0036111049790043797, + "loss": 7.787, + "step": 579000 + }, + { + "epoch": 2.358718755378663, + "eval_MaskedAccuracy": 0.5011903029437689, + "eval_loss": 1.6426823139190674, + "eval_runtime": 156.4449, + "eval_samples_per_second": 405.74, + "eval_steps_per_second": 1.585, + "step": 579000 + }, + { + "epoch": 2.3591261334020444, + "grad_norm": 4.3301215171813965, + "learning_rate": 0.0036106602794047284, + "loss": 7.817, + "step": 579100 + }, + { + "epoch": 2.359533511425426, + "grad_norm": 3.070080518722534, + "learning_rate": 0.0036102155360924165, + "loss": 7.7913, + "step": 579200 + }, + { + "epoch": 2.3599408894488074, + "grad_norm": 3.654529094696045, + "learning_rate": 0.003609770749085021, + "loss": 7.7843, + "step": 579300 + }, + { + "epoch": 2.360348267472189, + "grad_norm": 2.2287397384643555, + "learning_rate": 0.0036093259184001272, + "loss": 7.7742, + "step": 579400 + }, + { + "epoch": 2.36075564549557, + "grad_norm": 7.408300876617432, + "learning_rate": 0.003608881044055323, + "loss": 7.7914, + "step": 579500 + }, + { + "epoch": 2.3611630235189516, + "grad_norm": 3.6855971813201904, + "learning_rate": 0.003608436126068201, + "loss": 7.7836, + "step": 579600 + }, + { + "epoch": 2.361570401542333, + "grad_norm": 2.677215337753296, + "learning_rate": 0.0036079911644563457, + "loss": 7.803, + "step": 579700 + }, + { + "epoch": 2.3619777795657146, + "grad_norm": 2.59047794342041, + "learning_rate": 0.003607546159237352, + "loss": 7.7641, + "step": 579800 + }, + { + "epoch": 2.362385157589096, + "grad_norm": 3.900541305541992, + "learning_rate": 0.00360710111042881, + "loss": 7.7961, + "step": 579900 + }, + { + "epoch": 2.3627925356124777, + "grad_norm": 3.410102367401123, + "learning_rate": 0.003606656018048321, + "loss": 7.8124, + "step": 580000 + }, + { + "epoch": 2.3627925356124777, + "eval_MaskedAccuracy": 0.5012227258391272, + "eval_loss": 1.6263668537139893, + "eval_runtime": 197.6188, + "eval_samples_per_second": 321.204, + "eval_steps_per_second": 1.255, + "step": 580000 + }, + { + "epoch": 2.3631999136358592, + "grad_norm": 4.543544292449951, + "learning_rate": 0.003606210882113474, + "loss": 7.8132, + "step": 580100 + }, + { + "epoch": 2.3636072916592403, + "grad_norm": 5.982573986053467, + "learning_rate": 0.0036057657026418734, + "loss": 7.7747, + "step": 580200 + }, + { + "epoch": 2.364014669682622, + "grad_norm": 5.714713096618652, + "learning_rate": 0.003605320479651115, + "loss": 7.8041, + "step": 580300 + }, + { + "epoch": 2.3644220477060034, + "grad_norm": 3.8331897258758545, + "learning_rate": 0.0036048752131588022, + "loss": 7.7848, + "step": 580400 + }, + { + "epoch": 2.364829425729385, + "grad_norm": 2.8905255794525146, + "learning_rate": 0.003604429903182543, + "loss": 7.8034, + "step": 580500 + }, + { + "epoch": 2.3652368037527665, + "grad_norm": 7.409701347351074, + "learning_rate": 0.0036039845497399365, + "loss": 7.8113, + "step": 580600 + }, + { + "epoch": 2.365644181776148, + "grad_norm": 2.486720323562622, + "learning_rate": 0.0036035391528485955, + "loss": 7.7664, + "step": 580700 + }, + { + "epoch": 2.366051559799529, + "grad_norm": 4.313427925109863, + "learning_rate": 0.0036030937125261235, + "loss": 7.8212, + "step": 580800 + }, + { + "epoch": 2.3664589378229106, + "grad_norm": 3.203368902206421, + "learning_rate": 0.003602648228790135, + "loss": 7.7793, + "step": 580900 + }, + { + "epoch": 2.366866315846292, + "grad_norm": 5.898493766784668, + "learning_rate": 0.0036022027016582407, + "loss": 7.7956, + "step": 581000 + }, + { + "epoch": 2.366866315846292, + "eval_MaskedAccuracy": 0.5013717417484989, + "eval_loss": 1.6449079513549805, + "eval_runtime": 163.9042, + "eval_samples_per_second": 387.275, + "eval_steps_per_second": 1.513, + "step": 581000 + }, + { + "epoch": 2.3672736938696737, + "grad_norm": 3.306659460067749, + "learning_rate": 0.003601757131148055, + "loss": 7.8113, + "step": 581100 + }, + { + "epoch": 2.367681071893055, + "grad_norm": 7.1116814613342285, + "learning_rate": 0.0036013115172771936, + "loss": 7.7716, + "step": 581200 + }, + { + "epoch": 2.3680884499164367, + "grad_norm": 7.003377437591553, + "learning_rate": 0.0036008658600632784, + "loss": 7.7785, + "step": 581300 + }, + { + "epoch": 2.3684958279398183, + "grad_norm": 1.9321496486663818, + "learning_rate": 0.0036004201595239227, + "loss": 7.794, + "step": 581400 + }, + { + "epoch": 2.3689032059631994, + "grad_norm": 2.224198818206787, + "learning_rate": 0.003599974415676748, + "loss": 7.8046, + "step": 581500 + }, + { + "epoch": 2.369310583986581, + "grad_norm": 4.905538082122803, + "learning_rate": 0.0035995286285393736, + "loss": 7.7696, + "step": 581600 + }, + { + "epoch": 2.3697179620099624, + "grad_norm": 2.958261013031006, + "learning_rate": 0.003599082798129433, + "loss": 7.7959, + "step": 581700 + }, + { + "epoch": 2.370125340033344, + "grad_norm": 4.866700172424316, + "learning_rate": 0.0035986369244645497, + "loss": 7.8012, + "step": 581800 + }, + { + "epoch": 2.3705327180567255, + "grad_norm": 6.702052593231201, + "learning_rate": 0.003598191007562351, + "loss": 7.7836, + "step": 581900 + }, + { + "epoch": 2.3709400960801066, + "grad_norm": 3.02662992477417, + "learning_rate": 0.003597745047440462, + "loss": 7.8078, + "step": 582000 + }, + { + "epoch": 2.3709400960801066, + "eval_MaskedAccuracy": 0.5012330720607717, + "eval_loss": 1.6523900032043457, + "eval_runtime": 156.2154, + "eval_samples_per_second": 406.336, + "eval_steps_per_second": 1.588, + "step": 582000 + }, + { + "epoch": 2.371347474103488, + "grad_norm": 3.6374354362487793, + "learning_rate": 0.003597299044116517, + "loss": 7.7908, + "step": 582100 + }, + { + "epoch": 2.3717548521268697, + "grad_norm": 4.827856540679932, + "learning_rate": 0.0035968529976081517, + "loss": 7.8104, + "step": 582200 + }, + { + "epoch": 2.372162230150251, + "grad_norm": 6.088109493255615, + "learning_rate": 0.0035964069079329943, + "loss": 7.7937, + "step": 582300 + }, + { + "epoch": 2.3725696081736327, + "grad_norm": 2.390738010406494, + "learning_rate": 0.0035959607751086843, + "loss": 7.7718, + "step": 582400 + }, + { + "epoch": 2.3729769861970142, + "grad_norm": 1.5880234241485596, + "learning_rate": 0.0035955145991528616, + "loss": 7.8001, + "step": 582500 + }, + { + "epoch": 2.373384364220396, + "grad_norm": 5.216249465942383, + "learning_rate": 0.003595068380083166, + "loss": 7.7796, + "step": 582600 + }, + { + "epoch": 2.373791742243777, + "grad_norm": 3.8343546390533447, + "learning_rate": 0.0035946221179172363, + "loss": 7.8048, + "step": 582700 + }, + { + "epoch": 2.3741991202671584, + "grad_norm": 2.743173360824585, + "learning_rate": 0.0035941758126727167, + "loss": 7.7865, + "step": 582800 + }, + { + "epoch": 2.37460649829054, + "grad_norm": 5.641646862030029, + "learning_rate": 0.003593729464367252, + "loss": 7.7988, + "step": 582900 + }, + { + "epoch": 2.3750138763139215, + "grad_norm": 4.868067741394043, + "learning_rate": 0.0035932830730184867, + "loss": 7.8011, + "step": 583000 + }, + { + "epoch": 2.3750138763139215, + "eval_MaskedAccuracy": 0.5022030759828252, + "eval_loss": 1.63442063331604, + "eval_runtime": 156.3059, + "eval_samples_per_second": 406.101, + "eval_steps_per_second": 1.587, + "step": 583000 + }, + { + "epoch": 2.375421254337303, + "grad_norm": 6.6119608879089355, + "learning_rate": 0.003592836638644071, + "loss": 7.8056, + "step": 583100 + }, + { + "epoch": 2.3758286323606845, + "grad_norm": 2.2543911933898926, + "learning_rate": 0.0035923901612616538, + "loss": 7.7988, + "step": 583200 + }, + { + "epoch": 2.3762360103840656, + "grad_norm": 2.987464427947998, + "learning_rate": 0.0035919436408888886, + "loss": 7.8328, + "step": 583300 + }, + { + "epoch": 2.376643388407447, + "grad_norm": 6.303028106689453, + "learning_rate": 0.0035914970775434317, + "loss": 7.8182, + "step": 583400 + }, + { + "epoch": 2.3770507664308287, + "grad_norm": 3.9799399375915527, + "learning_rate": 0.0035910504712429324, + "loss": 7.7881, + "step": 583500 + }, + { + "epoch": 2.37745814445421, + "grad_norm": 2.1123669147491455, + "learning_rate": 0.0035906038220050503, + "loss": 7.7745, + "step": 583600 + }, + { + "epoch": 2.3778655224775918, + "grad_norm": 4.1016459465026855, + "learning_rate": 0.0035901571298474406, + "loss": 7.796, + "step": 583700 + }, + { + "epoch": 2.3782729005009733, + "grad_norm": 5.921614646911621, + "learning_rate": 0.0035897103947877674, + "loss": 7.8014, + "step": 583800 + }, + { + "epoch": 2.378680278524355, + "grad_norm": 6.695907115936279, + "learning_rate": 0.00358926361684369, + "loss": 7.7801, + "step": 583900 + }, + { + "epoch": 2.379087656547736, + "grad_norm": 4.198784351348877, + "learning_rate": 0.003588816796032871, + "loss": 7.778, + "step": 584000 + }, + { + "epoch": 2.379087656547736, + "eval_MaskedAccuracy": 0.501532274833887, + "eval_loss": 1.641954779624939, + "eval_runtime": 156.1541, + "eval_samples_per_second": 406.496, + "eval_steps_per_second": 1.588, + "step": 584000 + }, + { + "epoch": 2.3794950345711174, + "grad_norm": 6.060896396636963, + "learning_rate": 0.0035883699323729775, + "loss": 7.7869, + "step": 584100 + }, + { + "epoch": 2.379902412594499, + "grad_norm": 5.873023986816406, + "learning_rate": 0.003587923025881678, + "loss": 7.8, + "step": 584200 + }, + { + "epoch": 2.3803097906178805, + "grad_norm": 4.626306533813477, + "learning_rate": 0.0035874760765766387, + "loss": 7.7987, + "step": 584300 + }, + { + "epoch": 2.380717168641262, + "grad_norm": 5.819188594818115, + "learning_rate": 0.003587029084475532, + "loss": 7.8126, + "step": 584400 + }, + { + "epoch": 2.381124546664643, + "grad_norm": 6.103301525115967, + "learning_rate": 0.0035865820495960287, + "loss": 7.8009, + "step": 584500 + }, + { + "epoch": 2.3815319246880247, + "grad_norm": 2.5972049236297607, + "learning_rate": 0.003586134971955799, + "loss": 7.7842, + "step": 584600 + }, + { + "epoch": 2.381939302711406, + "grad_norm": 3.71113920211792, + "learning_rate": 0.003585687851572524, + "loss": 7.8145, + "step": 584700 + }, + { + "epoch": 2.3823466807347877, + "grad_norm": 2.0780303478240967, + "learning_rate": 0.0035852406884638774, + "loss": 7.7889, + "step": 584800 + }, + { + "epoch": 2.3827540587581693, + "grad_norm": 8.88117790222168, + "learning_rate": 0.0035847934826475384, + "loss": 7.7979, + "step": 584900 + }, + { + "epoch": 2.383161436781551, + "grad_norm": 2.3754899501800537, + "learning_rate": 0.003584346234141191, + "loss": 7.7864, + "step": 585000 + }, + { + "epoch": 2.383161436781551, + "eval_MaskedAccuracy": 0.5022698361572302, + "eval_loss": 1.6313894987106323, + "eval_runtime": 175.7067, + "eval_samples_per_second": 361.261, + "eval_steps_per_second": 1.411, + "step": 585000 + }, + { + "epoch": 2.3835688148049323, + "grad_norm": 3.1766247749328613, + "learning_rate": 0.003583898942962508, + "loss": 7.829, + "step": 585100 + }, + { + "epoch": 2.3839761928283134, + "grad_norm": 2.9906697273254395, + "learning_rate": 0.0035834516091291855, + "loss": 7.8102, + "step": 585200 + }, + { + "epoch": 2.384383570851695, + "grad_norm": 4.052495956420898, + "learning_rate": 0.003583004232658902, + "loss": 7.7815, + "step": 585300 + }, + { + "epoch": 2.3847909488750765, + "grad_norm": 5.7117509841918945, + "learning_rate": 0.0035825568135693445, + "loss": 7.7914, + "step": 585400 + }, + { + "epoch": 2.385198326898458, + "grad_norm": 3.1434757709503174, + "learning_rate": 0.003582109351878203, + "loss": 7.7874, + "step": 585500 + }, + { + "epoch": 2.3856057049218395, + "grad_norm": 4.065485000610352, + "learning_rate": 0.003581661847603165, + "loss": 7.7814, + "step": 585600 + }, + { + "epoch": 2.386013082945221, + "grad_norm": 3.9136483669281006, + "learning_rate": 0.0035812143007619285, + "loss": 7.7996, + "step": 585700 + }, + { + "epoch": 2.386420460968602, + "grad_norm": 3.7829060554504395, + "learning_rate": 0.0035807667113721842, + "loss": 7.7671, + "step": 585800 + }, + { + "epoch": 2.3868278389919837, + "grad_norm": 8.514617919921875, + "learning_rate": 0.00358031907945163, + "loss": 7.779, + "step": 585900 + }, + { + "epoch": 2.3872352170153652, + "grad_norm": 9.994239807128906, + "learning_rate": 0.0035798714050179563, + "loss": 7.8028, + "step": 586000 + }, + { + "epoch": 2.3872352170153652, + "eval_MaskedAccuracy": 0.5006644734791491, + "eval_loss": 1.6454873085021973, + "eval_runtime": 157.0633, + "eval_samples_per_second": 404.143, + "eval_steps_per_second": 1.579, + "step": 586000 + }, + { + "epoch": 2.3876425950387468, + "grad_norm": 3.2327418327331543, + "learning_rate": 0.0035794236880888684, + "loss": 7.7905, + "step": 586100 + }, + { + "epoch": 2.3880499730621283, + "grad_norm": 4.871578693389893, + "learning_rate": 0.0035789759286820708, + "loss": 7.8118, + "step": 586200 + }, + { + "epoch": 2.38845735108551, + "grad_norm": 9.962044715881348, + "learning_rate": 0.0035785281268152593, + "loss": 7.7867, + "step": 586300 + }, + { + "epoch": 2.3888647291088914, + "grad_norm": 7.3953118324279785, + "learning_rate": 0.0035780802825061396, + "loss": 7.7939, + "step": 586400 + }, + { + "epoch": 2.3892721071322724, + "grad_norm": 2.738851308822632, + "learning_rate": 0.0035776323957724177, + "loss": 7.7937, + "step": 586500 + }, + { + "epoch": 2.389679485155654, + "grad_norm": 2.027208089828491, + "learning_rate": 0.0035771844666317963, + "loss": 7.815, + "step": 586600 + }, + { + "epoch": 2.3900868631790355, + "grad_norm": 4.308239459991455, + "learning_rate": 0.003576736495101991, + "loss": 7.799, + "step": 586700 + }, + { + "epoch": 2.390494241202417, + "grad_norm": 5.500651836395264, + "learning_rate": 0.0035762884812007107, + "loss": 7.792, + "step": 586800 + }, + { + "epoch": 2.3909016192257986, + "grad_norm": 4.689356803894043, + "learning_rate": 0.0035758404249456644, + "loss": 7.7815, + "step": 586900 + }, + { + "epoch": 2.3913089972491797, + "grad_norm": 9.67606258392334, + "learning_rate": 0.0035753923263545694, + "loss": 7.7685, + "step": 587000 + }, + { + "epoch": 2.3913089972491797, + "eval_MaskedAccuracy": 0.5009284097804791, + "eval_loss": 1.6510730981826782, + "eval_runtime": 155.2204, + "eval_samples_per_second": 408.941, + "eval_steps_per_second": 1.598, + "step": 587000 + }, + { + "epoch": 2.391716375272561, + "grad_norm": 6.0829243659973145, + "learning_rate": 0.003574944185445143, + "loss": 7.7713, + "step": 587100 + }, + { + "epoch": 2.3921237532959427, + "grad_norm": 7.072274684906006, + "learning_rate": 0.003574496002235102, + "loss": 7.776, + "step": 587200 + }, + { + "epoch": 2.3925311313193243, + "grad_norm": 2.3124594688415527, + "learning_rate": 0.0035740477767421605, + "loss": 7.7851, + "step": 587300 + }, + { + "epoch": 2.392938509342706, + "grad_norm": 5.689125061035156, + "learning_rate": 0.0035735995089840426, + "loss": 7.8035, + "step": 587400 + }, + { + "epoch": 2.3933458873660873, + "grad_norm": 3.5439541339874268, + "learning_rate": 0.0035731511989784727, + "loss": 7.7895, + "step": 587500 + }, + { + "epoch": 2.393753265389469, + "grad_norm": 9.96220874786377, + "learning_rate": 0.0035727028467431705, + "loss": 7.7954, + "step": 587600 + }, + { + "epoch": 2.39416064341285, + "grad_norm": 3.481572151184082, + "learning_rate": 0.0035722544522958696, + "loss": 7.731, + "step": 587700 + }, + { + "epoch": 2.3945680214362315, + "grad_norm": 5.532615661621094, + "learning_rate": 0.0035718060156542867, + "loss": 7.7983, + "step": 587800 + }, + { + "epoch": 2.394975399459613, + "grad_norm": 3.2027766704559326, + "learning_rate": 0.003571357536836159, + "loss": 7.7861, + "step": 587900 + }, + { + "epoch": 2.3953827774829946, + "grad_norm": 8.927424430847168, + "learning_rate": 0.0035709090158592156, + "loss": 7.7831, + "step": 588000 + }, + { + "epoch": 2.3953827774829946, + "eval_MaskedAccuracy": 0.5007482981894489, + "eval_loss": 1.6479322910308838, + "eval_runtime": 155.9414, + "eval_samples_per_second": 407.05, + "eval_steps_per_second": 1.59, + "step": 588000 + }, + { + "epoch": 2.395790155506376, + "grad_norm": 3.5374948978424072, + "learning_rate": 0.003570460452741187, + "loss": 7.7869, + "step": 588100 + }, + { + "epoch": 2.3961975335297576, + "grad_norm": 5.209863662719727, + "learning_rate": 0.003570011847499804, + "loss": 7.7911, + "step": 588200 + }, + { + "epoch": 2.3966049115531387, + "grad_norm": 4.747452259063721, + "learning_rate": 0.0035695632001528075, + "loss": 7.7698, + "step": 588300 + }, + { + "epoch": 2.3970122895765202, + "grad_norm": 6.2473464012146, + "learning_rate": 0.003569114510717937, + "loss": 7.7797, + "step": 588400 + }, + { + "epoch": 2.3974196675999018, + "grad_norm": 5.453723907470703, + "learning_rate": 0.003568665779212927, + "loss": 7.7949, + "step": 588500 + }, + { + "epoch": 2.3978270456232833, + "grad_norm": 2.2264554500579834, + "learning_rate": 0.0035682170056555228, + "loss": 7.785, + "step": 588600 + }, + { + "epoch": 2.398234423646665, + "grad_norm": 2.996682643890381, + "learning_rate": 0.0035677681900634613, + "loss": 7.7861, + "step": 588700 + }, + { + "epoch": 2.3986418016700464, + "grad_norm": 5.559460163116455, + "learning_rate": 0.0035673193324544912, + "loss": 7.7806, + "step": 588800 + }, + { + "epoch": 2.399049179693428, + "grad_norm": 4.804593086242676, + "learning_rate": 0.003566870432846356, + "loss": 7.7879, + "step": 588900 + }, + { + "epoch": 2.399456557716809, + "grad_norm": 4.812346935272217, + "learning_rate": 0.0035664214912568015, + "loss": 7.7717, + "step": 589000 + }, + { + "epoch": 2.399456557716809, + "eval_MaskedAccuracy": 0.5014329700923861, + "eval_loss": 1.6416527032852173, + "eval_runtime": 245.9242, + "eval_samples_per_second": 258.112, + "eval_steps_per_second": 1.008, + "step": 589000 + }, + { + "epoch": 2.3998639357401905, + "grad_norm": 10.421682357788086, + "learning_rate": 0.0035659725077035794, + "loss": 7.7638, + "step": 589100 + }, + { + "epoch": 2.400271313763572, + "grad_norm": 5.037592887878418, + "learning_rate": 0.003565523482204439, + "loss": 7.8033, + "step": 589200 + }, + { + "epoch": 2.4006786917869536, + "grad_norm": 4.288503170013428, + "learning_rate": 0.0035650744147771346, + "loss": 7.7643, + "step": 589300 + }, + { + "epoch": 2.401086069810335, + "grad_norm": 4.179448127746582, + "learning_rate": 0.00356462530543942, + "loss": 7.7669, + "step": 589400 + }, + { + "epoch": 2.401493447833716, + "grad_norm": 5.028454303741455, + "learning_rate": 0.003564176154209048, + "loss": 7.8345, + "step": 589500 + }, + { + "epoch": 2.4019008258570977, + "grad_norm": 7.250356197357178, + "learning_rate": 0.0035637269611037744, + "loss": 7.7684, + "step": 589600 + }, + { + "epoch": 2.4023082038804793, + "grad_norm": 6.912049770355225, + "learning_rate": 0.0035632777261413633, + "loss": 7.7781, + "step": 589700 + }, + { + "epoch": 2.402715581903861, + "grad_norm": 4.628445148468018, + "learning_rate": 0.003562828449339571, + "loss": 7.7719, + "step": 589800 + }, + { + "epoch": 2.4031229599272423, + "grad_norm": 5.262722015380859, + "learning_rate": 0.0035623791307161654, + "loss": 7.7818, + "step": 589900 + }, + { + "epoch": 2.403530337950624, + "grad_norm": 2.250422239303589, + "learning_rate": 0.0035619297702889083, + "loss": 7.7787, + "step": 590000 + }, + { + "epoch": 2.403530337950624, + "eval_MaskedAccuracy": 0.5023515761858598, + "eval_loss": 1.637993335723877, + "eval_runtime": 165.8963, + "eval_samples_per_second": 382.624, + "eval_steps_per_second": 1.495, + "step": 590000 + }, + { + "epoch": 2.4039377159740054, + "grad_norm": 2.7703800201416016, + "learning_rate": 0.003561480368075565, + "loss": 7.7831, + "step": 590100 + }, + { + "epoch": 2.4043450939973865, + "grad_norm": 3.4823758602142334, + "learning_rate": 0.0035610309240939005, + "loss": 7.79, + "step": 590200 + }, + { + "epoch": 2.404752472020768, + "grad_norm": 2.857991933822632, + "learning_rate": 0.0035605814383616844, + "loss": 7.7733, + "step": 590300 + }, + { + "epoch": 2.4051598500441496, + "grad_norm": 2.695787191390991, + "learning_rate": 0.0035601319108966916, + "loss": 7.8119, + "step": 590400 + }, + { + "epoch": 2.405567228067531, + "grad_norm": 5.409249305725098, + "learning_rate": 0.0035596823417166886, + "loss": 7.8127, + "step": 590500 + }, + { + "epoch": 2.4059746060909126, + "grad_norm": 9.495888710021973, + "learning_rate": 0.0035592327308394548, + "loss": 7.756, + "step": 590600 + }, + { + "epoch": 2.406381984114294, + "grad_norm": 15.420244216918945, + "learning_rate": 0.003558783078282761, + "loss": 7.82, + "step": 590700 + }, + { + "epoch": 2.4067893621376752, + "grad_norm": 4.739377498626709, + "learning_rate": 0.0035583333840643883, + "loss": 7.804, + "step": 590800 + }, + { + "epoch": 2.407196740161057, + "grad_norm": 4.556971073150635, + "learning_rate": 0.003557883648202112, + "loss": 7.7701, + "step": 590900 + }, + { + "epoch": 2.4076041181844383, + "grad_norm": 3.1656908988952637, + "learning_rate": 0.003557433870713714, + "loss": 7.775, + "step": 591000 + }, + { + "epoch": 2.4076041181844383, + "eval_MaskedAccuracy": 0.502121768862807, + "eval_loss": 1.6379399299621582, + "eval_runtime": 160.9565, + "eval_samples_per_second": 394.367, + "eval_steps_per_second": 1.541, + "step": 591000 + }, + { + "epoch": 2.40801149620782, + "grad_norm": 3.7673425674438477, + "learning_rate": 0.003556984051616979, + "loss": 7.7967, + "step": 591100 + }, + { + "epoch": 2.4084188742312014, + "grad_norm": 3.8414700031280518, + "learning_rate": 0.003556534190929685, + "loss": 7.7876, + "step": 591200 + }, + { + "epoch": 2.408826252254583, + "grad_norm": 3.038132429122925, + "learning_rate": 0.0035560842886696174, + "loss": 7.7915, + "step": 591300 + }, + { + "epoch": 2.4092336302779644, + "grad_norm": 3.504563570022583, + "learning_rate": 0.0035556343448545696, + "loss": 7.7801, + "step": 591400 + }, + { + "epoch": 2.4096410083013455, + "grad_norm": 3.4391419887542725, + "learning_rate": 0.003555184359502326, + "loss": 7.7885, + "step": 591500 + }, + { + "epoch": 2.410048386324727, + "grad_norm": 5.2733306884765625, + "learning_rate": 0.0035547343326306786, + "loss": 7.7761, + "step": 591600 + }, + { + "epoch": 2.4104557643481086, + "grad_norm": 4.036264419555664, + "learning_rate": 0.0035542842642574198, + "loss": 7.7725, + "step": 591700 + }, + { + "epoch": 2.41086314237149, + "grad_norm": 2.197981357574463, + "learning_rate": 0.0035538341544003437, + "loss": 7.7995, + "step": 591800 + }, + { + "epoch": 2.4112705203948717, + "grad_norm": 1.739732265472412, + "learning_rate": 0.003553384003077243, + "loss": 7.7964, + "step": 591900 + }, + { + "epoch": 2.4116778984182528, + "grad_norm": 8.783902168273926, + "learning_rate": 0.003552933810305913, + "loss": 7.7825, + "step": 592000 + }, + { + "epoch": 2.4116778984182528, + "eval_MaskedAccuracy": 0.5010911627700765, + "eval_loss": 1.634800910949707, + "eval_runtime": 156.512, + "eval_samples_per_second": 405.566, + "eval_steps_per_second": 1.585, + "step": 592000 + }, + { + "epoch": 2.4120852764416343, + "grad_norm": 2.94250750541687, + "learning_rate": 0.003552483576104157, + "loss": 7.8049, + "step": 592100 + }, + { + "epoch": 2.412492654465016, + "grad_norm": 8.064268112182617, + "learning_rate": 0.0035520333004897733, + "loss": 7.7787, + "step": 592200 + }, + { + "epoch": 2.4129000324883974, + "grad_norm": 2.6121816635131836, + "learning_rate": 0.003551582983480563, + "loss": 7.794, + "step": 592300 + }, + { + "epoch": 2.413307410511779, + "grad_norm": 3.7331557273864746, + "learning_rate": 0.003551132625094326, + "loss": 7.7651, + "step": 592400 + }, + { + "epoch": 2.4137147885351604, + "grad_norm": 3.1750247478485107, + "learning_rate": 0.0035506822253488727, + "loss": 7.8189, + "step": 592500 + }, + { + "epoch": 2.414122166558542, + "grad_norm": 3.3395748138427734, + "learning_rate": 0.00355023178426201, + "loss": 7.8012, + "step": 592600 + }, + { + "epoch": 2.414529544581923, + "grad_norm": 2.3670284748077393, + "learning_rate": 0.003549781301851543, + "loss": 7.7763, + "step": 592700 + }, + { + "epoch": 2.4149369226053046, + "grad_norm": 3.7103707790374756, + "learning_rate": 0.003549330778135285, + "loss": 7.7952, + "step": 592800 + }, + { + "epoch": 2.415344300628686, + "grad_norm": 2.7309718132019043, + "learning_rate": 0.0035488802131310444, + "loss": 7.7881, + "step": 592900 + }, + { + "epoch": 2.4157516786520676, + "grad_norm": 3.794459104537964, + "learning_rate": 0.0035484296068566336, + "loss": 7.7456, + "step": 593000 + }, + { + "epoch": 2.4157516786520676, + "eval_MaskedAccuracy": 0.5026838307102172, + "eval_loss": 1.6401876211166382, + "eval_runtime": 214.8496, + "eval_samples_per_second": 295.444, + "eval_steps_per_second": 1.154, + "step": 593000 + }, + { + "epoch": 2.416159056675449, + "grad_norm": 3.176884889602661, + "learning_rate": 0.003547978959329874, + "loss": 7.7424, + "step": 593100 + }, + { + "epoch": 2.4165664346988307, + "grad_norm": 10.780375480651855, + "learning_rate": 0.0035475282705685757, + "loss": 7.7992, + "step": 593200 + }, + { + "epoch": 2.416973812722212, + "grad_norm": 7.064286231994629, + "learning_rate": 0.003547077540590559, + "loss": 7.8173, + "step": 593300 + }, + { + "epoch": 2.4173811907455933, + "grad_norm": 9.125280380249023, + "learning_rate": 0.003546626769413643, + "loss": 7.7634, + "step": 593400 + }, + { + "epoch": 2.417788568768975, + "grad_norm": 2.350525379180908, + "learning_rate": 0.0035461759570556504, + "loss": 7.8059, + "step": 593500 + }, + { + "epoch": 2.4181959467923564, + "grad_norm": 2.1609630584716797, + "learning_rate": 0.0035457251035344037, + "loss": 7.7845, + "step": 593600 + }, + { + "epoch": 2.418603324815738, + "grad_norm": 4.145672798156738, + "learning_rate": 0.003545274208867726, + "loss": 7.813, + "step": 593700 + }, + { + "epoch": 2.4190107028391195, + "grad_norm": 1.9367409944534302, + "learning_rate": 0.0035448232730734457, + "loss": 7.7629, + "step": 593800 + }, + { + "epoch": 2.419418080862501, + "grad_norm": 4.147180080413818, + "learning_rate": 0.003544372296169388, + "loss": 7.7736, + "step": 593900 + }, + { + "epoch": 2.419825458885882, + "grad_norm": 4.469532489776611, + "learning_rate": 0.003543921278173388, + "loss": 7.808, + "step": 594000 + }, + { + "epoch": 2.419825458885882, + "eval_MaskedAccuracy": 0.5020240601705928, + "eval_loss": 1.6414042711257935, + "eval_runtime": 154.9602, + "eval_samples_per_second": 409.628, + "eval_steps_per_second": 1.6, + "step": 594000 + }, + { + "epoch": 2.4202328369092636, + "grad_norm": 5.342765808105469, + "learning_rate": 0.003543470219103273, + "loss": 7.8297, + "step": 594100 + }, + { + "epoch": 2.420640214932645, + "grad_norm": 3.777756929397583, + "learning_rate": 0.0035430191189768753, + "loss": 7.7988, + "step": 594200 + }, + { + "epoch": 2.4210475929560267, + "grad_norm": 5.757164478302002, + "learning_rate": 0.003542567977812031, + "loss": 7.8026, + "step": 594300 + }, + { + "epoch": 2.421454970979408, + "grad_norm": 2.53171706199646, + "learning_rate": 0.003542116795626575, + "loss": 7.7985, + "step": 594400 + }, + { + "epoch": 2.4218623490027893, + "grad_norm": 3.1694672107696533, + "learning_rate": 0.003541665572438345, + "loss": 7.793, + "step": 594500 + }, + { + "epoch": 2.422269727026171, + "grad_norm": 5.521478176116943, + "learning_rate": 0.003541214308265178, + "loss": 7.7971, + "step": 594600 + }, + { + "epoch": 2.4226771050495524, + "grad_norm": 4.49680757522583, + "learning_rate": 0.0035407630031249215, + "loss": 7.8049, + "step": 594700 + }, + { + "epoch": 2.423084483072934, + "grad_norm": 3.1610946655273438, + "learning_rate": 0.0035403116570354135, + "loss": 7.772, + "step": 594800 + }, + { + "epoch": 2.4234918610963154, + "grad_norm": 2.836824893951416, + "learning_rate": 0.003539860270014498, + "loss": 7.8031, + "step": 594900 + }, + { + "epoch": 2.423899239119697, + "grad_norm": 8.358927726745605, + "learning_rate": 0.0035394088420800194, + "loss": 7.7444, + "step": 595000 + }, + { + "epoch": 2.423899239119697, + "eval_MaskedAccuracy": 0.501722756180028, + "eval_loss": 1.6437461376190186, + "eval_runtime": 161.375, + "eval_samples_per_second": 393.345, + "eval_steps_per_second": 1.537, + "step": 595000 + }, + { + "epoch": 2.4243066171430785, + "grad_norm": 5.089084148406982, + "learning_rate": 0.003538957373249825, + "loss": 7.7766, + "step": 595100 + }, + { + "epoch": 2.4247139951664596, + "grad_norm": 4.644046783447266, + "learning_rate": 0.0035385058635417715, + "loss": 7.7713, + "step": 595200 + }, + { + "epoch": 2.425121373189841, + "grad_norm": 3.160179376602173, + "learning_rate": 0.0035380543129736996, + "loss": 7.8105, + "step": 595300 + }, + { + "epoch": 2.4255287512132226, + "grad_norm": 4.421177387237549, + "learning_rate": 0.003537602721563468, + "loss": 7.7514, + "step": 595400 + }, + { + "epoch": 2.425936129236604, + "grad_norm": 10.34675407409668, + "learning_rate": 0.0035371510893289276, + "loss": 7.7565, + "step": 595500 + }, + { + "epoch": 2.4263435072599857, + "grad_norm": 4.81442403793335, + "learning_rate": 0.0035366994162879355, + "loss": 7.7428, + "step": 595600 + }, + { + "epoch": 2.4267508852833672, + "grad_norm": 6.543236255645752, + "learning_rate": 0.003536247702458345, + "loss": 7.8133, + "step": 595700 + }, + { + "epoch": 2.4271582633067483, + "grad_norm": 2.53407883644104, + "learning_rate": 0.003535795947858019, + "loss": 7.7965, + "step": 595800 + }, + { + "epoch": 2.42756564133013, + "grad_norm": 6.635446548461914, + "learning_rate": 0.003535344152504815, + "loss": 7.7983, + "step": 595900 + }, + { + "epoch": 2.4279730193535114, + "grad_norm": 4.083710670471191, + "learning_rate": 0.0035348923164165956, + "loss": 7.7524, + "step": 596000 + }, + { + "epoch": 2.4279730193535114, + "eval_MaskedAccuracy": 0.5013060614278118, + "eval_loss": 1.6439677476882935, + "eval_runtime": 157.3899, + "eval_samples_per_second": 403.304, + "eval_steps_per_second": 1.576, + "step": 596000 + }, + { + "epoch": 2.428380397376893, + "grad_norm": 6.001681327819824, + "learning_rate": 0.003534440439611226, + "loss": 7.7901, + "step": 596100 + }, + { + "epoch": 2.4287877754002745, + "grad_norm": 3.3321585655212402, + "learning_rate": 0.0035339885221065694, + "loss": 7.7784, + "step": 596200 + }, + { + "epoch": 2.429195153423656, + "grad_norm": 10.198421478271484, + "learning_rate": 0.003533536563920495, + "loss": 7.7841, + "step": 596300 + }, + { + "epoch": 2.4296025314470375, + "grad_norm": 4.937011241912842, + "learning_rate": 0.0035330845650708674, + "loss": 7.8041, + "step": 596400 + }, + { + "epoch": 2.4300099094704186, + "grad_norm": 3.839660406112671, + "learning_rate": 0.003532632525575559, + "loss": 7.7741, + "step": 596500 + }, + { + "epoch": 2.4304172874938, + "grad_norm": 4.8967061042785645, + "learning_rate": 0.003532180445452442, + "loss": 7.7967, + "step": 596600 + }, + { + "epoch": 2.4308246655171817, + "grad_norm": 3.739058256149292, + "learning_rate": 0.003531728324719386, + "loss": 7.7678, + "step": 596700 + }, + { + "epoch": 2.431232043540563, + "grad_norm": 3.3789188861846924, + "learning_rate": 0.0035312761633942667, + "loss": 7.8127, + "step": 596800 + }, + { + "epoch": 2.4316394215639447, + "grad_norm": 3.7414557933807373, + "learning_rate": 0.0035308239614949646, + "loss": 7.7633, + "step": 596900 + }, + { + "epoch": 2.432046799587326, + "grad_norm": 2.8671152591705322, + "learning_rate": 0.003530371719039353, + "loss": 7.7577, + "step": 597000 + }, + { + "epoch": 2.432046799587326, + "eval_MaskedAccuracy": 0.5013253448657601, + "eval_loss": 1.6389906406402588, + "eval_runtime": 157.3848, + "eval_samples_per_second": 403.317, + "eval_steps_per_second": 1.576, + "step": 597000 + }, + { + "epoch": 2.4324541776107074, + "grad_norm": 4.212960243225098, + "learning_rate": 0.003529919436045315, + "loss": 7.8131, + "step": 597100 + }, + { + "epoch": 2.432861555634089, + "grad_norm": 3.658583402633667, + "learning_rate": 0.0035294671125307266, + "loss": 7.7826, + "step": 597200 + }, + { + "epoch": 2.4332689336574704, + "grad_norm": 3.706495761871338, + "learning_rate": 0.003529014748513475, + "loss": 7.782, + "step": 597300 + }, + { + "epoch": 2.433676311680852, + "grad_norm": 2.5128815174102783, + "learning_rate": 0.003528562344011446, + "loss": 7.8014, + "step": 597400 + }, + { + "epoch": 2.4340836897042335, + "grad_norm": 1.9968478679656982, + "learning_rate": 0.0035281098990425186, + "loss": 7.7877, + "step": 597500 + }, + { + "epoch": 2.434491067727615, + "grad_norm": 7.746984958648682, + "learning_rate": 0.0035276574136245873, + "loss": 7.7932, + "step": 597600 + }, + { + "epoch": 2.434898445750996, + "grad_norm": 2.634483814239502, + "learning_rate": 0.003527204887775543, + "loss": 7.7713, + "step": 597700 + }, + { + "epoch": 2.4353058237743777, + "grad_norm": 9.012521743774414, + "learning_rate": 0.00352675232151327, + "loss": 7.7765, + "step": 597800 + }, + { + "epoch": 2.435713201797759, + "grad_norm": 3.6214818954467773, + "learning_rate": 0.003526299714855664, + "loss": 7.7699, + "step": 597900 + }, + { + "epoch": 2.4361205798211407, + "grad_norm": 7.51224946975708, + "learning_rate": 0.0035258470678206154, + "loss": 7.8023, + "step": 598000 + }, + { + "epoch": 2.4361205798211407, + "eval_MaskedAccuracy": 0.5020065258373666, + "eval_loss": 1.6350057125091553, + "eval_runtime": 152.2062, + "eval_samples_per_second": 417.039, + "eval_steps_per_second": 1.629, + "step": 598000 + }, + { + "epoch": 2.4365279578445223, + "grad_norm": 4.046254634857178, + "learning_rate": 0.0035253943804260213, + "loss": 7.7911, + "step": 598100 + }, + { + "epoch": 2.436935335867904, + "grad_norm": 5.190164089202881, + "learning_rate": 0.0035249416526897804, + "loss": 7.7615, + "step": 598200 + }, + { + "epoch": 2.437342713891285, + "grad_norm": 12.994916915893555, + "learning_rate": 0.0035244888846297953, + "loss": 7.7658, + "step": 598300 + }, + { + "epoch": 2.4377500919146664, + "grad_norm": 4.974771499633789, + "learning_rate": 0.003524036076263957, + "loss": 7.7725, + "step": 598400 + }, + { + "epoch": 2.438157469938048, + "grad_norm": 2.5542399883270264, + "learning_rate": 0.0035235832276101766, + "loss": 7.783, + "step": 598500 + }, + { + "epoch": 2.4385648479614295, + "grad_norm": 4.832586765289307, + "learning_rate": 0.0035231303386863526, + "loss": 7.7698, + "step": 598600 + }, + { + "epoch": 2.438972225984811, + "grad_norm": 5.31456995010376, + "learning_rate": 0.003522677409510389, + "loss": 7.7903, + "step": 598700 + }, + { + "epoch": 2.4393796040081925, + "grad_norm": 3.226726770401001, + "learning_rate": 0.0035222244401001963, + "loss": 7.7941, + "step": 598800 + }, + { + "epoch": 2.439786982031574, + "grad_norm": 6.057314872741699, + "learning_rate": 0.0035217714304736775, + "loss": 7.7717, + "step": 598900 + }, + { + "epoch": 2.440194360054955, + "grad_norm": 5.601971626281738, + "learning_rate": 0.0035213183806487463, + "loss": 7.7683, + "step": 599000 + }, + { + "epoch": 2.440194360054955, + "eval_MaskedAccuracy": 0.5013334530629393, + "eval_loss": 1.6420518159866333, + "eval_runtime": 160.0676, + "eval_samples_per_second": 396.557, + "eval_steps_per_second": 1.549, + "step": 599000 + }, + { + "epoch": 2.4406017380783367, + "grad_norm": 11.546951293945312, + "learning_rate": 0.0035208652906433186, + "loss": 7.7542, + "step": 599100 + }, + { + "epoch": 2.4410091161017182, + "grad_norm": 2.4373414516448975, + "learning_rate": 0.0035204121604753003, + "loss": 7.7695, + "step": 599200 + }, + { + "epoch": 2.4414164941250998, + "grad_norm": 2.8253660202026367, + "learning_rate": 0.00351995899016261, + "loss": 7.7759, + "step": 599300 + }, + { + "epoch": 2.4418238721484813, + "grad_norm": 3.2685346603393555, + "learning_rate": 0.0035195057797231615, + "loss": 7.7881, + "step": 599400 + }, + { + "epoch": 2.4422312501718624, + "grad_norm": 3.1104533672332764, + "learning_rate": 0.0035190525291748705, + "loss": 7.7807, + "step": 599500 + }, + { + "epoch": 2.442638628195244, + "grad_norm": 9.3849515914917, + "learning_rate": 0.0035185992385356585, + "loss": 7.7639, + "step": 599600 + }, + { + "epoch": 2.4430460062186254, + "grad_norm": 3.7798829078674316, + "learning_rate": 0.0035181459078234504, + "loss": 7.785, + "step": 599700 + }, + { + "epoch": 2.443453384242007, + "grad_norm": 2.5248496532440186, + "learning_rate": 0.0035176925370561667, + "loss": 7.7743, + "step": 599800 + }, + { + "epoch": 2.4438607622653885, + "grad_norm": 2.904284954071045, + "learning_rate": 0.00351723912625173, + "loss": 7.7652, + "step": 599900 + }, + { + "epoch": 2.44426814028877, + "grad_norm": 6.10533332824707, + "learning_rate": 0.0035167856754280678, + "loss": 7.7785, + "step": 600000 + }, + { + "epoch": 2.44426814028877, + "eval_MaskedAccuracy": 0.5014401785964906, + "eval_loss": 1.6428192853927612, + "eval_runtime": 161.7854, + "eval_samples_per_second": 392.347, + "eval_steps_per_second": 1.533, + "step": 600000 + }, + { + "epoch": 2.4446755183121516, + "grad_norm": 3.563767194747925, + "learning_rate": 0.0035163321846031046, + "loss": 7.7794, + "step": 600100 + }, + { + "epoch": 2.4450828963355327, + "grad_norm": 3.370847225189209, + "learning_rate": 0.0035158786537947723, + "loss": 7.7974, + "step": 600200 + }, + { + "epoch": 2.445490274358914, + "grad_norm": 4.457855224609375, + "learning_rate": 0.003515425083021001, + "loss": 7.7743, + "step": 600300 + }, + { + "epoch": 2.4458976523822957, + "grad_norm": 4.694652557373047, + "learning_rate": 0.003514971472299721, + "loss": 7.7955, + "step": 600400 + }, + { + "epoch": 2.4463050304056773, + "grad_norm": 3.1714799404144287, + "learning_rate": 0.003514517821648867, + "loss": 7.7949, + "step": 600500 + }, + { + "epoch": 2.446712408429059, + "grad_norm": 3.201887607574463, + "learning_rate": 0.0035140641310863756, + "loss": 7.7856, + "step": 600600 + }, + { + "epoch": 2.4471197864524403, + "grad_norm": 6.544091701507568, + "learning_rate": 0.0035136104006301826, + "loss": 7.799, + "step": 600700 + }, + { + "epoch": 2.4475271644758214, + "grad_norm": 3.627216100692749, + "learning_rate": 0.003513156630298223, + "loss": 7.8096, + "step": 600800 + }, + { + "epoch": 2.447934542499203, + "grad_norm": 3.186779499053955, + "learning_rate": 0.0035127028201084394, + "loss": 7.7801, + "step": 600900 + }, + { + "epoch": 2.4483419205225845, + "grad_norm": 9.663773536682129, + "learning_rate": 0.003512248970078778, + "loss": 7.7877, + "step": 601000 + }, + { + "epoch": 2.4483419205225845, + "eval_MaskedAccuracy": 0.5014117117366939, + "eval_loss": 1.6410915851593018, + "eval_runtime": 205.3947, + "eval_samples_per_second": 309.044, + "eval_steps_per_second": 1.207, + "step": 601000 + }, + { + "epoch": 2.448749298545966, + "grad_norm": 6.464639663696289, + "learning_rate": 0.0035117950802271733, + "loss": 7.82, + "step": 601100 + }, + { + "epoch": 2.4491566765693475, + "grad_norm": 3.021066904067993, + "learning_rate": 0.0035113411505715746, + "loss": 7.7605, + "step": 601200 + }, + { + "epoch": 2.449564054592729, + "grad_norm": 1.7831275463104248, + "learning_rate": 0.0035108871811299286, + "loss": 7.7613, + "step": 601300 + }, + { + "epoch": 2.4499714326161106, + "grad_norm": 1.8714038133621216, + "learning_rate": 0.00351043317192018, + "loss": 7.7519, + "step": 601400 + }, + { + "epoch": 2.4503788106394917, + "grad_norm": 2.989990472793579, + "learning_rate": 0.003509979122960283, + "loss": 7.7641, + "step": 601500 + }, + { + "epoch": 2.4507861886628732, + "grad_norm": 2.905515670776367, + "learning_rate": 0.003509525034268185, + "loss": 7.7724, + "step": 601600 + }, + { + "epoch": 2.4511935666862548, + "grad_norm": 1.7551355361938477, + "learning_rate": 0.003509070905861842, + "loss": 7.7868, + "step": 601700 + }, + { + "epoch": 2.4516009447096363, + "grad_norm": 2.576457977294922, + "learning_rate": 0.003508616737759204, + "loss": 7.7956, + "step": 601800 + }, + { + "epoch": 2.452008322733018, + "grad_norm": 3.6266062259674072, + "learning_rate": 0.0035081625299782284, + "loss": 7.7604, + "step": 601900 + }, + { + "epoch": 2.452415700756399, + "grad_norm": 3.1121485233306885, + "learning_rate": 0.0035077082825368724, + "loss": 7.7806, + "step": 602000 + }, + { + "epoch": 2.452415700756399, + "eval_MaskedAccuracy": 0.502414683474927, + "eval_loss": 1.6362465620040894, + "eval_runtime": 162.692, + "eval_samples_per_second": 390.16, + "eval_steps_per_second": 1.524, + "step": 602000 + }, + { + "epoch": 2.4528230787797805, + "grad_norm": 11.392309188842773, + "learning_rate": 0.0035072539954530977, + "loss": 7.7646, + "step": 602100 + }, + { + "epoch": 2.453230456803162, + "grad_norm": 5.102611541748047, + "learning_rate": 0.003506799668744863, + "loss": 7.8046, + "step": 602200 + }, + { + "epoch": 2.4536378348265435, + "grad_norm": 8.4649076461792, + "learning_rate": 0.0035063453024301297, + "loss": 7.763, + "step": 602300 + }, + { + "epoch": 2.454045212849925, + "grad_norm": 7.183914661407471, + "learning_rate": 0.0035058908965268625, + "loss": 7.7853, + "step": 602400 + }, + { + "epoch": 2.4544525908733066, + "grad_norm": 9.594482421875, + "learning_rate": 0.0035054364510530207, + "loss": 7.7438, + "step": 602500 + }, + { + "epoch": 2.454859968896688, + "grad_norm": 3.214622974395752, + "learning_rate": 0.0035049819660265768, + "loss": 7.7968, + "step": 602600 + }, + { + "epoch": 2.455267346920069, + "grad_norm": 3.7919723987579346, + "learning_rate": 0.003504527441465497, + "loss": 7.7411, + "step": 602700 + }, + { + "epoch": 2.4556747249434507, + "grad_norm": 4.053675651550293, + "learning_rate": 0.0035040728773877475, + "loss": 7.7886, + "step": 602800 + }, + { + "epoch": 2.4560821029668323, + "grad_norm": 3.9221904277801514, + "learning_rate": 0.0035036182738113083, + "loss": 7.7377, + "step": 602900 + }, + { + "epoch": 2.456489480990214, + "grad_norm": 2.673964500427246, + "learning_rate": 0.0035031636307541487, + "loss": 7.7565, + "step": 603000 + }, + { + "epoch": 2.456489480990214, + "eval_MaskedAccuracy": 0.5020966775888541, + "eval_loss": 1.6324012279510498, + "eval_runtime": 234.3925, + "eval_samples_per_second": 270.811, + "eval_steps_per_second": 1.058, + "step": 603000 + }, + { + "epoch": 2.4568968590135953, + "grad_norm": 2.3325867652893066, + "learning_rate": 0.0035027089482342413, + "loss": 7.7874, + "step": 603100 + }, + { + "epoch": 2.457304237036977, + "grad_norm": 4.30885648727417, + "learning_rate": 0.003502254226269564, + "loss": 7.7754, + "step": 603200 + }, + { + "epoch": 2.457711615060358, + "grad_norm": 4.349258899688721, + "learning_rate": 0.003501799464878091, + "loss": 7.7988, + "step": 603300 + }, + { + "epoch": 2.4581189930837395, + "grad_norm": 11.04083251953125, + "learning_rate": 0.0035013446640778027, + "loss": 7.7695, + "step": 603400 + }, + { + "epoch": 2.458526371107121, + "grad_norm": 4.791848659515381, + "learning_rate": 0.0035008898238866864, + "loss": 7.7853, + "step": 603500 + }, + { + "epoch": 2.4589337491305026, + "grad_norm": 5.2664408683776855, + "learning_rate": 0.003500434944322717, + "loss": 7.7757, + "step": 603600 + }, + { + "epoch": 2.459341127153884, + "grad_norm": 6.127841472625732, + "learning_rate": 0.0034999800254038814, + "loss": 7.7905, + "step": 603700 + }, + { + "epoch": 2.4597485051772656, + "grad_norm": 4.613772869110107, + "learning_rate": 0.0034995250671481623, + "loss": 7.7382, + "step": 603800 + }, + { + "epoch": 2.460155883200647, + "grad_norm": 3.5152437686920166, + "learning_rate": 0.003499070069573547, + "loss": 7.7811, + "step": 603900 + }, + { + "epoch": 2.4605632612240282, + "grad_norm": 8.655340194702148, + "learning_rate": 0.0034986150326980275, + "loss": 7.7644, + "step": 604000 + }, + { + "epoch": 2.4605632612240282, + "eval_MaskedAccuracy": 0.5018208698885417, + "eval_loss": 1.6410335302352905, + "eval_runtime": 167.0622, + "eval_samples_per_second": 379.954, + "eval_steps_per_second": 1.484, + "step": 604000 + }, + { + "epoch": 2.4609706392474098, + "grad_norm": 2.4539482593536377, + "learning_rate": 0.0034981599565395904, + "loss": 7.7642, + "step": 604100 + }, + { + "epoch": 2.4613780172707913, + "grad_norm": 2.8700265884399414, + "learning_rate": 0.0034977048411162274, + "loss": 7.799, + "step": 604200 + }, + { + "epoch": 2.461785395294173, + "grad_norm": 7.467411994934082, + "learning_rate": 0.0034972496864459357, + "loss": 7.8224, + "step": 604300 + }, + { + "epoch": 2.4621927733175544, + "grad_norm": 2.57955002784729, + "learning_rate": 0.0034967944925467045, + "loss": 7.7487, + "step": 604400 + }, + { + "epoch": 2.4626001513409355, + "grad_norm": 5.648074626922607, + "learning_rate": 0.0034963392594365354, + "loss": 7.7588, + "step": 604500 + }, + { + "epoch": 2.463007529364317, + "grad_norm": 3.960665225982666, + "learning_rate": 0.0034958839871334236, + "loss": 7.7721, + "step": 604600 + }, + { + "epoch": 2.4634149073876985, + "grad_norm": 7.0501275062561035, + "learning_rate": 0.003495428675655364, + "loss": 7.7589, + "step": 604700 + }, + { + "epoch": 2.46382228541108, + "grad_norm": 6.553703784942627, + "learning_rate": 0.0034949733250203667, + "loss": 7.7747, + "step": 604800 + }, + { + "epoch": 2.4642296634344616, + "grad_norm": 5.630553722381592, + "learning_rate": 0.003494517935246427, + "loss": 7.77, + "step": 604900 + }, + { + "epoch": 2.464637041457843, + "grad_norm": 2.755204439163208, + "learning_rate": 0.0034940625063515523, + "loss": 7.7734, + "step": 605000 + }, + { + "epoch": 2.464637041457843, + "eval_MaskedAccuracy": 0.5022833296206384, + "eval_loss": 1.6339539289474487, + "eval_runtime": 194.94, + "eval_samples_per_second": 325.618, + "eval_steps_per_second": 1.272, + "step": 605000 + }, + { + "epoch": 2.4650444194812247, + "grad_norm": 2.969895601272583, + "learning_rate": 0.003493607038353743, + "loss": 7.783, + "step": 605100 + }, + { + "epoch": 2.4654517975046057, + "grad_norm": 2.7398605346679688, + "learning_rate": 0.0034931515312710123, + "loss": 7.7843, + "step": 605200 + }, + { + "epoch": 2.4658591755279873, + "grad_norm": 3.5312838554382324, + "learning_rate": 0.003492695985121364, + "loss": 7.7612, + "step": 605300 + }, + { + "epoch": 2.466266553551369, + "grad_norm": 8.26854133605957, + "learning_rate": 0.003492240399922813, + "loss": 7.7444, + "step": 605400 + }, + { + "epoch": 2.4666739315747503, + "grad_norm": 4.664064884185791, + "learning_rate": 0.0034917847756933697, + "loss": 7.7587, + "step": 605500 + }, + { + "epoch": 2.467081309598132, + "grad_norm": 3.2515950202941895, + "learning_rate": 0.003491329112451045, + "loss": 7.7686, + "step": 605600 + }, + { + "epoch": 2.4674886876215134, + "grad_norm": 4.394134998321533, + "learning_rate": 0.003490873410213856, + "loss": 7.7707, + "step": 605700 + }, + { + "epoch": 2.4678960656448945, + "grad_norm": 5.289269924163818, + "learning_rate": 0.003490417668999815, + "loss": 7.7599, + "step": 605800 + }, + { + "epoch": 2.468303443668276, + "grad_norm": 3.2438066005706787, + "learning_rate": 0.003489961888826945, + "loss": 7.756, + "step": 605900 + }, + { + "epoch": 2.4687108216916576, + "grad_norm": 3.645613193511963, + "learning_rate": 0.00348950606971326, + "loss": 7.7876, + "step": 606000 + }, + { + "epoch": 2.4687108216916576, + "eval_MaskedAccuracy": 0.5029190005682133, + "eval_loss": 1.6357187032699585, + "eval_runtime": 165.3427, + "eval_samples_per_second": 383.906, + "eval_steps_per_second": 1.5, + "step": 606000 + }, + { + "epoch": 2.469118199715039, + "grad_norm": 10.071039199829102, + "learning_rate": 0.003489050211676785, + "loss": 7.7758, + "step": 606100 + }, + { + "epoch": 2.4695255777384206, + "grad_norm": 3.530179738998413, + "learning_rate": 0.003488594314735545, + "loss": 7.802, + "step": 606200 + }, + { + "epoch": 2.469932955761802, + "grad_norm": 2.915921211242676, + "learning_rate": 0.003488138378907564, + "loss": 7.7942, + "step": 606300 + }, + { + "epoch": 2.4703403337851837, + "grad_norm": 5.038060188293457, + "learning_rate": 0.003487682404210863, + "loss": 7.7954, + "step": 606400 + }, + { + "epoch": 2.470747711808565, + "grad_norm": 7.178977012634277, + "learning_rate": 0.0034872263906634672, + "loss": 7.7791, + "step": 606500 + }, + { + "epoch": 2.4711550898319463, + "grad_norm": 3.2906036376953125, + "learning_rate": 0.0034867703382834157, + "loss": 7.7613, + "step": 606600 + }, + { + "epoch": 2.471562467855328, + "grad_norm": 4.418756008148193, + "learning_rate": 0.0034863142470887274, + "loss": 7.8086, + "step": 606700 + }, + { + "epoch": 2.4719698458787094, + "grad_norm": 2.7068023681640625, + "learning_rate": 0.00348585811709744, + "loss": 7.7814, + "step": 606800 + }, + { + "epoch": 2.472377223902091, + "grad_norm": 3.6483817100524902, + "learning_rate": 0.003485401948327583, + "loss": 7.7869, + "step": 606900 + }, + { + "epoch": 2.472784601925472, + "grad_norm": 2.069129705429077, + "learning_rate": 0.0034849457407971933, + "loss": 7.7796, + "step": 607000 + }, + { + "epoch": 2.472784601925472, + "eval_MaskedAccuracy": 0.5023630049503295, + "eval_loss": 1.6310961246490479, + "eval_runtime": 164.6139, + "eval_samples_per_second": 385.605, + "eval_steps_per_second": 1.507, + "step": 607000 + }, + { + "epoch": 2.4731919799488535, + "grad_norm": 2.366436004638672, + "learning_rate": 0.003484489494524309, + "loss": 7.7938, + "step": 607100 + }, + { + "epoch": 2.473599357972235, + "grad_norm": 5.079567909240723, + "learning_rate": 0.003484033209526966, + "loss": 7.7741, + "step": 607200 + }, + { + "epoch": 2.4740067359956166, + "grad_norm": 7.488637924194336, + "learning_rate": 0.003483576885823208, + "loss": 7.7635, + "step": 607300 + }, + { + "epoch": 2.474414114018998, + "grad_norm": 2.4173405170440674, + "learning_rate": 0.003483120523431071, + "loss": 7.8059, + "step": 607400 + }, + { + "epoch": 2.4748214920423797, + "grad_norm": 4.786422252655029, + "learning_rate": 0.0034826641223685982, + "loss": 7.762, + "step": 607500 + }, + { + "epoch": 2.475228870065761, + "grad_norm": 4.182760238647461, + "learning_rate": 0.003482207682653829, + "loss": 7.7919, + "step": 607600 + }, + { + "epoch": 2.4756362480891423, + "grad_norm": 4.433559417724609, + "learning_rate": 0.003481751204304817, + "loss": 7.8114, + "step": 607700 + }, + { + "epoch": 2.476043626112524, + "grad_norm": 2.670544147491455, + "learning_rate": 0.003481294687339606, + "loss": 7.8075, + "step": 607800 + }, + { + "epoch": 2.4764510041359054, + "grad_norm": 2.4868991374969482, + "learning_rate": 0.0034808381317762388, + "loss": 7.7655, + "step": 607900 + }, + { + "epoch": 2.476858382159287, + "grad_norm": 2.5362184047698975, + "learning_rate": 0.0034803815376327736, + "loss": 7.7779, + "step": 608000 + }, + { + "epoch": 2.476858382159287, + "eval_MaskedAccuracy": 0.502517082019846, + "eval_loss": 1.635817050933838, + "eval_runtime": 167.187, + "eval_samples_per_second": 379.671, + "eval_steps_per_second": 1.483, + "step": 608000 + }, + { + "epoch": 2.4772657601826684, + "grad_norm": 8.652792930603027, + "learning_rate": 0.003479924904927261, + "loss": 7.7412, + "step": 608100 + }, + { + "epoch": 2.47767313820605, + "grad_norm": 3.2159080505371094, + "learning_rate": 0.0034794682336777532, + "loss": 7.8121, + "step": 608200 + }, + { + "epoch": 2.478080516229431, + "grad_norm": 2.7276620864868164, + "learning_rate": 0.0034790115239023045, + "loss": 7.798, + "step": 608300 + }, + { + "epoch": 2.4784878942528126, + "grad_norm": 2.7427170276641846, + "learning_rate": 0.003478554775618969, + "loss": 7.759, + "step": 608400 + }, + { + "epoch": 2.478895272276194, + "grad_norm": 3.3964476585388184, + "learning_rate": 0.003478097988845807, + "loss": 7.7567, + "step": 608500 + }, + { + "epoch": 2.4793026502995756, + "grad_norm": 2.7253124713897705, + "learning_rate": 0.0034776411636008803, + "loss": 7.8012, + "step": 608600 + }, + { + "epoch": 2.479710028322957, + "grad_norm": 6.021406650543213, + "learning_rate": 0.0034771842999022444, + "loss": 7.7693, + "step": 608700 + }, + { + "epoch": 2.4801174063463387, + "grad_norm": 5.724459648132324, + "learning_rate": 0.0034767273977679616, + "loss": 7.7649, + "step": 608800 + }, + { + "epoch": 2.4805247843697202, + "grad_norm": 3.1598286628723145, + "learning_rate": 0.003476270457216099, + "loss": 7.7503, + "step": 608900 + }, + { + "epoch": 2.4809321623931013, + "grad_norm": 6.052318572998047, + "learning_rate": 0.003475813478264721, + "loss": 7.7651, + "step": 609000 + }, + { + "epoch": 2.4809321623931013, + "eval_MaskedAccuracy": 0.5026950562611281, + "eval_loss": 1.6452536582946777, + "eval_runtime": 157.0128, + "eval_samples_per_second": 404.273, + "eval_steps_per_second": 1.579, + "step": 609000 + }, + { + "epoch": 2.481339540416483, + "grad_norm": 2.323472499847412, + "learning_rate": 0.0034753564609318953, + "loss": 7.7643, + "step": 609100 + }, + { + "epoch": 2.4817469184398644, + "grad_norm": 3.980980157852173, + "learning_rate": 0.003474899405235685, + "loss": 7.766, + "step": 609200 + }, + { + "epoch": 2.482154296463246, + "grad_norm": 5.836856365203857, + "learning_rate": 0.0034744423111941615, + "loss": 7.7902, + "step": 609300 + }, + { + "epoch": 2.4825616744866275, + "grad_norm": 3.978548526763916, + "learning_rate": 0.0034739851788254056, + "loss": 7.7556, + "step": 609400 + }, + { + "epoch": 2.4829690525100085, + "grad_norm": 2.193514585494995, + "learning_rate": 0.003473528008147481, + "loss": 7.7771, + "step": 609500 + }, + { + "epoch": 2.48337643053339, + "grad_norm": 4.373281478881836, + "learning_rate": 0.0034730707991784604, + "loss": 7.7665, + "step": 609600 + }, + { + "epoch": 2.4837838085567716, + "grad_norm": 3.0255274772644043, + "learning_rate": 0.0034726135519364274, + "loss": 7.7561, + "step": 609700 + }, + { + "epoch": 2.484191186580153, + "grad_norm": 3.5589005947113037, + "learning_rate": 0.0034721562664394565, + "loss": 7.7837, + "step": 609800 + }, + { + "epoch": 2.4845985646035347, + "grad_norm": 3.6775569915771484, + "learning_rate": 0.0034716989427056237, + "loss": 7.7502, + "step": 609900 + }, + { + "epoch": 2.485005942626916, + "grad_norm": 10.020858764648438, + "learning_rate": 0.003471241580753008, + "loss": 7.805, + "step": 610000 + }, + { + "epoch": 2.485005942626916, + "eval_MaskedAccuracy": 0.5018309697365412, + "eval_loss": 1.6496104001998901, + "eval_runtime": 159.4095, + "eval_samples_per_second": 398.195, + "eval_steps_per_second": 1.556, + "step": 610000 + }, + { + "epoch": 2.4854133206502977, + "grad_norm": 6.619731426239014, + "learning_rate": 0.0034707841805996986, + "loss": 7.7667, + "step": 610100 + }, + { + "epoch": 2.485820698673679, + "grad_norm": 5.1112542152404785, + "learning_rate": 0.003470326742263774, + "loss": 7.7172, + "step": 610200 + }, + { + "epoch": 2.4862280766970604, + "grad_norm": 6.065150737762451, + "learning_rate": 0.0034698692657633245, + "loss": 7.7433, + "step": 610300 + }, + { + "epoch": 2.486635454720442, + "grad_norm": 4.601003646850586, + "learning_rate": 0.003469411751116425, + "loss": 7.7615, + "step": 610400 + }, + { + "epoch": 2.4870428327438234, + "grad_norm": 5.6820244789123535, + "learning_rate": 0.0034689541983411744, + "loss": 7.7956, + "step": 610500 + }, + { + "epoch": 2.487450210767205, + "grad_norm": 2.847165107727051, + "learning_rate": 0.003468496607455661, + "loss": 7.7725, + "step": 610600 + }, + { + "epoch": 2.4878575887905865, + "grad_norm": 4.603565216064453, + "learning_rate": 0.003468038978477969, + "loss": 7.788, + "step": 610700 + }, + { + "epoch": 2.4882649668139676, + "grad_norm": 4.942103862762451, + "learning_rate": 0.0034675813114261956, + "loss": 7.7807, + "step": 610800 + }, + { + "epoch": 2.488672344837349, + "grad_norm": 4.010811805725098, + "learning_rate": 0.0034671236063184355, + "loss": 7.7754, + "step": 610900 + }, + { + "epoch": 2.4890797228607306, + "grad_norm": 4.431800842285156, + "learning_rate": 0.00346666586317279, + "loss": 7.7784, + "step": 611000 + }, + { + "epoch": 2.4890797228607306, + "eval_MaskedAccuracy": 0.5031158369280649, + "eval_loss": 1.6375545263290405, + "eval_runtime": 154.7681, + "eval_samples_per_second": 410.136, + "eval_steps_per_second": 1.602, + "step": 611000 + }, + { + "epoch": 2.489487100884112, + "grad_norm": 3.4358301162719727, + "learning_rate": 0.0034662080820073472, + "loss": 7.7893, + "step": 611100 + }, + { + "epoch": 2.4898944789074937, + "grad_norm": 5.371962547302246, + "learning_rate": 0.0034657502628402067, + "loss": 7.7254, + "step": 611200 + }, + { + "epoch": 2.4903018569308752, + "grad_norm": 5.027492523193359, + "learning_rate": 0.003465292405689468, + "loss": 7.7705, + "step": 611300 + }, + { + "epoch": 2.490709234954257, + "grad_norm": 5.015530109405518, + "learning_rate": 0.0034648345105732373, + "loss": 7.7824, + "step": 611400 + }, + { + "epoch": 2.491116612977638, + "grad_norm": 4.747501850128174, + "learning_rate": 0.003464376577509613, + "loss": 7.7949, + "step": 611500 + }, + { + "epoch": 2.4915239910010194, + "grad_norm": 3.3777360916137695, + "learning_rate": 0.0034639186065166996, + "loss": 7.8029, + "step": 611600 + }, + { + "epoch": 2.491931369024401, + "grad_norm": 2.6882739067077637, + "learning_rate": 0.003463460597612606, + "loss": 7.7576, + "step": 611700 + }, + { + "epoch": 2.4923387470477825, + "grad_norm": 6.328719139099121, + "learning_rate": 0.003463002550815439, + "loss": 7.7791, + "step": 611800 + }, + { + "epoch": 2.492746125071164, + "grad_norm": 1.7057703733444214, + "learning_rate": 0.0034625444661433064, + "loss": 7.7636, + "step": 611900 + }, + { + "epoch": 2.493153503094545, + "grad_norm": 2.849590301513672, + "learning_rate": 0.003462086343614317, + "loss": 7.7619, + "step": 612000 + }, + { + "epoch": 2.493153503094545, + "eval_MaskedAccuracy": 0.5034365202158351, + "eval_loss": 1.6304341554641724, + "eval_runtime": 218.3404, + "eval_samples_per_second": 290.72, + "eval_steps_per_second": 1.136, + "step": 612000 + }, + { + "epoch": 2.4935608811179266, + "grad_norm": 3.324779748916626, + "learning_rate": 0.0034616281832465937, + "loss": 7.7423, + "step": 612100 + }, + { + "epoch": 2.493968259141308, + "grad_norm": 8.73055362701416, + "learning_rate": 0.0034611699850582373, + "loss": 7.7453, + "step": 612200 + }, + { + "epoch": 2.4943756371646897, + "grad_norm": 8.771262168884277, + "learning_rate": 0.0034607117490673663, + "loss": 7.7936, + "step": 612300 + }, + { + "epoch": 2.494783015188071, + "grad_norm": 4.403766632080078, + "learning_rate": 0.003460253475292096, + "loss": 7.8016, + "step": 612400 + }, + { + "epoch": 2.4951903932114528, + "grad_norm": 8.34892463684082, + "learning_rate": 0.0034597951637505508, + "loss": 7.7638, + "step": 612500 + }, + { + "epoch": 2.4955977712348343, + "grad_norm": 5.161684036254883, + "learning_rate": 0.0034593368144608485, + "loss": 7.7557, + "step": 612600 + }, + { + "epoch": 2.4960051492582154, + "grad_norm": 5.6892876625061035, + "learning_rate": 0.0034588784274411062, + "loss": 7.8037, + "step": 612700 + }, + { + "epoch": 2.496412527281597, + "grad_norm": 3.6872031688690186, + "learning_rate": 0.003458420002709448, + "loss": 7.7857, + "step": 612800 + }, + { + "epoch": 2.4968199053049784, + "grad_norm": 3.777289628982544, + "learning_rate": 0.003457961540283996, + "loss": 7.7583, + "step": 612900 + }, + { + "epoch": 2.49722728332836, + "grad_norm": 2.7119300365448, + "learning_rate": 0.00345750304018288, + "loss": 7.7738, + "step": 613000 + }, + { + "epoch": 2.49722728332836, + "eval_MaskedAccuracy": 0.5022991251156033, + "eval_loss": 1.636875867843628, + "eval_runtime": 162.3094, + "eval_samples_per_second": 391.08, + "eval_steps_per_second": 1.528, + "step": 613000 + }, + { + "epoch": 2.4976346613517415, + "grad_norm": 3.9631919860839844, + "learning_rate": 0.0034570445024242214, + "loss": 7.7678, + "step": 613100 + }, + { + "epoch": 2.498042039375123, + "grad_norm": 3.6911187171936035, + "learning_rate": 0.003456585927026153, + "loss": 7.7789, + "step": 613200 + }, + { + "epoch": 2.498449417398504, + "grad_norm": 3.555579900741577, + "learning_rate": 0.0034561273140068013, + "loss": 7.795, + "step": 613300 + }, + { + "epoch": 2.4988567954218857, + "grad_norm": 5.924427509307861, + "learning_rate": 0.0034556686633843026, + "loss": 7.7343, + "step": 613400 + }, + { + "epoch": 2.499264173445267, + "grad_norm": 2.3663249015808105, + "learning_rate": 0.003455209975176787, + "loss": 7.7641, + "step": 613500 + }, + { + "epoch": 2.4996715514686487, + "grad_norm": 2.5663599967956543, + "learning_rate": 0.00345475124940239, + "loss": 7.775, + "step": 613600 + }, + { + "epoch": 2.5000789294920303, + "grad_norm": 4.334702014923096, + "learning_rate": 0.003454292486079246, + "loss": 7.7715, + "step": 613700 + }, + { + "epoch": 2.500486307515412, + "grad_norm": 3.6804959774017334, + "learning_rate": 0.0034538336852254913, + "loss": 7.8049, + "step": 613800 + }, + { + "epoch": 2.5008936855387933, + "grad_norm": 14.971857070922852, + "learning_rate": 0.003453374846859268, + "loss": 7.79, + "step": 613900 + }, + { + "epoch": 2.5013010635621744, + "grad_norm": 5.183899402618408, + "learning_rate": 0.0034529159709987154, + "loss": 7.7578, + "step": 614000 + }, + { + "epoch": 2.5013010635621744, + "eval_MaskedAccuracy": 0.5023197603573429, + "eval_loss": 1.6355232000350952, + "eval_runtime": 187.9687, + "eval_samples_per_second": 337.694, + "eval_steps_per_second": 1.319, + "step": 614000 + }, + { + "epoch": 2.501708441585556, + "grad_norm": 2.179237127304077, + "learning_rate": 0.003452457057661972, + "loss": 7.7526, + "step": 614100 + }, + { + "epoch": 2.5021158196089375, + "grad_norm": 5.896249294281006, + "learning_rate": 0.0034519981068671848, + "loss": 7.781, + "step": 614200 + }, + { + "epoch": 2.502523197632319, + "grad_norm": 3.0926969051361084, + "learning_rate": 0.0034515391186324975, + "loss": 7.7922, + "step": 614300 + }, + { + "epoch": 2.5029305756557005, + "grad_norm": 5.69252872467041, + "learning_rate": 0.003451080092976058, + "loss": 7.7848, + "step": 614400 + }, + { + "epoch": 2.5033379536790816, + "grad_norm": 5.180838584899902, + "learning_rate": 0.0034506210299160116, + "loss": 7.7828, + "step": 614500 + }, + { + "epoch": 2.503745331702463, + "grad_norm": 4.583305835723877, + "learning_rate": 0.0034501619294705094, + "loss": 7.7746, + "step": 614600 + }, + { + "epoch": 2.5041527097258447, + "grad_norm": 5.0666279792785645, + "learning_rate": 0.003449702791657699, + "loss": 7.777, + "step": 614700 + }, + { + "epoch": 2.5045600877492262, + "grad_norm": 5.836339950561523, + "learning_rate": 0.0034492436164957367, + "loss": 7.7332, + "step": 614800 + }, + { + "epoch": 2.5049674657726078, + "grad_norm": 2.6494479179382324, + "learning_rate": 0.0034487844040027695, + "loss": 7.7607, + "step": 614900 + }, + { + "epoch": 2.5053748437959893, + "grad_norm": 2.540177583694458, + "learning_rate": 0.003448325154196961, + "loss": 7.7808, + "step": 615000 + }, + { + "epoch": 2.5053748437959893, + "eval_MaskedAccuracy": 0.5018629976135593, + "eval_loss": 1.6425836086273193, + "eval_runtime": 162.095, + "eval_samples_per_second": 391.597, + "eval_steps_per_second": 1.53, + "step": 615000 + }, + { + "epoch": 2.505782221819371, + "grad_norm": 8.705368041992188, + "learning_rate": 0.0034478658670964676, + "loss": 7.7685, + "step": 615100 + }, + { + "epoch": 2.5061895998427524, + "grad_norm": 7.735439300537109, + "learning_rate": 0.0034474065427194424, + "loss": 7.7816, + "step": 615200 + }, + { + "epoch": 2.5065969778661334, + "grad_norm": 3.698718547821045, + "learning_rate": 0.0034469471810840446, + "loss": 7.7528, + "step": 615300 + }, + { + "epoch": 2.507004355889515, + "grad_norm": 2.3210785388946533, + "learning_rate": 0.0034464877822084354, + "loss": 7.7778, + "step": 615400 + }, + { + "epoch": 2.5074117339128965, + "grad_norm": 4.6606125831604, + "learning_rate": 0.0034460283461107796, + "loss": 7.7883, + "step": 615500 + }, + { + "epoch": 2.507819111936278, + "grad_norm": 1.963275671005249, + "learning_rate": 0.0034455688728092402, + "loss": 7.7587, + "step": 615600 + }, + { + "epoch": 2.508226489959659, + "grad_norm": 5.2963361740112305, + "learning_rate": 0.003445109362321983, + "loss": 7.7516, + "step": 615700 + }, + { + "epoch": 2.5086338679830407, + "grad_norm": 4.963475704193115, + "learning_rate": 0.0034446498146671754, + "loss": 7.7896, + "step": 615800 + }, + { + "epoch": 2.509041246006422, + "grad_norm": 3.43817400932312, + "learning_rate": 0.0034441902298629826, + "loss": 7.7537, + "step": 615900 + }, + { + "epoch": 2.5094486240298037, + "grad_norm": 5.425559997558594, + "learning_rate": 0.0034437306079275753, + "loss": 7.7537, + "step": 616000 + }, + { + "epoch": 2.5094486240298037, + "eval_MaskedAccuracy": 0.5025626337645198, + "eval_loss": 1.6352336406707764, + "eval_runtime": 241.6423, + "eval_samples_per_second": 262.686, + "eval_steps_per_second": 1.026, + "step": 616000 + }, + { + "epoch": 2.5098560020531853, + "grad_norm": 9.041913032531738, + "learning_rate": 0.003443270948879127, + "loss": 7.763, + "step": 616100 + }, + { + "epoch": 2.510263380076567, + "grad_norm": 3.324021339416504, + "learning_rate": 0.003442811252735811, + "loss": 7.7677, + "step": 616200 + }, + { + "epoch": 2.5106707580999483, + "grad_norm": 7.944117069244385, + "learning_rate": 0.0034423515195157986, + "loss": 7.7356, + "step": 616300 + }, + { + "epoch": 2.51107813612333, + "grad_norm": 7.065430641174316, + "learning_rate": 0.003441891749237269, + "loss": 7.7934, + "step": 616400 + }, + { + "epoch": 2.511485514146711, + "grad_norm": 3.643815040588379, + "learning_rate": 0.0034414319419183983, + "loss": 7.7855, + "step": 616500 + }, + { + "epoch": 2.5118928921700925, + "grad_norm": 3.610201597213745, + "learning_rate": 0.0034409720975773664, + "loss": 7.7476, + "step": 616600 + }, + { + "epoch": 2.512300270193474, + "grad_norm": 5.183040142059326, + "learning_rate": 0.0034405122162323477, + "loss": 7.7557, + "step": 616700 + }, + { + "epoch": 2.5127076482168555, + "grad_norm": 6.824285507202148, + "learning_rate": 0.0034400522979015297, + "loss": 7.7853, + "step": 616800 + }, + { + "epoch": 2.513115026240237, + "grad_norm": 10.065016746520996, + "learning_rate": 0.0034395923426030917, + "loss": 7.773, + "step": 616900 + }, + { + "epoch": 2.513522404263618, + "grad_norm": 4.384364604949951, + "learning_rate": 0.0034391323503552207, + "loss": 7.7912, + "step": 617000 + }, + { + "epoch": 2.513522404263618, + "eval_MaskedAccuracy": 0.5024734477734989, + "eval_loss": 1.639122486114502, + "eval_runtime": 161.8896, + "eval_samples_per_second": 392.094, + "eval_steps_per_second": 1.532, + "step": 617000 + }, + { + "epoch": 2.5139297822869997, + "grad_norm": 2.951051712036133, + "learning_rate": 0.003438672321176104, + "loss": 7.7722, + "step": 617100 + }, + { + "epoch": 2.5143371603103812, + "grad_norm": 11.248997688293457, + "learning_rate": 0.0034382122550839253, + "loss": 7.7819, + "step": 617200 + }, + { + "epoch": 2.5147445383337628, + "grad_norm": 4.652326583862305, + "learning_rate": 0.003437752152096874, + "loss": 7.7744, + "step": 617300 + }, + { + "epoch": 2.5151519163571443, + "grad_norm": 2.3746697902679443, + "learning_rate": 0.0034372920122331435, + "loss": 7.7547, + "step": 617400 + }, + { + "epoch": 2.515559294380526, + "grad_norm": 7.469295978546143, + "learning_rate": 0.003436831835510922, + "loss": 7.7722, + "step": 617500 + }, + { + "epoch": 2.5159666724039074, + "grad_norm": 5.900223731994629, + "learning_rate": 0.0034363716219484065, + "loss": 7.7933, + "step": 617600 + }, + { + "epoch": 2.516374050427289, + "grad_norm": 6.131950855255127, + "learning_rate": 0.0034359113715637856, + "loss": 7.7672, + "step": 617700 + }, + { + "epoch": 2.51678142845067, + "grad_norm": 6.427236557006836, + "learning_rate": 0.003435451084375261, + "loss": 7.7507, + "step": 617800 + }, + { + "epoch": 2.5171888064740515, + "grad_norm": 7.460343360900879, + "learning_rate": 0.003434990760401027, + "loss": 7.7422, + "step": 617900 + }, + { + "epoch": 2.517596184497433, + "grad_norm": 3.753558397293091, + "learning_rate": 0.0034345303996592834, + "loss": 7.7799, + "step": 618000 + }, + { + "epoch": 2.517596184497433, + "eval_MaskedAccuracy": 0.5025722323480888, + "eval_loss": 1.6341897249221802, + "eval_runtime": 185.6726, + "eval_samples_per_second": 341.871, + "eval_steps_per_second": 1.336, + "step": 618000 + }, + { + "epoch": 2.5180035625208146, + "grad_norm": 3.9119603633880615, + "learning_rate": 0.003434070002168237, + "loss": 7.7853, + "step": 618100 + }, + { + "epoch": 2.5184109405441957, + "grad_norm": 5.590705394744873, + "learning_rate": 0.0034336095679460815, + "loss": 7.7477, + "step": 618200 + }, + { + "epoch": 2.518818318567577, + "grad_norm": 4.970400810241699, + "learning_rate": 0.003433149097011025, + "loss": 7.7607, + "step": 618300 + }, + { + "epoch": 2.5192256965909587, + "grad_norm": 5.685876846313477, + "learning_rate": 0.0034326885893812696, + "loss": 7.7934, + "step": 618400 + }, + { + "epoch": 2.5196330746143403, + "grad_norm": 7.493686199188232, + "learning_rate": 0.0034322280450750234, + "loss": 7.7888, + "step": 618500 + }, + { + "epoch": 2.520040452637722, + "grad_norm": 3.7989518642425537, + "learning_rate": 0.003431767464110492, + "loss": 7.7608, + "step": 618600 + }, + { + "epoch": 2.5204478306611033, + "grad_norm": 2.843581199645996, + "learning_rate": 0.0034313068465058843, + "loss": 7.7442, + "step": 618700 + }, + { + "epoch": 2.520855208684485, + "grad_norm": 3.6405067443847656, + "learning_rate": 0.0034308461922794168, + "loss": 7.7532, + "step": 618800 + }, + { + "epoch": 2.5212625867078664, + "grad_norm": 6.370888710021973, + "learning_rate": 0.0034303855014492964, + "loss": 7.7689, + "step": 618900 + }, + { + "epoch": 2.5216699647312475, + "grad_norm": 4.907922744750977, + "learning_rate": 0.0034299247740337407, + "loss": 7.7687, + "step": 619000 + }, + { + "epoch": 2.5216699647312475, + "eval_MaskedAccuracy": 0.5023274693518835, + "eval_loss": 1.6362217664718628, + "eval_runtime": 174.3653, + "eval_samples_per_second": 364.04, + "eval_steps_per_second": 1.422, + "step": 619000 + }, + { + "epoch": 2.522077342754629, + "grad_norm": 3.8803508281707764, + "learning_rate": 0.003429464010050957, + "loss": 7.7692, + "step": 619100 + }, + { + "epoch": 2.5224847207780106, + "grad_norm": 3.016896963119507, + "learning_rate": 0.00342900320951917, + "loss": 7.7835, + "step": 619200 + }, + { + "epoch": 2.522892098801392, + "grad_norm": 7.485684394836426, + "learning_rate": 0.003428542372456591, + "loss": 7.7638, + "step": 619300 + }, + { + "epoch": 2.5232994768247736, + "grad_norm": 3.4870240688323975, + "learning_rate": 0.0034280814988814435, + "loss": 7.7549, + "step": 619400 + }, + { + "epoch": 2.5237068548481547, + "grad_norm": 2.594203233718872, + "learning_rate": 0.003427620588811948, + "loss": 7.7738, + "step": 619500 + }, + { + "epoch": 2.5241142328715362, + "grad_norm": 4.025074481964111, + "learning_rate": 0.0034271596422663286, + "loss": 7.766, + "step": 619600 + }, + { + "epoch": 2.524521610894918, + "grad_norm": 15.031338691711426, + "learning_rate": 0.0034266986592628034, + "loss": 7.7444, + "step": 619700 + }, + { + "epoch": 2.5249289889182993, + "grad_norm": 3.5820491313934326, + "learning_rate": 0.0034262376398196013, + "loss": 7.7875, + "step": 619800 + }, + { + "epoch": 2.525336366941681, + "grad_norm": 3.851367473602295, + "learning_rate": 0.0034257765839549475, + "loss": 7.7547, + "step": 619900 + }, + { + "epoch": 2.5257437449650624, + "grad_norm": 4.407445907592773, + "learning_rate": 0.0034253154916870695, + "loss": 7.7818, + "step": 620000 + }, + { + "epoch": 2.5257437449650624, + "eval_MaskedAccuracy": 0.5017868783965054, + "eval_loss": 1.6382781267166138, + "eval_runtime": 158.9167, + "eval_samples_per_second": 399.429, + "eval_steps_per_second": 1.561, + "step": 620000 + }, + { + "epoch": 2.526151122988444, + "grad_norm": 5.073345184326172, + "learning_rate": 0.0034248543630341992, + "loss": 7.7495, + "step": 620100 + }, + { + "epoch": 2.5265585010118254, + "grad_norm": 4.661971569061279, + "learning_rate": 0.0034243931980145687, + "loss": 7.751, + "step": 620200 + }, + { + "epoch": 2.5269658790352065, + "grad_norm": 3.7760164737701416, + "learning_rate": 0.0034239319966464037, + "loss": 7.7607, + "step": 620300 + }, + { + "epoch": 2.527373257058588, + "grad_norm": 5.625454425811768, + "learning_rate": 0.003423470758947944, + "loss": 7.7581, + "step": 620400 + }, + { + "epoch": 2.5277806350819696, + "grad_norm": 3.6919190883636475, + "learning_rate": 0.003423009484937423, + "loss": 7.7946, + "step": 620500 + }, + { + "epoch": 2.528188013105351, + "grad_norm": 9.471922874450684, + "learning_rate": 0.003422548174633077, + "loss": 7.7613, + "step": 620600 + }, + { + "epoch": 2.528595391128732, + "grad_norm": 2.955620288848877, + "learning_rate": 0.003422086828053144, + "loss": 7.7491, + "step": 620700 + }, + { + "epoch": 2.5290027691521137, + "grad_norm": 3.1450116634368896, + "learning_rate": 0.0034216254452158652, + "loss": 7.7672, + "step": 620800 + }, + { + "epoch": 2.5294101471754953, + "grad_norm": 2.3749918937683105, + "learning_rate": 0.0034211640261394777, + "loss": 7.7367, + "step": 620900 + }, + { + "epoch": 2.529817525198877, + "grad_norm": 3.8808748722076416, + "learning_rate": 0.0034207025708422258, + "loss": 7.781, + "step": 621000 + }, + { + "epoch": 2.529817525198877, + "eval_MaskedAccuracy": 0.503201223599316, + "eval_loss": 1.63099205493927, + "eval_runtime": 160.6549, + "eval_samples_per_second": 395.108, + "eval_steps_per_second": 1.544, + "step": 621000 + }, + { + "epoch": 2.5302249032222583, + "grad_norm": 4.164322376251221, + "learning_rate": 0.0034202410793423544, + "loss": 7.7878, + "step": 621100 + }, + { + "epoch": 2.53063228124564, + "grad_norm": 5.7266645431518555, + "learning_rate": 0.003419779551658109, + "loss": 7.7607, + "step": 621200 + }, + { + "epoch": 2.5310396592690214, + "grad_norm": 2.7845304012298584, + "learning_rate": 0.003419317987807737, + "loss": 7.7895, + "step": 621300 + }, + { + "epoch": 2.531447037292403, + "grad_norm": 4.079392910003662, + "learning_rate": 0.003418856387809479, + "loss": 7.7829, + "step": 621400 + }, + { + "epoch": 2.531854415315784, + "grad_norm": 6.282017707824707, + "learning_rate": 0.00341839475168159, + "loss": 7.7442, + "step": 621500 + }, + { + "epoch": 2.5322617933391656, + "grad_norm": 3.3576087951660156, + "learning_rate": 0.0034179330794423257, + "loss": 7.7694, + "step": 621600 + }, + { + "epoch": 2.532669171362547, + "grad_norm": 3.2473766803741455, + "learning_rate": 0.003417471371109934, + "loss": 7.7467, + "step": 621700 + }, + { + "epoch": 2.5330765493859286, + "grad_norm": 3.8091328144073486, + "learning_rate": 0.003417009626702671, + "loss": 7.7745, + "step": 621800 + }, + { + "epoch": 2.53348392740931, + "grad_norm": 1.907678484916687, + "learning_rate": 0.0034165478462387894, + "loss": 7.7477, + "step": 621900 + }, + { + "epoch": 2.5338913054326913, + "grad_norm": 2.0353424549102783, + "learning_rate": 0.0034160860297365414, + "loss": 7.7626, + "step": 622000 + }, + { + "epoch": 2.5338913054326913, + "eval_MaskedAccuracy": 0.5024660503472291, + "eval_loss": 1.6323477029800415, + "eval_runtime": 158.4253, + "eval_samples_per_second": 400.668, + "eval_steps_per_second": 1.565, + "step": 622000 + }, + { + "epoch": 2.534298683456073, + "grad_norm": 8.999128341674805, + "learning_rate": 0.0034156241772141933, + "loss": 7.7751, + "step": 622100 + }, + { + "epoch": 2.5347060614794543, + "grad_norm": 5.298986434936523, + "learning_rate": 0.00341516228869, + "loss": 7.7877, + "step": 622200 + }, + { + "epoch": 2.535113439502836, + "grad_norm": 6.519100189208984, + "learning_rate": 0.0034147003641822236, + "loss": 7.7697, + "step": 622300 + }, + { + "epoch": 2.5355208175262174, + "grad_norm": 5.478178024291992, + "learning_rate": 0.0034142384037091256, + "loss": 7.745, + "step": 622400 + }, + { + "epoch": 2.535928195549599, + "grad_norm": 3.7654266357421875, + "learning_rate": 0.0034137764072889724, + "loss": 7.7622, + "step": 622500 + }, + { + "epoch": 2.5363355735729805, + "grad_norm": 4.415789604187012, + "learning_rate": 0.0034133143749400254, + "loss": 7.7689, + "step": 622600 + }, + { + "epoch": 2.536742951596362, + "grad_norm": 2.717233180999756, + "learning_rate": 0.003412852306680553, + "loss": 7.7383, + "step": 622700 + }, + { + "epoch": 2.537150329619743, + "grad_norm": 10.035981178283691, + "learning_rate": 0.0034123902025288227, + "loss": 7.7406, + "step": 622800 + }, + { + "epoch": 2.5375577076431246, + "grad_norm": 7.083771705627441, + "learning_rate": 0.0034119280625031063, + "loss": 7.7589, + "step": 622900 + }, + { + "epoch": 2.537965085666506, + "grad_norm": 2.220515727996826, + "learning_rate": 0.0034114658866216743, + "loss": 7.7809, + "step": 623000 + }, + { + "epoch": 2.537965085666506, + "eval_MaskedAccuracy": 0.5033736089149096, + "eval_loss": 1.6255074739456177, + "eval_runtime": 162.0465, + "eval_samples_per_second": 391.715, + "eval_steps_per_second": 1.53, + "step": 623000 + }, + { + "epoch": 2.5383724636898877, + "grad_norm": 3.3882949352264404, + "learning_rate": 0.0034110036749027957, + "loss": 7.7483, + "step": 623100 + }, + { + "epoch": 2.5387798417132688, + "grad_norm": 4.393842697143555, + "learning_rate": 0.003410541427364745, + "loss": 7.7839, + "step": 623200 + }, + { + "epoch": 2.5391872197366503, + "grad_norm": 4.683650970458984, + "learning_rate": 0.0034100791440258007, + "loss": 7.7726, + "step": 623300 + }, + { + "epoch": 2.539594597760032, + "grad_norm": 3.0677711963653564, + "learning_rate": 0.0034096168249042377, + "loss": 7.752, + "step": 623400 + }, + { + "epoch": 2.5400019757834134, + "grad_norm": 7.665134906768799, + "learning_rate": 0.0034091544700183335, + "loss": 7.7489, + "step": 623500 + }, + { + "epoch": 2.540409353806795, + "grad_norm": 2.6495871543884277, + "learning_rate": 0.003408692079386368, + "loss": 7.7649, + "step": 623600 + }, + { + "epoch": 2.5408167318301764, + "grad_norm": 6.374454021453857, + "learning_rate": 0.0034082296530266185, + "loss": 7.7857, + "step": 623700 + }, + { + "epoch": 2.541224109853558, + "grad_norm": 4.619896411895752, + "learning_rate": 0.00340776719095737, + "loss": 7.7247, + "step": 623800 + }, + { + "epoch": 2.5416314878769395, + "grad_norm": 5.068841457366943, + "learning_rate": 0.0034073046931969064, + "loss": 7.7504, + "step": 623900 + }, + { + "epoch": 2.5420388659003206, + "grad_norm": 2.5633034706115723, + "learning_rate": 0.003406842159763513, + "loss": 7.7569, + "step": 624000 + }, + { + "epoch": 2.5420388659003206, + "eval_MaskedAccuracy": 0.5031453808360214, + "eval_loss": 1.6318475008010864, + "eval_runtime": 156.4185, + "eval_samples_per_second": 405.809, + "eval_steps_per_second": 1.585, + "step": 624000 + }, + { + "epoch": 2.542446243923702, + "grad_norm": 3.5641820430755615, + "learning_rate": 0.0034063795906754744, + "loss": 7.7768, + "step": 624100 + }, + { + "epoch": 2.5428536219470836, + "grad_norm": 4.2368879318237305, + "learning_rate": 0.00340591698595108, + "loss": 7.7521, + "step": 624200 + }, + { + "epoch": 2.543260999970465, + "grad_norm": 3.035111665725708, + "learning_rate": 0.0034054543456086207, + "loss": 7.7471, + "step": 624300 + }, + { + "epoch": 2.5436683779938467, + "grad_norm": 2.8946642875671387, + "learning_rate": 0.003404991669666385, + "loss": 7.7628, + "step": 624400 + }, + { + "epoch": 2.544075756017228, + "grad_norm": 3.1361265182495117, + "learning_rate": 0.0034045289581426615, + "loss": 7.7874, + "step": 624500 + }, + { + "epoch": 2.5444831340406093, + "grad_norm": 4.589052677154541, + "learning_rate": 0.003404066211055753, + "loss": 7.7339, + "step": 624600 + }, + { + "epoch": 2.544890512063991, + "grad_norm": 9.63264274597168, + "learning_rate": 0.0034036034284239446, + "loss": 7.7424, + "step": 624700 + }, + { + "epoch": 2.5452978900873724, + "grad_norm": 5.131928443908691, + "learning_rate": 0.003403140610265541, + "loss": 7.7656, + "step": 624800 + }, + { + "epoch": 2.545705268110754, + "grad_norm": 4.377277851104736, + "learning_rate": 0.0034026777565988302, + "loss": 7.7772, + "step": 624900 + }, + { + "epoch": 2.5461126461341355, + "grad_norm": 2.270489454269409, + "learning_rate": 0.003402214867442119, + "loss": 7.727, + "step": 625000 + }, + { + "epoch": 2.5461126461341355, + "eval_MaskedAccuracy": 0.5028148942312095, + "eval_loss": 1.6352275609970093, + "eval_runtime": 163.2975, + "eval_samples_per_second": 388.714, + "eval_steps_per_second": 1.519, + "step": 625000 + }, + { + "epoch": 2.546520024157517, + "grad_norm": 6.795170307159424, + "learning_rate": 0.0034017519428137075, + "loss": 7.76, + "step": 625100 + }, + { + "epoch": 2.5469274021808985, + "grad_norm": 4.163559913635254, + "learning_rate": 0.0034012889827318934, + "loss": 7.7531, + "step": 625200 + }, + { + "epoch": 2.5473347802042796, + "grad_norm": 4.35445499420166, + "learning_rate": 0.003400825987214981, + "loss": 7.775, + "step": 625300 + }, + { + "epoch": 2.547742158227661, + "grad_norm": 5.694300174713135, + "learning_rate": 0.003400362956281278, + "loss": 7.7588, + "step": 625400 + }, + { + "epoch": 2.5481495362510427, + "grad_norm": 11.355413436889648, + "learning_rate": 0.0033998998899490867, + "loss": 7.7714, + "step": 625500 + }, + { + "epoch": 2.548556914274424, + "grad_norm": 4.696236610412598, + "learning_rate": 0.0033994367882367177, + "loss": 7.736, + "step": 625600 + }, + { + "epoch": 2.5489642922978053, + "grad_norm": 3.539301872253418, + "learning_rate": 0.0033989736511624777, + "loss": 7.752, + "step": 625700 + }, + { + "epoch": 2.549371670321187, + "grad_norm": 4.2535223960876465, + "learning_rate": 0.0033985104787446767, + "loss": 7.7551, + "step": 625800 + }, + { + "epoch": 2.5497790483445684, + "grad_norm": 3.1228325366973877, + "learning_rate": 0.003398047271001628, + "loss": 7.7513, + "step": 625900 + }, + { + "epoch": 2.55018642636795, + "grad_norm": 5.040442943572998, + "learning_rate": 0.003397584027951647, + "loss": 7.7251, + "step": 626000 + }, + { + "epoch": 2.55018642636795, + "eval_MaskedAccuracy": 0.5020356942354235, + "eval_loss": 1.630746841430664, + "eval_runtime": 201.9807, + "eval_samples_per_second": 314.268, + "eval_steps_per_second": 1.228, + "step": 626000 + }, + { + "epoch": 2.5505938043913314, + "grad_norm": 3.274500608444214, + "learning_rate": 0.003397120749613042, + "loss": 7.764, + "step": 626100 + }, + { + "epoch": 2.551001182414713, + "grad_norm": 5.133733749389648, + "learning_rate": 0.0033966574360041347, + "loss": 7.7578, + "step": 626200 + }, + { + "epoch": 2.5514085604380945, + "grad_norm": 3.882274866104126, + "learning_rate": 0.00339619408714324, + "loss": 7.7649, + "step": 626300 + }, + { + "epoch": 2.551815938461476, + "grad_norm": 2.9392213821411133, + "learning_rate": 0.003395730703048678, + "loss": 7.7526, + "step": 626400 + }, + { + "epoch": 2.552223316484857, + "grad_norm": 5.912354946136475, + "learning_rate": 0.003395267283738765, + "loss": 7.7781, + "step": 626500 + }, + { + "epoch": 2.5526306945082387, + "grad_norm": 4.403803825378418, + "learning_rate": 0.0033948038292318243, + "loss": 7.7613, + "step": 626600 + }, + { + "epoch": 2.55303807253162, + "grad_norm": 4.239830017089844, + "learning_rate": 0.003394340339546181, + "loss": 7.7316, + "step": 626700 + }, + { + "epoch": 2.5534454505550017, + "grad_norm": 2.795454978942871, + "learning_rate": 0.003393876814700157, + "loss": 7.7912, + "step": 626800 + }, + { + "epoch": 2.5538528285783832, + "grad_norm": 4.8407793045043945, + "learning_rate": 0.0033934132547120796, + "loss": 7.7749, + "step": 626900 + }, + { + "epoch": 2.5542602066017643, + "grad_norm": 3.2152836322784424, + "learning_rate": 0.0033929496596002756, + "loss": 7.7589, + "step": 627000 + }, + { + "epoch": 2.5542602066017643, + "eval_MaskedAccuracy": 0.5028559796056031, + "eval_loss": 1.6388236284255981, + "eval_runtime": 199.451, + "eval_samples_per_second": 318.254, + "eval_steps_per_second": 1.243, + "step": 627000 + }, + { + "epoch": 2.554667584625146, + "grad_norm": 4.629742622375488, + "learning_rate": 0.003392486029383074, + "loss": 7.7668, + "step": 627100 + }, + { + "epoch": 2.5550749626485274, + "grad_norm": 4.139995574951172, + "learning_rate": 0.0033920223640788033, + "loss": 7.7629, + "step": 627200 + }, + { + "epoch": 2.555482340671909, + "grad_norm": 7.941045761108398, + "learning_rate": 0.0033915586637057937, + "loss": 7.745, + "step": 627300 + }, + { + "epoch": 2.5558897186952905, + "grad_norm": 3.0665557384490967, + "learning_rate": 0.003391094928282374, + "loss": 7.7789, + "step": 627400 + }, + { + "epoch": 2.556297096718672, + "grad_norm": 3.5421810150146484, + "learning_rate": 0.0033906311578268842, + "loss": 7.7454, + "step": 627500 + }, + { + "epoch": 2.5567044747420535, + "grad_norm": 2.8414247035980225, + "learning_rate": 0.0033901673523576573, + "loss": 7.7498, + "step": 627600 + }, + { + "epoch": 2.5571118527654346, + "grad_norm": 2.641085147857666, + "learning_rate": 0.0033897035118930306, + "loss": 7.7597, + "step": 627700 + }, + { + "epoch": 2.557519230788816, + "grad_norm": 4.267969131469727, + "learning_rate": 0.003389239636451345, + "loss": 7.7813, + "step": 627800 + }, + { + "epoch": 2.5579266088121977, + "grad_norm": 4.983828067779541, + "learning_rate": 0.0033887757260509326, + "loss": 7.7749, + "step": 627900 + }, + { + "epoch": 2.558333986835579, + "grad_norm": 2.1012320518493652, + "learning_rate": 0.003388311780710142, + "loss": 7.7959, + "step": 628000 + }, + { + "epoch": 2.558333986835579, + "eval_MaskedAccuracy": 0.5024192944190107, + "eval_loss": 1.6422089338302612, + "eval_runtime": 168.2914, + "eval_samples_per_second": 377.179, + "eval_steps_per_second": 1.474, + "step": 628000 + }, + { + "epoch": 2.5587413648589608, + "grad_norm": 7.516231536865234, + "learning_rate": 0.0033878478004473124, + "loss": 7.7633, + "step": 628100 + }, + { + "epoch": 2.559148742882342, + "grad_norm": 4.119853496551514, + "learning_rate": 0.0033873837852807865, + "loss": 7.7588, + "step": 628200 + }, + { + "epoch": 2.5595561209057234, + "grad_norm": 3.6812572479248047, + "learning_rate": 0.0033869197352289137, + "loss": 7.7777, + "step": 628300 + }, + { + "epoch": 2.559963498929105, + "grad_norm": 5.309595584869385, + "learning_rate": 0.003386455650310039, + "loss": 7.7878, + "step": 628400 + }, + { + "epoch": 2.5603708769524864, + "grad_norm": 6.48313570022583, + "learning_rate": 0.0033859915305425037, + "loss": 7.7696, + "step": 628500 + }, + { + "epoch": 2.560778254975868, + "grad_norm": 7.021803855895996, + "learning_rate": 0.00338552737594466, + "loss": 7.7929, + "step": 628600 + }, + { + "epoch": 2.5611856329992495, + "grad_norm": 4.681790351867676, + "learning_rate": 0.003385063186534861, + "loss": 7.7431, + "step": 628700 + }, + { + "epoch": 2.561593011022631, + "grad_norm": 7.020096302032471, + "learning_rate": 0.00338459896233146, + "loss": 7.7527, + "step": 628800 + }, + { + "epoch": 2.5620003890460126, + "grad_norm": 6.3975725173950195, + "learning_rate": 0.0033841347033528057, + "loss": 7.7605, + "step": 628900 + }, + { + "epoch": 2.5624077670693937, + "grad_norm": 8.613045692443848, + "learning_rate": 0.0033836704096172547, + "loss": 7.7299, + "step": 629000 + }, + { + "epoch": 2.5624077670693937, + "eval_MaskedAccuracy": 0.5020393620005411, + "eval_loss": 1.6370080709457397, + "eval_runtime": 200.737, + "eval_samples_per_second": 316.215, + "eval_steps_per_second": 1.235, + "step": 629000 + }, + { + "epoch": 2.562815145092775, + "grad_norm": 4.094930648803711, + "learning_rate": 0.003383206081143165, + "loss": 7.7384, + "step": 629100 + }, + { + "epoch": 2.5632225231161567, + "grad_norm": 3.1509199142456055, + "learning_rate": 0.0033827417179488907, + "loss": 7.7706, + "step": 629200 + }, + { + "epoch": 2.5636299011395383, + "grad_norm": 5.84613561630249, + "learning_rate": 0.0033822773200527898, + "loss": 7.7464, + "step": 629300 + }, + { + "epoch": 2.56403727916292, + "grad_norm": 2.2058050632476807, + "learning_rate": 0.003381812887473227, + "loss": 7.737, + "step": 629400 + }, + { + "epoch": 2.564444657186301, + "grad_norm": 3.3693466186523438, + "learning_rate": 0.0033813484202285596, + "loss": 7.7523, + "step": 629500 + }, + { + "epoch": 2.5648520352096824, + "grad_norm": 10.856343269348145, + "learning_rate": 0.0033808839183371527, + "loss": 7.7631, + "step": 629600 + }, + { + "epoch": 2.565259413233064, + "grad_norm": 3.346740484237671, + "learning_rate": 0.0033804193818173706, + "loss": 7.7421, + "step": 629700 + }, + { + "epoch": 2.5656667912564455, + "grad_norm": 5.470869541168213, + "learning_rate": 0.0033799548106875745, + "loss": 7.7382, + "step": 629800 + }, + { + "epoch": 2.566074169279827, + "grad_norm": 4.04727840423584, + "learning_rate": 0.003379490204966138, + "loss": 7.7826, + "step": 629900 + }, + { + "epoch": 2.5664815473032085, + "grad_norm": 3.49943208694458, + "learning_rate": 0.0033790255646714245, + "loss": 7.7395, + "step": 630000 + }, + { + "epoch": 2.5664815473032085, + "eval_MaskedAccuracy": 0.5026848141215671, + "eval_loss": 1.6341649293899536, + "eval_runtime": 218.4629, + "eval_samples_per_second": 290.557, + "eval_steps_per_second": 1.135, + "step": 630000 + }, + { + "epoch": 2.56688892532659, + "grad_norm": 4.030453205108643, + "learning_rate": 0.0033785608898218083, + "loss": 7.7338, + "step": 630100 + }, + { + "epoch": 2.567296303349971, + "grad_norm": 4.446766376495361, + "learning_rate": 0.0033780961804356596, + "loss": 7.7699, + "step": 630200 + }, + { + "epoch": 2.5677036813733527, + "grad_norm": 3.2295103073120117, + "learning_rate": 0.0033776314365313452, + "loss": 7.7671, + "step": 630300 + }, + { + "epoch": 2.5681110593967342, + "grad_norm": 3.9324440956115723, + "learning_rate": 0.003377166658127242, + "loss": 7.7481, + "step": 630400 + }, + { + "epoch": 2.5685184374201158, + "grad_norm": 3.631042003631592, + "learning_rate": 0.003376701845241731, + "loss": 7.7388, + "step": 630500 + }, + { + "epoch": 2.5689258154434973, + "grad_norm": 4.721069812774658, + "learning_rate": 0.0033762369978931795, + "loss": 7.7604, + "step": 630600 + }, + { + "epoch": 2.5693331934668784, + "grad_norm": 7.176989555358887, + "learning_rate": 0.003375772116099972, + "loss": 7.7796, + "step": 630700 + }, + { + "epoch": 2.56974057149026, + "grad_norm": 5.375190734863281, + "learning_rate": 0.003375307199880481, + "loss": 7.7791, + "step": 630800 + }, + { + "epoch": 2.5701479495136414, + "grad_norm": 5.220234394073486, + "learning_rate": 0.0033748422492530955, + "loss": 7.7767, + "step": 630900 + }, + { + "epoch": 2.570555327537023, + "grad_norm": 2.8575234413146973, + "learning_rate": 0.0033743772642361946, + "loss": 7.7608, + "step": 631000 + }, + { + "epoch": 2.570555327537023, + "eval_MaskedAccuracy": 0.5033651866433944, + "eval_loss": 1.6335687637329102, + "eval_runtime": 168.7379, + "eval_samples_per_second": 376.181, + "eval_steps_per_second": 1.47, + "step": 631000 + }, + { + "epoch": 2.5709627055604045, + "grad_norm": 3.3210229873657227, + "learning_rate": 0.003373912244848161, + "loss": 7.7667, + "step": 631100 + }, + { + "epoch": 2.571370083583786, + "grad_norm": 3.0266685485839844, + "learning_rate": 0.0033734471911073777, + "loss": 7.7601, + "step": 631200 + }, + { + "epoch": 2.5717774616071676, + "grad_norm": 1.9625940322875977, + "learning_rate": 0.00337298210303223, + "loss": 7.779, + "step": 631300 + }, + { + "epoch": 2.572184839630549, + "grad_norm": 2.838864326477051, + "learning_rate": 0.003372516980641106, + "loss": 7.7817, + "step": 631400 + }, + { + "epoch": 2.57259221765393, + "grad_norm": 4.446281433105469, + "learning_rate": 0.003372051823952398, + "loss": 7.7641, + "step": 631500 + }, + { + "epoch": 2.5729995956773117, + "grad_norm": 2.1418299674987793, + "learning_rate": 0.003371586632984494, + "loss": 7.7803, + "step": 631600 + }, + { + "epoch": 2.5734069737006933, + "grad_norm": 6.692812919616699, + "learning_rate": 0.0033711214077557865, + "loss": 7.7374, + "step": 631700 + }, + { + "epoch": 2.573814351724075, + "grad_norm": 3.6473629474639893, + "learning_rate": 0.00337065614828467, + "loss": 7.7772, + "step": 631800 + }, + { + "epoch": 2.5742217297474563, + "grad_norm": 5.349830627441406, + "learning_rate": 0.0033701908545895314, + "loss": 7.7521, + "step": 631900 + }, + { + "epoch": 2.5746291077708374, + "grad_norm": 3.0547399520874023, + "learning_rate": 0.0033697255266887743, + "loss": 7.7611, + "step": 632000 + }, + { + "epoch": 2.5746291077708374, + "eval_MaskedAccuracy": 0.5032770196930922, + "eval_loss": 1.635548710823059, + "eval_runtime": 201.8086, + "eval_samples_per_second": 314.536, + "eval_steps_per_second": 1.229, + "step": 632000 + }, + { + "epoch": 2.575036485794219, + "grad_norm": 2.5424275398254395, + "learning_rate": 0.003369260164600792, + "loss": 7.7414, + "step": 632100 + }, + { + "epoch": 2.5754438638176005, + "grad_norm": 7.3950395584106445, + "learning_rate": 0.0033687947683439805, + "loss": 7.7431, + "step": 632200 + }, + { + "epoch": 2.575851241840982, + "grad_norm": 2.375016689300537, + "learning_rate": 0.0033683293379367448, + "loss": 7.7477, + "step": 632300 + }, + { + "epoch": 2.5762586198643636, + "grad_norm": 3.9684011936187744, + "learning_rate": 0.0033678638733974824, + "loss": 7.7538, + "step": 632400 + }, + { + "epoch": 2.576665997887745, + "grad_norm": 10.618762016296387, + "learning_rate": 0.003367398374744596, + "loss": 7.7222, + "step": 632500 + }, + { + "epoch": 2.5770733759111266, + "grad_norm": 2.8266074657440186, + "learning_rate": 0.0033669328419964908, + "loss": 7.7424, + "step": 632600 + }, + { + "epoch": 2.5774807539345077, + "grad_norm": 4.158894062042236, + "learning_rate": 0.0033664672751715662, + "loss": 7.7364, + "step": 632700 + }, + { + "epoch": 2.5778881319578892, + "grad_norm": 3.104992628097534, + "learning_rate": 0.0033660016742882346, + "loss": 7.7622, + "step": 632800 + }, + { + "epoch": 2.5782955099812708, + "grad_norm": 8.103853225708008, + "learning_rate": 0.0033655360393649004, + "loss": 7.7499, + "step": 632900 + }, + { + "epoch": 2.5787028880046523, + "grad_norm": 7.338804721832275, + "learning_rate": 0.0033650703704199768, + "loss": 7.7608, + "step": 633000 + }, + { + "epoch": 2.5787028880046523, + "eval_MaskedAccuracy": 0.5028308156270933, + "eval_loss": 1.6374253034591675, + "eval_runtime": 155.7305, + "eval_samples_per_second": 407.602, + "eval_steps_per_second": 1.592, + "step": 633000 + }, + { + "epoch": 2.579110266028034, + "grad_norm": 4.880356788635254, + "learning_rate": 0.0033646046674718757, + "loss": 7.764, + "step": 633100 + }, + { + "epoch": 2.579517644051415, + "grad_norm": 7.644883632659912, + "learning_rate": 0.003364138930539002, + "loss": 7.7473, + "step": 633200 + }, + { + "epoch": 2.5799250220747965, + "grad_norm": 2.9873547554016113, + "learning_rate": 0.0033636731596397708, + "loss": 7.7226, + "step": 633300 + }, + { + "epoch": 2.580332400098178, + "grad_norm": 2.4233789443969727, + "learning_rate": 0.0033632073547925955, + "loss": 7.7255, + "step": 633400 + }, + { + "epoch": 2.5807397781215595, + "grad_norm": 5.47147274017334, + "learning_rate": 0.003362741516015892, + "loss": 7.7682, + "step": 633500 + }, + { + "epoch": 2.581147156144941, + "grad_norm": 7.26539421081543, + "learning_rate": 0.0033622756433280784, + "loss": 7.7674, + "step": 633600 + }, + { + "epoch": 2.5815545341683226, + "grad_norm": 3.6344358921051025, + "learning_rate": 0.0033618097367475756, + "loss": 7.8084, + "step": 633700 + }, + { + "epoch": 2.581961912191704, + "grad_norm": 5.889102935791016, + "learning_rate": 0.0033613437962927996, + "loss": 7.7693, + "step": 633800 + }, + { + "epoch": 2.5823692902150857, + "grad_norm": 2.740557909011841, + "learning_rate": 0.0033608778219821763, + "loss": 7.7431, + "step": 633900 + }, + { + "epoch": 2.5827766682384667, + "grad_norm": 5.892124176025391, + "learning_rate": 0.003360411813834124, + "loss": 7.7568, + "step": 634000 + }, + { + "epoch": 2.5827766682384667, + "eval_MaskedAccuracy": 0.5030534435764971, + "eval_loss": 1.6318116188049316, + "eval_runtime": 160.8996, + "eval_samples_per_second": 394.507, + "eval_steps_per_second": 1.541, + "step": 634000 + }, + { + "epoch": 2.5831840462618483, + "grad_norm": 7.4653778076171875, + "learning_rate": 0.0033599457718670673, + "loss": 7.7416, + "step": 634100 + }, + { + "epoch": 2.58359142428523, + "grad_norm": 2.5234272480010986, + "learning_rate": 0.003359479696099433, + "loss": 7.7468, + "step": 634200 + }, + { + "epoch": 2.5839988023086113, + "grad_norm": 3.5666096210479736, + "learning_rate": 0.0033590135865496484, + "loss": 7.7775, + "step": 634300 + }, + { + "epoch": 2.584406180331993, + "grad_norm": 4.852608680725098, + "learning_rate": 0.003358547443236134, + "loss": 7.7654, + "step": 634400 + }, + { + "epoch": 2.584813558355374, + "grad_norm": 3.440246105194092, + "learning_rate": 0.0033580812661773243, + "loss": 7.7464, + "step": 634500 + }, + { + "epoch": 2.5852209363787555, + "grad_norm": 4.503756999969482, + "learning_rate": 0.0033576150553916487, + "loss": 7.7556, + "step": 634600 + }, + { + "epoch": 2.585628314402137, + "grad_norm": 6.754344463348389, + "learning_rate": 0.003357148810897542, + "loss": 7.7561, + "step": 634700 + }, + { + "epoch": 2.5860356924255186, + "grad_norm": 3.8878188133239746, + "learning_rate": 0.0033566825327134404, + "loss": 7.7432, + "step": 634800 + }, + { + "epoch": 2.5864430704489, + "grad_norm": 3.3473639488220215, + "learning_rate": 0.00335621622085777, + "loss": 7.7797, + "step": 634900 + }, + { + "epoch": 2.5868504484722816, + "grad_norm": 3.869795799255371, + "learning_rate": 0.003355749875348968, + "loss": 7.7359, + "step": 635000 + }, + { + "epoch": 2.5868504484722816, + "eval_MaskedAccuracy": 0.5022740854793031, + "eval_loss": 1.6272011995315552, + "eval_runtime": 169.6625, + "eval_samples_per_second": 374.131, + "eval_steps_per_second": 1.462, + "step": 635000 + }, + { + "epoch": 2.587257826495663, + "grad_norm": 3.1247730255126953, + "learning_rate": 0.003355283496205473, + "loss": 7.7528, + "step": 635100 + }, + { + "epoch": 2.5876652045190442, + "grad_norm": 3.1590864658355713, + "learning_rate": 0.003354817083445727, + "loss": 7.7502, + "step": 635200 + }, + { + "epoch": 2.588072582542426, + "grad_norm": 11.75013542175293, + "learning_rate": 0.0033543506370881638, + "loss": 7.7724, + "step": 635300 + }, + { + "epoch": 2.5884799605658073, + "grad_norm": 10.730052947998047, + "learning_rate": 0.003353884157151228, + "loss": 7.7249, + "step": 635400 + }, + { + "epoch": 2.588887338589189, + "grad_norm": 5.237130641937256, + "learning_rate": 0.0033534176436533602, + "loss": 7.7701, + "step": 635500 + }, + { + "epoch": 2.5892947166125704, + "grad_norm": 5.012698650360107, + "learning_rate": 0.003352951096613007, + "loss": 7.7378, + "step": 635600 + }, + { + "epoch": 2.5897020946359515, + "grad_norm": 3.3394856452941895, + "learning_rate": 0.0033524845160486146, + "loss": 7.7727, + "step": 635700 + }, + { + "epoch": 2.590109472659333, + "grad_norm": 4.111293315887451, + "learning_rate": 0.0033520179019786243, + "loss": 7.7264, + "step": 635800 + }, + { + "epoch": 2.5905168506827145, + "grad_norm": 3.9242944717407227, + "learning_rate": 0.0033515512544214842, + "loss": 7.7592, + "step": 635900 + }, + { + "epoch": 2.590924228706096, + "grad_norm": 3.3268682956695557, + "learning_rate": 0.003351084573395647, + "loss": 7.7419, + "step": 636000 + }, + { + "epoch": 2.590924228706096, + "eval_MaskedAccuracy": 0.5030383245427981, + "eval_loss": 1.6388341188430786, + "eval_runtime": 155.5487, + "eval_samples_per_second": 408.078, + "eval_steps_per_second": 1.594, + "step": 636000 + }, + { + "epoch": 2.5913316067294776, + "grad_norm": 6.217716693878174, + "learning_rate": 0.0033506178589195615, + "loss": 7.7713, + "step": 636100 + }, + { + "epoch": 2.591738984752859, + "grad_norm": 3.2350013256073, + "learning_rate": 0.0033501511110116777, + "loss": 7.754, + "step": 636200 + }, + { + "epoch": 2.5921463627762407, + "grad_norm": 6.423516273498535, + "learning_rate": 0.0033496843296904506, + "loss": 7.7939, + "step": 636300 + }, + { + "epoch": 2.592553740799622, + "grad_norm": 5.533207893371582, + "learning_rate": 0.003349217514974333, + "loss": 7.768, + "step": 636400 + }, + { + "epoch": 2.5929611188230033, + "grad_norm": 3.31998610496521, + "learning_rate": 0.0033487506668817835, + "loss": 7.743, + "step": 636500 + }, + { + "epoch": 2.593368496846385, + "grad_norm": 3.193930149078369, + "learning_rate": 0.0033482837854312574, + "loss": 7.7569, + "step": 636600 + }, + { + "epoch": 2.5937758748697664, + "grad_norm": 7.648054599761963, + "learning_rate": 0.0033478168706412115, + "loss": 7.7484, + "step": 636700 + }, + { + "epoch": 2.594183252893148, + "grad_norm": 4.330843925476074, + "learning_rate": 0.0033473499225301073, + "loss": 7.7393, + "step": 636800 + }, + { + "epoch": 2.5945906309165294, + "grad_norm": 2.8680262565612793, + "learning_rate": 0.0033468829411164073, + "loss": 7.7703, + "step": 636900 + }, + { + "epoch": 2.5949980089399105, + "grad_norm": 3.8069069385528564, + "learning_rate": 0.0033464159264185705, + "loss": 7.736, + "step": 637000 + }, + { + "epoch": 2.5949980089399105, + "eval_MaskedAccuracy": 0.5022587825609062, + "eval_loss": 1.6388996839523315, + "eval_runtime": 185.8618, + "eval_samples_per_second": 341.523, + "eval_steps_per_second": 1.334, + "step": 637000 + }, + { + "epoch": 2.595405386963292, + "grad_norm": 5.330597877502441, + "learning_rate": 0.003345948878455062, + "loss": 7.7636, + "step": 637100 + }, + { + "epoch": 2.5958127649866736, + "grad_norm": 3.6598243713378906, + "learning_rate": 0.0033454817972443504, + "loss": 7.778, + "step": 637200 + }, + { + "epoch": 2.596220143010055, + "grad_norm": 2.821416139602661, + "learning_rate": 0.003345014682804897, + "loss": 7.7682, + "step": 637300 + }, + { + "epoch": 2.5966275210334366, + "grad_norm": 1.9124808311462402, + "learning_rate": 0.0033445475351551704, + "loss": 7.7222, + "step": 637400 + }, + { + "epoch": 2.597034899056818, + "grad_norm": 2.547819137573242, + "learning_rate": 0.0033440803543136385, + "loss": 7.7314, + "step": 637500 + }, + { + "epoch": 2.5974422770801997, + "grad_norm": 3.20186448097229, + "learning_rate": 0.00334361314029877, + "loss": 7.7568, + "step": 637600 + }, + { + "epoch": 2.597849655103581, + "grad_norm": 4.085344314575195, + "learning_rate": 0.0033431458931290426, + "loss": 7.7625, + "step": 637700 + }, + { + "epoch": 2.5982570331269623, + "grad_norm": 2.124180555343628, + "learning_rate": 0.003342678612822924, + "loss": 7.7627, + "step": 637800 + }, + { + "epoch": 2.598664411150344, + "grad_norm": 9.462471008300781, + "learning_rate": 0.0033422112993988884, + "loss": 7.7629, + "step": 637900 + }, + { + "epoch": 2.5990717891737254, + "grad_norm": 2.8271639347076416, + "learning_rate": 0.0033417439528754134, + "loss": 7.7276, + "step": 638000 + }, + { + "epoch": 2.5990717891737254, + "eval_MaskedAccuracy": 0.5034621723905387, + "eval_loss": 1.634547233581543, + "eval_runtime": 151.1659, + "eval_samples_per_second": 419.91, + "eval_steps_per_second": 1.641, + "step": 638000 + }, + { + "epoch": 2.599479167197107, + "grad_norm": 6.620189666748047, + "learning_rate": 0.0033412765732709765, + "loss": 7.7502, + "step": 638100 + }, + { + "epoch": 2.599886545220488, + "grad_norm": 2.910623788833618, + "learning_rate": 0.003340809160604051, + "loss": 7.7625, + "step": 638200 + }, + { + "epoch": 2.6002939232438695, + "grad_norm": 3.306079149246216, + "learning_rate": 0.0033403417148931197, + "loss": 7.7523, + "step": 638300 + }, + { + "epoch": 2.600701301267251, + "grad_norm": 4.400734901428223, + "learning_rate": 0.0033398742361566616, + "loss": 7.7739, + "step": 638400 + }, + { + "epoch": 2.6011086792906326, + "grad_norm": 3.394253969192505, + "learning_rate": 0.00333940672441316, + "loss": 7.742, + "step": 638500 + }, + { + "epoch": 2.601516057314014, + "grad_norm": 3.1092782020568848, + "learning_rate": 0.003338939179681099, + "loss": 7.7571, + "step": 638600 + }, + { + "epoch": 2.6019234353373957, + "grad_norm": 2.225964307785034, + "learning_rate": 0.00333847160197896, + "loss": 7.7552, + "step": 638700 + }, + { + "epoch": 2.602330813360777, + "grad_norm": 3.435316562652588, + "learning_rate": 0.00333800399132523, + "loss": 7.7401, + "step": 638800 + }, + { + "epoch": 2.6027381913841587, + "grad_norm": 10.924406051635742, + "learning_rate": 0.0033375363477384, + "loss": 7.7392, + "step": 638900 + }, + { + "epoch": 2.60314556940754, + "grad_norm": 4.353061199188232, + "learning_rate": 0.0033370686712369552, + "loss": 7.7537, + "step": 639000 + }, + { + "epoch": 2.60314556940754, + "eval_MaskedAccuracy": 0.5024711280498285, + "eval_loss": 1.6405030488967896, + "eval_runtime": 158.0755, + "eval_samples_per_second": 401.555, + "eval_steps_per_second": 1.569, + "step": 639000 + }, + { + "epoch": 2.6035529474309214, + "grad_norm": 5.042884826660156, + "learning_rate": 0.0033366009618393836, + "loss": 7.7651, + "step": 639100 + }, + { + "epoch": 2.603960325454303, + "grad_norm": 3.777416229248047, + "learning_rate": 0.0033361332195641826, + "loss": 7.7392, + "step": 639200 + }, + { + "epoch": 2.6043677034776844, + "grad_norm": 3.00793719291687, + "learning_rate": 0.0033356654444298356, + "loss": 7.7564, + "step": 639300 + }, + { + "epoch": 2.604775081501066, + "grad_norm": 5.785336017608643, + "learning_rate": 0.003335197636454842, + "loss": 7.7292, + "step": 639400 + }, + { + "epoch": 2.605182459524447, + "grad_norm": 1.8484314680099487, + "learning_rate": 0.0033347297956576975, + "loss": 7.7556, + "step": 639500 + }, + { + "epoch": 2.6055898375478286, + "grad_norm": 3.6654958724975586, + "learning_rate": 0.003334261922056897, + "loss": 7.7618, + "step": 639600 + }, + { + "epoch": 2.60599721557121, + "grad_norm": 2.9968793392181396, + "learning_rate": 0.003333794015670936, + "loss": 7.7455, + "step": 639700 + }, + { + "epoch": 2.6064045935945916, + "grad_norm": 2.7247536182403564, + "learning_rate": 0.003333326076518316, + "loss": 7.7662, + "step": 639800 + }, + { + "epoch": 2.606811971617973, + "grad_norm": 2.114069938659668, + "learning_rate": 0.003332858104617532, + "loss": 7.7438, + "step": 639900 + }, + { + "epoch": 2.6072193496413547, + "grad_norm": 3.1880838871002197, + "learning_rate": 0.003332390099987095, + "loss": 7.7387, + "step": 640000 + }, + { + "epoch": 2.6072193496413547, + "eval_MaskedAccuracy": 0.5025778422856783, + "eval_loss": 1.6369432210922241, + "eval_runtime": 156.6582, + "eval_samples_per_second": 405.188, + "eval_steps_per_second": 1.583, + "step": 640000 + }, + { + "epoch": 2.6076267276647362, + "grad_norm": 8.58343505859375, + "learning_rate": 0.0033319220626455014, + "loss": 7.775, + "step": 640100 + }, + { + "epoch": 2.6080341056881173, + "grad_norm": 4.06058406829834, + "learning_rate": 0.003331453992611252, + "loss": 7.7449, + "step": 640200 + }, + { + "epoch": 2.608441483711499, + "grad_norm": 4.162612438201904, + "learning_rate": 0.003330985889902857, + "loss": 7.7489, + "step": 640300 + }, + { + "epoch": 2.6088488617348804, + "grad_norm": 6.647921085357666, + "learning_rate": 0.0033305177545388223, + "loss": 7.7241, + "step": 640400 + }, + { + "epoch": 2.609256239758262, + "grad_norm": 2.6385583877563477, + "learning_rate": 0.003330049586537656, + "loss": 7.7574, + "step": 640500 + }, + { + "epoch": 2.6096636177816435, + "grad_norm": 3.011281967163086, + "learning_rate": 0.0033295813859178696, + "loss": 7.7616, + "step": 640600 + }, + { + "epoch": 2.6100709958050246, + "grad_norm": 6.106122970581055, + "learning_rate": 0.003329113152697967, + "loss": 7.7396, + "step": 640700 + }, + { + "epoch": 2.610478373828406, + "grad_norm": 3.221099853515625, + "learning_rate": 0.003328644886896465, + "loss": 7.7382, + "step": 640800 + }, + { + "epoch": 2.6108857518517876, + "grad_norm": 4.115914344787598, + "learning_rate": 0.003328176588531876, + "loss": 7.777, + "step": 640900 + }, + { + "epoch": 2.611293129875169, + "grad_norm": 10.452240943908691, + "learning_rate": 0.003327708257622711, + "loss": 7.7523, + "step": 641000 + }, + { + "epoch": 2.611293129875169, + "eval_MaskedAccuracy": 0.5026915635817152, + "eval_loss": 1.6318708658218384, + "eval_runtime": 153.341, + "eval_samples_per_second": 413.953, + "eval_steps_per_second": 1.617, + "step": 641000 + }, + { + "epoch": 2.6117005078985507, + "grad_norm": 3.1379988193511963, + "learning_rate": 0.003327239894187492, + "loss": 7.7272, + "step": 641100 + }, + { + "epoch": 2.612107885921932, + "grad_norm": 3.4544689655303955, + "learning_rate": 0.003326771498244731, + "loss": 7.749, + "step": 641200 + }, + { + "epoch": 2.6125152639453137, + "grad_norm": 3.4039297103881836, + "learning_rate": 0.0033263030698129485, + "loss": 7.7518, + "step": 641300 + }, + { + "epoch": 2.6129226419686953, + "grad_norm": 4.055280685424805, + "learning_rate": 0.003325834608910663, + "loss": 7.7408, + "step": 641400 + }, + { + "epoch": 2.6133300199920764, + "grad_norm": 3.9467811584472656, + "learning_rate": 0.0033253661155563936, + "loss": 7.7629, + "step": 641500 + }, + { + "epoch": 2.613737398015458, + "grad_norm": 4.436812877655029, + "learning_rate": 0.003324897589768664, + "loss": 7.7476, + "step": 641600 + }, + { + "epoch": 2.6141447760388394, + "grad_norm": 5.523942470550537, + "learning_rate": 0.003324429031566, + "loss": 7.763, + "step": 641700 + }, + { + "epoch": 2.614552154062221, + "grad_norm": 3.8268330097198486, + "learning_rate": 0.003323960440966925, + "loss": 7.7348, + "step": 641800 + }, + { + "epoch": 2.6149595320856025, + "grad_norm": 3.319409132003784, + "learning_rate": 0.0033234918179899623, + "loss": 7.7337, + "step": 641900 + }, + { + "epoch": 2.6153669101089836, + "grad_norm": 3.136472463607788, + "learning_rate": 0.0033230231626536428, + "loss": 7.7086, + "step": 642000 + }, + { + "epoch": 2.6153669101089836, + "eval_MaskedAccuracy": 0.5035448803106163, + "eval_loss": 1.6341197490692139, + "eval_runtime": 257.8807, + "eval_samples_per_second": 246.145, + "eval_steps_per_second": 0.962, + "step": 642000 + }, + { + "epoch": 2.615774288132365, + "grad_norm": 5.561611652374268, + "learning_rate": 0.0033225544749764914, + "loss": 7.7543, + "step": 642100 + }, + { + "epoch": 2.6161816661557467, + "grad_norm": 6.583291053771973, + "learning_rate": 0.0033220857549770377, + "loss": 7.7651, + "step": 642200 + }, + { + "epoch": 2.616589044179128, + "grad_norm": 4.528241157531738, + "learning_rate": 0.0033216170026738163, + "loss": 7.7199, + "step": 642300 + }, + { + "epoch": 2.6169964222025097, + "grad_norm": 10.77587890625, + "learning_rate": 0.0033211482180853535, + "loss": 7.7579, + "step": 642400 + }, + { + "epoch": 2.6174038002258913, + "grad_norm": 3.0642378330230713, + "learning_rate": 0.0033206794012301867, + "loss": 7.782, + "step": 642500 + }, + { + "epoch": 2.617811178249273, + "grad_norm": 7.352479934692383, + "learning_rate": 0.0033202105521268526, + "loss": 7.7674, + "step": 642600 + }, + { + "epoch": 2.618218556272654, + "grad_norm": 4.312407493591309, + "learning_rate": 0.003319741670793884, + "loss": 7.7521, + "step": 642700 + }, + { + "epoch": 2.6186259342960354, + "grad_norm": 4.083638668060303, + "learning_rate": 0.0033192727572498196, + "loss": 7.7394, + "step": 642800 + }, + { + "epoch": 2.619033312319417, + "grad_norm": 2.7038230895996094, + "learning_rate": 0.0033188038115131986, + "loss": 7.736, + "step": 642900 + }, + { + "epoch": 2.6194406903427985, + "grad_norm": 4.815116882324219, + "learning_rate": 0.0033183348336025603, + "loss": 7.7477, + "step": 643000 + }, + { + "epoch": 2.6194406903427985, + "eval_MaskedAccuracy": 0.5028943727183118, + "eval_loss": 1.6297352313995361, + "eval_runtime": 153.1634, + "eval_samples_per_second": 414.433, + "eval_steps_per_second": 1.619, + "step": 643000 + }, + { + "epoch": 2.61984806836618, + "grad_norm": 7.506451606750488, + "learning_rate": 0.0033178658235364466, + "loss": 7.7345, + "step": 643100 + }, + { + "epoch": 2.620255446389561, + "grad_norm": 3.9260177612304688, + "learning_rate": 0.003317396781333397, + "loss": 7.7586, + "step": 643200 + }, + { + "epoch": 2.6206628244129426, + "grad_norm": 3.6912059783935547, + "learning_rate": 0.0033169277070119576, + "loss": 7.7536, + "step": 643300 + }, + { + "epoch": 2.621070202436324, + "grad_norm": 3.7203218936920166, + "learning_rate": 0.0033164586005906744, + "loss": 7.7613, + "step": 643400 + }, + { + "epoch": 2.6214775804597057, + "grad_norm": 4.618086814880371, + "learning_rate": 0.0033159894620880923, + "loss": 7.7718, + "step": 643500 + }, + { + "epoch": 2.6218849584830872, + "grad_norm": 8.803803443908691, + "learning_rate": 0.0033155202915227584, + "loss": 7.7613, + "step": 643600 + }, + { + "epoch": 2.6222923365064688, + "grad_norm": 4.569681167602539, + "learning_rate": 0.0033150510889132243, + "loss": 7.7379, + "step": 643700 + }, + { + "epoch": 2.6226997145298503, + "grad_norm": 6.2448015213012695, + "learning_rate": 0.0033145818542780385, + "loss": 7.7458, + "step": 643800 + }, + { + "epoch": 2.623107092553232, + "grad_norm": 6.161468505859375, + "learning_rate": 0.0033141125876357514, + "loss": 7.7255, + "step": 643900 + }, + { + "epoch": 2.623514470576613, + "grad_norm": 4.657154560089111, + "learning_rate": 0.0033136432890049166, + "loss": 7.7212, + "step": 644000 + }, + { + "epoch": 2.623514470576613, + "eval_MaskedAccuracy": 0.5034150424478206, + "eval_loss": 1.629897117614746, + "eval_runtime": 150.5706, + "eval_samples_per_second": 421.57, + "eval_steps_per_second": 1.647, + "step": 644000 + }, + { + "epoch": 2.6239218485999944, + "grad_norm": 9.369250297546387, + "learning_rate": 0.003313173958404086, + "loss": 7.7378, + "step": 644100 + }, + { + "epoch": 2.624329226623376, + "grad_norm": 3.7428979873657227, + "learning_rate": 0.0033127045958518168, + "loss": 7.7266, + "step": 644200 + }, + { + "epoch": 2.6247366046467575, + "grad_norm": 3.641878604888916, + "learning_rate": 0.003312235201366667, + "loss": 7.727, + "step": 644300 + }, + { + "epoch": 2.625143982670139, + "grad_norm": 7.004256248474121, + "learning_rate": 0.003311765774967191, + "loss": 7.7331, + "step": 644400 + }, + { + "epoch": 2.62555136069352, + "grad_norm": 8.118968963623047, + "learning_rate": 0.003311296316671948, + "loss": 7.7447, + "step": 644500 + }, + { + "epoch": 2.6259587387169017, + "grad_norm": 5.203165054321289, + "learning_rate": 0.003310826826499498, + "loss": 7.7414, + "step": 644600 + }, + { + "epoch": 2.626366116740283, + "grad_norm": 4.296602725982666, + "learning_rate": 0.0033103573044684083, + "loss": 7.7464, + "step": 644700 + }, + { + "epoch": 2.6267734947636647, + "grad_norm": 2.9566855430603027, + "learning_rate": 0.003309887750597233, + "loss": 7.739, + "step": 644800 + }, + { + "epoch": 2.6271808727870463, + "grad_norm": 6.859632968902588, + "learning_rate": 0.0033094181649045424, + "loss": 7.7531, + "step": 644900 + }, + { + "epoch": 2.627588250810428, + "grad_norm": 5.904131889343262, + "learning_rate": 0.0033089485474088985, + "loss": 7.7462, + "step": 645000 + }, + { + "epoch": 2.627588250810428, + "eval_MaskedAccuracy": 0.5030494847748717, + "eval_loss": 1.6355324983596802, + "eval_runtime": 198.0358, + "eval_samples_per_second": 320.528, + "eval_steps_per_second": 1.252, + "step": 645000 + }, + { + "epoch": 2.6279956288338093, + "grad_norm": 6.6715593338012695, + "learning_rate": 0.003308478898128868, + "loss": 7.7265, + "step": 645100 + }, + { + "epoch": 2.6284030068571904, + "grad_norm": 9.638071060180664, + "learning_rate": 0.0033080092170830176, + "loss": 7.7136, + "step": 645200 + }, + { + "epoch": 2.628810384880572, + "grad_norm": 4.502671241760254, + "learning_rate": 0.0033075395042899223, + "loss": 7.7503, + "step": 645300 + }, + { + "epoch": 2.6292177629039535, + "grad_norm": 3.2229530811309814, + "learning_rate": 0.003307069759768146, + "loss": 7.7282, + "step": 645400 + }, + { + "epoch": 2.629625140927335, + "grad_norm": 4.425695896148682, + "learning_rate": 0.0033065999835362636, + "loss": 7.7401, + "step": 645500 + }, + { + "epoch": 2.6300325189507165, + "grad_norm": 4.474765300750732, + "learning_rate": 0.003306130175612846, + "loss": 7.7369, + "step": 645600 + }, + { + "epoch": 2.6304398969740976, + "grad_norm": 3.017754554748535, + "learning_rate": 0.00330566033601647, + "loss": 7.7343, + "step": 645700 + }, + { + "epoch": 2.630847274997479, + "grad_norm": 4.0706706047058105, + "learning_rate": 0.0033051904647657076, + "loss": 7.7278, + "step": 645800 + }, + { + "epoch": 2.6312546530208607, + "grad_norm": 3.418870449066162, + "learning_rate": 0.0033047205618791375, + "loss": 7.7664, + "step": 645900 + }, + { + "epoch": 2.6316620310442422, + "grad_norm": 4.286476135253906, + "learning_rate": 0.003304250627375334, + "loss": 7.7521, + "step": 646000 + }, + { + "epoch": 2.6316620310442422, + "eval_MaskedAccuracy": 0.5034081972867275, + "eval_loss": 1.6310018301010132, + "eval_runtime": 189.7994, + "eval_samples_per_second": 334.437, + "eval_steps_per_second": 1.307, + "step": 646000 + }, + { + "epoch": 2.6320694090676238, + "grad_norm": 3.2341248989105225, + "learning_rate": 0.0033037806612728816, + "loss": 7.7632, + "step": 646100 + }, + { + "epoch": 2.6324767870910053, + "grad_norm": 4.886885166168213, + "learning_rate": 0.0033033106635903535, + "loss": 7.7327, + "step": 646200 + }, + { + "epoch": 2.632884165114387, + "grad_norm": 5.385007381439209, + "learning_rate": 0.0033028406343463346, + "loss": 7.7425, + "step": 646300 + }, + { + "epoch": 2.6332915431377684, + "grad_norm": 4.208031177520752, + "learning_rate": 0.0033023705735594116, + "loss": 7.7504, + "step": 646400 + }, + { + "epoch": 2.6336989211611495, + "grad_norm": 5.101689338684082, + "learning_rate": 0.003301900481248163, + "loss": 7.7287, + "step": 646500 + }, + { + "epoch": 2.634106299184531, + "grad_norm": 4.291695594787598, + "learning_rate": 0.003301430357431178, + "loss": 7.7218, + "step": 646600 + }, + { + "epoch": 2.6345136772079125, + "grad_norm": 4.769300937652588, + "learning_rate": 0.003300960202127043, + "loss": 7.7584, + "step": 646700 + }, + { + "epoch": 2.634921055231294, + "grad_norm": 3.5957601070404053, + "learning_rate": 0.003300490015354342, + "loss": 7.741, + "step": 646800 + }, + { + "epoch": 2.6353284332546756, + "grad_norm": 2.1788899898529053, + "learning_rate": 0.0033000197971316636, + "loss": 7.7314, + "step": 646900 + }, + { + "epoch": 2.6357358112780567, + "grad_norm": 2.3784098625183105, + "learning_rate": 0.003299549547477601, + "loss": 7.766, + "step": 647000 + }, + { + "epoch": 2.6357358112780567, + "eval_MaskedAccuracy": 0.502821646416057, + "eval_loss": 1.6343514919281006, + "eval_runtime": 153.2288, + "eval_samples_per_second": 414.256, + "eval_steps_per_second": 1.618, + "step": 647000 + }, + { + "epoch": 2.636143189301438, + "grad_norm": 5.536612033843994, + "learning_rate": 0.003299079266410749, + "loss": 7.7294, + "step": 647100 + }, + { + "epoch": 2.6365505673248197, + "grad_norm": 3.7842025756835938, + "learning_rate": 0.0032986089539496946, + "loss": 7.7352, + "step": 647200 + }, + { + "epoch": 2.6369579453482013, + "grad_norm": 3.101235866546631, + "learning_rate": 0.0032981386101130347, + "loss": 7.7491, + "step": 647300 + }, + { + "epoch": 2.637365323371583, + "grad_norm": 4.833268165588379, + "learning_rate": 0.0032976682349193627, + "loss": 7.7601, + "step": 647400 + }, + { + "epoch": 2.6377727013949643, + "grad_norm": 3.6589972972869873, + "learning_rate": 0.0032971978283872776, + "loss": 7.7311, + "step": 647500 + }, + { + "epoch": 2.638180079418346, + "grad_norm": 3.490777015686035, + "learning_rate": 0.003296727390535375, + "loss": 7.7495, + "step": 647600 + }, + { + "epoch": 2.638587457441727, + "grad_norm": 2.925628900527954, + "learning_rate": 0.0032962569213822487, + "loss": 7.7641, + "step": 647700 + }, + { + "epoch": 2.6389948354651085, + "grad_norm": 2.6034014225006104, + "learning_rate": 0.0032957864209465088, + "loss": 7.7402, + "step": 647800 + }, + { + "epoch": 2.63940221348849, + "grad_norm": 3.1383039951324463, + "learning_rate": 0.003295315889246752, + "loss": 7.7217, + "step": 647900 + }, + { + "epoch": 2.6398095915118716, + "grad_norm": 2.6625351905822754, + "learning_rate": 0.0032948453263015784, + "loss": 7.7506, + "step": 648000 + }, + { + "epoch": 2.6398095915118716, + "eval_MaskedAccuracy": 0.5032627812556223, + "eval_loss": 1.633973240852356, + "eval_runtime": 157.4257, + "eval_samples_per_second": 403.212, + "eval_steps_per_second": 1.575, + "step": 648000 + }, + { + "epoch": 2.640216969535253, + "grad_norm": 4.2527031898498535, + "learning_rate": 0.0032943747321295984, + "loss": 7.7224, + "step": 648100 + }, + { + "epoch": 2.640624347558634, + "grad_norm": 4.225713729858398, + "learning_rate": 0.0032939041067494117, + "loss": 7.7487, + "step": 648200 + }, + { + "epoch": 2.6410317255820157, + "grad_norm": 4.850101947784424, + "learning_rate": 0.0032934334501796268, + "loss": 7.7383, + "step": 648300 + }, + { + "epoch": 2.6414391036053972, + "grad_norm": 3.3083293437957764, + "learning_rate": 0.003292962762438849, + "loss": 7.7351, + "step": 648400 + }, + { + "epoch": 2.6418464816287788, + "grad_norm": 5.9677042961120605, + "learning_rate": 0.0032924920435456894, + "loss": 7.7441, + "step": 648500 + }, + { + "epoch": 2.6422538596521603, + "grad_norm": 4.312749862670898, + "learning_rate": 0.0032920212935187584, + "loss": 7.7333, + "step": 648600 + }, + { + "epoch": 2.642661237675542, + "grad_norm": 4.137760162353516, + "learning_rate": 0.0032915505123766687, + "loss": 7.734, + "step": 648700 + }, + { + "epoch": 2.6430686156989234, + "grad_norm": 4.529685974121094, + "learning_rate": 0.003291079700138029, + "loss": 7.7682, + "step": 648800 + }, + { + "epoch": 2.643475993722305, + "grad_norm": 2.5717289447784424, + "learning_rate": 0.003290608856821454, + "loss": 7.7284, + "step": 648900 + }, + { + "epoch": 2.643883371745686, + "grad_norm": 7.729617595672607, + "learning_rate": 0.0032901379824455566, + "loss": 7.7786, + "step": 649000 + }, + { + "epoch": 2.643883371745686, + "eval_MaskedAccuracy": 0.5025726633766079, + "eval_loss": 1.6284892559051514, + "eval_runtime": 158.623, + "eval_samples_per_second": 400.169, + "eval_steps_per_second": 1.563, + "step": 649000 + }, + { + "epoch": 2.6442907497690675, + "grad_norm": 5.148902416229248, + "learning_rate": 0.0032896670770289588, + "loss": 7.7386, + "step": 649100 + }, + { + "epoch": 2.644698127792449, + "grad_norm": 3.575874090194702, + "learning_rate": 0.003289196140590273, + "loss": 7.7559, + "step": 649200 + }, + { + "epoch": 2.6451055058158306, + "grad_norm": 4.578037738800049, + "learning_rate": 0.003288725173148119, + "loss": 7.7321, + "step": 649300 + }, + { + "epoch": 2.645512883839212, + "grad_norm": 3.5895893573760986, + "learning_rate": 0.003288254174721118, + "loss": 7.7578, + "step": 649400 + }, + { + "epoch": 2.645920261862593, + "grad_norm": 2.84005069732666, + "learning_rate": 0.003287783145327894, + "loss": 7.7407, + "step": 649500 + }, + { + "epoch": 2.6463276398859747, + "grad_norm": 2.76251482963562, + "learning_rate": 0.003287312084987063, + "loss": 7.7272, + "step": 649600 + }, + { + "epoch": 2.6467350179093563, + "grad_norm": 5.468085289001465, + "learning_rate": 0.003286840993717252, + "loss": 7.7274, + "step": 649700 + }, + { + "epoch": 2.647142395932738, + "grad_norm": 4.382936954498291, + "learning_rate": 0.003286369871537083, + "loss": 7.7303, + "step": 649800 + }, + { + "epoch": 2.6475497739561193, + "grad_norm": 5.057107448577881, + "learning_rate": 0.0032858987184651816, + "loss": 7.7261, + "step": 649900 + }, + { + "epoch": 2.647957151979501, + "grad_norm": 3.6791536808013916, + "learning_rate": 0.0032854275345201777, + "loss": 7.7577, + "step": 650000 + }, + { + "epoch": 2.647957151979501, + "eval_MaskedAccuracy": 0.5033928448150526, + "eval_loss": 1.6409040689468384, + "eval_runtime": 168.8179, + "eval_samples_per_second": 376.003, + "eval_steps_per_second": 1.469, + "step": 650000 + }, + { + "epoch": 2.6483645300028824, + "grad_norm": 4.990233898162842, + "learning_rate": 0.0032849563197207016, + "loss": 7.7639, + "step": 650100 + }, + { + "epoch": 2.6487719080262635, + "grad_norm": 3.060655117034912, + "learning_rate": 0.003284485074085381, + "loss": 7.7528, + "step": 650200 + }, + { + "epoch": 2.649179286049645, + "grad_norm": 4.236000061035156, + "learning_rate": 0.0032840137976328453, + "loss": 7.7395, + "step": 650300 + }, + { + "epoch": 2.6495866640730266, + "grad_norm": 4.522440433502197, + "learning_rate": 0.0032835424903817283, + "loss": 7.7816, + "step": 650400 + }, + { + "epoch": 2.649994042096408, + "grad_norm": 3.9909865856170654, + "learning_rate": 0.0032830711523506666, + "loss": 7.7378, + "step": 650500 + }, + { + "epoch": 2.6504014201197896, + "grad_norm": 3.386911392211914, + "learning_rate": 0.003282599783558287, + "loss": 7.7474, + "step": 650600 + }, + { + "epoch": 2.6508087981431707, + "grad_norm": 2.55659818649292, + "learning_rate": 0.0032821283840232278, + "loss": 7.7695, + "step": 650700 + }, + { + "epoch": 2.6512161761665523, + "grad_norm": 5.9968061447143555, + "learning_rate": 0.0032816569537641284, + "loss": 7.7578, + "step": 650800 + }, + { + "epoch": 2.651623554189934, + "grad_norm": 6.817568302154541, + "learning_rate": 0.00328118549279963, + "loss": 7.7295, + "step": 650900 + }, + { + "epoch": 2.6520309322133153, + "grad_norm": 9.927818298339844, + "learning_rate": 0.0032807140011483628, + "loss": 7.7438, + "step": 651000 + }, + { + "epoch": 2.6520309322133153, + "eval_MaskedAccuracy": 0.5034037926576029, + "eval_loss": 1.6238354444503784, + "eval_runtime": 161.4741, + "eval_samples_per_second": 393.103, + "eval_steps_per_second": 1.536, + "step": 651000 + }, + { + "epoch": 2.652438310236697, + "grad_norm": 4.887489318847656, + "learning_rate": 0.003280242478828971, + "loss": 7.7562, + "step": 651100 + }, + { + "epoch": 2.6528456882600784, + "grad_norm": 6.656474590301514, + "learning_rate": 0.003279770925860098, + "loss": 7.7341, + "step": 651200 + }, + { + "epoch": 2.65325306628346, + "grad_norm": 4.37863302230835, + "learning_rate": 0.003279299342260389, + "loss": 7.7249, + "step": 651300 + }, + { + "epoch": 2.6536604443068414, + "grad_norm": 3.029027223587036, + "learning_rate": 0.003278827728048482, + "loss": 7.7519, + "step": 651400 + }, + { + "epoch": 2.6540678223302225, + "grad_norm": 5.147206783294678, + "learning_rate": 0.0032783560832430284, + "loss": 7.736, + "step": 651500 + }, + { + "epoch": 2.654475200353604, + "grad_norm": 4.042923450469971, + "learning_rate": 0.0032778844078626738, + "loss": 7.7345, + "step": 651600 + }, + { + "epoch": 2.6548825783769856, + "grad_norm": 5.513915061950684, + "learning_rate": 0.0032774127019260637, + "loss": 7.7615, + "step": 651700 + }, + { + "epoch": 2.655289956400367, + "grad_norm": 3.6763839721679688, + "learning_rate": 0.00327694096545185, + "loss": 7.7258, + "step": 651800 + }, + { + "epoch": 2.6556973344237487, + "grad_norm": 2.0926759243011475, + "learning_rate": 0.0032764691984586824, + "loss": 7.7838, + "step": 651900 + }, + { + "epoch": 2.6561047124471298, + "grad_norm": 8.465835571289062, + "learning_rate": 0.003275997400965211, + "loss": 7.7634, + "step": 652000 + }, + { + "epoch": 2.6561047124471298, + "eval_MaskedAccuracy": 0.5026738480989897, + "eval_loss": 1.6305789947509766, + "eval_runtime": 158.411, + "eval_samples_per_second": 400.705, + "eval_steps_per_second": 1.566, + "step": 652000 + }, + { + "epoch": 2.6565120904705113, + "grad_norm": 4.270816326141357, + "learning_rate": 0.003275525572990089, + "loss": 7.7325, + "step": 652100 + }, + { + "epoch": 2.656919468493893, + "grad_norm": 3.6876659393310547, + "learning_rate": 0.00327505371455197, + "loss": 7.7261, + "step": 652200 + }, + { + "epoch": 2.6573268465172744, + "grad_norm": 3.0237393379211426, + "learning_rate": 0.0032745818256695093, + "loss": 7.7233, + "step": 652300 + }, + { + "epoch": 2.657734224540656, + "grad_norm": 4.164248943328857, + "learning_rate": 0.0032741099063613587, + "loss": 7.7374, + "step": 652400 + }, + { + "epoch": 2.6581416025640374, + "grad_norm": 5.732575416564941, + "learning_rate": 0.003273637956646182, + "loss": 7.7519, + "step": 652500 + }, + { + "epoch": 2.658548980587419, + "grad_norm": 3.6276843547821045, + "learning_rate": 0.0032731659765426343, + "loss": 7.7491, + "step": 652600 + }, + { + "epoch": 2.6589563586108, + "grad_norm": 3.634322166442871, + "learning_rate": 0.0032726939660693787, + "loss": 7.744, + "step": 652700 + }, + { + "epoch": 2.6593637366341816, + "grad_norm": 6.3059563636779785, + "learning_rate": 0.003272221925245076, + "loss": 7.7371, + "step": 652800 + }, + { + "epoch": 2.659771114657563, + "grad_norm": 6.109255313873291, + "learning_rate": 0.00327174985408839, + "loss": 7.746, + "step": 652900 + }, + { + "epoch": 2.6601784926809446, + "grad_norm": 6.7755208015441895, + "learning_rate": 0.0032712777526179783, + "loss": 7.7462, + "step": 653000 + }, + { + "epoch": 2.6601784926809446, + "eval_MaskedAccuracy": 0.5039142799604575, + "eval_loss": 1.636847734451294, + "eval_runtime": 158.8607, + "eval_samples_per_second": 399.57, + "eval_steps_per_second": 1.561, + "step": 653000 + }, + { + "epoch": 2.660585870704326, + "grad_norm": 4.314255237579346, + "learning_rate": 0.003270805620852508, + "loss": 7.7527, + "step": 653100 + }, + { + "epoch": 2.6609932487277073, + "grad_norm": 3.098864793777466, + "learning_rate": 0.003270333458810645, + "loss": 7.7721, + "step": 653200 + }, + { + "epoch": 2.661400626751089, + "grad_norm": 3.9847843647003174, + "learning_rate": 0.003269861266511057, + "loss": 7.7402, + "step": 653300 + }, + { + "epoch": 2.6618080047744703, + "grad_norm": 12.476365089416504, + "learning_rate": 0.003269389043972412, + "loss": 7.7257, + "step": 653400 + }, + { + "epoch": 2.662215382797852, + "grad_norm": 2.452924966812134, + "learning_rate": 0.003268916791213377, + "loss": 7.7618, + "step": 653500 + }, + { + "epoch": 2.6626227608212334, + "grad_norm": 4.939070224761963, + "learning_rate": 0.00326844450825263, + "loss": 7.7161, + "step": 653600 + }, + { + "epoch": 2.663030138844615, + "grad_norm": 4.919296741485596, + "learning_rate": 0.003267972195108836, + "loss": 7.7501, + "step": 653700 + }, + { + "epoch": 2.6634375168679965, + "grad_norm": 2.7469310760498047, + "learning_rate": 0.003267499851800673, + "loss": 7.7293, + "step": 653800 + }, + { + "epoch": 2.663844894891378, + "grad_norm": 4.047046184539795, + "learning_rate": 0.003267027478346811, + "loss": 7.7394, + "step": 653900 + }, + { + "epoch": 2.664252272914759, + "grad_norm": 3.699885368347168, + "learning_rate": 0.0032665550747659282, + "loss": 7.7607, + "step": 654000 + }, + { + "epoch": 2.664252272914759, + "eval_MaskedAccuracy": 0.5038251701037562, + "eval_loss": 1.6308118104934692, + "eval_runtime": 156.6225, + "eval_samples_per_second": 405.28, + "eval_steps_per_second": 1.583, + "step": 654000 + }, + { + "epoch": 2.6646596509381406, + "grad_norm": 2.383355140686035, + "learning_rate": 0.003266082641076702, + "loss": 7.721, + "step": 654100 + }, + { + "epoch": 2.665067028961522, + "grad_norm": 5.599305152893066, + "learning_rate": 0.003265610177297807, + "loss": 7.7281, + "step": 654200 + }, + { + "epoch": 2.6654744069849037, + "grad_norm": 9.50638484954834, + "learning_rate": 0.003265137683447925, + "loss": 7.7655, + "step": 654300 + }, + { + "epoch": 2.665881785008285, + "grad_norm": 5.803840637207031, + "learning_rate": 0.0032646651595457307, + "loss": 7.7634, + "step": 654400 + }, + { + "epoch": 2.6662891630316663, + "grad_norm": 2.774467945098877, + "learning_rate": 0.003264192605609913, + "loss": 7.7026, + "step": 654500 + }, + { + "epoch": 2.666696541055048, + "grad_norm": 3.172819137573242, + "learning_rate": 0.003263720021659151, + "loss": 7.7291, + "step": 654600 + }, + { + "epoch": 2.6671039190784294, + "grad_norm": 6.5416975021362305, + "learning_rate": 0.003263247407712128, + "loss": 7.7583, + "step": 654700 + }, + { + "epoch": 2.667511297101811, + "grad_norm": 2.694756507873535, + "learning_rate": 0.003262774763787526, + "loss": 7.7231, + "step": 654800 + }, + { + "epoch": 2.6679186751251924, + "grad_norm": 7.714715480804443, + "learning_rate": 0.003262302089904039, + "loss": 7.7305, + "step": 654900 + }, + { + "epoch": 2.668326053148574, + "grad_norm": 3.5213520526885986, + "learning_rate": 0.0032618293860803473, + "loss": 7.7225, + "step": 655000 + }, + { + "epoch": 2.668326053148574, + "eval_MaskedAccuracy": 0.5033319548576939, + "eval_loss": 1.6350668668746948, + "eval_runtime": 157.4558, + "eval_samples_per_second": 403.135, + "eval_steps_per_second": 1.575, + "step": 655000 + }, + { + "epoch": 2.6687334311719555, + "grad_norm": 6.717520713806152, + "learning_rate": 0.0032613566523351446, + "loss": 7.7301, + "step": 655100 + }, + { + "epoch": 2.6691408091953366, + "grad_norm": 1.9514811038970947, + "learning_rate": 0.003260883888687118, + "loss": 7.7179, + "step": 655200 + }, + { + "epoch": 2.669548187218718, + "grad_norm": 6.654797077178955, + "learning_rate": 0.0032604110951549594, + "loss": 7.733, + "step": 655300 + }, + { + "epoch": 2.6699555652420996, + "grad_norm": 3.4241044521331787, + "learning_rate": 0.0032599382717573597, + "loss": 7.7225, + "step": 655400 + }, + { + "epoch": 2.670362943265481, + "grad_norm": 3.2960762977600098, + "learning_rate": 0.003259465418513011, + "loss": 7.7223, + "step": 655500 + }, + { + "epoch": 2.6707703212888627, + "grad_norm": 4.2361674308776855, + "learning_rate": 0.0032589925354406094, + "loss": 7.741, + "step": 655600 + }, + { + "epoch": 2.671177699312244, + "grad_norm": 4.492542266845703, + "learning_rate": 0.0032585196225588483, + "loss": 7.7582, + "step": 655700 + }, + { + "epoch": 2.6715850773356253, + "grad_norm": 3.8159728050231934, + "learning_rate": 0.003258046679886431, + "loss": 7.7834, + "step": 655800 + }, + { + "epoch": 2.671992455359007, + "grad_norm": 8.345585823059082, + "learning_rate": 0.0032575737074420495, + "loss": 7.7202, + "step": 655900 + }, + { + "epoch": 2.6723998333823884, + "grad_norm": 6.983959674835205, + "learning_rate": 0.0032571007052444047, + "loss": 7.7417, + "step": 656000 + }, + { + "epoch": 2.6723998333823884, + "eval_MaskedAccuracy": 0.5041212221981818, + "eval_loss": 1.6257683038711548, + "eval_runtime": 167.801, + "eval_samples_per_second": 378.281, + "eval_steps_per_second": 1.478, + "step": 656000 + }, + { + "epoch": 2.67280721140577, + "grad_norm": 3.0840566158294678, + "learning_rate": 0.0032566276733121966, + "loss": 7.7297, + "step": 656100 + }, + { + "epoch": 2.6732145894291515, + "grad_norm": 5.3380279541015625, + "learning_rate": 0.0032561546116641263, + "loss": 7.7449, + "step": 656200 + }, + { + "epoch": 2.673621967452533, + "grad_norm": 6.754693508148193, + "learning_rate": 0.003255681520318896, + "loss": 7.7088, + "step": 656300 + }, + { + "epoch": 2.6740293454759145, + "grad_norm": 5.879538059234619, + "learning_rate": 0.0032552083992952096, + "loss": 7.74, + "step": 656400 + }, + { + "epoch": 2.6744367234992956, + "grad_norm": 5.780153274536133, + "learning_rate": 0.0032547352486117747, + "loss": 7.7345, + "step": 656500 + }, + { + "epoch": 2.674844101522677, + "grad_norm": 4.357320785522461, + "learning_rate": 0.0032542620682872946, + "loss": 7.7305, + "step": 656600 + }, + { + "epoch": 2.6752514795460587, + "grad_norm": 3.5099992752075195, + "learning_rate": 0.003253788858340474, + "loss": 7.7745, + "step": 656700 + }, + { + "epoch": 2.67565885756944, + "grad_norm": 6.600289344787598, + "learning_rate": 0.003253315618790031, + "loss": 7.738, + "step": 656800 + }, + { + "epoch": 2.6760662355928218, + "grad_norm": 10.957653999328613, + "learning_rate": 0.00325284234965467, + "loss": 7.7517, + "step": 656900 + }, + { + "epoch": 2.676473613616203, + "grad_norm": 4.545962333679199, + "learning_rate": 0.0032523690509530974, + "loss": 7.7583, + "step": 657000 + }, + { + "epoch": 2.676473613616203, + "eval_MaskedAccuracy": 0.5034654201527211, + "eval_loss": 1.6379927396774292, + "eval_runtime": 226.1231, + "eval_samples_per_second": 280.714, + "eval_steps_per_second": 1.097, + "step": 657000 + }, + { + "epoch": 2.6768809916395844, + "grad_norm": 2.678373336791992, + "learning_rate": 0.003251895722704028, + "loss": 7.7635, + "step": 657100 + }, + { + "epoch": 2.677288369662966, + "grad_norm": 5.181834697723389, + "learning_rate": 0.0032514223649261777, + "loss": 7.7028, + "step": 657200 + }, + { + "epoch": 2.6776957476863474, + "grad_norm": 9.015517234802246, + "learning_rate": 0.0032509489776382564, + "loss": 7.7647, + "step": 657300 + }, + { + "epoch": 2.678103125709729, + "grad_norm": 4.9846720695495605, + "learning_rate": 0.003250475560858984, + "loss": 7.7519, + "step": 657400 + }, + { + "epoch": 2.6785105037331105, + "grad_norm": 4.39346981048584, + "learning_rate": 0.0032500021146070752, + "loss": 7.7089, + "step": 657500 + }, + { + "epoch": 2.678917881756492, + "grad_norm": 6.527012825012207, + "learning_rate": 0.003249528638901249, + "loss": 7.751, + "step": 657600 + }, + { + "epoch": 2.679325259779873, + "grad_norm": 5.306674003601074, + "learning_rate": 0.003249055133760224, + "loss": 7.7623, + "step": 657700 + }, + { + "epoch": 2.6797326378032547, + "grad_norm": 2.8503808975219727, + "learning_rate": 0.0032485815992027212, + "loss": 7.7665, + "step": 657800 + }, + { + "epoch": 2.680140015826636, + "grad_norm": 5.299534797668457, + "learning_rate": 0.003248108035247459, + "loss": 7.7183, + "step": 657900 + }, + { + "epoch": 2.6805473938500177, + "grad_norm": 5.834193706512451, + "learning_rate": 0.0032476344419131633, + "loss": 7.7479, + "step": 658000 + }, + { + "epoch": 2.6805473938500177, + "eval_MaskedAccuracy": 0.5036503069933103, + "eval_loss": 1.630007028579712, + "eval_runtime": 175.1958, + "eval_samples_per_second": 362.315, + "eval_steps_per_second": 1.416, + "step": 658000 + }, + { + "epoch": 2.6809547718733993, + "grad_norm": 3.148181438446045, + "learning_rate": 0.003247160819218553, + "loss": 7.7387, + "step": 658100 + }, + { + "epoch": 2.6813621498967803, + "grad_norm": 4.28448486328125, + "learning_rate": 0.0032466871671823585, + "loss": 7.7497, + "step": 658200 + }, + { + "epoch": 2.681769527920162, + "grad_norm": 6.779094696044922, + "learning_rate": 0.0032462134858233034, + "loss": 7.7273, + "step": 658300 + }, + { + "epoch": 2.6821769059435434, + "grad_norm": 3.783496856689453, + "learning_rate": 0.003245739775160116, + "loss": 7.6995, + "step": 658400 + }, + { + "epoch": 2.682584283966925, + "grad_norm": 2.3883776664733887, + "learning_rate": 0.003245266035211523, + "loss": 7.7321, + "step": 658500 + }, + { + "epoch": 2.6829916619903065, + "grad_norm": 9.007153511047363, + "learning_rate": 0.003244792265996254, + "loss": 7.7263, + "step": 658600 + }, + { + "epoch": 2.683399040013688, + "grad_norm": 4.153413772583008, + "learning_rate": 0.0032443184675330404, + "loss": 7.7335, + "step": 658700 + }, + { + "epoch": 2.6838064180370695, + "grad_norm": 5.07761287689209, + "learning_rate": 0.0032438446398406133, + "loss": 7.7478, + "step": 658800 + }, + { + "epoch": 2.684213796060451, + "grad_norm": 5.622042655944824, + "learning_rate": 0.0032433707829377044, + "loss": 7.7436, + "step": 658900 + }, + { + "epoch": 2.684621174083832, + "grad_norm": 2.3242077827453613, + "learning_rate": 0.0032428968968430507, + "loss": 7.7153, + "step": 659000 + }, + { + "epoch": 2.684621174083832, + "eval_MaskedAccuracy": 0.5030991623659978, + "eval_loss": 1.6299339532852173, + "eval_runtime": 160.5434, + "eval_samples_per_second": 395.382, + "eval_steps_per_second": 1.545, + "step": 659000 + }, + { + "epoch": 2.6850285521072137, + "grad_norm": 3.4865944385528564, + "learning_rate": 0.003242422981575384, + "loss": 7.7329, + "step": 659100 + }, + { + "epoch": 2.6854359301305952, + "grad_norm": 2.9826807975769043, + "learning_rate": 0.0032419490371534436, + "loss": 7.7384, + "step": 659200 + }, + { + "epoch": 2.6858433081539768, + "grad_norm": 6.728859901428223, + "learning_rate": 0.003241475063595969, + "loss": 7.739, + "step": 659300 + }, + { + "epoch": 2.6862506861773583, + "grad_norm": 3.132185697555542, + "learning_rate": 0.0032410010609216972, + "loss": 7.7441, + "step": 659400 + }, + { + "epoch": 2.6866580642007394, + "grad_norm": 4.959762096405029, + "learning_rate": 0.003240527029149367, + "loss": 7.7033, + "step": 659500 + }, + { + "epoch": 2.687065442224121, + "grad_norm": 3.4063496589660645, + "learning_rate": 0.003240052968297717, + "loss": 7.7615, + "step": 659600 + }, + { + "epoch": 2.6874728202475024, + "grad_norm": 10.797226905822754, + "learning_rate": 0.0032395788783854945, + "loss": 7.7321, + "step": 659700 + }, + { + "epoch": 2.687880198270884, + "grad_norm": 11.4490385055542, + "learning_rate": 0.003239104759431438, + "loss": 7.7274, + "step": 659800 + }, + { + "epoch": 2.6882875762942655, + "grad_norm": 3.827448844909668, + "learning_rate": 0.0032386306114542927, + "loss": 7.7157, + "step": 659900 + }, + { + "epoch": 2.688694954317647, + "grad_norm": 3.8741378784179688, + "learning_rate": 0.003238156434472808, + "loss": 7.7169, + "step": 660000 + }, + { + "epoch": 2.688694954317647, + "eval_MaskedAccuracy": 0.503351968900708, + "eval_loss": 1.6341768503189087, + "eval_runtime": 159.4772, + "eval_samples_per_second": 398.026, + "eval_steps_per_second": 1.555, + "step": 660000 + }, + { + "epoch": 2.6891023323410286, + "grad_norm": 106.6436538696289, + "learning_rate": 0.003237682228505726, + "loss": 7.7472, + "step": 660100 + }, + { + "epoch": 2.6895097103644097, + "grad_norm": 4.269049167633057, + "learning_rate": 0.003237207993571798, + "loss": 7.7843, + "step": 660200 + }, + { + "epoch": 2.689917088387791, + "grad_norm": 6.576282501220703, + "learning_rate": 0.0032367337296897707, + "loss": 7.7143, + "step": 660300 + }, + { + "epoch": 2.6903244664111727, + "grad_norm": 9.505963325500488, + "learning_rate": 0.003236259436878399, + "loss": 7.7229, + "step": 660400 + }, + { + "epoch": 2.6907318444345543, + "grad_norm": 6.678642749786377, + "learning_rate": 0.003235785115156429, + "loss": 7.7638, + "step": 660500 + }, + { + "epoch": 2.691139222457936, + "grad_norm": 3.550166606903076, + "learning_rate": 0.003235310764542615, + "loss": 7.7644, + "step": 660600 + }, + { + "epoch": 2.691546600481317, + "grad_norm": 10.332160949707031, + "learning_rate": 0.0032348363850557113, + "loss": 7.754, + "step": 660700 + }, + { + "epoch": 2.6919539785046984, + "grad_norm": 4.420124530792236, + "learning_rate": 0.00323436197671447, + "loss": 7.7474, + "step": 660800 + }, + { + "epoch": 2.69236135652808, + "grad_norm": 12.303844451904297, + "learning_rate": 0.0032338875395376467, + "loss": 7.7464, + "step": 660900 + }, + { + "epoch": 2.6927687345514615, + "grad_norm": 10.578495979309082, + "learning_rate": 0.003233413073544003, + "loss": 7.7469, + "step": 661000 + }, + { + "epoch": 2.6927687345514615, + "eval_MaskedAccuracy": 0.5034515531390799, + "eval_loss": 1.6318436861038208, + "eval_runtime": 148.7141, + "eval_samples_per_second": 426.832, + "eval_steps_per_second": 1.668, + "step": 661000 + }, + { + "epoch": 2.693176112574843, + "grad_norm": 3.2199487686157227, + "learning_rate": 0.0032329385787522934, + "loss": 7.7408, + "step": 661100 + }, + { + "epoch": 2.6935834905982246, + "grad_norm": 9.546009063720703, + "learning_rate": 0.0032324640551812827, + "loss": 7.7545, + "step": 661200 + }, + { + "epoch": 2.693990868621606, + "grad_norm": 2.9559309482574463, + "learning_rate": 0.0032319895028497226, + "loss": 7.7454, + "step": 661300 + }, + { + "epoch": 2.6943982466449876, + "grad_norm": 3.469130754470825, + "learning_rate": 0.0032315149217763822, + "loss": 7.7414, + "step": 661400 + }, + { + "epoch": 2.6948056246683687, + "grad_norm": 4.411319255828857, + "learning_rate": 0.003231040311980017, + "loss": 7.7061, + "step": 661500 + }, + { + "epoch": 2.6952130026917502, + "grad_norm": 3.295698881149292, + "learning_rate": 0.003230565673479398, + "loss": 7.7346, + "step": 661600 + }, + { + "epoch": 2.6956203807151318, + "grad_norm": 10.868988037109375, + "learning_rate": 0.0032300910062932893, + "loss": 7.7283, + "step": 661700 + }, + { + "epoch": 2.6960277587385133, + "grad_norm": 9.993134498596191, + "learning_rate": 0.0032296163104404514, + "loss": 7.7633, + "step": 661800 + }, + { + "epoch": 2.696435136761895, + "grad_norm": 8.998684883117676, + "learning_rate": 0.0032291415859396605, + "loss": 7.7485, + "step": 661900 + }, + { + "epoch": 2.696842514785276, + "grad_norm": 3.220615863800049, + "learning_rate": 0.003228666832809676, + "loss": 7.7697, + "step": 662000 + }, + { + "epoch": 2.696842514785276, + "eval_MaskedAccuracy": 0.5040962974919867, + "eval_loss": 1.6281524896621704, + "eval_runtime": 148.3752, + "eval_samples_per_second": 427.807, + "eval_steps_per_second": 1.671, + "step": 662000 + }, + { + "epoch": 2.6972498928086575, + "grad_norm": 3.631373643875122, + "learning_rate": 0.003228192051069267, + "loss": 7.7254, + "step": 662100 + }, + { + "epoch": 2.697657270832039, + "grad_norm": 5.53477144241333, + "learning_rate": 0.0032277172407372146, + "loss": 7.7484, + "step": 662200 + }, + { + "epoch": 2.6980646488554205, + "grad_norm": 5.845667362213135, + "learning_rate": 0.0032272424018322814, + "loss": 7.7308, + "step": 662300 + }, + { + "epoch": 2.698472026878802, + "grad_norm": 3.323057174682617, + "learning_rate": 0.003226767534373242, + "loss": 7.724, + "step": 662400 + }, + { + "epoch": 2.6988794049021836, + "grad_norm": 2.7929320335388184, + "learning_rate": 0.0032262926383788753, + "loss": 7.7006, + "step": 662500 + }, + { + "epoch": 2.699286782925565, + "grad_norm": 7.5049896240234375, + "learning_rate": 0.003225817713867954, + "loss": 7.73, + "step": 662600 + }, + { + "epoch": 2.699694160948946, + "grad_norm": 4.794573783874512, + "learning_rate": 0.0032253427608592514, + "loss": 7.7093, + "step": 662700 + }, + { + "epoch": 2.7001015389723277, + "grad_norm": 4.7356696128845215, + "learning_rate": 0.003224867779371547, + "loss": 7.7181, + "step": 662800 + }, + { + "epoch": 2.7005089169957093, + "grad_norm": 2.7196238040924072, + "learning_rate": 0.0032243927694236185, + "loss": 7.7176, + "step": 662900 + }, + { + "epoch": 2.700916295019091, + "grad_norm": 5.0205159187316895, + "learning_rate": 0.003223917731034248, + "loss": 7.7623, + "step": 663000 + }, + { + "epoch": 2.700916295019091, + "eval_MaskedAccuracy": 0.5038521830246014, + "eval_loss": 1.6336082220077515, + "eval_runtime": 149.6745, + "eval_samples_per_second": 424.093, + "eval_steps_per_second": 1.657, + "step": 663000 + }, + { + "epoch": 2.7013236730424723, + "grad_norm": 3.996946334838867, + "learning_rate": 0.0032234426642222137, + "loss": 7.7267, + "step": 663100 + }, + { + "epoch": 2.7017310510658534, + "grad_norm": 3.2087459564208984, + "learning_rate": 0.0032229675690062997, + "loss": 7.7654, + "step": 663200 + }, + { + "epoch": 2.702138429089235, + "grad_norm": 3.582521438598633, + "learning_rate": 0.0032224924454052855, + "loss": 7.7521, + "step": 663300 + }, + { + "epoch": 2.7025458071126165, + "grad_norm": 7.063210964202881, + "learning_rate": 0.0032220172934379556, + "loss": 7.7552, + "step": 663400 + }, + { + "epoch": 2.702953185135998, + "grad_norm": 7.65894079208374, + "learning_rate": 0.003221542113123099, + "loss": 7.738, + "step": 663500 + }, + { + "epoch": 2.7033605631593796, + "grad_norm": 5.349039077758789, + "learning_rate": 0.0032210669044795, + "loss": 7.7524, + "step": 663600 + }, + { + "epoch": 2.703767941182761, + "grad_norm": 4.374447822570801, + "learning_rate": 0.003220591667525946, + "loss": 7.7433, + "step": 663700 + }, + { + "epoch": 2.7041753192061426, + "grad_norm": 4.791806221008301, + "learning_rate": 0.0032201164022812273, + "loss": 7.7792, + "step": 663800 + }, + { + "epoch": 2.704582697229524, + "grad_norm": 8.21108627319336, + "learning_rate": 0.003219641108764132, + "loss": 7.7342, + "step": 663900 + }, + { + "epoch": 2.7049900752529052, + "grad_norm": 3.945631742477417, + "learning_rate": 0.003219165786993452, + "loss": 7.709, + "step": 664000 + }, + { + "epoch": 2.7049900752529052, + "eval_MaskedAccuracy": 0.5037033042070721, + "eval_loss": 1.6292518377304077, + "eval_runtime": 149.5575, + "eval_samples_per_second": 424.425, + "eval_steps_per_second": 1.658, + "step": 664000 + }, + { + "epoch": 2.705397453276287, + "grad_norm": 3.24385142326355, + "learning_rate": 0.003218690436987978, + "loss": 7.7348, + "step": 664100 + }, + { + "epoch": 2.7058048312996683, + "grad_norm": 7.810568332672119, + "learning_rate": 0.0032182150587665064, + "loss": 7.7509, + "step": 664200 + }, + { + "epoch": 2.70621220932305, + "grad_norm": 9.91736888885498, + "learning_rate": 0.0032177396523478292, + "loss": 7.7213, + "step": 664300 + }, + { + "epoch": 2.7066195873464314, + "grad_norm": 8.979564666748047, + "learning_rate": 0.0032172642177507385, + "loss": 7.7312, + "step": 664400 + }, + { + "epoch": 2.7070269653698125, + "grad_norm": 3.256312370300293, + "learning_rate": 0.0032167887549940355, + "loss": 7.7632, + "step": 664500 + }, + { + "epoch": 2.707434343393194, + "grad_norm": 6.67056131362915, + "learning_rate": 0.0032163132640965165, + "loss": 7.7438, + "step": 664600 + }, + { + "epoch": 2.7078417214165755, + "grad_norm": 3.1909127235412598, + "learning_rate": 0.003215837745076978, + "loss": 7.733, + "step": 664700 + }, + { + "epoch": 2.708249099439957, + "grad_norm": 4.2988715171813965, + "learning_rate": 0.0032153621979542223, + "loss": 7.7362, + "step": 664800 + }, + { + "epoch": 2.7086564774633386, + "grad_norm": 3.895310878753662, + "learning_rate": 0.00321488662274705, + "loss": 7.7339, + "step": 664900 + }, + { + "epoch": 2.70906385548672, + "grad_norm": 6.113119125366211, + "learning_rate": 0.0032144110194742624, + "loss": 7.7264, + "step": 665000 + }, + { + "epoch": 2.70906385548672, + "eval_MaskedAccuracy": 0.503768846450946, + "eval_loss": 1.6280908584594727, + "eval_runtime": 148.2173, + "eval_samples_per_second": 428.263, + "eval_steps_per_second": 1.673, + "step": 665000 + }, + { + "epoch": 2.7094712335101017, + "grad_norm": 4.171345233917236, + "learning_rate": 0.0032139353881546636, + "loss": 7.7229, + "step": 665100 + }, + { + "epoch": 2.7098786115334828, + "grad_norm": 6.491979598999023, + "learning_rate": 0.0032134597288070556, + "loss": 7.7482, + "step": 665200 + }, + { + "epoch": 2.7102859895568643, + "grad_norm": 10.674572944641113, + "learning_rate": 0.0032129840414502463, + "loss": 7.7533, + "step": 665300 + }, + { + "epoch": 2.710693367580246, + "grad_norm": 5.564167022705078, + "learning_rate": 0.00321250832610304, + "loss": 7.7534, + "step": 665400 + }, + { + "epoch": 2.7111007456036273, + "grad_norm": 7.038264274597168, + "learning_rate": 0.0032120325827842463, + "loss": 7.7142, + "step": 665500 + }, + { + "epoch": 2.711508123627009, + "grad_norm": 2.9987425804138184, + "learning_rate": 0.0032115568115126743, + "loss": 7.7232, + "step": 665600 + }, + { + "epoch": 2.71191550165039, + "grad_norm": 3.360598087310791, + "learning_rate": 0.0032110810123071308, + "loss": 7.7358, + "step": 665700 + }, + { + "epoch": 2.7123228796737715, + "grad_norm": 11.335540771484375, + "learning_rate": 0.0032106051851864313, + "loss": 7.7117, + "step": 665800 + }, + { + "epoch": 2.712730257697153, + "grad_norm": 3.6695189476013184, + "learning_rate": 0.0032101293301693866, + "loss": 7.758, + "step": 665900 + }, + { + "epoch": 2.7131376357205346, + "grad_norm": 5.335036277770996, + "learning_rate": 0.0032096534472748055, + "loss": 7.6919, + "step": 666000 + }, + { + "epoch": 2.7131376357205346, + "eval_MaskedAccuracy": 0.5040659751029255, + "eval_loss": 1.628699541091919, + "eval_runtime": 150.9461, + "eval_samples_per_second": 420.521, + "eval_steps_per_second": 1.643, + "step": 666000 + }, + { + "epoch": 2.713545013743916, + "grad_norm": 3.3608007431030273, + "learning_rate": 0.003209177536521505, + "loss": 7.7517, + "step": 666100 + }, + { + "epoch": 2.7139523917672976, + "grad_norm": 2.3203938007354736, + "learning_rate": 0.003208701597928301, + "loss": 7.7089, + "step": 666200 + }, + { + "epoch": 2.714359769790679, + "grad_norm": 2.199331283569336, + "learning_rate": 0.0032082256315140063, + "loss": 7.7258, + "step": 666300 + }, + { + "epoch": 2.7147671478140607, + "grad_norm": 5.572924613952637, + "learning_rate": 0.0032077496372974394, + "loss": 7.7371, + "step": 666400 + }, + { + "epoch": 2.715174525837442, + "grad_norm": 5.677365303039551, + "learning_rate": 0.003207273615297422, + "loss": 7.7777, + "step": 666500 + }, + { + "epoch": 2.7155819038608233, + "grad_norm": 3.157910108566284, + "learning_rate": 0.0032067975655327764, + "loss": 7.7324, + "step": 666600 + }, + { + "epoch": 2.715989281884205, + "grad_norm": 5.038809299468994, + "learning_rate": 0.0032063214880223173, + "loss": 7.7197, + "step": 666700 + }, + { + "epoch": 2.7163966599075864, + "grad_norm": 9.037930488586426, + "learning_rate": 0.003205845382784871, + "loss": 7.7173, + "step": 666800 + }, + { + "epoch": 2.716804037930968, + "grad_norm": 2.4714245796203613, + "learning_rate": 0.0032053692498392548, + "loss": 7.7198, + "step": 666900 + }, + { + "epoch": 2.717211415954349, + "grad_norm": 5.054344654083252, + "learning_rate": 0.0032048930892042975, + "loss": 7.7455, + "step": 667000 + }, + { + "epoch": 2.717211415954349, + "eval_MaskedAccuracy": 0.5031529451686492, + "eval_loss": 1.6368383169174194, + "eval_runtime": 148.3692, + "eval_samples_per_second": 427.825, + "eval_steps_per_second": 1.672, + "step": 667000 + }, + { + "epoch": 2.7176187939777305, + "grad_norm": 5.298316478729248, + "learning_rate": 0.003204416900898827, + "loss": 7.7285, + "step": 667100 + }, + { + "epoch": 2.718026172001112, + "grad_norm": 10.430736541748047, + "learning_rate": 0.0032039406849416643, + "loss": 7.775, + "step": 667200 + }, + { + "epoch": 2.7184335500244936, + "grad_norm": 8.048696517944336, + "learning_rate": 0.003203464441351639, + "loss": 7.7378, + "step": 667300 + }, + { + "epoch": 2.718840928047875, + "grad_norm": 4.4530253410339355, + "learning_rate": 0.0032029881701475743, + "loss": 7.7577, + "step": 667400 + }, + { + "epoch": 2.7192483060712567, + "grad_norm": 3.222097158432007, + "learning_rate": 0.0032025118713483063, + "loss": 7.7375, + "step": 667500 + }, + { + "epoch": 2.719655684094638, + "grad_norm": 3.4581751823425293, + "learning_rate": 0.003202035544972665, + "loss": 7.7477, + "step": 667600 + }, + { + "epoch": 2.7200630621180193, + "grad_norm": 4.972837448120117, + "learning_rate": 0.0032015591910394808, + "loss": 7.7218, + "step": 667700 + }, + { + "epoch": 2.720470440141401, + "grad_norm": 6.326439380645752, + "learning_rate": 0.0032010828095675858, + "loss": 7.7323, + "step": 667800 + }, + { + "epoch": 2.7208778181647824, + "grad_norm": 3.2040696144104004, + "learning_rate": 0.0032006064005758133, + "loss": 7.7626, + "step": 667900 + }, + { + "epoch": 2.721285196188164, + "grad_norm": 5.238594055175781, + "learning_rate": 0.003200129964082997, + "loss": 7.7036, + "step": 668000 + }, + { + "epoch": 2.721285196188164, + "eval_MaskedAccuracy": 0.5033877847279113, + "eval_loss": 1.6351256370544434, + "eval_runtime": 148.8788, + "eval_samples_per_second": 426.36, + "eval_steps_per_second": 1.666, + "step": 668000 + }, + { + "epoch": 2.7216925742115454, + "grad_norm": 3.157193183898926, + "learning_rate": 0.0031996535001079783, + "loss": 7.7385, + "step": 668100 + }, + { + "epoch": 2.7220999522349265, + "grad_norm": 4.541149139404297, + "learning_rate": 0.00319917700866959, + "loss": 7.7382, + "step": 668200 + }, + { + "epoch": 2.722507330258308, + "grad_norm": 3.2123706340789795, + "learning_rate": 0.0031987004897866727, + "loss": 7.7155, + "step": 668300 + }, + { + "epoch": 2.7229147082816896, + "grad_norm": 8.792269706726074, + "learning_rate": 0.0031982239434780652, + "loss": 7.7058, + "step": 668400 + }, + { + "epoch": 2.723322086305071, + "grad_norm": 3.3838703632354736, + "learning_rate": 0.0031977473697626054, + "loss": 7.7268, + "step": 668500 + }, + { + "epoch": 2.7237294643284526, + "grad_norm": 4.612702369689941, + "learning_rate": 0.0031972707686591375, + "loss": 7.7645, + "step": 668600 + }, + { + "epoch": 2.724136842351834, + "grad_norm": 6.421452045440674, + "learning_rate": 0.0031967941401865045, + "loss": 7.7529, + "step": 668700 + }, + { + "epoch": 2.7245442203752157, + "grad_norm": 6.113936424255371, + "learning_rate": 0.0031963174843635507, + "loss": 7.7221, + "step": 668800 + }, + { + "epoch": 2.7249515983985972, + "grad_norm": 7.661069393157959, + "learning_rate": 0.0031958408012091174, + "loss": 7.7416, + "step": 668900 + }, + { + "epoch": 2.7253589764219783, + "grad_norm": 3.8138558864593506, + "learning_rate": 0.003195364090742052, + "loss": 7.7242, + "step": 669000 + }, + { + "epoch": 2.7253589764219783, + "eval_MaskedAccuracy": 0.5034439907782373, + "eval_loss": 1.6119866371154785, + "eval_runtime": 149.2263, + "eval_samples_per_second": 425.367, + "eval_steps_per_second": 1.662, + "step": 669000 + }, + { + "epoch": 2.72576635444536, + "grad_norm": 5.3097734451293945, + "learning_rate": 0.0031948873529812015, + "loss": 7.7515, + "step": 669100 + }, + { + "epoch": 2.7261737324687414, + "grad_norm": 3.244429588317871, + "learning_rate": 0.0031944105879454137, + "loss": 7.7197, + "step": 669200 + }, + { + "epoch": 2.726581110492123, + "grad_norm": 4.060519218444824, + "learning_rate": 0.0031939337956535395, + "loss": 7.7415, + "step": 669300 + }, + { + "epoch": 2.7269884885155045, + "grad_norm": 10.333324432373047, + "learning_rate": 0.0031934569761244255, + "loss": 7.6973, + "step": 669400 + }, + { + "epoch": 2.7273958665388855, + "grad_norm": 4.821107387542725, + "learning_rate": 0.003192980129376926, + "loss": 7.7379, + "step": 669500 + }, + { + "epoch": 2.727803244562267, + "grad_norm": 4.046305179595947, + "learning_rate": 0.0031925032554298905, + "loss": 7.7052, + "step": 669600 + }, + { + "epoch": 2.7282106225856486, + "grad_norm": 3.4669225215911865, + "learning_rate": 0.003192026354302181, + "loss": 7.75, + "step": 669700 + }, + { + "epoch": 2.72861800060903, + "grad_norm": 9.231145858764648, + "learning_rate": 0.0031915494260126393, + "loss": 7.7366, + "step": 669800 + }, + { + "epoch": 2.7290253786324117, + "grad_norm": 2.925002098083496, + "learning_rate": 0.003191072470580129, + "loss": 7.7343, + "step": 669900 + }, + { + "epoch": 2.729432756655793, + "grad_norm": 11.176908493041992, + "learning_rate": 0.003190595488023502, + "loss": 7.7285, + "step": 670000 + }, + { + "epoch": 2.729432756655793, + "eval_MaskedAccuracy": 0.5037509228227985, + "eval_loss": 1.6253544092178345, + "eval_runtime": 148.6907, + "eval_samples_per_second": 426.9, + "eval_steps_per_second": 1.668, + "step": 670000 + }, + { + "epoch": 2.7298401346791747, + "grad_norm": 4.042506217956543, + "learning_rate": 0.003190118478361622, + "loss": 7.7288, + "step": 670100 + }, + { + "epoch": 2.730247512702556, + "grad_norm": 4.371574401855469, + "learning_rate": 0.003189641441613343, + "loss": 7.7238, + "step": 670200 + }, + { + "epoch": 2.7306548907259374, + "grad_norm": 3.266829252243042, + "learning_rate": 0.003189164377797526, + "loss": 7.7006, + "step": 670300 + }, + { + "epoch": 2.731062268749319, + "grad_norm": 9.816326141357422, + "learning_rate": 0.003188687286933032, + "loss": 7.7063, + "step": 670400 + }, + { + "epoch": 2.7314696467727004, + "grad_norm": 3.488961935043335, + "learning_rate": 0.0031882101690387248, + "loss": 7.74, + "step": 670500 + }, + { + "epoch": 2.731877024796082, + "grad_norm": 2.7548892498016357, + "learning_rate": 0.003187733024133465, + "loss": 7.7267, + "step": 670600 + }, + { + "epoch": 2.732284402819463, + "grad_norm": 5.342942714691162, + "learning_rate": 0.0031872558522361163, + "loss": 7.7698, + "step": 670700 + }, + { + "epoch": 2.7326917808428446, + "grad_norm": 2.250199556350708, + "learning_rate": 0.003186778653365544, + "loss": 7.7258, + "step": 670800 + }, + { + "epoch": 2.733099158866226, + "grad_norm": 2.5757293701171875, + "learning_rate": 0.003186301427540618, + "loss": 7.7061, + "step": 670900 + }, + { + "epoch": 2.7335065368896077, + "grad_norm": 3.861353874206543, + "learning_rate": 0.003185824174780201, + "loss": 7.7477, + "step": 671000 + }, + { + "epoch": 2.7335065368896077, + "eval_MaskedAccuracy": 0.5030542218199967, + "eval_loss": 1.6270012855529785, + "eval_runtime": 149.4202, + "eval_samples_per_second": 424.815, + "eval_steps_per_second": 1.66, + "step": 671000 + }, + { + "epoch": 2.733913914912989, + "grad_norm": 5.667209625244141, + "learning_rate": 0.0031853468951031617, + "loss": 7.7407, + "step": 671100 + }, + { + "epoch": 2.7343212929363707, + "grad_norm": 6.801764011383057, + "learning_rate": 0.0031848695885283713, + "loss": 7.7014, + "step": 671200 + }, + { + "epoch": 2.7347286709597523, + "grad_norm": 3.8669235706329346, + "learning_rate": 0.0031843922550747004, + "loss": 7.716, + "step": 671300 + }, + { + "epoch": 2.735136048983134, + "grad_norm": 4.433848857879639, + "learning_rate": 0.003183914894761019, + "loss": 7.7218, + "step": 671400 + }, + { + "epoch": 2.735543427006515, + "grad_norm": 3.301412582397461, + "learning_rate": 0.003183437507606204, + "loss": 7.7504, + "step": 671500 + }, + { + "epoch": 2.7359508050298964, + "grad_norm": 4.774487018585205, + "learning_rate": 0.0031829600936291286, + "loss": 7.7406, + "step": 671600 + }, + { + "epoch": 2.736358183053278, + "grad_norm": 4.506502628326416, + "learning_rate": 0.003182482652848658, + "loss": 7.7439, + "step": 671700 + }, + { + "epoch": 2.7367655610766595, + "grad_norm": 2.380566358566284, + "learning_rate": 0.0031820051852836776, + "loss": 7.7247, + "step": 671800 + }, + { + "epoch": 2.737172939100041, + "grad_norm": 2.402787685394287, + "learning_rate": 0.0031815276909530617, + "loss": 7.7408, + "step": 671900 + }, + { + "epoch": 2.737580317123422, + "grad_norm": 4.9035491943359375, + "learning_rate": 0.0031810501698756838, + "loss": 7.7508, + "step": 672000 + }, + { + "epoch": 2.737580317123422, + "eval_MaskedAccuracy": 0.504237015677938, + "eval_loss": 1.6274861097335815, + "eval_runtime": 148.0721, + "eval_samples_per_second": 428.683, + "eval_steps_per_second": 1.675, + "step": 672000 + }, + { + "epoch": 2.7379876951468036, + "grad_norm": 4.066720962524414, + "learning_rate": 0.0031805726220704295, + "loss": 7.7401, + "step": 672100 + }, + { + "epoch": 2.738395073170185, + "grad_norm": 7.2215142250061035, + "learning_rate": 0.003180095047556176, + "loss": 7.7269, + "step": 672200 + }, + { + "epoch": 2.7388024511935667, + "grad_norm": 6.007561206817627, + "learning_rate": 0.003179617446351804, + "loss": 7.7329, + "step": 672300 + }, + { + "epoch": 2.7392098292169482, + "grad_norm": 6.568578720092773, + "learning_rate": 0.003179139818476196, + "loss": 7.7144, + "step": 672400 + }, + { + "epoch": 2.7396172072403298, + "grad_norm": 7.652806758880615, + "learning_rate": 0.0031786621639482387, + "loss": 7.7502, + "step": 672500 + }, + { + "epoch": 2.7400245852637113, + "grad_norm": 3.3555850982666016, + "learning_rate": 0.00317818448278681, + "loss": 7.6896, + "step": 672600 + }, + { + "epoch": 2.7404319632870924, + "grad_norm": 5.025359630584717, + "learning_rate": 0.003177706775010799, + "loss": 7.7452, + "step": 672700 + }, + { + "epoch": 2.740839341310474, + "grad_norm": 5.02529239654541, + "learning_rate": 0.0031772290406390908, + "loss": 7.7531, + "step": 672800 + }, + { + "epoch": 2.7412467193338554, + "grad_norm": 5.380584239959717, + "learning_rate": 0.003176751279690576, + "loss": 7.7413, + "step": 672900 + }, + { + "epoch": 2.741654097357237, + "grad_norm": 6.14848518371582, + "learning_rate": 0.003176273492184139, + "loss": 7.7432, + "step": 673000 + }, + { + "epoch": 2.741654097357237, + "eval_MaskedAccuracy": 0.5031924432503198, + "eval_loss": 1.6348071098327637, + "eval_runtime": 149.5255, + "eval_samples_per_second": 424.516, + "eval_steps_per_second": 1.659, + "step": 673000 + }, + { + "epoch": 2.7420614753806185, + "grad_norm": 3.4233415126800537, + "learning_rate": 0.003175795678138673, + "loss": 7.7337, + "step": 673100 + }, + { + "epoch": 2.7424688534039996, + "grad_norm": 3.1220381259918213, + "learning_rate": 0.0031753178375730625, + "loss": 7.7318, + "step": 673200 + }, + { + "epoch": 2.742876231427381, + "grad_norm": 5.210653781890869, + "learning_rate": 0.0031748399705062032, + "loss": 7.7346, + "step": 673300 + }, + { + "epoch": 2.7432836094507627, + "grad_norm": 3.6905648708343506, + "learning_rate": 0.0031743620769569877, + "loss": 7.7322, + "step": 673400 + }, + { + "epoch": 2.743690987474144, + "grad_norm": 3.794119358062744, + "learning_rate": 0.0031738841569443096, + "loss": 7.7447, + "step": 673500 + }, + { + "epoch": 2.7440983654975257, + "grad_norm": 9.346611976623535, + "learning_rate": 0.0031734062104870654, + "loss": 7.7172, + "step": 673600 + }, + { + "epoch": 2.7445057435209073, + "grad_norm": 8.45644760131836, + "learning_rate": 0.003172928237604145, + "loss": 7.7499, + "step": 673700 + }, + { + "epoch": 2.744913121544289, + "grad_norm": 11.964926719665527, + "learning_rate": 0.0031724502383144483, + "loss": 7.7178, + "step": 673800 + }, + { + "epoch": 2.7453204995676703, + "grad_norm": 9.513945579528809, + "learning_rate": 0.0031719722126368776, + "loss": 7.726, + "step": 673900 + }, + { + "epoch": 2.7457278775910514, + "grad_norm": 3.0402848720550537, + "learning_rate": 0.003171494160590327, + "loss": 7.7057, + "step": 674000 + }, + { + "epoch": 2.7457278775910514, + "eval_MaskedAccuracy": 0.5034982886733228, + "eval_loss": 1.6308186054229736, + "eval_runtime": 148.9235, + "eval_samples_per_second": 426.232, + "eval_steps_per_second": 1.665, + "step": 674000 + }, + { + "epoch": 2.746135255614433, + "grad_norm": 2.512981414794922, + "learning_rate": 0.003171016082193699, + "loss": 7.7149, + "step": 674100 + }, + { + "epoch": 2.7465426336378145, + "grad_norm": 2.6818249225616455, + "learning_rate": 0.0031705379774658936, + "loss": 7.719, + "step": 674200 + }, + { + "epoch": 2.746950011661196, + "grad_norm": 3.6553287506103516, + "learning_rate": 0.0031700598464258103, + "loss": 7.7353, + "step": 674300 + }, + { + "epoch": 2.7473573896845775, + "grad_norm": 3.668513059616089, + "learning_rate": 0.003169581689092354, + "loss": 7.6893, + "step": 674400 + }, + { + "epoch": 2.7477647677079586, + "grad_norm": 2.452850341796875, + "learning_rate": 0.003169103505484428, + "loss": 7.729, + "step": 674500 + }, + { + "epoch": 2.74817214573134, + "grad_norm": 4.9898905754089355, + "learning_rate": 0.003168625295620942, + "loss": 7.7586, + "step": 674600 + }, + { + "epoch": 2.7485795237547217, + "grad_norm": 3.0301427841186523, + "learning_rate": 0.003168147059520797, + "loss": 7.6959, + "step": 674700 + }, + { + "epoch": 2.7489869017781032, + "grad_norm": 2.4823179244995117, + "learning_rate": 0.0031676687972029007, + "loss": 7.7184, + "step": 674800 + }, + { + "epoch": 2.7493942798014848, + "grad_norm": 4.111626625061035, + "learning_rate": 0.003167190508686164, + "loss": 7.7334, + "step": 674900 + }, + { + "epoch": 2.7498016578248663, + "grad_norm": 2.7121200561523438, + "learning_rate": 0.0031667121939894937, + "loss": 7.7417, + "step": 675000 + }, + { + "epoch": 2.7498016578248663, + "eval_MaskedAccuracy": 0.5036854018838389, + "eval_loss": 1.6386982202529907, + "eval_runtime": 148.7873, + "eval_samples_per_second": 426.622, + "eval_steps_per_second": 1.667, + "step": 675000 + }, + { + "epoch": 2.750209035848248, + "grad_norm": 6.808712959289551, + "learning_rate": 0.0031662338531318005, + "loss": 7.7388, + "step": 675100 + }, + { + "epoch": 2.750616413871629, + "grad_norm": 3.6436800956726074, + "learning_rate": 0.003165755486132, + "loss": 7.7042, + "step": 675200 + }, + { + "epoch": 2.7510237918950105, + "grad_norm": 3.008739709854126, + "learning_rate": 0.0031652770930089974, + "loss": 7.7218, + "step": 675300 + }, + { + "epoch": 2.751431169918392, + "grad_norm": 3.265902519226074, + "learning_rate": 0.0031647986737817135, + "loss": 7.719, + "step": 675400 + }, + { + "epoch": 2.7518385479417735, + "grad_norm": 12.43411922454834, + "learning_rate": 0.0031643202284690575, + "loss": 7.7321, + "step": 675500 + }, + { + "epoch": 2.752245925965155, + "grad_norm": 4.71318244934082, + "learning_rate": 0.003163841757089949, + "loss": 7.7354, + "step": 675600 + }, + { + "epoch": 2.752653303988536, + "grad_norm": 5.173762321472168, + "learning_rate": 0.0031633632596632997, + "loss": 7.7349, + "step": 675700 + }, + { + "epoch": 2.7530606820119177, + "grad_norm": 7.2542877197265625, + "learning_rate": 0.0031628847362080294, + "loss": 7.7312, + "step": 675800 + }, + { + "epoch": 2.753468060035299, + "grad_norm": 2.1073505878448486, + "learning_rate": 0.0031624061867430594, + "loss": 7.7359, + "step": 675900 + }, + { + "epoch": 2.7538754380586807, + "grad_norm": 4.097883701324463, + "learning_rate": 0.003161927611287303, + "loss": 7.7634, + "step": 676000 + }, + { + "epoch": 2.7538754380586807, + "eval_MaskedAccuracy": 0.5040784739991235, + "eval_loss": 1.6261121034622192, + "eval_runtime": 148.1031, + "eval_samples_per_second": 428.593, + "eval_steps_per_second": 1.675, + "step": 676000 + }, + { + "epoch": 2.7542828160820623, + "grad_norm": 9.5602388381958, + "learning_rate": 0.003161449009859684, + "loss": 7.7123, + "step": 676100 + }, + { + "epoch": 2.754690194105444, + "grad_norm": 4.921302318572998, + "learning_rate": 0.0031609703824791264, + "loss": 7.7375, + "step": 676200 + }, + { + "epoch": 2.7550975721288253, + "grad_norm": 7.506611347198486, + "learning_rate": 0.003160491729164552, + "loss": 7.7179, + "step": 676300 + }, + { + "epoch": 2.755504950152207, + "grad_norm": 5.155307769775391, + "learning_rate": 0.003160013049934881, + "loss": 7.74, + "step": 676400 + }, + { + "epoch": 2.755912328175588, + "grad_norm": 6.343349933624268, + "learning_rate": 0.0031595343448090436, + "loss": 7.7557, + "step": 676500 + }, + { + "epoch": 2.7563197061989695, + "grad_norm": 5.314796447753906, + "learning_rate": 0.0031590556138059605, + "loss": 7.7697, + "step": 676600 + }, + { + "epoch": 2.756727084222351, + "grad_norm": 9.024616241455078, + "learning_rate": 0.0031585768569445593, + "loss": 7.7188, + "step": 676700 + }, + { + "epoch": 2.7571344622457326, + "grad_norm": 4.857398509979248, + "learning_rate": 0.00315809807424377, + "loss": 7.6998, + "step": 676800 + }, + { + "epoch": 2.757541840269114, + "grad_norm": 3.59858775138855, + "learning_rate": 0.0031576192657225234, + "loss": 7.7389, + "step": 676900 + }, + { + "epoch": 2.757949218292495, + "grad_norm": 2.3827154636383057, + "learning_rate": 0.003157140431399746, + "loss": 7.6996, + "step": 677000 + }, + { + "epoch": 2.757949218292495, + "eval_MaskedAccuracy": 0.5043865721423844, + "eval_loss": 1.6320961713790894, + "eval_runtime": 148.277, + "eval_samples_per_second": 428.091, + "eval_steps_per_second": 1.673, + "step": 677000 + }, + { + "epoch": 2.7583565963158767, + "grad_norm": 3.564591646194458, + "learning_rate": 0.0031566615712943706, + "loss": 7.7032, + "step": 677100 + }, + { + "epoch": 2.7587639743392582, + "grad_norm": 4.032743453979492, + "learning_rate": 0.003156182685425328, + "loss": 7.7725, + "step": 677200 + }, + { + "epoch": 2.7591713523626398, + "grad_norm": 7.908647537231445, + "learning_rate": 0.0031557037738115536, + "loss": 7.6997, + "step": 677300 + }, + { + "epoch": 2.7595787303860213, + "grad_norm": 2.619904041290283, + "learning_rate": 0.0031552248364719766, + "loss": 7.7045, + "step": 677400 + }, + { + "epoch": 2.759986108409403, + "grad_norm": 6.5481061935424805, + "learning_rate": 0.0031547458734255338, + "loss": 7.7437, + "step": 677500 + }, + { + "epoch": 2.7603934864327844, + "grad_norm": 3.4915366172790527, + "learning_rate": 0.0031542668846911612, + "loss": 7.7467, + "step": 677600 + }, + { + "epoch": 2.7608008644561655, + "grad_norm": 4.60849142074585, + "learning_rate": 0.003153787870287798, + "loss": 7.7135, + "step": 677700 + }, + { + "epoch": 2.761208242479547, + "grad_norm": 5.239776134490967, + "learning_rate": 0.003153308830234378, + "loss": 7.7504, + "step": 677800 + }, + { + "epoch": 2.7616156205029285, + "grad_norm": 8.834480285644531, + "learning_rate": 0.0031528297645498425, + "loss": 7.722, + "step": 677900 + }, + { + "epoch": 2.76202299852631, + "grad_norm": 2.389150619506836, + "learning_rate": 0.003152350673253132, + "loss": 7.7123, + "step": 678000 + }, + { + "epoch": 2.76202299852631, + "eval_MaskedAccuracy": 0.5047657486651792, + "eval_loss": 1.6233168840408325, + "eval_runtime": 148.873, + "eval_samples_per_second": 426.377, + "eval_steps_per_second": 1.666, + "step": 678000 + }, + { + "epoch": 2.7624303765496916, + "grad_norm": 7.212696552276611, + "learning_rate": 0.003151871556363187, + "loss": 7.6981, + "step": 678100 + }, + { + "epoch": 2.7628377545730727, + "grad_norm": 5.194124698638916, + "learning_rate": 0.0031513924138989453, + "loss": 7.7247, + "step": 678200 + }, + { + "epoch": 2.763245132596454, + "grad_norm": 3.673970937728882, + "learning_rate": 0.0031509132458793594, + "loss": 7.7149, + "step": 678300 + }, + { + "epoch": 2.7636525106198357, + "grad_norm": 2.1644327640533447, + "learning_rate": 0.003150434052323366, + "loss": 7.7381, + "step": 678400 + }, + { + "epoch": 2.7640598886432173, + "grad_norm": 7.227392673492432, + "learning_rate": 0.0031499548332499156, + "loss": 7.7276, + "step": 678500 + }, + { + "epoch": 2.764467266666599, + "grad_norm": 6.203690528869629, + "learning_rate": 0.003149475588677948, + "loss": 7.7101, + "step": 678600 + }, + { + "epoch": 2.7648746446899803, + "grad_norm": 2.6559998989105225, + "learning_rate": 0.003148996318626418, + "loss": 7.7048, + "step": 678700 + }, + { + "epoch": 2.765282022713362, + "grad_norm": 2.9390740394592285, + "learning_rate": 0.0031485170231142687, + "loss": 7.719, + "step": 678800 + }, + { + "epoch": 2.7656894007367434, + "grad_norm": 2.8324954509735107, + "learning_rate": 0.0031480377021604502, + "loss": 7.717, + "step": 678900 + }, + { + "epoch": 2.7660967787601245, + "grad_norm": 5.291262149810791, + "learning_rate": 0.0031475583557839084, + "loss": 7.7237, + "step": 679000 + }, + { + "epoch": 2.7660967787601245, + "eval_MaskedAccuracy": 0.5034864006410433, + "eval_loss": 1.6280428171157837, + "eval_runtime": 149.0766, + "eval_samples_per_second": 425.795, + "eval_steps_per_second": 1.664, + "step": 679000 + }, + { + "epoch": 2.766504156783506, + "grad_norm": 3.3246068954467773, + "learning_rate": 0.003147078984003601, + "loss": 7.7369, + "step": 679100 + }, + { + "epoch": 2.7669115348068876, + "grad_norm": 7.084670543670654, + "learning_rate": 0.003146599586838473, + "loss": 7.7349, + "step": 679200 + }, + { + "epoch": 2.767318912830269, + "grad_norm": 7.184365272521973, + "learning_rate": 0.0031461201643074826, + "loss": 7.7123, + "step": 679300 + }, + { + "epoch": 2.7677262908536506, + "grad_norm": 2.9993152618408203, + "learning_rate": 0.003145640716429583, + "loss": 7.7133, + "step": 679400 + }, + { + "epoch": 2.7681336688770317, + "grad_norm": 4.63287878036499, + "learning_rate": 0.0031451612432237307, + "loss": 7.7412, + "step": 679500 + }, + { + "epoch": 2.7685410469004132, + "grad_norm": 4.111084461212158, + "learning_rate": 0.0031446817447088825, + "loss": 7.7259, + "step": 679600 + }, + { + "epoch": 2.768948424923795, + "grad_norm": 3.8003876209259033, + "learning_rate": 0.0031442022209039913, + "loss": 7.7123, + "step": 679700 + }, + { + "epoch": 2.7693558029471763, + "grad_norm": 5.517698764801025, + "learning_rate": 0.003143722671828017, + "loss": 7.7237, + "step": 679800 + }, + { + "epoch": 2.769763180970558, + "grad_norm": 7.103514671325684, + "learning_rate": 0.0031432430974999218, + "loss": 7.7492, + "step": 679900 + }, + { + "epoch": 2.7701705589939394, + "grad_norm": 2.8233470916748047, + "learning_rate": 0.00314276349793866, + "loss": 7.7166, + "step": 680000 + }, + { + "epoch": 2.7701705589939394, + "eval_MaskedAccuracy": 0.5041070418044662, + "eval_loss": 1.6354745626449585, + "eval_runtime": 148.7404, + "eval_samples_per_second": 426.757, + "eval_steps_per_second": 1.667, + "step": 680000 + }, + { + "epoch": 2.770577937017321, + "grad_norm": 3.1389245986938477, + "learning_rate": 0.0031422838731632002, + "loss": 7.728, + "step": 680100 + }, + { + "epoch": 2.770985315040702, + "grad_norm": 7.308162689208984, + "learning_rate": 0.0031418042231924977, + "loss": 7.7258, + "step": 680200 + }, + { + "epoch": 2.7713926930640835, + "grad_norm": 3.361658811569214, + "learning_rate": 0.003141324548045524, + "loss": 7.7189, + "step": 680300 + }, + { + "epoch": 2.771800071087465, + "grad_norm": 2.921330690383911, + "learning_rate": 0.0031408448477412325, + "loss": 7.7132, + "step": 680400 + }, + { + "epoch": 2.7722074491108466, + "grad_norm": 5.913390159606934, + "learning_rate": 0.0031403651222985966, + "loss": 7.712, + "step": 680500 + }, + { + "epoch": 2.772614827134228, + "grad_norm": 4.974047660827637, + "learning_rate": 0.0031398853717365745, + "loss": 7.721, + "step": 680600 + }, + { + "epoch": 2.773022205157609, + "grad_norm": 2.729994297027588, + "learning_rate": 0.0031394055960741398, + "loss": 7.7278, + "step": 680700 + }, + { + "epoch": 2.7734295831809908, + "grad_norm": 8.038491249084473, + "learning_rate": 0.0031389257953302603, + "loss": 7.7267, + "step": 680800 + }, + { + "epoch": 2.7738369612043723, + "grad_norm": 3.782606363296509, + "learning_rate": 0.0031384459695239027, + "loss": 7.7274, + "step": 680900 + }, + { + "epoch": 2.774244339227754, + "grad_norm": 3.078270196914673, + "learning_rate": 0.003137966118674038, + "loss": 7.7161, + "step": 681000 + }, + { + "epoch": 2.774244339227754, + "eval_MaskedAccuracy": 0.5041785264991338, + "eval_loss": 1.6250332593917847, + "eval_runtime": 148.5202, + "eval_samples_per_second": 427.39, + "eval_steps_per_second": 1.67, + "step": 681000 + }, + { + "epoch": 2.7746517172511354, + "grad_norm": 5.447459697723389, + "learning_rate": 0.003137486242799641, + "loss": 7.7379, + "step": 681100 + }, + { + "epoch": 2.775059095274517, + "grad_norm": 4.432352542877197, + "learning_rate": 0.003137006341919675, + "loss": 7.7326, + "step": 681200 + }, + { + "epoch": 2.7754664732978984, + "grad_norm": 2.1805579662323, + "learning_rate": 0.003136526416053121, + "loss": 7.7204, + "step": 681300 + }, + { + "epoch": 2.77587385132128, + "grad_norm": 3.008263111114502, + "learning_rate": 0.0031360464652189484, + "loss": 7.7275, + "step": 681400 + }, + { + "epoch": 2.776281229344661, + "grad_norm": 4.100875377655029, + "learning_rate": 0.0031355664894361307, + "loss": 7.7247, + "step": 681500 + }, + { + "epoch": 2.7766886073680426, + "grad_norm": 4.5805253982543945, + "learning_rate": 0.0031350864887236514, + "loss": 7.722, + "step": 681600 + }, + { + "epoch": 2.777095985391424, + "grad_norm": 4.6138763427734375, + "learning_rate": 0.0031346064631004827, + "loss": 7.7396, + "step": 681700 + }, + { + "epoch": 2.7775033634148056, + "grad_norm": 3.771960973739624, + "learning_rate": 0.0031341264125856025, + "loss": 7.7384, + "step": 681800 + }, + { + "epoch": 2.777910741438187, + "grad_norm": 6.917104244232178, + "learning_rate": 0.0031336463371979886, + "loss": 7.7311, + "step": 681900 + }, + { + "epoch": 2.7783181194615683, + "grad_norm": 4.290818214416504, + "learning_rate": 0.0031331662369566272, + "loss": 7.7459, + "step": 682000 + }, + { + "epoch": 2.7783181194615683, + "eval_MaskedAccuracy": 0.5034832038612592, + "eval_loss": 1.6305007934570312, + "eval_runtime": 149.1055, + "eval_samples_per_second": 425.712, + "eval_steps_per_second": 1.663, + "step": 682000 + }, + { + "epoch": 2.77872549748495, + "grad_norm": 3.2923989295959473, + "learning_rate": 0.0031326861118804914, + "loss": 7.717, + "step": 682100 + }, + { + "epoch": 2.7791328755083313, + "grad_norm": 3.507107973098755, + "learning_rate": 0.003132205961988569, + "loss": 7.7414, + "step": 682200 + }, + { + "epoch": 2.779540253531713, + "grad_norm": 8.689350128173828, + "learning_rate": 0.003131725787299842, + "loss": 7.7122, + "step": 682300 + }, + { + "epoch": 2.7799476315550944, + "grad_norm": 5.283635139465332, + "learning_rate": 0.0031312455878332924, + "loss": 7.7247, + "step": 682400 + }, + { + "epoch": 2.780355009578476, + "grad_norm": 4.201075553894043, + "learning_rate": 0.003130765363607904, + "loss": 7.7141, + "step": 682500 + }, + { + "epoch": 2.7807623876018575, + "grad_norm": 2.8721766471862793, + "learning_rate": 0.0031302851146426626, + "loss": 7.7778, + "step": 682600 + }, + { + "epoch": 2.7811697656252385, + "grad_norm": 3.2971625328063965, + "learning_rate": 0.0031298048409565556, + "loss": 7.7198, + "step": 682700 + }, + { + "epoch": 2.78157714364862, + "grad_norm": 4.196006774902344, + "learning_rate": 0.003129324542568573, + "loss": 7.7237, + "step": 682800 + }, + { + "epoch": 2.7819845216720016, + "grad_norm": 2.078606128692627, + "learning_rate": 0.0031288442194977043, + "loss": 7.7405, + "step": 682900 + }, + { + "epoch": 2.782391899695383, + "grad_norm": 9.211048126220703, + "learning_rate": 0.0031283638717629317, + "loss": 7.7066, + "step": 683000 + }, + { + "epoch": 2.782391899695383, + "eval_MaskedAccuracy": 0.5041814404864644, + "eval_loss": 1.627467155456543, + "eval_runtime": 148.6688, + "eval_samples_per_second": 426.962, + "eval_steps_per_second": 1.668, + "step": 683000 + }, + { + "epoch": 2.7827992777187647, + "grad_norm": 5.170778751373291, + "learning_rate": 0.003127883499383252, + "loss": 7.6953, + "step": 683100 + }, + { + "epoch": 2.7832066557421458, + "grad_norm": 3.7099454402923584, + "learning_rate": 0.0031274031023776606, + "loss": 7.743, + "step": 683200 + }, + { + "epoch": 2.7836140337655273, + "grad_norm": 4.783493518829346, + "learning_rate": 0.0031269226807651438, + "loss": 7.7147, + "step": 683300 + }, + { + "epoch": 2.784021411788909, + "grad_norm": 6.044512748718262, + "learning_rate": 0.0031264422345646976, + "loss": 7.7199, + "step": 683400 + }, + { + "epoch": 2.7844287898122904, + "grad_norm": 6.170932292938232, + "learning_rate": 0.003125961763795313, + "loss": 7.731, + "step": 683500 + }, + { + "epoch": 2.784836167835672, + "grad_norm": 4.316214084625244, + "learning_rate": 0.0031254812684759895, + "loss": 7.6988, + "step": 683600 + }, + { + "epoch": 2.7852435458590534, + "grad_norm": 6.521294593811035, + "learning_rate": 0.003125000748625722, + "loss": 7.7277, + "step": 683700 + }, + { + "epoch": 2.785650923882435, + "grad_norm": 5.507486820220947, + "learning_rate": 0.003124520204263514, + "loss": 7.7474, + "step": 683800 + }, + { + "epoch": 2.7860583019058165, + "grad_norm": 4.277698516845703, + "learning_rate": 0.0031240396354083578, + "loss": 7.711, + "step": 683900 + }, + { + "epoch": 2.7864656799291976, + "grad_norm": 8.897061347961426, + "learning_rate": 0.003123559042079254, + "loss": 7.7319, + "step": 684000 + }, + { + "epoch": 2.7864656799291976, + "eval_MaskedAccuracy": 0.5039483355498073, + "eval_loss": 1.6266154050827026, + "eval_runtime": 148.4396, + "eval_samples_per_second": 427.622, + "eval_steps_per_second": 1.671, + "step": 684000 + }, + { + "epoch": 2.786873057952579, + "grad_norm": 6.696630001068115, + "learning_rate": 0.003123078424295201, + "loss": 7.7096, + "step": 684100 + }, + { + "epoch": 2.7872804359759606, + "grad_norm": 5.411538124084473, + "learning_rate": 0.0031225977820752023, + "loss": 7.7234, + "step": 684200 + }, + { + "epoch": 2.787687813999342, + "grad_norm": 3.5579845905303955, + "learning_rate": 0.0031221171154382573, + "loss": 7.6876, + "step": 684300 + }, + { + "epoch": 2.7880951920227237, + "grad_norm": 2.8582041263580322, + "learning_rate": 0.0031216364244033737, + "loss": 7.7063, + "step": 684400 + }, + { + "epoch": 2.788502570046105, + "grad_norm": 6.498418807983398, + "learning_rate": 0.0031211557089895585, + "loss": 7.7511, + "step": 684500 + }, + { + "epoch": 2.7889099480694863, + "grad_norm": 4.134893417358398, + "learning_rate": 0.0031206749692158087, + "loss": 7.704, + "step": 684600 + }, + { + "epoch": 2.789317326092868, + "grad_norm": 2.3614683151245117, + "learning_rate": 0.0031201942051011355, + "loss": 7.756, + "step": 684700 + }, + { + "epoch": 2.7897247041162494, + "grad_norm": 2.780640125274658, + "learning_rate": 0.0031197134166645445, + "loss": 7.7161, + "step": 684800 + }, + { + "epoch": 2.790132082139631, + "grad_norm": 5.244045257568359, + "learning_rate": 0.0031192326039250445, + "loss": 7.7454, + "step": 684900 + }, + { + "epoch": 2.7905394601630125, + "grad_norm": 8.637303352355957, + "learning_rate": 0.0031187517669016454, + "loss": 7.7301, + "step": 685000 + }, + { + "epoch": 2.7905394601630125, + "eval_MaskedAccuracy": 0.5043351051933935, + "eval_loss": 1.6246445178985596, + "eval_runtime": 148.7432, + "eval_samples_per_second": 426.749, + "eval_steps_per_second": 1.667, + "step": 685000 + }, + { + "epoch": 2.790946838186394, + "grad_norm": 7.704771041870117, + "learning_rate": 0.0031182709056133554, + "loss": 7.7314, + "step": 685100 + }, + { + "epoch": 2.791354216209775, + "grad_norm": 3.6072418689727783, + "learning_rate": 0.0031177900200791845, + "loss": 7.728, + "step": 685200 + }, + { + "epoch": 2.7917615942331566, + "grad_norm": 3.866476535797119, + "learning_rate": 0.0031173091103181504, + "loss": 7.7149, + "step": 685300 + }, + { + "epoch": 2.792168972256538, + "grad_norm": 3.703122854232788, + "learning_rate": 0.0031168281763492606, + "loss": 7.7016, + "step": 685400 + }, + { + "epoch": 2.7925763502799197, + "grad_norm": 3.259774923324585, + "learning_rate": 0.003116347218191534, + "loss": 7.715, + "step": 685500 + }, + { + "epoch": 2.792983728303301, + "grad_norm": 4.0200324058532715, + "learning_rate": 0.00311586623586398, + "loss": 7.7021, + "step": 685600 + }, + { + "epoch": 2.7933911063266823, + "grad_norm": 3.2888174057006836, + "learning_rate": 0.0031153852293856177, + "loss": 7.7336, + "step": 685700 + }, + { + "epoch": 2.793798484350064, + "grad_norm": 4.4348368644714355, + "learning_rate": 0.0031149041987754617, + "loss": 7.7349, + "step": 685800 + }, + { + "epoch": 2.7942058623734454, + "grad_norm": 2.963186502456665, + "learning_rate": 0.0031144231440525315, + "loss": 7.6976, + "step": 685900 + }, + { + "epoch": 2.794613240396827, + "grad_norm": 5.670172214508057, + "learning_rate": 0.003113942065235843, + "loss": 7.7188, + "step": 686000 + }, + { + "epoch": 2.794613240396827, + "eval_MaskedAccuracy": 0.504245478068693, + "eval_loss": 1.6286091804504395, + "eval_runtime": 149.342, + "eval_samples_per_second": 425.038, + "eval_steps_per_second": 1.661, + "step": 686000 + }, + { + "epoch": 2.7950206184202084, + "grad_norm": 2.3668212890625, + "learning_rate": 0.003113460962344421, + "loss": 7.6912, + "step": 686100 + }, + { + "epoch": 2.79542799644359, + "grad_norm": 4.4726786613464355, + "learning_rate": 0.0031129798353972784, + "loss": 7.7345, + "step": 686200 + }, + { + "epoch": 2.7958353744669715, + "grad_norm": 4.388908386230469, + "learning_rate": 0.0031124986844134443, + "loss": 7.753, + "step": 686300 + }, + { + "epoch": 2.796242752490353, + "grad_norm": 7.050173282623291, + "learning_rate": 0.0031120175094119344, + "loss": 7.7281, + "step": 686400 + }, + { + "epoch": 2.796650130513734, + "grad_norm": 4.756292819976807, + "learning_rate": 0.0031115363104117773, + "loss": 7.72, + "step": 686500 + }, + { + "epoch": 2.7970575085371157, + "grad_norm": 5.236794948577881, + "learning_rate": 0.0031110550874319976, + "loss": 7.7203, + "step": 686600 + }, + { + "epoch": 2.797464886560497, + "grad_norm": 6.344773769378662, + "learning_rate": 0.0031105738404916166, + "loss": 7.7042, + "step": 686700 + }, + { + "epoch": 2.7978722645838787, + "grad_norm": 6.981508255004883, + "learning_rate": 0.003110092569609664, + "loss": 7.7147, + "step": 686800 + }, + { + "epoch": 2.7982796426072603, + "grad_norm": 4.591001987457275, + "learning_rate": 0.003109611274805169, + "loss": 7.721, + "step": 686900 + }, + { + "epoch": 2.7986870206306413, + "grad_norm": 5.013411045074463, + "learning_rate": 0.003109129956097154, + "loss": 7.7221, + "step": 687000 + }, + { + "epoch": 2.7986870206306413, + "eval_MaskedAccuracy": 0.5041233797290112, + "eval_loss": 1.6214803457260132, + "eval_runtime": 150.2376, + "eval_samples_per_second": 422.504, + "eval_steps_per_second": 1.651, + "step": 687000 + }, + { + "epoch": 2.799094398654023, + "grad_norm": 4.680105686187744, + "learning_rate": 0.003108648613504652, + "loss": 7.7321, + "step": 687100 + }, + { + "epoch": 2.7995017766774044, + "grad_norm": 4.8180389404296875, + "learning_rate": 0.0031081672470466906, + "loss": 7.7019, + "step": 687200 + }, + { + "epoch": 2.799909154700786, + "grad_norm": 3.550044536590576, + "learning_rate": 0.0031076858567423006, + "loss": 7.7062, + "step": 687300 + }, + { + "epoch": 2.8003165327241675, + "grad_norm": 2.391879081726074, + "learning_rate": 0.0031072044426105117, + "loss": 7.7312, + "step": 687400 + }, + { + "epoch": 2.800723910747549, + "grad_norm": 5.2197041511535645, + "learning_rate": 0.003106723004670364, + "loss": 7.7202, + "step": 687500 + }, + { + "epoch": 2.8011312887709305, + "grad_norm": 4.447170734405518, + "learning_rate": 0.0031062415429408898, + "loss": 7.7159, + "step": 687600 + }, + { + "epoch": 2.8015386667943116, + "grad_norm": 4.965562343597412, + "learning_rate": 0.0031057600574411217, + "loss": 7.7362, + "step": 687700 + }, + { + "epoch": 2.801946044817693, + "grad_norm": 5.058368682861328, + "learning_rate": 0.0031052785481900927, + "loss": 7.7302, + "step": 687800 + }, + { + "epoch": 2.8023534228410747, + "grad_norm": 3.9994418621063232, + "learning_rate": 0.003104797015206845, + "loss": 7.7124, + "step": 687900 + }, + { + "epoch": 2.8027608008644562, + "grad_norm": 3.2288706302642822, + "learning_rate": 0.0031043154585104105, + "loss": 7.6999, + "step": 688000 + }, + { + "epoch": 2.8027608008644562, + "eval_MaskedAccuracy": 0.5048591890666292, + "eval_loss": 1.6219958066940308, + "eval_runtime": 149.2094, + "eval_samples_per_second": 425.416, + "eval_steps_per_second": 1.662, + "step": 688000 + }, + { + "epoch": 2.8031681788878378, + "grad_norm": 3.697815418243408, + "learning_rate": 0.0031038338781198328, + "loss": 7.7462, + "step": 688100 + }, + { + "epoch": 2.803575556911219, + "grad_norm": 6.208542346954346, + "learning_rate": 0.003103352274054148, + "loss": 7.7115, + "step": 688200 + }, + { + "epoch": 2.8039829349346004, + "grad_norm": 4.559090614318848, + "learning_rate": 0.003102870646332399, + "loss": 7.7318, + "step": 688300 + }, + { + "epoch": 2.804390312957982, + "grad_norm": 3.0643157958984375, + "learning_rate": 0.0031023889949736245, + "loss": 7.7203, + "step": 688400 + }, + { + "epoch": 2.8047976909813634, + "grad_norm": 5.9461669921875, + "learning_rate": 0.0031019073199968676, + "loss": 7.7279, + "step": 688500 + }, + { + "epoch": 2.805205069004745, + "grad_norm": 8.562631607055664, + "learning_rate": 0.003101425621421172, + "loss": 7.7171, + "step": 688600 + }, + { + "epoch": 2.8056124470281265, + "grad_norm": 8.356283187866211, + "learning_rate": 0.0031009438992655844, + "loss": 7.6994, + "step": 688700 + }, + { + "epoch": 2.806019825051508, + "grad_norm": 10.357708930969238, + "learning_rate": 0.003100462153549147, + "loss": 7.7289, + "step": 688800 + }, + { + "epoch": 2.8064272030748896, + "grad_norm": 5.205289363861084, + "learning_rate": 0.0030999803842909034, + "loss": 7.7502, + "step": 688900 + }, + { + "epoch": 2.8068345810982707, + "grad_norm": 7.57562780380249, + "learning_rate": 0.0030994985915099036, + "loss": 7.7263, + "step": 689000 + }, + { + "epoch": 2.8068345810982707, + "eval_MaskedAccuracy": 0.504349269193066, + "eval_loss": 1.62308669090271, + "eval_runtime": 149.1934, + "eval_samples_per_second": 425.461, + "eval_steps_per_second": 1.662, + "step": 689000 + }, + { + "epoch": 2.807241959121652, + "grad_norm": 5.566107273101807, + "learning_rate": 0.003099016775225192, + "loss": 7.7312, + "step": 689100 + }, + { + "epoch": 2.8076493371450337, + "grad_norm": 5.995349407196045, + "learning_rate": 0.0030985349354558205, + "loss": 7.7518, + "step": 689200 + }, + { + "epoch": 2.8080567151684153, + "grad_norm": 3.4671058654785156, + "learning_rate": 0.0030980530722208442, + "loss": 7.7189, + "step": 689300 + }, + { + "epoch": 2.8084640931917964, + "grad_norm": 3.8707988262176514, + "learning_rate": 0.003097571185539305, + "loss": 7.6957, + "step": 689400 + }, + { + "epoch": 2.808871471215178, + "grad_norm": 3.558927059173584, + "learning_rate": 0.0030970892754302592, + "loss": 7.7169, + "step": 689500 + }, + { + "epoch": 2.8092788492385594, + "grad_norm": 2.6028566360473633, + "learning_rate": 0.0030966073419127584, + "loss": 7.7107, + "step": 689600 + }, + { + "epoch": 2.809686227261941, + "grad_norm": 4.1595940589904785, + "learning_rate": 0.0030961253850058526, + "loss": 7.7297, + "step": 689700 + }, + { + "epoch": 2.8100936052853225, + "grad_norm": 3.851271390914917, + "learning_rate": 0.003095643404728598, + "loss": 7.7341, + "step": 689800 + }, + { + "epoch": 2.810500983308704, + "grad_norm": 6.789198875427246, + "learning_rate": 0.0030951614011000507, + "loss": 7.7163, + "step": 689900 + }, + { + "epoch": 2.8109083613320855, + "grad_norm": 4.078215599060059, + "learning_rate": 0.00309467937413927, + "loss": 7.6922, + "step": 690000 + }, + { + "epoch": 2.8109083613320855, + "eval_MaskedAccuracy": 0.5041951639695755, + "eval_loss": 1.623473882675171, + "eval_runtime": 148.8146, + "eval_samples_per_second": 426.544, + "eval_steps_per_second": 1.667, + "step": 690000 + }, + { + "epoch": 2.811315739355467, + "grad_norm": 5.313997745513916, + "learning_rate": 0.00309419732386531, + "loss": 7.6959, + "step": 690100 + }, + { + "epoch": 2.811723117378848, + "grad_norm": 18.729393005371094, + "learning_rate": 0.0030937152502972243, + "loss": 7.7084, + "step": 690200 + }, + { + "epoch": 2.8121304954022297, + "grad_norm": 4.033646583557129, + "learning_rate": 0.0030932331534540753, + "loss": 7.7469, + "step": 690300 + }, + { + "epoch": 2.8125378734256112, + "grad_norm": 2.9967758655548096, + "learning_rate": 0.0030927510333549277, + "loss": 7.7383, + "step": 690400 + }, + { + "epoch": 2.8129452514489928, + "grad_norm": 5.653675079345703, + "learning_rate": 0.0030922688900188386, + "loss": 7.7156, + "step": 690500 + }, + { + "epoch": 2.8133526294723743, + "grad_norm": 5.748961925506592, + "learning_rate": 0.0030917867234648666, + "loss": 7.7241, + "step": 690600 + }, + { + "epoch": 2.8137600074957554, + "grad_norm": 4.450202465057373, + "learning_rate": 0.0030913045337120776, + "loss": 7.7102, + "step": 690700 + }, + { + "epoch": 2.814167385519137, + "grad_norm": 5.869491100311279, + "learning_rate": 0.0030908223207795366, + "loss": 7.7178, + "step": 690800 + }, + { + "epoch": 2.8145747635425185, + "grad_norm": 12.048808097839355, + "learning_rate": 0.0030903400846863053, + "loss": 7.7094, + "step": 690900 + }, + { + "epoch": 2.8149821415659, + "grad_norm": 6.04811429977417, + "learning_rate": 0.003089857825451448, + "loss": 7.7201, + "step": 691000 + }, + { + "epoch": 2.8149821415659, + "eval_MaskedAccuracy": 0.5040807661085703, + "eval_loss": 1.6305480003356934, + "eval_runtime": 149.2591, + "eval_samples_per_second": 425.274, + "eval_steps_per_second": 1.662, + "step": 691000 + }, + { + "epoch": 2.8153895195892815, + "grad_norm": 5.337472915649414, + "learning_rate": 0.0030893755430940334, + "loss": 7.7292, + "step": 691100 + }, + { + "epoch": 2.815796897612663, + "grad_norm": 5.841711521148682, + "learning_rate": 0.0030888932376331316, + "loss": 7.7428, + "step": 691200 + }, + { + "epoch": 2.8162042756360446, + "grad_norm": 6.443739891052246, + "learning_rate": 0.0030884109090878065, + "loss": 7.7095, + "step": 691300 + }, + { + "epoch": 2.816611653659426, + "grad_norm": 3.8162341117858887, + "learning_rate": 0.0030879285574771263, + "loss": 7.711, + "step": 691400 + }, + { + "epoch": 2.817019031682807, + "grad_norm": 3.2067131996154785, + "learning_rate": 0.003087446182820161, + "loss": 7.7339, + "step": 691500 + }, + { + "epoch": 2.8174264097061887, + "grad_norm": 3.846188545227051, + "learning_rate": 0.003086963785135982, + "loss": 7.7297, + "step": 691600 + }, + { + "epoch": 2.8178337877295703, + "grad_norm": 7.100821018218994, + "learning_rate": 0.0030864813644436613, + "loss": 7.7268, + "step": 691700 + }, + { + "epoch": 2.818241165752952, + "grad_norm": 8.114201545715332, + "learning_rate": 0.0030859989207622737, + "loss": 7.7188, + "step": 691800 + }, + { + "epoch": 2.818648543776333, + "grad_norm": 4.804603576660156, + "learning_rate": 0.0030855164541108916, + "loss": 7.6748, + "step": 691900 + }, + { + "epoch": 2.8190559217997144, + "grad_norm": 9.302783012390137, + "learning_rate": 0.003085033964508592, + "loss": 7.7527, + "step": 692000 + }, + { + "epoch": 2.8190559217997144, + "eval_MaskedAccuracy": 0.503542860675209, + "eval_loss": 1.6362484693527222, + "eval_runtime": 149.521, + "eval_samples_per_second": 424.529, + "eval_steps_per_second": 1.659, + "step": 692000 + }, + { + "epoch": 2.819463299823096, + "grad_norm": 11.042616844177246, + "learning_rate": 0.0030845514519744458, + "loss": 7.7349, + "step": 692100 + }, + { + "epoch": 2.8198706778464775, + "grad_norm": 7.054125785827637, + "learning_rate": 0.0030840689165275302, + "loss": 7.7114, + "step": 692200 + }, + { + "epoch": 2.820278055869859, + "grad_norm": 8.906975746154785, + "learning_rate": 0.0030835863581869247, + "loss": 7.7168, + "step": 692300 + }, + { + "epoch": 2.8206854338932406, + "grad_norm": 2.9098007678985596, + "learning_rate": 0.0030831037769717055, + "loss": 7.7402, + "step": 692400 + }, + { + "epoch": 2.821092811916622, + "grad_norm": 3.044633388519287, + "learning_rate": 0.003082621172900954, + "loss": 7.7195, + "step": 692500 + }, + { + "epoch": 2.8215001899400036, + "grad_norm": 2.5839884281158447, + "learning_rate": 0.0030821385459937436, + "loss": 7.7109, + "step": 692600 + }, + { + "epoch": 2.8219075679633847, + "grad_norm": 7.343106746673584, + "learning_rate": 0.0030816558962691604, + "loss": 7.7065, + "step": 692700 + }, + { + "epoch": 2.8223149459867662, + "grad_norm": 3.198737621307373, + "learning_rate": 0.003081173223746287, + "loss": 7.7233, + "step": 692800 + }, + { + "epoch": 2.8227223240101478, + "grad_norm": 2.686126708984375, + "learning_rate": 0.003080690528444206, + "loss": 7.7205, + "step": 692900 + }, + { + "epoch": 2.8231297020335293, + "grad_norm": 2.5314345359802246, + "learning_rate": 0.003080207810381999, + "loss": 7.7184, + "step": 693000 + }, + { + "epoch": 2.8231297020335293, + "eval_MaskedAccuracy": 0.504285800770228, + "eval_loss": 1.6302218437194824, + "eval_runtime": 149.4239, + "eval_samples_per_second": 424.805, + "eval_steps_per_second": 1.66, + "step": 693000 + }, + { + "epoch": 2.823537080056911, + "grad_norm": 7.544650077819824, + "learning_rate": 0.0030797250695787496, + "loss": 7.7302, + "step": 693100 + }, + { + "epoch": 2.823944458080292, + "grad_norm": 4.015535354614258, + "learning_rate": 0.0030792423060535426, + "loss": 7.7167, + "step": 693200 + }, + { + "epoch": 2.8243518361036735, + "grad_norm": 2.9468696117401123, + "learning_rate": 0.0030787595198254686, + "loss": 7.7445, + "step": 693300 + }, + { + "epoch": 2.824759214127055, + "grad_norm": 8.753650665283203, + "learning_rate": 0.003078276710913611, + "loss": 7.7316, + "step": 693400 + }, + { + "epoch": 2.8251665921504365, + "grad_norm": 5.600179672241211, + "learning_rate": 0.003077793879337059, + "loss": 7.7287, + "step": 693500 + }, + { + "epoch": 2.825573970173818, + "grad_norm": 5.177038669586182, + "learning_rate": 0.003077311025114899, + "loss": 7.6927, + "step": 693600 + }, + { + "epoch": 2.8259813481971996, + "grad_norm": 8.811504364013672, + "learning_rate": 0.0030768281482662263, + "loss": 7.679, + "step": 693700 + }, + { + "epoch": 2.826388726220581, + "grad_norm": 6.328119277954102, + "learning_rate": 0.003076345248810129, + "loss": 7.7264, + "step": 693800 + }, + { + "epoch": 2.8267961042439627, + "grad_norm": 4.487634658813477, + "learning_rate": 0.0030758623267656974, + "loss": 7.7199, + "step": 693900 + }, + { + "epoch": 2.8272034822673437, + "grad_norm": 4.589730739593506, + "learning_rate": 0.003075379382152023, + "loss": 7.7223, + "step": 694000 + }, + { + "epoch": 2.8272034822673437, + "eval_MaskedAccuracy": 0.5045122670012522, + "eval_loss": 1.6221905946731567, + "eval_runtime": 148.9319, + "eval_samples_per_second": 426.208, + "eval_steps_per_second": 1.665, + "step": 694000 + }, + { + "epoch": 2.8276108602907253, + "grad_norm": 3.289022207260132, + "learning_rate": 0.003074896414988207, + "loss": 7.7231, + "step": 694100 + }, + { + "epoch": 2.828018238314107, + "grad_norm": 6.647270202636719, + "learning_rate": 0.0030744134252933333, + "loss": 7.7364, + "step": 694200 + }, + { + "epoch": 2.8284256163374883, + "grad_norm": 6.761031150817871, + "learning_rate": 0.0030739304130865054, + "loss": 7.721, + "step": 694300 + }, + { + "epoch": 2.8288329943608694, + "grad_norm": 4.394124984741211, + "learning_rate": 0.0030734473783868123, + "loss": 7.7176, + "step": 694400 + }, + { + "epoch": 2.829240372384251, + "grad_norm": 6.165120601654053, + "learning_rate": 0.003072964321213357, + "loss": 7.7101, + "step": 694500 + }, + { + "epoch": 2.8296477504076325, + "grad_norm": 4.580467700958252, + "learning_rate": 0.0030724812415852325, + "loss": 7.6946, + "step": 694600 + }, + { + "epoch": 2.830055128431014, + "grad_norm": 3.2740745544433594, + "learning_rate": 0.003071998139521537, + "loss": 7.6949, + "step": 694700 + }, + { + "epoch": 2.8304625064543956, + "grad_norm": 8.765399932861328, + "learning_rate": 0.0030715150150413748, + "loss": 7.6705, + "step": 694800 + }, + { + "epoch": 2.830869884477777, + "grad_norm": 3.4015614986419678, + "learning_rate": 0.0030710318681638447, + "loss": 7.7404, + "step": 694900 + }, + { + "epoch": 2.8312772625011586, + "grad_norm": 3.995572805404663, + "learning_rate": 0.0030705486989080454, + "loss": 7.7388, + "step": 695000 + }, + { + "epoch": 2.8312772625011586, + "eval_MaskedAccuracy": 0.5046110943227539, + "eval_loss": 1.633867621421814, + "eval_runtime": 149.1334, + "eval_samples_per_second": 425.632, + "eval_steps_per_second": 1.663, + "step": 695000 + }, + { + "epoch": 2.83168464052454, + "grad_norm": 6.184289455413818, + "learning_rate": 0.0030700655072930824, + "loss": 7.7401, + "step": 695100 + }, + { + "epoch": 2.8320920185479213, + "grad_norm": 7.292242050170898, + "learning_rate": 0.0030695822933380574, + "loss": 7.747, + "step": 695200 + }, + { + "epoch": 2.832499396571303, + "grad_norm": 6.968306064605713, + "learning_rate": 0.0030690990570620713, + "loss": 7.6917, + "step": 695300 + }, + { + "epoch": 2.8329067745946843, + "grad_norm": 4.081552982330322, + "learning_rate": 0.0030686157984842356, + "loss": 7.7035, + "step": 695400 + }, + { + "epoch": 2.833314152618066, + "grad_norm": 6.458633899688721, + "learning_rate": 0.003068132517623655, + "loss": 7.6979, + "step": 695500 + }, + { + "epoch": 2.8337215306414474, + "grad_norm": 3.7281322479248047, + "learning_rate": 0.003067649214499431, + "loss": 7.7263, + "step": 695600 + }, + { + "epoch": 2.8341289086648285, + "grad_norm": 7.177165985107422, + "learning_rate": 0.0030671658891306755, + "loss": 7.6904, + "step": 695700 + }, + { + "epoch": 2.83453628668821, + "grad_norm": 7.323238849639893, + "learning_rate": 0.0030666825415364965, + "loss": 7.6749, + "step": 695800 + }, + { + "epoch": 2.8349436647115915, + "grad_norm": 7.35430908203125, + "learning_rate": 0.003066199171736003, + "loss": 7.719, + "step": 695900 + }, + { + "epoch": 2.835351042734973, + "grad_norm": 7.7270073890686035, + "learning_rate": 0.003065715779748306, + "loss": 7.708, + "step": 696000 + }, + { + "epoch": 2.835351042734973, + "eval_MaskedAccuracy": 0.5048210869445112, + "eval_loss": 1.623809576034546, + "eval_runtime": 148.8926, + "eval_samples_per_second": 426.321, + "eval_steps_per_second": 1.666, + "step": 696000 + }, + { + "epoch": 2.8357584207583546, + "grad_norm": 3.1805379390716553, + "learning_rate": 0.0030652323655925118, + "loss": 7.6836, + "step": 696100 + }, + { + "epoch": 2.836165798781736, + "grad_norm": 3.5719704627990723, + "learning_rate": 0.003064748929287736, + "loss": 7.7184, + "step": 696200 + }, + { + "epoch": 2.8365731768051177, + "grad_norm": 2.7994565963745117, + "learning_rate": 0.0030642654708530905, + "loss": 7.7149, + "step": 696300 + }, + { + "epoch": 2.836980554828499, + "grad_norm": 5.4765625, + "learning_rate": 0.003063781990307692, + "loss": 7.6906, + "step": 696400 + }, + { + "epoch": 2.8373879328518803, + "grad_norm": 3.522538423538208, + "learning_rate": 0.003063298487670654, + "loss": 7.6981, + "step": 696500 + }, + { + "epoch": 2.837795310875262, + "grad_norm": 2.3591034412384033, + "learning_rate": 0.003062814962961088, + "loss": 7.7021, + "step": 696600 + }, + { + "epoch": 2.8382026888986434, + "grad_norm": 5.188504219055176, + "learning_rate": 0.0030623314161981148, + "loss": 7.6768, + "step": 696700 + }, + { + "epoch": 2.838610066922025, + "grad_norm": 12.307287216186523, + "learning_rate": 0.0030618478474008465, + "loss": 7.6884, + "step": 696800 + }, + { + "epoch": 2.839017444945406, + "grad_norm": 4.71583890914917, + "learning_rate": 0.0030613642565884077, + "loss": 7.7258, + "step": 696900 + }, + { + "epoch": 2.8394248229687875, + "grad_norm": 7.0869832038879395, + "learning_rate": 0.003060880643779911, + "loss": 7.6911, + "step": 697000 + }, + { + "epoch": 2.8394248229687875, + "eval_MaskedAccuracy": 0.5041891362388816, + "eval_loss": 1.6216367483139038, + "eval_runtime": 149.287, + "eval_samples_per_second": 425.195, + "eval_steps_per_second": 1.661, + "step": 697000 + }, + { + "epoch": 2.839832200992169, + "grad_norm": 3.5789458751678467, + "learning_rate": 0.0030603970089944812, + "loss": 7.7058, + "step": 697100 + }, + { + "epoch": 2.8402395790155506, + "grad_norm": 7.403262615203857, + "learning_rate": 0.0030599133522512378, + "loss": 7.6868, + "step": 697200 + }, + { + "epoch": 2.840646957038932, + "grad_norm": 7.532144546508789, + "learning_rate": 0.0030594296735692956, + "loss": 7.7527, + "step": 697300 + }, + { + "epoch": 2.8410543350623136, + "grad_norm": 3.671082019805908, + "learning_rate": 0.003058945972967789, + "loss": 7.7012, + "step": 697400 + }, + { + "epoch": 2.841461713085695, + "grad_norm": 8.792901992797852, + "learning_rate": 0.003058462250465834, + "loss": 7.6974, + "step": 697500 + }, + { + "epoch": 2.8418690911090767, + "grad_norm": 6.278231620788574, + "learning_rate": 0.0030579785060825546, + "loss": 7.7238, + "step": 697600 + }, + { + "epoch": 2.842276469132458, + "grad_norm": 2.9636943340301514, + "learning_rate": 0.0030574947398370805, + "loss": 7.7156, + "step": 697700 + }, + { + "epoch": 2.8426838471558393, + "grad_norm": 5.388239860534668, + "learning_rate": 0.0030570109517485348, + "loss": 7.7217, + "step": 697800 + }, + { + "epoch": 2.843091225179221, + "grad_norm": 13.538797378540039, + "learning_rate": 0.003056527141836042, + "loss": 7.6908, + "step": 697900 + }, + { + "epoch": 2.8434986032026024, + "grad_norm": 7.257206439971924, + "learning_rate": 0.0030560433101187337, + "loss": 7.7442, + "step": 698000 + }, + { + "epoch": 2.8434986032026024, + "eval_MaskedAccuracy": 0.5049515498740003, + "eval_loss": 1.6278083324432373, + "eval_runtime": 149.3787, + "eval_samples_per_second": 424.933, + "eval_steps_per_second": 1.66, + "step": 698000 + }, + { + "epoch": 2.843905981225984, + "grad_norm": 3.598336935043335, + "learning_rate": 0.0030555594566157355, + "loss": 7.7023, + "step": 698100 + }, + { + "epoch": 2.844313359249365, + "grad_norm": 6.302800178527832, + "learning_rate": 0.003055075581346176, + "loss": 7.7309, + "step": 698200 + }, + { + "epoch": 2.8447207372727465, + "grad_norm": 3.172574043273926, + "learning_rate": 0.0030545916843291907, + "loss": 7.7402, + "step": 698300 + }, + { + "epoch": 2.845128115296128, + "grad_norm": 3.85713267326355, + "learning_rate": 0.0030541077655839035, + "loss": 7.7114, + "step": 698400 + }, + { + "epoch": 2.8455354933195096, + "grad_norm": 3.4408833980560303, + "learning_rate": 0.0030536238251294514, + "loss": 7.7179, + "step": 698500 + }, + { + "epoch": 2.845942871342891, + "grad_norm": 6.391783714294434, + "learning_rate": 0.0030531398629849655, + "loss": 7.7357, + "step": 698600 + }, + { + "epoch": 2.8463502493662727, + "grad_norm": 7.269733905792236, + "learning_rate": 0.0030526558791695775, + "loss": 7.7042, + "step": 698700 + }, + { + "epoch": 2.846757627389654, + "grad_norm": 7.072800636291504, + "learning_rate": 0.0030521718737024236, + "loss": 7.6934, + "step": 698800 + }, + { + "epoch": 2.8471650054130357, + "grad_norm": 6.533130168914795, + "learning_rate": 0.0030516878466026382, + "loss": 7.7217, + "step": 698900 + }, + { + "epoch": 2.847572383436417, + "grad_norm": 5.605027198791504, + "learning_rate": 0.0030512037978893567, + "loss": 7.7091, + "step": 699000 + }, + { + "epoch": 2.847572383436417, + "eval_MaskedAccuracy": 0.5045449432806524, + "eval_loss": 1.6316790580749512, + "eval_runtime": 149.5385, + "eval_samples_per_second": 424.479, + "eval_steps_per_second": 1.658, + "step": 699000 + }, + { + "epoch": 2.8479797614597984, + "grad_norm": 10.6749267578125, + "learning_rate": 0.003050719727581721, + "loss": 7.7156, + "step": 699100 + }, + { + "epoch": 2.84838713948318, + "grad_norm": 6.989471435546875, + "learning_rate": 0.003050235635698865, + "loss": 7.7154, + "step": 699200 + }, + { + "epoch": 2.8487945175065614, + "grad_norm": 7.544661045074463, + "learning_rate": 0.0030497515222599274, + "loss": 7.7293, + "step": 699300 + }, + { + "epoch": 2.8492018955299425, + "grad_norm": 4.551535129547119, + "learning_rate": 0.0030492673872840525, + "loss": 7.7206, + "step": 699400 + }, + { + "epoch": 2.849609273553324, + "grad_norm": 5.39074182510376, + "learning_rate": 0.0030487832307903727, + "loss": 7.7176, + "step": 699500 + }, + { + "epoch": 2.8500166515767056, + "grad_norm": 4.181148529052734, + "learning_rate": 0.003048299052798037, + "loss": 7.7149, + "step": 699600 + }, + { + "epoch": 2.850424029600087, + "grad_norm": 3.8611361980438232, + "learning_rate": 0.003047814853326184, + "loss": 7.7027, + "step": 699700 + }, + { + "epoch": 2.8508314076234686, + "grad_norm": 3.8873724937438965, + "learning_rate": 0.0030473306323939536, + "loss": 7.7517, + "step": 699800 + }, + { + "epoch": 2.85123878564685, + "grad_norm": 3.9484033584594727, + "learning_rate": 0.003046846390020493, + "loss": 7.7039, + "step": 699900 + }, + { + "epoch": 2.8516461636702317, + "grad_norm": 9.328553199768066, + "learning_rate": 0.0030463621262249454, + "loss": 7.7465, + "step": 700000 + }, + { + "epoch": 2.8516461636702317, + "eval_MaskedAccuracy": 0.503998148291027, + "eval_loss": 1.63697350025177, + "eval_runtime": 150.2187, + "eval_samples_per_second": 422.557, + "eval_steps_per_second": 1.651, + "step": 700000 + }, + { + "epoch": 2.8520535416936132, + "grad_norm": 3.6931982040405273, + "learning_rate": 0.003045877841026461, + "loss": 7.6942, + "step": 700100 + }, + { + "epoch": 2.8524609197169943, + "grad_norm": 5.414756774902344, + "learning_rate": 0.0030453935344441794, + "loss": 7.7076, + "step": 700200 + }, + { + "epoch": 2.852868297740376, + "grad_norm": 3.762303590774536, + "learning_rate": 0.0030449092064972517, + "loss": 7.6774, + "step": 700300 + }, + { + "epoch": 2.8532756757637574, + "grad_norm": 4.2372283935546875, + "learning_rate": 0.0030444248572048254, + "loss": 7.7182, + "step": 700400 + }, + { + "epoch": 2.853683053787139, + "grad_norm": 8.580768585205078, + "learning_rate": 0.0030439404865860525, + "loss": 7.7171, + "step": 700500 + }, + { + "epoch": 2.8540904318105205, + "grad_norm": 6.197897434234619, + "learning_rate": 0.0030434560946600766, + "loss": 7.7389, + "step": 700600 + }, + { + "epoch": 2.8544978098339016, + "grad_norm": 7.621669769287109, + "learning_rate": 0.0030429716814460547, + "loss": 7.6687, + "step": 700700 + }, + { + "epoch": 2.854905187857283, + "grad_norm": 3.5647974014282227, + "learning_rate": 0.0030424872469631318, + "loss": 7.6928, + "step": 700800 + }, + { + "epoch": 2.8553125658806646, + "grad_norm": 8.431017875671387, + "learning_rate": 0.003042002791230462, + "loss": 7.7091, + "step": 700900 + }, + { + "epoch": 2.855719943904046, + "grad_norm": 4.402780532836914, + "learning_rate": 0.003041518314267202, + "loss": 7.7249, + "step": 701000 + }, + { + "epoch": 2.855719943904046, + "eval_MaskedAccuracy": 0.5047495204685684, + "eval_loss": 1.6300948858261108, + "eval_runtime": 148.4825, + "eval_samples_per_second": 427.498, + "eval_steps_per_second": 1.67, + "step": 701000 + }, + { + "epoch": 2.8561273219274277, + "grad_norm": 3.092787504196167, + "learning_rate": 0.0030410338160925037, + "loss": 7.691, + "step": 701100 + }, + { + "epoch": 2.856534699950809, + "grad_norm": 6.276670932769775, + "learning_rate": 0.0030405492967255214, + "loss": 7.724, + "step": 701200 + }, + { + "epoch": 2.8569420779741908, + "grad_norm": 7.114150047302246, + "learning_rate": 0.0030400647561854084, + "loss": 7.6765, + "step": 701300 + }, + { + "epoch": 2.8573494559975723, + "grad_norm": 9.160008430480957, + "learning_rate": 0.0030395801944913267, + "loss": 7.698, + "step": 701400 + }, + { + "epoch": 2.8577568340209534, + "grad_norm": 3.681234359741211, + "learning_rate": 0.0030390956116624307, + "loss": 7.7127, + "step": 701500 + }, + { + "epoch": 2.858164212044335, + "grad_norm": 3.5404510498046875, + "learning_rate": 0.003038611007717878, + "loss": 7.6865, + "step": 701600 + }, + { + "epoch": 2.8585715900677164, + "grad_norm": 4.3910040855407715, + "learning_rate": 0.0030381263826768294, + "loss": 7.7255, + "step": 701700 + }, + { + "epoch": 2.858978968091098, + "grad_norm": 4.79788064956665, + "learning_rate": 0.003037641736558443, + "loss": 7.7295, + "step": 701800 + }, + { + "epoch": 2.859386346114479, + "grad_norm": 4.587947845458984, + "learning_rate": 0.003037157069381879, + "loss": 7.684, + "step": 701900 + }, + { + "epoch": 2.8597937241378606, + "grad_norm": 9.907379150390625, + "learning_rate": 0.003036672381166298, + "loss": 7.7106, + "step": 702000 + }, + { + "epoch": 2.8597937241378606, + "eval_MaskedAccuracy": 0.5044675331927309, + "eval_loss": 1.6207369565963745, + "eval_runtime": 148.9726, + "eval_samples_per_second": 426.092, + "eval_steps_per_second": 1.665, + "step": 702000 + }, + { + "epoch": 2.860201102161242, + "grad_norm": 4.197406768798828, + "learning_rate": 0.0030361876719308682, + "loss": 7.7149, + "step": 702100 + }, + { + "epoch": 2.8606084801846237, + "grad_norm": 3.471236228942871, + "learning_rate": 0.0030357029416947477, + "loss": 7.7045, + "step": 702200 + }, + { + "epoch": 2.861015858208005, + "grad_norm": 3.9093289375305176, + "learning_rate": 0.0030352181904771012, + "loss": 7.7128, + "step": 702300 + }, + { + "epoch": 2.8614232362313867, + "grad_norm": 8.871551513671875, + "learning_rate": 0.003034733418297096, + "loss": 7.7123, + "step": 702400 + }, + { + "epoch": 2.8618306142547683, + "grad_norm": 4.724961280822754, + "learning_rate": 0.0030342486251738942, + "loss": 7.7155, + "step": 702500 + }, + { + "epoch": 2.86223799227815, + "grad_norm": 5.354717254638672, + "learning_rate": 0.0030337638111266616, + "loss": 7.6495, + "step": 702600 + }, + { + "epoch": 2.862645370301531, + "grad_norm": 4.442861557006836, + "learning_rate": 0.003033278976174568, + "loss": 7.7297, + "step": 702700 + }, + { + "epoch": 2.8630527483249124, + "grad_norm": 5.711311340332031, + "learning_rate": 0.003032794120336781, + "loss": 7.723, + "step": 702800 + }, + { + "epoch": 2.863460126348294, + "grad_norm": 6.135890007019043, + "learning_rate": 0.003032309243632472, + "loss": 7.7171, + "step": 702900 + }, + { + "epoch": 2.8638675043716755, + "grad_norm": 5.509307384490967, + "learning_rate": 0.0030318243460808044, + "loss": 7.7359, + "step": 703000 + }, + { + "epoch": 2.8638675043716755, + "eval_MaskedAccuracy": 0.5043071706895894, + "eval_loss": 1.6213853359222412, + "eval_runtime": 149.5854, + "eval_samples_per_second": 424.346, + "eval_steps_per_second": 1.658, + "step": 703000 + }, + { + "epoch": 2.864274882395057, + "grad_norm": 3.526852607727051, + "learning_rate": 0.0030313394277009568, + "loss": 7.6897, + "step": 703100 + }, + { + "epoch": 2.864682260418438, + "grad_norm": 3.783799171447754, + "learning_rate": 0.003030854488512097, + "loss": 7.7135, + "step": 703200 + }, + { + "epoch": 2.8650896384418196, + "grad_norm": 3.1931519508361816, + "learning_rate": 0.0030303695285333948, + "loss": 7.6781, + "step": 703300 + }, + { + "epoch": 2.865497016465201, + "grad_norm": 6.136069297790527, + "learning_rate": 0.0030298845477840252, + "loss": 7.7244, + "step": 703400 + }, + { + "epoch": 2.8659043944885827, + "grad_norm": 5.632696151733398, + "learning_rate": 0.003029399546283161, + "loss": 7.7179, + "step": 703500 + }, + { + "epoch": 2.8663117725119642, + "grad_norm": 5.666800022125244, + "learning_rate": 0.003028914524049981, + "loss": 7.7362, + "step": 703600 + }, + { + "epoch": 2.8667191505353458, + "grad_norm": 5.373630523681641, + "learning_rate": 0.003028429481103657, + "loss": 7.7143, + "step": 703700 + }, + { + "epoch": 2.8671265285587273, + "grad_norm": 4.59381628036499, + "learning_rate": 0.0030279444174633632, + "loss": 7.7349, + "step": 703800 + }, + { + "epoch": 2.867533906582109, + "grad_norm": 4.590327262878418, + "learning_rate": 0.0030274593331482817, + "loss": 7.7045, + "step": 703900 + }, + { + "epoch": 2.86794128460549, + "grad_norm": 6.63765811920166, + "learning_rate": 0.003026974228177588, + "loss": 7.7145, + "step": 704000 + }, + { + "epoch": 2.86794128460549, + "eval_MaskedAccuracy": 0.5050363644742049, + "eval_loss": 1.631582260131836, + "eval_runtime": 149.3218, + "eval_samples_per_second": 425.095, + "eval_steps_per_second": 1.661, + "step": 704000 + }, + { + "epoch": 2.8683486626288714, + "grad_norm": 2.6514902114868164, + "learning_rate": 0.0030264891025704615, + "loss": 7.7125, + "step": 704100 + }, + { + "epoch": 2.868756040652253, + "grad_norm": 3.9465410709381104, + "learning_rate": 0.003026003956346082, + "loss": 7.7155, + "step": 704200 + }, + { + "epoch": 2.8691634186756345, + "grad_norm": 14.424043655395508, + "learning_rate": 0.0030255187895236305, + "loss": 7.7102, + "step": 704300 + }, + { + "epoch": 2.8695707966990156, + "grad_norm": 5.492244243621826, + "learning_rate": 0.0030250336021222872, + "loss": 7.7153, + "step": 704400 + }, + { + "epoch": 2.869978174722397, + "grad_norm": 5.44766092300415, + "learning_rate": 0.0030245483941612338, + "loss": 7.7064, + "step": 704500 + }, + { + "epoch": 2.8703855527457787, + "grad_norm": 13.81653118133545, + "learning_rate": 0.003024063165659656, + "loss": 7.7101, + "step": 704600 + }, + { + "epoch": 2.87079293076916, + "grad_norm": 5.759309768676758, + "learning_rate": 0.0030235779166367317, + "loss": 7.7331, + "step": 704700 + }, + { + "epoch": 2.8712003087925417, + "grad_norm": 5.606010913848877, + "learning_rate": 0.003023092647111651, + "loss": 7.6919, + "step": 704800 + }, + { + "epoch": 2.8716076868159233, + "grad_norm": 3.6325557231903076, + "learning_rate": 0.0030226073571035968, + "loss": 7.7012, + "step": 704900 + }, + { + "epoch": 2.872015064839305, + "grad_norm": 5.615848541259766, + "learning_rate": 0.0030221220466317552, + "loss": 7.7007, + "step": 705000 + }, + { + "epoch": 2.872015064839305, + "eval_MaskedAccuracy": 0.5040617252538367, + "eval_loss": 1.6279255151748657, + "eval_runtime": 150.6296, + "eval_samples_per_second": 421.405, + "eval_steps_per_second": 1.646, + "step": 705000 + }, + { + "epoch": 2.8724224428626863, + "grad_norm": 5.490151882171631, + "learning_rate": 0.003021636715715311, + "loss": 7.7074, + "step": 705100 + }, + { + "epoch": 2.8728298208860674, + "grad_norm": 4.993218898773193, + "learning_rate": 0.0030211513643734547, + "loss": 7.6761, + "step": 705200 + }, + { + "epoch": 2.873237198909449, + "grad_norm": 6.155918598175049, + "learning_rate": 0.0030206659926253756, + "loss": 7.7004, + "step": 705300 + }, + { + "epoch": 2.8736445769328305, + "grad_norm": 4.703640937805176, + "learning_rate": 0.003020180600490261, + "loss": 7.7017, + "step": 705400 + }, + { + "epoch": 2.874051954956212, + "grad_norm": 6.884881496429443, + "learning_rate": 0.0030196951879873017, + "loss": 7.7238, + "step": 705500 + }, + { + "epoch": 2.8744593329795936, + "grad_norm": 8.275361061096191, + "learning_rate": 0.00301920975513569, + "loss": 7.7167, + "step": 705600 + }, + { + "epoch": 2.8748667110029746, + "grad_norm": 6.780029296875, + "learning_rate": 0.0030187243019546162, + "loss": 7.7141, + "step": 705700 + }, + { + "epoch": 2.875274089026356, + "grad_norm": 3.2327849864959717, + "learning_rate": 0.0030182388284632745, + "loss": 7.7281, + "step": 705800 + }, + { + "epoch": 2.8756814670497377, + "grad_norm": 4.052231788635254, + "learning_rate": 0.003017753334680855, + "loss": 7.6799, + "step": 705900 + }, + { + "epoch": 2.8760888450731192, + "grad_norm": 6.708811283111572, + "learning_rate": 0.0030172678206265525, + "loss": 7.7126, + "step": 706000 + }, + { + "epoch": 2.8760888450731192, + "eval_MaskedAccuracy": 0.5049096995201287, + "eval_loss": 1.618673324584961, + "eval_runtime": 149.8397, + "eval_samples_per_second": 423.626, + "eval_steps_per_second": 1.655, + "step": 706000 + }, + { + "epoch": 2.8764962230965008, + "grad_norm": 2.240532875061035, + "learning_rate": 0.003016782286319563, + "loss": 7.7333, + "step": 706100 + }, + { + "epoch": 2.8769036011198823, + "grad_norm": 7.174434661865234, + "learning_rate": 0.0030162967317790826, + "loss": 7.7014, + "step": 706200 + }, + { + "epoch": 2.877310979143264, + "grad_norm": 7.849184989929199, + "learning_rate": 0.003015811157024309, + "loss": 7.6906, + "step": 706300 + }, + { + "epoch": 2.8777183571666454, + "grad_norm": 3.4370803833007812, + "learning_rate": 0.0030153255620744357, + "loss": 7.705, + "step": 706400 + }, + { + "epoch": 2.8781257351900265, + "grad_norm": 3.1505165100097656, + "learning_rate": 0.003014839946948669, + "loss": 7.7099, + "step": 706500 + }, + { + "epoch": 2.878533113213408, + "grad_norm": 3.3950932025909424, + "learning_rate": 0.0030143543116662, + "loss": 7.7487, + "step": 706600 + }, + { + "epoch": 2.8789404912367895, + "grad_norm": 5.6509222984313965, + "learning_rate": 0.003013868656246233, + "loss": 7.7353, + "step": 706700 + }, + { + "epoch": 2.879347869260171, + "grad_norm": 4.527373790740967, + "learning_rate": 0.0030133829807079643, + "loss": 7.73, + "step": 706800 + }, + { + "epoch": 2.879755247283552, + "grad_norm": 5.177055835723877, + "learning_rate": 0.0030128972850705974, + "loss": 7.6888, + "step": 706900 + }, + { + "epoch": 2.8801626253069337, + "grad_norm": 6.955810546875, + "learning_rate": 0.0030124115693533357, + "loss": 7.6916, + "step": 707000 + }, + { + "epoch": 2.8801626253069337, + "eval_MaskedAccuracy": 0.5048821537289734, + "eval_loss": 1.6189310550689697, + "eval_runtime": 151.7201, + "eval_samples_per_second": 418.376, + "eval_steps_per_second": 1.635, + "step": 707000 + }, + { + "epoch": 2.880570003330315, + "grad_norm": 3.485917568206787, + "learning_rate": 0.003011925833575381, + "loss": 7.6974, + "step": 707100 + }, + { + "epoch": 2.8809773813536967, + "grad_norm": 5.959170818328857, + "learning_rate": 0.003011440077755936, + "loss": 7.7137, + "step": 707200 + }, + { + "epoch": 2.8813847593770783, + "grad_norm": 6.006525039672852, + "learning_rate": 0.003010954301914205, + "loss": 7.7283, + "step": 707300 + }, + { + "epoch": 2.88179213740046, + "grad_norm": 5.056791305541992, + "learning_rate": 0.003010468506069395, + "loss": 7.7301, + "step": 707400 + }, + { + "epoch": 2.8821995154238413, + "grad_norm": 3.9427425861358643, + "learning_rate": 0.0030099826902407097, + "loss": 7.6731, + "step": 707500 + }, + { + "epoch": 2.882606893447223, + "grad_norm": 11.1943998336792, + "learning_rate": 0.0030094968544473583, + "loss": 7.7174, + "step": 707600 + }, + { + "epoch": 2.883014271470604, + "grad_norm": 4.588169097900391, + "learning_rate": 0.003009010998708546, + "loss": 7.7047, + "step": 707700 + }, + { + "epoch": 2.8834216494939855, + "grad_norm": 8.373878479003906, + "learning_rate": 0.0030085251230434875, + "loss": 7.6711, + "step": 707800 + }, + { + "epoch": 2.883829027517367, + "grad_norm": 6.053554534912109, + "learning_rate": 0.003008039227471388, + "loss": 7.7386, + "step": 707900 + }, + { + "epoch": 2.8842364055407486, + "grad_norm": 6.039653778076172, + "learning_rate": 0.003007553312011456, + "loss": 7.6898, + "step": 708000 + }, + { + "epoch": 2.8842364055407486, + "eval_MaskedAccuracy": 0.5048192362638789, + "eval_loss": 1.631282925605774, + "eval_runtime": 149.2734, + "eval_samples_per_second": 425.233, + "eval_steps_per_second": 1.661, + "step": 708000 + }, + { + "epoch": 2.88464378356413, + "grad_norm": 6.733051776885986, + "learning_rate": 0.0030070673766829014, + "loss": 7.7225, + "step": 708100 + }, + { + "epoch": 2.885051161587511, + "grad_norm": 9.6881103515625, + "learning_rate": 0.0030065814215049385, + "loss": 7.6886, + "step": 708200 + }, + { + "epoch": 2.8854585396108927, + "grad_norm": 8.552311897277832, + "learning_rate": 0.0030060954464967817, + "loss": 7.7346, + "step": 708300 + }, + { + "epoch": 2.8858659176342742, + "grad_norm": 4.68634557723999, + "learning_rate": 0.0030056094516776417, + "loss": 7.6908, + "step": 708400 + }, + { + "epoch": 2.886273295657656, + "grad_norm": 3.3616514205932617, + "learning_rate": 0.003005123437066729, + "loss": 7.718, + "step": 708500 + }, + { + "epoch": 2.8866806736810373, + "grad_norm": 7.538613319396973, + "learning_rate": 0.003004637402683269, + "loss": 7.6986, + "step": 708600 + }, + { + "epoch": 2.887088051704419, + "grad_norm": 7.794840335845947, + "learning_rate": 0.0030041513485464634, + "loss": 7.7061, + "step": 708700 + }, + { + "epoch": 2.8874954297278004, + "grad_norm": 4.480391502380371, + "learning_rate": 0.0030036652746755384, + "loss": 7.685, + "step": 708800 + }, + { + "epoch": 2.887902807751182, + "grad_norm": 7.3373847007751465, + "learning_rate": 0.0030031791810897048, + "loss": 7.6954, + "step": 708900 + }, + { + "epoch": 2.888310185774563, + "grad_norm": 6.190589427947998, + "learning_rate": 0.0030026930678081877, + "loss": 7.7198, + "step": 709000 + }, + { + "epoch": 2.888310185774563, + "eval_MaskedAccuracy": 0.5039708538074354, + "eval_loss": 1.6308400630950928, + "eval_runtime": 152.1451, + "eval_samples_per_second": 417.207, + "eval_steps_per_second": 1.63, + "step": 709000 + }, + { + "epoch": 2.8887175637979445, + "grad_norm": 8.154083251953125, + "learning_rate": 0.003002206934850202, + "loss": 7.7233, + "step": 709100 + }, + { + "epoch": 2.889124941821326, + "grad_norm": 2.867553949356079, + "learning_rate": 0.0030017207822349663, + "loss": 7.6865, + "step": 709200 + }, + { + "epoch": 2.8895323198447076, + "grad_norm": 5.089413642883301, + "learning_rate": 0.0030012346099817003, + "loss": 7.7197, + "step": 709300 + }, + { + "epoch": 2.8899396978680887, + "grad_norm": 3.1376349925994873, + "learning_rate": 0.003000748418109628, + "loss": 7.7022, + "step": 709400 + }, + { + "epoch": 2.89034707589147, + "grad_norm": 3.189527988433838, + "learning_rate": 0.00300026220663797, + "loss": 7.7078, + "step": 709500 + }, + { + "epoch": 2.8907544539148518, + "grad_norm": 2.2897450923919678, + "learning_rate": 0.002999775975585948, + "loss": 7.7159, + "step": 709600 + }, + { + "epoch": 2.8911618319382333, + "grad_norm": 4.6041669845581055, + "learning_rate": 0.002999289724972783, + "loss": 7.7472, + "step": 709700 + }, + { + "epoch": 2.891569209961615, + "grad_norm": 4.715603351593018, + "learning_rate": 0.0029988034548177, + "loss": 7.7378, + "step": 709800 + }, + { + "epoch": 2.8919765879849963, + "grad_norm": 8.691732406616211, + "learning_rate": 0.0029983171651399325, + "loss": 7.6861, + "step": 709900 + }, + { + "epoch": 2.892383966008378, + "grad_norm": 2.508530378341675, + "learning_rate": 0.002997830855958693, + "loss": 7.7132, + "step": 710000 + }, + { + "epoch": 2.892383966008378, + "eval_MaskedAccuracy": 0.5058309831015211, + "eval_loss": 1.6112987995147705, + "eval_runtime": 150.3306, + "eval_samples_per_second": 422.243, + "eval_steps_per_second": 1.65, + "step": 710000 + }, + { + "epoch": 2.8927913440317594, + "grad_norm": 10.949094772338867, + "learning_rate": 0.0029973445272932116, + "loss": 7.7088, + "step": 710100 + }, + { + "epoch": 2.8931987220551405, + "grad_norm": 7.096635818481445, + "learning_rate": 0.0029968581791627196, + "loss": 7.7207, + "step": 710200 + }, + { + "epoch": 2.893606100078522, + "grad_norm": 6.554495334625244, + "learning_rate": 0.002996371811586448, + "loss": 7.694, + "step": 710300 + }, + { + "epoch": 2.8940134781019036, + "grad_norm": 4.622104644775391, + "learning_rate": 0.002995885424583617, + "loss": 7.7164, + "step": 710400 + }, + { + "epoch": 2.894420856125285, + "grad_norm": 4.340527057647705, + "learning_rate": 0.0029953990181734594, + "loss": 7.7106, + "step": 710500 + }, + { + "epoch": 2.8948282341486666, + "grad_norm": 4.099761962890625, + "learning_rate": 0.00299491259237521, + "loss": 7.7194, + "step": 710600 + }, + { + "epoch": 2.8952356121720477, + "grad_norm": 5.374207496643066, + "learning_rate": 0.0029944261472080913, + "loss": 7.7074, + "step": 710700 + }, + { + "epoch": 2.8956429901954293, + "grad_norm": 2.7654128074645996, + "learning_rate": 0.0029939396826913407, + "loss": 7.7106, + "step": 710800 + }, + { + "epoch": 2.896050368218811, + "grad_norm": 1.8438504934310913, + "learning_rate": 0.002993453198844185, + "loss": 7.7102, + "step": 710900 + }, + { + "epoch": 2.8964577462421923, + "grad_norm": 3.5522167682647705, + "learning_rate": 0.002992966695685864, + "loss": 7.722, + "step": 711000 + }, + { + "epoch": 2.8964577462421923, + "eval_MaskedAccuracy": 0.5050682224615328, + "eval_loss": 1.6337037086486816, + "eval_runtime": 162.347, + "eval_samples_per_second": 390.99, + "eval_steps_per_second": 1.528, + "step": 711000 + }, + { + "epoch": 2.896865124265574, + "grad_norm": 7.992817401885986, + "learning_rate": 0.0029924801732356104, + "loss": 7.6995, + "step": 711100 + }, + { + "epoch": 2.8972725022889554, + "grad_norm": 2.883950710296631, + "learning_rate": 0.002991993631512658, + "loss": 7.7226, + "step": 711200 + }, + { + "epoch": 2.897679880312337, + "grad_norm": 6.727240085601807, + "learning_rate": 0.002991507070536246, + "loss": 7.7159, + "step": 711300 + }, + { + "epoch": 2.8980872583357185, + "grad_norm": 7.277162551879883, + "learning_rate": 0.002991020490325604, + "loss": 7.739, + "step": 711400 + }, + { + "epoch": 2.8984946363590995, + "grad_norm": 4.6611151695251465, + "learning_rate": 0.0029905338908999776, + "loss": 7.7172, + "step": 711500 + }, + { + "epoch": 2.898902014382481, + "grad_norm": 2.6746129989624023, + "learning_rate": 0.002990047272278596, + "loss": 7.7064, + "step": 711600 + }, + { + "epoch": 2.8993093924058626, + "grad_norm": 2.9171528816223145, + "learning_rate": 0.0029895606344807007, + "loss": 7.7234, + "step": 711700 + }, + { + "epoch": 2.899716770429244, + "grad_norm": 5.212060928344727, + "learning_rate": 0.002989073977525534, + "loss": 7.7135, + "step": 711800 + }, + { + "epoch": 2.9001241484526252, + "grad_norm": 3.189424753189087, + "learning_rate": 0.002988587301432333, + "loss": 7.7212, + "step": 711900 + }, + { + "epoch": 2.9005315264760068, + "grad_norm": 3.9328625202178955, + "learning_rate": 0.002988100606220339, + "loss": 7.6914, + "step": 712000 + }, + { + "epoch": 2.9005315264760068, + "eval_MaskedAccuracy": 0.5044705051542822, + "eval_loss": 1.6272234916687012, + "eval_runtime": 149.5052, + "eval_samples_per_second": 424.574, + "eval_steps_per_second": 1.659, + "step": 712000 + }, + { + "epoch": 2.9009389044993883, + "grad_norm": 6.704166412353516, + "learning_rate": 0.0029876138919087947, + "loss": 7.6884, + "step": 712100 + }, + { + "epoch": 2.90134628252277, + "grad_norm": 5.157271862030029, + "learning_rate": 0.0029871271585169444, + "loss": 7.6892, + "step": 712200 + }, + { + "epoch": 2.9017536605461514, + "grad_norm": 4.0318284034729, + "learning_rate": 0.0029866404060640245, + "loss": 7.7036, + "step": 712300 + }, + { + "epoch": 2.902161038569533, + "grad_norm": 5.164078235626221, + "learning_rate": 0.002986153634569284, + "loss": 7.7161, + "step": 712400 + }, + { + "epoch": 2.9025684165929144, + "grad_norm": 2.2458889484405518, + "learning_rate": 0.0029856668440519695, + "loss": 7.6755, + "step": 712500 + }, + { + "epoch": 2.902975794616296, + "grad_norm": 3.4589650630950928, + "learning_rate": 0.002985180034531323, + "loss": 7.7568, + "step": 712600 + }, + { + "epoch": 2.903383172639677, + "grad_norm": 6.620082378387451, + "learning_rate": 0.0029846932060265943, + "loss": 7.7145, + "step": 712700 + }, + { + "epoch": 2.9037905506630586, + "grad_norm": 3.3758513927459717, + "learning_rate": 0.002984206358557023, + "loss": 7.6961, + "step": 712800 + }, + { + "epoch": 2.90419792868644, + "grad_norm": 2.730759859085083, + "learning_rate": 0.0029837194921418647, + "loss": 7.7228, + "step": 712900 + }, + { + "epoch": 2.9046053067098216, + "grad_norm": 4.694217205047607, + "learning_rate": 0.0029832326068003623, + "loss": 7.7067, + "step": 713000 + }, + { + "epoch": 2.9046053067098216, + "eval_MaskedAccuracy": 0.5045022081028918, + "eval_loss": 1.6237282752990723, + "eval_runtime": 149.0646, + "eval_samples_per_second": 425.829, + "eval_steps_per_second": 1.664, + "step": 713000 + }, + { + "epoch": 2.905012684733203, + "grad_norm": 3.580455780029297, + "learning_rate": 0.0029827457025517696, + "loss": 7.7108, + "step": 713100 + }, + { + "epoch": 2.9054200627565843, + "grad_norm": 4.593750953674316, + "learning_rate": 0.0029822587794153315, + "loss": 7.7094, + "step": 713200 + }, + { + "epoch": 2.905827440779966, + "grad_norm": 8.8233060836792, + "learning_rate": 0.0029817718374103006, + "loss": 7.6925, + "step": 713300 + }, + { + "epoch": 2.9062348188033473, + "grad_norm": 5.019341945648193, + "learning_rate": 0.0029812848765559327, + "loss": 7.7098, + "step": 713400 + }, + { + "epoch": 2.906642196826729, + "grad_norm": 16.843360900878906, + "learning_rate": 0.002980797896871476, + "loss": 7.724, + "step": 713500 + }, + { + "epoch": 2.9070495748501104, + "grad_norm": 4.91794490814209, + "learning_rate": 0.0029803108983761798, + "loss": 7.7207, + "step": 713600 + }, + { + "epoch": 2.907456952873492, + "grad_norm": 4.787856101989746, + "learning_rate": 0.002979823881089304, + "loss": 7.7097, + "step": 713700 + }, + { + "epoch": 2.9078643308968735, + "grad_norm": 5.718859672546387, + "learning_rate": 0.0029793368450301014, + "loss": 7.6908, + "step": 713800 + }, + { + "epoch": 2.908271708920255, + "grad_norm": 3.3372747898101807, + "learning_rate": 0.002978849790217824, + "loss": 7.7173, + "step": 713900 + }, + { + "epoch": 2.908679086943636, + "grad_norm": 12.192091941833496, + "learning_rate": 0.0029783627166717315, + "loss": 7.6694, + "step": 714000 + }, + { + "epoch": 2.908679086943636, + "eval_MaskedAccuracy": 0.5045802707806957, + "eval_loss": 1.626402735710144, + "eval_runtime": 149.4493, + "eval_samples_per_second": 424.733, + "eval_steps_per_second": 1.659, + "step": 714000 + }, + { + "epoch": 2.9090864649670176, + "grad_norm": 9.249349594116211, + "learning_rate": 0.002977875624411079, + "loss": 7.7004, + "step": 714100 + }, + { + "epoch": 2.909493842990399, + "grad_norm": 2.8941810131073, + "learning_rate": 0.002977388513455125, + "loss": 7.6865, + "step": 714200 + }, + { + "epoch": 2.9099012210137807, + "grad_norm": 2.450652837753296, + "learning_rate": 0.002976901383823124, + "loss": 7.6643, + "step": 714300 + }, + { + "epoch": 2.9103085990371618, + "grad_norm": 3.3011999130249023, + "learning_rate": 0.0029764142355343383, + "loss": 7.7161, + "step": 714400 + }, + { + "epoch": 2.9107159770605433, + "grad_norm": 6.105404853820801, + "learning_rate": 0.0029759270686080252, + "loss": 7.7055, + "step": 714500 + }, + { + "epoch": 2.911123355083925, + "grad_norm": 4.215657711029053, + "learning_rate": 0.0029754398830634447, + "loss": 7.7178, + "step": 714600 + }, + { + "epoch": 2.9115307331073064, + "grad_norm": 4.54106330871582, + "learning_rate": 0.0029749526789198686, + "loss": 7.6772, + "step": 714700 + }, + { + "epoch": 2.911938111130688, + "grad_norm": 9.177712440490723, + "learning_rate": 0.002974465456196543, + "loss": 7.6919, + "step": 714800 + }, + { + "epoch": 2.9123454891540694, + "grad_norm": 5.482707500457764, + "learning_rate": 0.0029739782149127395, + "loss": 7.7002, + "step": 714900 + }, + { + "epoch": 2.912752867177451, + "grad_norm": 8.724689483642578, + "learning_rate": 0.0029734909550877156, + "loss": 7.6898, + "step": 715000 + }, + { + "epoch": 2.912752867177451, + "eval_MaskedAccuracy": 0.5038898235956852, + "eval_loss": 1.6259732246398926, + "eval_runtime": 149.5821, + "eval_samples_per_second": 424.356, + "eval_steps_per_second": 1.658, + "step": 715000 + }, + { + "epoch": 2.9131602452008325, + "grad_norm": 2.1611859798431396, + "learning_rate": 0.0029730036767407412, + "loss": 7.729, + "step": 715100 + }, + { + "epoch": 2.9135676232242136, + "grad_norm": 5.355795860290527, + "learning_rate": 0.0029725163798910797, + "loss": 7.6732, + "step": 715200 + }, + { + "epoch": 2.913975001247595, + "grad_norm": 6.048969745635986, + "learning_rate": 0.0029720290645579945, + "loss": 7.7244, + "step": 715300 + }, + { + "epoch": 2.9143823792709767, + "grad_norm": 4.974484920501709, + "learning_rate": 0.00297154173076075, + "loss": 7.7006, + "step": 715400 + }, + { + "epoch": 2.914789757294358, + "grad_norm": 6.803608417510986, + "learning_rate": 0.0029710543785186174, + "loss": 7.7058, + "step": 715500 + }, + { + "epoch": 2.9151971353177397, + "grad_norm": 5.492311000823975, + "learning_rate": 0.002970567007850859, + "loss": 7.665, + "step": 715600 + }, + { + "epoch": 2.915604513341121, + "grad_norm": 3.787933349609375, + "learning_rate": 0.0029700796187767473, + "loss": 7.7027, + "step": 715700 + }, + { + "epoch": 2.9160118913645023, + "grad_norm": 4.391969203948975, + "learning_rate": 0.0029695922113155524, + "loss": 7.7234, + "step": 715800 + }, + { + "epoch": 2.916419269387884, + "grad_norm": 6.197183609008789, + "learning_rate": 0.0029691047854865445, + "loss": 7.7195, + "step": 715900 + }, + { + "epoch": 2.9168266474112654, + "grad_norm": 4.438564777374268, + "learning_rate": 0.002968617341308992, + "loss": 7.6584, + "step": 716000 + }, + { + "epoch": 2.9168266474112654, + "eval_MaskedAccuracy": 0.5050919345665351, + "eval_loss": 1.617082953453064, + "eval_runtime": 149.1154, + "eval_samples_per_second": 425.684, + "eval_steps_per_second": 1.663, + "step": 716000 + }, + { + "epoch": 2.917234025434647, + "grad_norm": 3.1807377338409424, + "learning_rate": 0.002968129878802167, + "loss": 7.7373, + "step": 716100 + }, + { + "epoch": 2.9176414034580285, + "grad_norm": 3.906308889389038, + "learning_rate": 0.0029676423979853378, + "loss": 7.6929, + "step": 716200 + }, + { + "epoch": 2.91804878148141, + "grad_norm": 5.037497520446777, + "learning_rate": 0.0029671548988777817, + "loss": 7.6899, + "step": 716300 + }, + { + "epoch": 2.9184561595047915, + "grad_norm": 11.539466857910156, + "learning_rate": 0.0029666673814987723, + "loss": 7.7106, + "step": 716400 + }, + { + "epoch": 2.9188635375281726, + "grad_norm": 4.352266788482666, + "learning_rate": 0.002966179845867578, + "loss": 7.7163, + "step": 716500 + }, + { + "epoch": 2.919270915551554, + "grad_norm": 2.791508913040161, + "learning_rate": 0.0029656922920034802, + "loss": 7.7151, + "step": 716600 + }, + { + "epoch": 2.9196782935749357, + "grad_norm": 2.868605852127075, + "learning_rate": 0.0029652047199257544, + "loss": 7.6685, + "step": 716700 + }, + { + "epoch": 2.9200856715983172, + "grad_norm": 2.7636594772338867, + "learning_rate": 0.0029647171296536736, + "loss": 7.7198, + "step": 716800 + }, + { + "epoch": 2.9204930496216983, + "grad_norm": 10.295610427856445, + "learning_rate": 0.0029642295212065142, + "loss": 7.7159, + "step": 716900 + }, + { + "epoch": 2.92090042764508, + "grad_norm": 4.25411319732666, + "learning_rate": 0.0029637418946035572, + "loss": 7.7139, + "step": 717000 + }, + { + "epoch": 2.92090042764508, + "eval_MaskedAccuracy": 0.5046538472804063, + "eval_loss": 1.63292396068573, + "eval_runtime": 149.3438, + "eval_samples_per_second": 425.033, + "eval_steps_per_second": 1.661, + "step": 717000 + }, + { + "epoch": 2.9213078056684614, + "grad_norm": 5.864894390106201, + "learning_rate": 0.002963254249864079, + "loss": 7.7272, + "step": 717100 + }, + { + "epoch": 2.921715183691843, + "grad_norm": 8.386134147644043, + "learning_rate": 0.0029627665870073597, + "loss": 7.7314, + "step": 717200 + }, + { + "epoch": 2.9221225617152244, + "grad_norm": 4.239066123962402, + "learning_rate": 0.0029622789060526787, + "loss": 7.7046, + "step": 717300 + }, + { + "epoch": 2.922529939738606, + "grad_norm": 3.709198236465454, + "learning_rate": 0.0029617912070193154, + "loss": 7.6977, + "step": 717400 + }, + { + "epoch": 2.9229373177619875, + "grad_norm": 3.645662784576416, + "learning_rate": 0.0029613034899265514, + "loss": 7.7026, + "step": 717500 + }, + { + "epoch": 2.923344695785369, + "grad_norm": 3.1036386489868164, + "learning_rate": 0.0029608157547936684, + "loss": 7.6985, + "step": 717600 + }, + { + "epoch": 2.92375207380875, + "grad_norm": 3.0938498973846436, + "learning_rate": 0.002960328001639957, + "loss": 7.6977, + "step": 717700 + }, + { + "epoch": 2.9241594518321317, + "grad_norm": 6.18734073638916, + "learning_rate": 0.002959840230484692, + "loss": 7.7267, + "step": 717800 + }, + { + "epoch": 2.924566829855513, + "grad_norm": 4.151284694671631, + "learning_rate": 0.0029593524413471612, + "loss": 7.6832, + "step": 717900 + }, + { + "epoch": 2.9249742078788947, + "grad_norm": 4.311347961425781, + "learning_rate": 0.0029588646342466485, + "loss": 7.6803, + "step": 718000 + }, + { + "epoch": 2.9249742078788947, + "eval_MaskedAccuracy": 0.5049609116665331, + "eval_loss": 1.6250135898590088, + "eval_runtime": 152.4868, + "eval_samples_per_second": 416.272, + "eval_steps_per_second": 1.626, + "step": 718000 + }, + { + "epoch": 2.9253815859022763, + "grad_norm": 2.8787670135498047, + "learning_rate": 0.0029583768092024364, + "loss": 7.6815, + "step": 718100 + }, + { + "epoch": 2.9257889639256573, + "grad_norm": 5.31884241104126, + "learning_rate": 0.0029578889662338183, + "loss": 7.7168, + "step": 718200 + }, + { + "epoch": 2.926196341949039, + "grad_norm": 12.058183670043945, + "learning_rate": 0.002957401105360075, + "loss": 7.6968, + "step": 718300 + }, + { + "epoch": 2.9266037199724204, + "grad_norm": 6.33865213394165, + "learning_rate": 0.002956913226600497, + "loss": 7.703, + "step": 718400 + }, + { + "epoch": 2.927011097995802, + "grad_norm": 3.841931104660034, + "learning_rate": 0.002956425329974373, + "loss": 7.7032, + "step": 718500 + }, + { + "epoch": 2.9274184760191835, + "grad_norm": 5.193235397338867, + "learning_rate": 0.002955937415500991, + "loss": 7.6898, + "step": 718600 + }, + { + "epoch": 2.927825854042565, + "grad_norm": 4.908641815185547, + "learning_rate": 0.0029554494831996406, + "loss": 7.7301, + "step": 718700 + }, + { + "epoch": 2.9282332320659465, + "grad_norm": 2.531991720199585, + "learning_rate": 0.0029549615330896092, + "loss": 7.7017, + "step": 718800 + }, + { + "epoch": 2.928640610089328, + "grad_norm": 5.351516246795654, + "learning_rate": 0.0029544735651901946, + "loss": 7.6722, + "step": 718900 + }, + { + "epoch": 2.929047988112709, + "grad_norm": 6.742931842803955, + "learning_rate": 0.0029539855795206868, + "loss": 7.6868, + "step": 719000 + }, + { + "epoch": 2.929047988112709, + "eval_MaskedAccuracy": 0.5054865291604324, + "eval_loss": 1.617951512336731, + "eval_runtime": 149.5237, + "eval_samples_per_second": 424.521, + "eval_steps_per_second": 1.659, + "step": 719000 + }, + { + "epoch": 2.9294553661360907, + "grad_norm": 6.731724739074707, + "learning_rate": 0.0029534975761003764, + "loss": 7.7216, + "step": 719100 + }, + { + "epoch": 2.9298627441594722, + "grad_norm": 9.921653747558594, + "learning_rate": 0.0029530095549485575, + "loss": 7.6906, + "step": 719200 + }, + { + "epoch": 2.9302701221828538, + "grad_norm": 4.2861409187316895, + "learning_rate": 0.002952521516084528, + "loss": 7.7046, + "step": 719300 + }, + { + "epoch": 2.930677500206235, + "grad_norm": 4.969663143157959, + "learning_rate": 0.0029520334595275776, + "loss": 7.6951, + "step": 719400 + }, + { + "epoch": 2.9310848782296164, + "grad_norm": 4.280135154724121, + "learning_rate": 0.0029515453852970034, + "loss": 7.7229, + "step": 719500 + }, + { + "epoch": 2.931492256252998, + "grad_norm": 5.155004978179932, + "learning_rate": 0.0029510572934121015, + "loss": 7.7359, + "step": 719600 + }, + { + "epoch": 2.9318996342763795, + "grad_norm": 7.241194248199463, + "learning_rate": 0.0029505691838921672, + "loss": 7.7205, + "step": 719700 + }, + { + "epoch": 2.932307012299761, + "grad_norm": 3.899705410003662, + "learning_rate": 0.002950081056756502, + "loss": 7.6606, + "step": 719800 + }, + { + "epoch": 2.9327143903231425, + "grad_norm": 4.1715264320373535, + "learning_rate": 0.002949592912024402, + "loss": 7.6871, + "step": 719900 + }, + { + "epoch": 2.933121768346524, + "grad_norm": 3.1270625591278076, + "learning_rate": 0.0029491047497151707, + "loss": 7.7235, + "step": 720000 + }, + { + "epoch": 2.933121768346524, + "eval_MaskedAccuracy": 0.5054259339308764, + "eval_loss": 1.6230510473251343, + "eval_runtime": 150.2205, + "eval_samples_per_second": 422.552, + "eval_steps_per_second": 1.651, + "step": 720000 + }, + { + "epoch": 2.9335291463699056, + "grad_norm": 8.253684997558594, + "learning_rate": 0.0029486165698481013, + "loss": 7.7145, + "step": 720100 + }, + { + "epoch": 2.9339365243932867, + "grad_norm": 10.558189392089844, + "learning_rate": 0.0029481283724424948, + "loss": 7.6804, + "step": 720200 + }, + { + "epoch": 2.934343902416668, + "grad_norm": 8.32635498046875, + "learning_rate": 0.0029476401575176544, + "loss": 7.7117, + "step": 720300 + }, + { + "epoch": 2.9347512804400497, + "grad_norm": 6.75897741317749, + "learning_rate": 0.0029471519250928785, + "loss": 7.7012, + "step": 720400 + }, + { + "epoch": 2.9351586584634313, + "grad_norm": 3.6416022777557373, + "learning_rate": 0.0029466636751874758, + "loss": 7.7006, + "step": 720500 + }, + { + "epoch": 2.935566036486813, + "grad_norm": 8.695493698120117, + "learning_rate": 0.002946175407820745, + "loss": 7.7098, + "step": 720600 + }, + { + "epoch": 2.935973414510194, + "grad_norm": 10.675243377685547, + "learning_rate": 0.002945687123011988, + "loss": 7.721, + "step": 720700 + }, + { + "epoch": 2.9363807925335754, + "grad_norm": 4.8481574058532715, + "learning_rate": 0.0029451988207805134, + "loss": 7.6941, + "step": 720800 + }, + { + "epoch": 2.936788170556957, + "grad_norm": 8.68506145477295, + "learning_rate": 0.002944710501145626, + "loss": 7.6994, + "step": 720900 + }, + { + "epoch": 2.9371955485803385, + "grad_norm": 7.147097587585449, + "learning_rate": 0.0029442221641266305, + "loss": 7.6612, + "step": 721000 + }, + { + "epoch": 2.9371955485803385, + "eval_MaskedAccuracy": 0.5051493795452836, + "eval_loss": 1.6206094026565552, + "eval_runtime": 149.6584, + "eval_samples_per_second": 424.139, + "eval_steps_per_second": 1.657, + "step": 721000 + }, + { + "epoch": 2.93760292660372, + "grad_norm": 9.032767295837402, + "learning_rate": 0.0029437338097428327, + "loss": 7.693, + "step": 721100 + }, + { + "epoch": 2.9380103046271016, + "grad_norm": 4.130688667297363, + "learning_rate": 0.0029432454380135427, + "loss": 7.6987, + "step": 721200 + }, + { + "epoch": 2.938417682650483, + "grad_norm": 6.816453456878662, + "learning_rate": 0.002942757048958063, + "loss": 7.6833, + "step": 721300 + }, + { + "epoch": 2.9388250606738646, + "grad_norm": 3.935292959213257, + "learning_rate": 0.0029422686425957093, + "loss": 7.6942, + "step": 721400 + }, + { + "epoch": 2.9392324386972457, + "grad_norm": 3.9741551876068115, + "learning_rate": 0.0029417802189457888, + "loss": 7.7356, + "step": 721500 + }, + { + "epoch": 2.9396398167206272, + "grad_norm": 3.3311500549316406, + "learning_rate": 0.002941291778027609, + "loss": 7.6762, + "step": 721600 + }, + { + "epoch": 2.9400471947440088, + "grad_norm": 4.277311325073242, + "learning_rate": 0.002940803319860483, + "loss": 7.7096, + "step": 721700 + }, + { + "epoch": 2.9404545727673903, + "grad_norm": 3.4445302486419678, + "learning_rate": 0.0029403148444637167, + "loss": 7.6713, + "step": 721800 + }, + { + "epoch": 2.9408619507907714, + "grad_norm": 8.501815795898438, + "learning_rate": 0.002939826351856633, + "loss": 7.7114, + "step": 721900 + }, + { + "epoch": 2.941269328814153, + "grad_norm": 3.7861318588256836, + "learning_rate": 0.002939337842058535, + "loss": 7.6802, + "step": 722000 + }, + { + "epoch": 2.941269328814153, + "eval_MaskedAccuracy": 0.5052984105124683, + "eval_loss": 1.6163740158081055, + "eval_runtime": 152.9085, + "eval_samples_per_second": 415.124, + "eval_steps_per_second": 1.622, + "step": 722000 + }, + { + "epoch": 2.9416767068375345, + "grad_norm": 5.100472450256348, + "learning_rate": 0.002938849315088736, + "loss": 7.6613, + "step": 722100 + }, + { + "epoch": 2.942084084860916, + "grad_norm": 7.634149074554443, + "learning_rate": 0.002938360770966554, + "loss": 7.6993, + "step": 722200 + }, + { + "epoch": 2.9424914628842975, + "grad_norm": 3.8841817378997803, + "learning_rate": 0.002937872209711308, + "loss": 7.6613, + "step": 722300 + }, + { + "epoch": 2.942898840907679, + "grad_norm": 7.857929229736328, + "learning_rate": 0.0029373836313423027, + "loss": 7.7037, + "step": 722400 + }, + { + "epoch": 2.9433062189310606, + "grad_norm": 5.007296562194824, + "learning_rate": 0.002936895035878861, + "loss": 7.7167, + "step": 722500 + }, + { + "epoch": 2.943713596954442, + "grad_norm": 5.84328031539917, + "learning_rate": 0.0029364064233403054, + "loss": 7.6694, + "step": 722600 + }, + { + "epoch": 2.944120974977823, + "grad_norm": 4.893817901611328, + "learning_rate": 0.0029359177937459432, + "loss": 7.7198, + "step": 722700 + }, + { + "epoch": 2.9445283530012047, + "grad_norm": 4.34383487701416, + "learning_rate": 0.002935429147115091, + "loss": 7.7282, + "step": 722800 + }, + { + "epoch": 2.9449357310245863, + "grad_norm": 3.5883665084838867, + "learning_rate": 0.002934940483467072, + "loss": 7.7415, + "step": 722900 + }, + { + "epoch": 2.945343109047968, + "grad_norm": 8.807660102844238, + "learning_rate": 0.0029344518028212078, + "loss": 7.7001, + "step": 723000 + }, + { + "epoch": 2.945343109047968, + "eval_MaskedAccuracy": 0.5059133828639059, + "eval_loss": 1.615801215171814, + "eval_runtime": 151.4336, + "eval_samples_per_second": 419.167, + "eval_steps_per_second": 1.638, + "step": 723000 + }, + { + "epoch": 2.9457504870713493, + "grad_norm": 6.665395259857178, + "learning_rate": 0.002933963105196817, + "loss": 7.6794, + "step": 723100 + }, + { + "epoch": 2.9461578650947304, + "grad_norm": 7.352250099182129, + "learning_rate": 0.0029334743906132203, + "loss": 7.721, + "step": 723200 + }, + { + "epoch": 2.946565243118112, + "grad_norm": 4.369938373565674, + "learning_rate": 0.0029329856590897424, + "loss": 7.6953, + "step": 723300 + }, + { + "epoch": 2.9469726211414935, + "grad_norm": 4.498866081237793, + "learning_rate": 0.0029324969106456996, + "loss": 7.7072, + "step": 723400 + }, + { + "epoch": 2.947379999164875, + "grad_norm": 14.836862564086914, + "learning_rate": 0.0029320081453004187, + "loss": 7.664, + "step": 723500 + }, + { + "epoch": 2.9477873771882566, + "grad_norm": 5.324207782745361, + "learning_rate": 0.0029315193630732213, + "loss": 7.6951, + "step": 723600 + }, + { + "epoch": 2.948194755211638, + "grad_norm": 5.6641716957092285, + "learning_rate": 0.0029310305639834336, + "loss": 7.7168, + "step": 723700 + }, + { + "epoch": 2.9486021332350196, + "grad_norm": 5.0012102127075195, + "learning_rate": 0.002930541748050377, + "loss": 7.7001, + "step": 723800 + }, + { + "epoch": 2.949009511258401, + "grad_norm": 9.163387298583984, + "learning_rate": 0.0029300529152933813, + "loss": 7.6949, + "step": 723900 + }, + { + "epoch": 2.9494168892817823, + "grad_norm": 4.676816463470459, + "learning_rate": 0.0029295640657317663, + "loss": 7.6794, + "step": 724000 + }, + { + "epoch": 2.9494168892817823, + "eval_MaskedAccuracy": 0.5046621195508941, + "eval_loss": 1.620750069618225, + "eval_runtime": 148.7728, + "eval_samples_per_second": 426.664, + "eval_steps_per_second": 1.667, + "step": 724000 + }, + { + "epoch": 2.949824267305164, + "grad_norm": 3.069194793701172, + "learning_rate": 0.002929075199384863, + "loss": 7.7058, + "step": 724100 + }, + { + "epoch": 2.9502316453285453, + "grad_norm": 4.935457229614258, + "learning_rate": 0.0029285863162720025, + "loss": 7.7179, + "step": 724200 + }, + { + "epoch": 2.950639023351927, + "grad_norm": 5.500514984130859, + "learning_rate": 0.00292809741641251, + "loss": 7.714, + "step": 724300 + }, + { + "epoch": 2.951046401375308, + "grad_norm": 3.5026183128356934, + "learning_rate": 0.0029276084998257155, + "loss": 7.703, + "step": 724400 + }, + { + "epoch": 2.9514537793986895, + "grad_norm": 4.186327934265137, + "learning_rate": 0.002927119566530943, + "loss": 7.689, + "step": 724500 + }, + { + "epoch": 2.951861157422071, + "grad_norm": 8.100483894348145, + "learning_rate": 0.002926630616547526, + "loss": 7.7152, + "step": 724600 + }, + { + "epoch": 2.9522685354454525, + "grad_norm": 2.308488607406616, + "learning_rate": 0.002926141649894795, + "loss": 7.6922, + "step": 724700 + }, + { + "epoch": 2.952675913468834, + "grad_norm": 8.073177337646484, + "learning_rate": 0.002925652666592082, + "loss": 7.7232, + "step": 724800 + }, + { + "epoch": 2.9530832914922156, + "grad_norm": 5.914341926574707, + "learning_rate": 0.0029251636666587227, + "loss": 7.6996, + "step": 724900 + }, + { + "epoch": 2.953490669515597, + "grad_norm": 2.3362417221069336, + "learning_rate": 0.002924674650114046, + "loss": 7.7011, + "step": 725000 + }, + { + "epoch": 2.953490669515597, + "eval_MaskedAccuracy": 0.5049072191625641, + "eval_loss": 1.63347589969635, + "eval_runtime": 149.5143, + "eval_samples_per_second": 424.548, + "eval_steps_per_second": 1.659, + "step": 725000 + }, + { + "epoch": 2.9538980475389787, + "grad_norm": 3.9333150386810303, + "learning_rate": 0.0029241856169773817, + "loss": 7.697, + "step": 725100 + }, + { + "epoch": 2.9543054255623598, + "grad_norm": 4.176628589630127, + "learning_rate": 0.0029236965672680723, + "loss": 7.712, + "step": 725200 + }, + { + "epoch": 2.9547128035857413, + "grad_norm": 4.2781596183776855, + "learning_rate": 0.002923207501005447, + "loss": 7.7064, + "step": 725300 + }, + { + "epoch": 2.955120181609123, + "grad_norm": 7.252349853515625, + "learning_rate": 0.002922718418208844, + "loss": 7.6847, + "step": 725400 + }, + { + "epoch": 2.9555275596325044, + "grad_norm": 3.614680528640747, + "learning_rate": 0.0029222293188975934, + "loss": 7.721, + "step": 725500 + }, + { + "epoch": 2.955934937655886, + "grad_norm": 5.9780378341674805, + "learning_rate": 0.0029217402030910404, + "loss": 7.7173, + "step": 725600 + }, + { + "epoch": 2.956342315679267, + "grad_norm": 3.651726722717285, + "learning_rate": 0.0029212510708085198, + "loss": 7.6847, + "step": 725700 + }, + { + "epoch": 2.9567496937026485, + "grad_norm": 2.0712854862213135, + "learning_rate": 0.0029207619220693666, + "loss": 7.6922, + "step": 725800 + }, + { + "epoch": 2.95715707172603, + "grad_norm": 7.380396366119385, + "learning_rate": 0.0029202727568929197, + "loss": 7.7136, + "step": 725900 + }, + { + "epoch": 2.9575644497494116, + "grad_norm": 5.288581371307373, + "learning_rate": 0.002919783575298519, + "loss": 7.6962, + "step": 726000 + }, + { + "epoch": 2.9575644497494116, + "eval_MaskedAccuracy": 0.5056228732656415, + "eval_loss": 1.612118124961853, + "eval_runtime": 149.9322, + "eval_samples_per_second": 423.365, + "eval_steps_per_second": 1.654, + "step": 726000 + }, + { + "epoch": 2.957971827772793, + "grad_norm": 6.535576343536377, + "learning_rate": 0.00291929437730551, + "loss": 7.7225, + "step": 726100 + }, + { + "epoch": 2.9583792057961746, + "grad_norm": 4.8185343742370605, + "learning_rate": 0.0029188051629332273, + "loss": 7.6954, + "step": 726200 + }, + { + "epoch": 2.958786583819556, + "grad_norm": 5.420071601867676, + "learning_rate": 0.0029183159322010107, + "loss": 7.6909, + "step": 726300 + }, + { + "epoch": 2.9591939618429377, + "grad_norm": 5.946316719055176, + "learning_rate": 0.002917826685128204, + "loss": 7.6928, + "step": 726400 + }, + { + "epoch": 2.959601339866319, + "grad_norm": 3.5505311489105225, + "learning_rate": 0.0029173374217341497, + "loss": 7.6879, + "step": 726500 + }, + { + "epoch": 2.9600087178897003, + "grad_norm": 5.27262544631958, + "learning_rate": 0.002916848142038198, + "loss": 7.702, + "step": 726600 + }, + { + "epoch": 2.960416095913082, + "grad_norm": 4.948713302612305, + "learning_rate": 0.0029163588460596837, + "loss": 7.6924, + "step": 726700 + }, + { + "epoch": 2.9608234739364634, + "grad_norm": 4.281991004943848, + "learning_rate": 0.0029158695338179523, + "loss": 7.6952, + "step": 726800 + }, + { + "epoch": 2.9612308519598445, + "grad_norm": 2.798652410507202, + "learning_rate": 0.0029153802053323514, + "loss": 7.6928, + "step": 726900 + }, + { + "epoch": 2.961638229983226, + "grad_norm": 4.923476219177246, + "learning_rate": 0.002914890860622227, + "loss": 7.6702, + "step": 727000 + }, + { + "epoch": 2.961638229983226, + "eval_MaskedAccuracy": 0.5045890129922889, + "eval_loss": 1.6305315494537354, + "eval_runtime": 149.5062, + "eval_samples_per_second": 424.571, + "eval_steps_per_second": 1.659, + "step": 727000 + }, + { + "epoch": 2.9620456080066075, + "grad_norm": 5.6135101318359375, + "learning_rate": 0.002914401499706924, + "loss": 7.6997, + "step": 727100 + }, + { + "epoch": 2.962452986029989, + "grad_norm": 4.787189483642578, + "learning_rate": 0.002913912122605788, + "loss": 7.7305, + "step": 727200 + }, + { + "epoch": 2.9628603640533706, + "grad_norm": 4.7185282707214355, + "learning_rate": 0.002913422729338171, + "loss": 7.6821, + "step": 727300 + }, + { + "epoch": 2.963267742076752, + "grad_norm": 5.087222099304199, + "learning_rate": 0.002912933319923419, + "loss": 7.7056, + "step": 727400 + }, + { + "epoch": 2.9636751201001337, + "grad_norm": 5.190913200378418, + "learning_rate": 0.0029124438943808804, + "loss": 7.7075, + "step": 727500 + }, + { + "epoch": 2.964082498123515, + "grad_norm": 4.4287495613098145, + "learning_rate": 0.002911954452729903, + "loss": 7.7054, + "step": 727600 + }, + { + "epoch": 2.9644898761468963, + "grad_norm": 6.311236381530762, + "learning_rate": 0.0029114649949898395, + "loss": 7.6822, + "step": 727700 + }, + { + "epoch": 2.964897254170278, + "grad_norm": 3.371539831161499, + "learning_rate": 0.0029109755211800415, + "loss": 7.6709, + "step": 727800 + }, + { + "epoch": 2.9653046321936594, + "grad_norm": 3.391474723815918, + "learning_rate": 0.0029104860313198594, + "loss": 7.6815, + "step": 727900 + }, + { + "epoch": 2.965712010217041, + "grad_norm": 5.109682083129883, + "learning_rate": 0.002909996525428648, + "loss": 7.6944, + "step": 728000 + }, + { + "epoch": 2.965712010217041, + "eval_MaskedAccuracy": 0.5048527927644045, + "eval_loss": 1.6240174770355225, + "eval_runtime": 151.7464, + "eval_samples_per_second": 418.303, + "eval_steps_per_second": 1.634, + "step": 728000 + }, + { + "epoch": 2.9661193882404224, + "grad_norm": 8.26157283782959, + "learning_rate": 0.0029095070035257565, + "loss": 7.6758, + "step": 728100 + }, + { + "epoch": 2.9665267662638035, + "grad_norm": 4.0670552253723145, + "learning_rate": 0.0029090174656305414, + "loss": 7.7192, + "step": 728200 + }, + { + "epoch": 2.966934144287185, + "grad_norm": 6.780729293823242, + "learning_rate": 0.0029085279117623533, + "loss": 7.7318, + "step": 728300 + }, + { + "epoch": 2.9673415223105666, + "grad_norm": 7.467378616333008, + "learning_rate": 0.002908038341940552, + "loss": 7.6909, + "step": 728400 + }, + { + "epoch": 2.967748900333948, + "grad_norm": 3.635274648666382, + "learning_rate": 0.002907548756184487, + "loss": 7.6848, + "step": 728500 + }, + { + "epoch": 2.9681562783573296, + "grad_norm": 11.50245475769043, + "learning_rate": 0.002907059154513517, + "loss": 7.7112, + "step": 728600 + }, + { + "epoch": 2.968563656380711, + "grad_norm": 4.017874240875244, + "learning_rate": 0.0029065695369469994, + "loss": 7.7199, + "step": 728700 + }, + { + "epoch": 2.9689710344040927, + "grad_norm": 2.9645016193389893, + "learning_rate": 0.002906079903504291, + "loss": 7.6968, + "step": 728800 + }, + { + "epoch": 2.9693784124274742, + "grad_norm": 5.300781726837158, + "learning_rate": 0.0029055902542047423, + "loss": 7.7147, + "step": 728900 + }, + { + "epoch": 2.9697857904508553, + "grad_norm": 6.1493754386901855, + "learning_rate": 0.002905100589067723, + "loss": 7.7191, + "step": 729000 + }, + { + "epoch": 2.9697857904508553, + "eval_MaskedAccuracy": 0.5048661914267644, + "eval_loss": 1.6320323944091797, + "eval_runtime": 149.8621, + "eval_samples_per_second": 423.563, + "eval_steps_per_second": 1.655, + "step": 729000 + }, + { + "epoch": 2.970193168474237, + "grad_norm": 3.4888110160827637, + "learning_rate": 0.002904610908112584, + "loss": 7.687, + "step": 729100 + }, + { + "epoch": 2.9706005464976184, + "grad_norm": 7.061175346374512, + "learning_rate": 0.0029041212113586914, + "loss": 7.7043, + "step": 729200 + }, + { + "epoch": 2.971007924521, + "grad_norm": 6.764214515686035, + "learning_rate": 0.002903631498825401, + "loss": 7.6733, + "step": 729300 + }, + { + "epoch": 2.971415302544381, + "grad_norm": 6.659168243408203, + "learning_rate": 0.0029031417705320784, + "loss": 7.699, + "step": 729400 + }, + { + "epoch": 2.9718226805677626, + "grad_norm": 4.185555458068848, + "learning_rate": 0.002902652026498082, + "loss": 7.694, + "step": 729500 + }, + { + "epoch": 2.972230058591144, + "grad_norm": 3.2082176208496094, + "learning_rate": 0.0029021622667427735, + "loss": 7.7067, + "step": 729600 + }, + { + "epoch": 2.9726374366145256, + "grad_norm": 3.5333786010742188, + "learning_rate": 0.002901672491285517, + "loss": 7.6632, + "step": 729700 + }, + { + "epoch": 2.973044814637907, + "grad_norm": 4.613351821899414, + "learning_rate": 0.002901182700145677, + "loss": 7.6723, + "step": 729800 + }, + { + "epoch": 2.9734521926612887, + "grad_norm": 8.803238868713379, + "learning_rate": 0.002900692893342618, + "loss": 7.6971, + "step": 729900 + }, + { + "epoch": 2.97385957068467, + "grad_norm": 7.382742881774902, + "learning_rate": 0.0029002030708957, + "loss": 7.6764, + "step": 730000 + }, + { + "epoch": 2.97385957068467, + "eval_MaskedAccuracy": 0.5047144547284493, + "eval_loss": 1.6255278587341309, + "eval_runtime": 150.1483, + "eval_samples_per_second": 422.755, + "eval_steps_per_second": 1.652, + "step": 730000 + }, + { + "epoch": 2.9742669487080517, + "grad_norm": 6.7785725593566895, + "learning_rate": 0.002899713232824287, + "loss": 7.6895, + "step": 730100 + }, + { + "epoch": 2.974674326731433, + "grad_norm": 5.104455471038818, + "learning_rate": 0.002899223379147751, + "loss": 7.6893, + "step": 730200 + }, + { + "epoch": 2.9750817047548144, + "grad_norm": 9.909504890441895, + "learning_rate": 0.002898733509885456, + "loss": 7.7152, + "step": 730300 + }, + { + "epoch": 2.975489082778196, + "grad_norm": 4.498833179473877, + "learning_rate": 0.0028982436250567715, + "loss": 7.6746, + "step": 730400 + }, + { + "epoch": 2.9758964608015774, + "grad_norm": 4.285370826721191, + "learning_rate": 0.0028977537246810637, + "loss": 7.6886, + "step": 730500 + }, + { + "epoch": 2.976303838824959, + "grad_norm": 4.445438861846924, + "learning_rate": 0.002897263808777697, + "loss": 7.6967, + "step": 730600 + }, + { + "epoch": 2.97671121684834, + "grad_norm": 3.821533679962158, + "learning_rate": 0.0028967738773660453, + "loss": 7.7112, + "step": 730700 + }, + { + "epoch": 2.9771185948717216, + "grad_norm": 7.10809850692749, + "learning_rate": 0.0028962839304654763, + "loss": 7.6848, + "step": 730800 + }, + { + "epoch": 2.977525972895103, + "grad_norm": 3.383739709854126, + "learning_rate": 0.0028957939680953606, + "loss": 7.7111, + "step": 730900 + }, + { + "epoch": 2.9779333509184847, + "grad_norm": 2.9453628063201904, + "learning_rate": 0.00289530399027507, + "loss": 7.6914, + "step": 731000 + }, + { + "epoch": 2.9779333509184847, + "eval_MaskedAccuracy": 0.5047596690559836, + "eval_loss": 1.6265870332717896, + "eval_runtime": 149.4462, + "eval_samples_per_second": 424.741, + "eval_steps_per_second": 1.659, + "step": 731000 + }, + { + "epoch": 2.978340728941866, + "grad_norm": 2.5462045669555664, + "learning_rate": 0.002894813997023976, + "loss": 7.7145, + "step": 731100 + }, + { + "epoch": 2.9787481069652477, + "grad_norm": 13.579780578613281, + "learning_rate": 0.002894323988361453, + "loss": 7.7328, + "step": 731200 + }, + { + "epoch": 2.9791554849886293, + "grad_norm": 4.660255432128906, + "learning_rate": 0.002893833964306867, + "loss": 7.7004, + "step": 731300 + }, + { + "epoch": 2.979562863012011, + "grad_norm": 6.878735065460205, + "learning_rate": 0.0028933439248795953, + "loss": 7.7008, + "step": 731400 + }, + { + "epoch": 2.979970241035392, + "grad_norm": 9.91526985168457, + "learning_rate": 0.002892853870099012, + "loss": 7.6857, + "step": 731500 + }, + { + "epoch": 2.9803776190587734, + "grad_norm": 8.592841148376465, + "learning_rate": 0.002892363799984492, + "loss": 7.6917, + "step": 731600 + }, + { + "epoch": 2.980784997082155, + "grad_norm": 8.438247680664062, + "learning_rate": 0.0028918737145554067, + "loss": 7.6953, + "step": 731700 + }, + { + "epoch": 2.9811923751055365, + "grad_norm": 5.771642208099365, + "learning_rate": 0.002891383613831136, + "loss": 7.7254, + "step": 731800 + }, + { + "epoch": 2.9815997531289176, + "grad_norm": 9.10280990600586, + "learning_rate": 0.0028908934978310514, + "loss": 7.6905, + "step": 731900 + }, + { + "epoch": 2.982007131152299, + "grad_norm": 5.338988780975342, + "learning_rate": 0.0028904033665745313, + "loss": 7.6966, + "step": 732000 + }, + { + "epoch": 2.982007131152299, + "eval_MaskedAccuracy": 0.5050782308411959, + "eval_loss": 1.6257840394973755, + "eval_runtime": 148.6556, + "eval_samples_per_second": 427.001, + "eval_steps_per_second": 1.668, + "step": 732000 + }, + { + "epoch": 2.9824145091756806, + "grad_norm": 5.40072774887085, + "learning_rate": 0.0028899132200809587, + "loss": 7.6848, + "step": 732100 + }, + { + "epoch": 2.982821887199062, + "grad_norm": 5.238689422607422, + "learning_rate": 0.0028894230583697057, + "loss": 7.7052, + "step": 732200 + }, + { + "epoch": 2.9832292652224437, + "grad_norm": 8.466293334960938, + "learning_rate": 0.002888932881460157, + "loss": 7.6786, + "step": 732300 + }, + { + "epoch": 2.9836366432458252, + "grad_norm": 6.270270347595215, + "learning_rate": 0.002888442689371685, + "loss": 7.6877, + "step": 732400 + }, + { + "epoch": 2.9840440212692068, + "grad_norm": 6.5726189613342285, + "learning_rate": 0.002887952482123674, + "loss": 7.6897, + "step": 732500 + }, + { + "epoch": 2.9844513992925883, + "grad_norm": 4.553796768188477, + "learning_rate": 0.002887462259735502, + "loss": 7.7027, + "step": 732600 + }, + { + "epoch": 2.9848587773159694, + "grad_norm": 6.129177093505859, + "learning_rate": 0.0028869720222265516, + "loss": 7.707, + "step": 732700 + }, + { + "epoch": 2.985266155339351, + "grad_norm": 4.546405792236328, + "learning_rate": 0.002886481769616207, + "loss": 7.6833, + "step": 732800 + }, + { + "epoch": 2.9856735333627324, + "grad_norm": 3.305152177810669, + "learning_rate": 0.002885991501923847, + "loss": 7.7115, + "step": 732900 + }, + { + "epoch": 2.986080911386114, + "grad_norm": 2.6064453125, + "learning_rate": 0.002885501219168852, + "loss": 7.6836, + "step": 733000 + }, + { + "epoch": 2.986080911386114, + "eval_MaskedAccuracy": 0.5055143008516121, + "eval_loss": 1.6125576496124268, + "eval_runtime": 149.4368, + "eval_samples_per_second": 424.768, + "eval_steps_per_second": 1.66, + "step": 733000 + }, + { + "epoch": 2.9864882894094955, + "grad_norm": 8.053740501403809, + "learning_rate": 0.0028850109213706134, + "loss": 7.7005, + "step": 733100 + }, + { + "epoch": 2.9868956674328766, + "grad_norm": 5.1984686851501465, + "learning_rate": 0.0028845206085485065, + "loss": 7.6784, + "step": 733200 + }, + { + "epoch": 2.987303045456258, + "grad_norm": 10.709946632385254, + "learning_rate": 0.0028840302807219185, + "loss": 7.7334, + "step": 733300 + }, + { + "epoch": 2.9877104234796397, + "grad_norm": 4.176344871520996, + "learning_rate": 0.0028835399379102383, + "loss": 7.6755, + "step": 733400 + }, + { + "epoch": 2.988117801503021, + "grad_norm": 2.9539191722869873, + "learning_rate": 0.002883049580132852, + "loss": 7.7084, + "step": 733500 + }, + { + "epoch": 2.9885251795264027, + "grad_norm": 6.382058143615723, + "learning_rate": 0.002882559207409141, + "loss": 7.6969, + "step": 733600 + }, + { + "epoch": 2.9889325575497843, + "grad_norm": 9.542335510253906, + "learning_rate": 0.002882068819758494, + "loss": 7.7055, + "step": 733700 + }, + { + "epoch": 2.989339935573166, + "grad_norm": 8.789894104003906, + "learning_rate": 0.002881578417200301, + "loss": 7.6694, + "step": 733800 + }, + { + "epoch": 2.9897473135965473, + "grad_norm": 4.893813610076904, + "learning_rate": 0.0028810879997539477, + "loss": 7.6971, + "step": 733900 + }, + { + "epoch": 2.9901546916199284, + "grad_norm": 5.839018821716309, + "learning_rate": 0.0028805975674388276, + "loss": 7.72, + "step": 734000 + }, + { + "epoch": 2.9901546916199284, + "eval_MaskedAccuracy": 0.5056054177429948, + "eval_loss": 1.6218526363372803, + "eval_runtime": 168.423, + "eval_samples_per_second": 376.884, + "eval_steps_per_second": 1.472, + "step": 734000 + }, + { + "epoch": 2.99056206964331, + "grad_norm": 4.137758731842041, + "learning_rate": 0.002880107120274327, + "loss": 7.6845, + "step": 734100 + }, + { + "epoch": 2.9909694476666915, + "grad_norm": 5.157028675079346, + "learning_rate": 0.0028796166582798362, + "loss": 7.7, + "step": 734200 + }, + { + "epoch": 2.991376825690073, + "grad_norm": 5.364386558532715, + "learning_rate": 0.0028791261814747424, + "loss": 7.7323, + "step": 734300 + }, + { + "epoch": 2.991784203713454, + "grad_norm": 3.042241334915161, + "learning_rate": 0.002878635689878441, + "loss": 7.6873, + "step": 734400 + }, + { + "epoch": 2.9921915817368356, + "grad_norm": 4.499843597412109, + "learning_rate": 0.002878145183510324, + "loss": 7.6892, + "step": 734500 + }, + { + "epoch": 2.992598959760217, + "grad_norm": 3.1978838443756104, + "learning_rate": 0.0028776546623897827, + "loss": 7.6834, + "step": 734600 + }, + { + "epoch": 2.9930063377835987, + "grad_norm": 4.563478469848633, + "learning_rate": 0.002877164126536209, + "loss": 7.6761, + "step": 734700 + }, + { + "epoch": 2.9934137158069802, + "grad_norm": 3.513648271560669, + "learning_rate": 0.0028766735759689966, + "loss": 7.6972, + "step": 734800 + }, + { + "epoch": 2.9938210938303618, + "grad_norm": 5.518065452575684, + "learning_rate": 0.0028761830107075386, + "loss": 7.707, + "step": 734900 + }, + { + "epoch": 2.9942284718537433, + "grad_norm": 5.362910747528076, + "learning_rate": 0.00287569243077123, + "loss": 7.7155, + "step": 735000 + }, + { + "epoch": 2.9942284718537433, + "eval_MaskedAccuracy": 0.505939511371849, + "eval_loss": 1.618703007698059, + "eval_runtime": 150.1441, + "eval_samples_per_second": 422.767, + "eval_steps_per_second": 1.652, + "step": 735000 + }, + { + "epoch": 2.994635849877125, + "grad_norm": 3.5753090381622314, + "learning_rate": 0.002875201836179469, + "loss": 7.7045, + "step": 735100 + }, + { + "epoch": 2.995043227900506, + "grad_norm": 8.037628173828125, + "learning_rate": 0.0028747112269516483, + "loss": 7.6737, + "step": 735200 + }, + { + "epoch": 2.9954506059238875, + "grad_norm": 6.65151309967041, + "learning_rate": 0.002874220603107168, + "loss": 7.6828, + "step": 735300 + }, + { + "epoch": 2.995857983947269, + "grad_norm": 3.9218015670776367, + "learning_rate": 0.00287372996466542, + "loss": 7.6996, + "step": 735400 + }, + { + "epoch": 2.9962653619706505, + "grad_norm": 6.109340667724609, + "learning_rate": 0.0028732393116458056, + "loss": 7.7094, + "step": 735500 + }, + { + "epoch": 2.996672739994032, + "grad_norm": 4.062647342681885, + "learning_rate": 0.0028727486440677207, + "loss": 7.7047, + "step": 735600 + }, + { + "epoch": 2.997080118017413, + "grad_norm": 3.483187437057495, + "learning_rate": 0.002872257961950565, + "loss": 7.6908, + "step": 735700 + }, + { + "epoch": 2.9974874960407947, + "grad_norm": 2.671337366104126, + "learning_rate": 0.0028717672653137375, + "loss": 7.6875, + "step": 735800 + }, + { + "epoch": 2.997894874064176, + "grad_norm": 5.355167388916016, + "learning_rate": 0.0028712765541766424, + "loss": 7.6624, + "step": 735900 + }, + { + "epoch": 2.9983022520875577, + "grad_norm": 6.177648544311523, + "learning_rate": 0.0028707858285586725, + "loss": 7.691, + "step": 736000 + }, + { + "epoch": 2.9983022520875577, + "eval_MaskedAccuracy": 0.5055798926796584, + "eval_loss": 1.6123625040054321, + "eval_runtime": 149.3746, + "eval_samples_per_second": 424.945, + "eval_steps_per_second": 1.66, + "step": 736000 + }, + { + "epoch": 2.9987096301109393, + "grad_norm": 5.381433486938477, + "learning_rate": 0.0028702950884792336, + "loss": 7.693, + "step": 736100 + }, + { + "epoch": 2.999117008134321, + "grad_norm": 3.4278206825256348, + "learning_rate": 0.0028698043339577257, + "loss": 7.6852, + "step": 736200 + }, + { + "epoch": 2.9995243861577023, + "grad_norm": 5.128978252410889, + "learning_rate": 0.0028693135650135534, + "loss": 7.6878, + "step": 736300 + }, + { + "epoch": 2.999931764181084, + "grad_norm": 5.375679016113281, + "learning_rate": 0.0028688227816661164, + "loss": 7.6914, + "step": 736400 + }, + { + "epoch": 3.000339142204465, + "grad_norm": 4.731819152832031, + "learning_rate": 0.002868331983934817, + "loss": 7.7202, + "step": 736500 + }, + { + "epoch": 3.0007465202278465, + "grad_norm": 3.3854033946990967, + "learning_rate": 0.00286784117183906, + "loss": 7.7265, + "step": 736600 + }, + { + "epoch": 3.001153898251228, + "grad_norm": 5.151326656341553, + "learning_rate": 0.0028673503453982515, + "loss": 7.6926, + "step": 736700 + }, + { + "epoch": 3.0015612762746096, + "grad_norm": 3.308462619781494, + "learning_rate": 0.002866859504631797, + "loss": 7.6979, + "step": 736800 + }, + { + "epoch": 3.001968654297991, + "grad_norm": 4.403634548187256, + "learning_rate": 0.002866368649559097, + "loss": 7.7056, + "step": 736900 + }, + { + "epoch": 3.002376032321372, + "grad_norm": 9.887831687927246, + "learning_rate": 0.0028658777801995627, + "loss": 7.7114, + "step": 737000 + }, + { + "epoch": 3.002376032321372, + "eval_MaskedAccuracy": 0.505611609193833, + "eval_loss": 1.6212120056152344, + "eval_runtime": 149.3676, + "eval_samples_per_second": 424.965, + "eval_steps_per_second": 1.66, + "step": 737000 + }, + { + "epoch": 3.0027834103447537, + "grad_norm": 4.206628799438477, + "learning_rate": 0.0028653868965725997, + "loss": 7.6915, + "step": 737100 + }, + { + "epoch": 3.0031907883681352, + "grad_norm": 4.031418323516846, + "learning_rate": 0.0028648959986976195, + "loss": 7.7405, + "step": 737200 + }, + { + "epoch": 3.0035981663915168, + "grad_norm": 4.3671746253967285, + "learning_rate": 0.002864405086594023, + "loss": 7.7004, + "step": 737300 + }, + { + "epoch": 3.0040055444148983, + "grad_norm": 4.248547077178955, + "learning_rate": 0.0028639141602812205, + "loss": 7.6832, + "step": 737400 + }, + { + "epoch": 3.00441292243828, + "grad_norm": 4.220926761627197, + "learning_rate": 0.002863423219778623, + "loss": 7.6934, + "step": 737500 + }, + { + "epoch": 3.004820300461661, + "grad_norm": 10.080137252807617, + "learning_rate": 0.0028629322651056413, + "loss": 7.7249, + "step": 737600 + }, + { + "epoch": 3.0052276784850425, + "grad_norm": 5.022657871246338, + "learning_rate": 0.002862441296281681, + "loss": 7.6981, + "step": 737700 + }, + { + "epoch": 3.005635056508424, + "grad_norm": 3.362440347671509, + "learning_rate": 0.0028619503133261543, + "loss": 7.724, + "step": 737800 + }, + { + "epoch": 3.0060424345318055, + "grad_norm": 6.001651287078857, + "learning_rate": 0.00286145931625847, + "loss": 7.6872, + "step": 737900 + }, + { + "epoch": 3.006449812555187, + "grad_norm": 3.4034435749053955, + "learning_rate": 0.0028609683050980477, + "loss": 7.7149, + "step": 738000 + }, + { + "epoch": 3.006449812555187, + "eval_MaskedAccuracy": 0.5058157231836666, + "eval_loss": 1.6164454221725464, + "eval_runtime": 148.4915, + "eval_samples_per_second": 427.472, + "eval_steps_per_second": 1.67, + "step": 738000 + }, + { + "epoch": 3.0068571905785686, + "grad_norm": 6.026236534118652, + "learning_rate": 0.00286047727986429, + "loss": 7.6771, + "step": 738100 + }, + { + "epoch": 3.00726456860195, + "grad_norm": 4.825756072998047, + "learning_rate": 0.0028599862405766143, + "loss": 7.7141, + "step": 738200 + }, + { + "epoch": 3.007671946625331, + "grad_norm": 3.3585920333862305, + "learning_rate": 0.0028594951872544343, + "loss": 7.6962, + "step": 738300 + }, + { + "epoch": 3.0080793246487127, + "grad_norm": 4.888416290283203, + "learning_rate": 0.0028590041199171635, + "loss": 7.6803, + "step": 738400 + }, + { + "epoch": 3.0084867026720943, + "grad_norm": 4.891050815582275, + "learning_rate": 0.002858513038584217, + "loss": 7.7202, + "step": 738500 + }, + { + "epoch": 3.008894080695476, + "grad_norm": 3.14123797416687, + "learning_rate": 0.0028580219432750094, + "loss": 7.7039, + "step": 738600 + }, + { + "epoch": 3.0093014587188573, + "grad_norm": 3.5715818405151367, + "learning_rate": 0.002857530834008957, + "loss": 7.7042, + "step": 738700 + }, + { + "epoch": 3.009708836742239, + "grad_norm": 5.466462135314941, + "learning_rate": 0.002857039710805476, + "loss": 7.7214, + "step": 738800 + }, + { + "epoch": 3.01011621476562, + "grad_norm": 5.804788112640381, + "learning_rate": 0.002856548573683981, + "loss": 7.7363, + "step": 738900 + }, + { + "epoch": 3.0105235927890015, + "grad_norm": 8.0160493850708, + "learning_rate": 0.0028560574226638917, + "loss": 7.7206, + "step": 739000 + }, + { + "epoch": 3.0105235927890015, + "eval_MaskedAccuracy": 0.5056501302084144, + "eval_loss": 1.6164095401763916, + "eval_runtime": 149.2281, + "eval_samples_per_second": 425.362, + "eval_steps_per_second": 1.662, + "step": 739000 + }, + { + "epoch": 3.010930970812383, + "grad_norm": 3.4310660362243652, + "learning_rate": 0.002855566257764624, + "loss": 7.7148, + "step": 739100 + }, + { + "epoch": 3.0113383488357646, + "grad_norm": 5.341206073760986, + "learning_rate": 0.0028550750790055975, + "loss": 7.6797, + "step": 739200 + }, + { + "epoch": 3.011745726859146, + "grad_norm": 4.394791603088379, + "learning_rate": 0.0028545838864062318, + "loss": 7.6749, + "step": 739300 + }, + { + "epoch": 3.0121531048825276, + "grad_norm": 4.237307548522949, + "learning_rate": 0.002854092679985943, + "loss": 7.7006, + "step": 739400 + }, + { + "epoch": 3.0125604829059087, + "grad_norm": 2.5844573974609375, + "learning_rate": 0.002853601459764154, + "loss": 7.6835, + "step": 739500 + }, + { + "epoch": 3.0129678609292903, + "grad_norm": 5.240605354309082, + "learning_rate": 0.002853110225760285, + "loss": 7.6913, + "step": 739600 + }, + { + "epoch": 3.013375238952672, + "grad_norm": 8.880341529846191, + "learning_rate": 0.002852618977993756, + "loss": 7.6997, + "step": 739700 + }, + { + "epoch": 3.0137826169760533, + "grad_norm": 5.894056797027588, + "learning_rate": 0.0028521277164839922, + "loss": 7.6919, + "step": 739800 + }, + { + "epoch": 3.014189994999435, + "grad_norm": 2.679236650466919, + "learning_rate": 0.0028516364412504132, + "loss": 7.6828, + "step": 739900 + }, + { + "epoch": 3.0145973730228164, + "grad_norm": 3.015913724899292, + "learning_rate": 0.002851145152312444, + "loss": 7.7108, + "step": 740000 + }, + { + "epoch": 3.0145973730228164, + "eval_MaskedAccuracy": 0.5061662086205887, + "eval_loss": 1.6264451742172241, + "eval_runtime": 149.7792, + "eval_samples_per_second": 423.797, + "eval_steps_per_second": 1.656, + "step": 740000 + }, + { + "epoch": 3.0150047510461975, + "grad_norm": 4.6664228439331055, + "learning_rate": 0.0028506538496895, + "loss": 7.6873, + "step": 740100 + }, + { + "epoch": 3.015412129069579, + "grad_norm": 3.0084333419799805, + "learning_rate": 0.0028501625334010137, + "loss": 7.7015, + "step": 740200 + }, + { + "epoch": 3.0158195070929605, + "grad_norm": 6.42466926574707, + "learning_rate": 0.002849671203466406, + "loss": 7.6992, + "step": 740300 + }, + { + "epoch": 3.016226885116342, + "grad_norm": 5.211976528167725, + "learning_rate": 0.002849179859905102, + "loss": 7.6955, + "step": 740400 + }, + { + "epoch": 3.0166342631397236, + "grad_norm": 8.872745513916016, + "learning_rate": 0.002848688502736526, + "loss": 7.7122, + "step": 740500 + }, + { + "epoch": 3.017041641163105, + "grad_norm": 5.2570695877075195, + "learning_rate": 0.0028481971319801076, + "loss": 7.7019, + "step": 740600 + }, + { + "epoch": 3.0174490191864867, + "grad_norm": 6.204245090484619, + "learning_rate": 0.00284770574765527, + "loss": 7.7023, + "step": 740700 + }, + { + "epoch": 3.0178563972098678, + "grad_norm": 4.359434604644775, + "learning_rate": 0.00284721434978144, + "loss": 7.6523, + "step": 740800 + }, + { + "epoch": 3.0182637752332493, + "grad_norm": 8.607190132141113, + "learning_rate": 0.002846722938378047, + "loss": 7.7324, + "step": 740900 + }, + { + "epoch": 3.018671153256631, + "grad_norm": 5.038397789001465, + "learning_rate": 0.0028462315134645177, + "loss": 7.6877, + "step": 741000 + }, + { + "epoch": 3.018671153256631, + "eval_MaskedAccuracy": 0.5055186695738466, + "eval_loss": 1.620163917541504, + "eval_runtime": 150.7644, + "eval_samples_per_second": 421.028, + "eval_steps_per_second": 1.645, + "step": 741000 + }, + { + "epoch": 3.0190785312800124, + "grad_norm": 8.162409782409668, + "learning_rate": 0.002845740075060283, + "loss": 7.6638, + "step": 741100 + }, + { + "epoch": 3.019485909303394, + "grad_norm": 5.22674036026001, + "learning_rate": 0.002845248623184771, + "loss": 7.6913, + "step": 741200 + }, + { + "epoch": 3.0198932873267754, + "grad_norm": 7.2996907234191895, + "learning_rate": 0.00284475715785741, + "loss": 7.6909, + "step": 741300 + }, + { + "epoch": 3.0203006653501565, + "grad_norm": 4.533921718597412, + "learning_rate": 0.0028442656790976305, + "loss": 7.6845, + "step": 741400 + }, + { + "epoch": 3.020708043373538, + "grad_norm": 7.809845447540283, + "learning_rate": 0.0028437741869248656, + "loss": 7.6783, + "step": 741500 + }, + { + "epoch": 3.0211154213969196, + "grad_norm": 3.4706003665924072, + "learning_rate": 0.002843282681358549, + "loss": 7.6951, + "step": 741600 + }, + { + "epoch": 3.021522799420301, + "grad_norm": 2.5168254375457764, + "learning_rate": 0.0028427911624181066, + "loss": 7.6831, + "step": 741700 + }, + { + "epoch": 3.0219301774436826, + "grad_norm": 3.419708728790283, + "learning_rate": 0.00284229963012297, + "loss": 7.6986, + "step": 741800 + }, + { + "epoch": 3.022337555467064, + "grad_norm": 5.101961135864258, + "learning_rate": 0.002841808084492575, + "loss": 7.7074, + "step": 741900 + }, + { + "epoch": 3.0227449334904453, + "grad_norm": 3.0421252250671387, + "learning_rate": 0.0028413165255463564, + "loss": 7.6789, + "step": 742000 + }, + { + "epoch": 3.0227449334904453, + "eval_MaskedAccuracy": 0.5056500423693558, + "eval_loss": 1.6271016597747803, + "eval_runtime": 151.2846, + "eval_samples_per_second": 419.58, + "eval_steps_per_second": 1.639, + "step": 742000 + }, + { + "epoch": 3.023152311513827, + "grad_norm": 3.4731786251068115, + "learning_rate": 0.002840824953303746, + "loss": 7.6852, + "step": 742100 + }, + { + "epoch": 3.0235596895372083, + "grad_norm": 3.8895857334136963, + "learning_rate": 0.002840333367784181, + "loss": 7.7051, + "step": 742200 + }, + { + "epoch": 3.02396706756059, + "grad_norm": 5.0788493156433105, + "learning_rate": 0.0028398417690070977, + "loss": 7.7234, + "step": 742300 + }, + { + "epoch": 3.0243744455839714, + "grad_norm": 4.581942558288574, + "learning_rate": 0.002839350156991924, + "loss": 7.693, + "step": 742400 + }, + { + "epoch": 3.024781823607353, + "grad_norm": 2.6827476024627686, + "learning_rate": 0.002838858531758098, + "loss": 7.7087, + "step": 742500 + }, + { + "epoch": 3.025189201630734, + "grad_norm": 3.306877613067627, + "learning_rate": 0.002838366893325062, + "loss": 7.7229, + "step": 742600 + }, + { + "epoch": 3.0255965796541155, + "grad_norm": 4.539236545562744, + "learning_rate": 0.0028378752417122493, + "loss": 7.6909, + "step": 742700 + }, + { + "epoch": 3.026003957677497, + "grad_norm": 3.317619562149048, + "learning_rate": 0.0028373835769391, + "loss": 7.6756, + "step": 742800 + }, + { + "epoch": 3.0264113357008786, + "grad_norm": 6.079135417938232, + "learning_rate": 0.002836891899025048, + "loss": 7.7073, + "step": 742900 + }, + { + "epoch": 3.02681871372426, + "grad_norm": 8.416139602661133, + "learning_rate": 0.002836400207989534, + "loss": 7.693, + "step": 743000 + }, + { + "epoch": 3.02681871372426, + "eval_MaskedAccuracy": 0.5055632710561418, + "eval_loss": 1.6125744581222534, + "eval_runtime": 152.4589, + "eval_samples_per_second": 416.348, + "eval_steps_per_second": 1.627, + "step": 743000 + }, + { + "epoch": 3.0272260917476417, + "grad_norm": 7.413698673248291, + "learning_rate": 0.0028359085038519977, + "loss": 7.6778, + "step": 743100 + }, + { + "epoch": 3.027633469771023, + "grad_norm": 6.549065113067627, + "learning_rate": 0.0028354167866318776, + "loss": 7.6899, + "step": 743200 + }, + { + "epoch": 3.0280408477944043, + "grad_norm": 5.39340353012085, + "learning_rate": 0.0028349250563486157, + "loss": 7.7047, + "step": 743300 + }, + { + "epoch": 3.028448225817786, + "grad_norm": 5.896380424499512, + "learning_rate": 0.0028344333130216506, + "loss": 7.7133, + "step": 743400 + }, + { + "epoch": 3.0288556038411674, + "grad_norm": 6.32306432723999, + "learning_rate": 0.0028339415566704246, + "loss": 7.7131, + "step": 743500 + }, + { + "epoch": 3.029262981864549, + "grad_norm": 3.8068430423736572, + "learning_rate": 0.0028334497873143795, + "loss": 7.7373, + "step": 743600 + }, + { + "epoch": 3.0296703598879304, + "grad_norm": 10.210831642150879, + "learning_rate": 0.0028329580049729595, + "loss": 7.6972, + "step": 743700 + }, + { + "epoch": 3.030077737911312, + "grad_norm": 10.17298412322998, + "learning_rate": 0.0028324662096656044, + "loss": 7.7149, + "step": 743800 + }, + { + "epoch": 3.030485115934693, + "grad_norm": 4.248089790344238, + "learning_rate": 0.002831974401411758, + "loss": 7.6869, + "step": 743900 + }, + { + "epoch": 3.0308924939580746, + "grad_norm": 3.900387763977051, + "learning_rate": 0.0028314825802308657, + "loss": 7.7162, + "step": 744000 + }, + { + "epoch": 3.0308924939580746, + "eval_MaskedAccuracy": 0.5062941364730348, + "eval_loss": 1.6123100519180298, + "eval_runtime": 154.1937, + "eval_samples_per_second": 411.664, + "eval_steps_per_second": 1.608, + "step": 744000 + }, + { + "epoch": 3.031299871981456, + "grad_norm": 3.7343649864196777, + "learning_rate": 0.0028309907461423704, + "loss": 7.7208, + "step": 744100 + }, + { + "epoch": 3.0317072500048377, + "grad_norm": 3.5612926483154297, + "learning_rate": 0.0028304988991657153, + "loss": 7.6932, + "step": 744200 + }, + { + "epoch": 3.032114628028219, + "grad_norm": 6.879009246826172, + "learning_rate": 0.0028300070393203476, + "loss": 7.7268, + "step": 744300 + }, + { + "epoch": 3.0325220060516007, + "grad_norm": 4.398885726928711, + "learning_rate": 0.002829515166625715, + "loss": 7.7104, + "step": 744400 + }, + { + "epoch": 3.032929384074982, + "grad_norm": 4.473330974578857, + "learning_rate": 0.002829023281101265, + "loss": 7.7114, + "step": 744500 + }, + { + "epoch": 3.0333367620983633, + "grad_norm": 4.854863166809082, + "learning_rate": 0.0028285313827664384, + "loss": 7.7086, + "step": 744600 + }, + { + "epoch": 3.033744140121745, + "grad_norm": 6.374593257904053, + "learning_rate": 0.0028280394716406884, + "loss": 7.7386, + "step": 744700 + }, + { + "epoch": 3.0341515181451264, + "grad_norm": 3.94598126411438, + "learning_rate": 0.0028275475477434627, + "loss": 7.7082, + "step": 744800 + }, + { + "epoch": 3.034558896168508, + "grad_norm": 4.2861409187316895, + "learning_rate": 0.0028270556110942045, + "loss": 7.659, + "step": 744900 + }, + { + "epoch": 3.0349662741918895, + "grad_norm": 4.4543023109436035, + "learning_rate": 0.0028265636617123653, + "loss": 7.7032, + "step": 745000 + }, + { + "epoch": 3.0349662741918895, + "eval_MaskedAccuracy": 0.5057628963797117, + "eval_loss": 1.6194565296173096, + "eval_runtime": 151.1422, + "eval_samples_per_second": 419.975, + "eval_steps_per_second": 1.641, + "step": 745000 + }, + { + "epoch": 3.0353736522152706, + "grad_norm": 5.848895072937012, + "learning_rate": 0.002826071699617396, + "loss": 7.6823, + "step": 745100 + }, + { + "epoch": 3.035781030238652, + "grad_norm": 5.476065635681152, + "learning_rate": 0.002825579724828742, + "loss": 7.7301, + "step": 745200 + }, + { + "epoch": 3.0361884082620336, + "grad_norm": 9.30887222290039, + "learning_rate": 0.0028250877373658563, + "loss": 7.7132, + "step": 745300 + }, + { + "epoch": 3.036595786285415, + "grad_norm": 5.337265968322754, + "learning_rate": 0.0028245957372481927, + "loss": 7.6611, + "step": 745400 + }, + { + "epoch": 3.0370031643087967, + "grad_norm": 3.821563243865967, + "learning_rate": 0.0028241037244951996, + "loss": 7.7002, + "step": 745500 + }, + { + "epoch": 3.037410542332178, + "grad_norm": 7.907029151916504, + "learning_rate": 0.0028236116991263295, + "loss": 7.6754, + "step": 745600 + }, + { + "epoch": 3.0378179203555598, + "grad_norm": 9.829130172729492, + "learning_rate": 0.0028231196611610363, + "loss": 7.7387, + "step": 745700 + }, + { + "epoch": 3.038225298378941, + "grad_norm": 4.472766399383545, + "learning_rate": 0.0028226276106187685, + "loss": 7.6909, + "step": 745800 + }, + { + "epoch": 3.0386326764023224, + "grad_norm": 8.586014747619629, + "learning_rate": 0.0028221355475189845, + "loss": 7.6959, + "step": 745900 + }, + { + "epoch": 3.039040054425704, + "grad_norm": 3.3005423545837402, + "learning_rate": 0.002821643471881136, + "loss": 7.7071, + "step": 746000 + }, + { + "epoch": 3.039040054425704, + "eval_MaskedAccuracy": 0.5054460142927345, + "eval_loss": 1.6223331689834595, + "eval_runtime": 151.3643, + "eval_samples_per_second": 419.359, + "eval_steps_per_second": 1.638, + "step": 746000 + }, + { + "epoch": 3.0394474324490854, + "grad_norm": 7.866025924682617, + "learning_rate": 0.0028211513837246766, + "loss": 7.7215, + "step": 746100 + }, + { + "epoch": 3.039854810472467, + "grad_norm": 12.360164642333984, + "learning_rate": 0.0028206592830690626, + "loss": 7.6792, + "step": 746200 + }, + { + "epoch": 3.0402621884958485, + "grad_norm": 5.763864040374756, + "learning_rate": 0.0028201671699337504, + "loss": 7.7254, + "step": 746300 + }, + { + "epoch": 3.0406695665192296, + "grad_norm": 4.453338623046875, + "learning_rate": 0.002819675044338191, + "loss": 7.6742, + "step": 746400 + }, + { + "epoch": 3.041076944542611, + "grad_norm": 10.988475799560547, + "learning_rate": 0.002819182906301844, + "loss": 7.7044, + "step": 746500 + }, + { + "epoch": 3.0414843225659927, + "grad_norm": 3.7080917358398438, + "learning_rate": 0.002818690755844169, + "loss": 7.71, + "step": 746600 + }, + { + "epoch": 3.041891700589374, + "grad_norm": 7.433635234832764, + "learning_rate": 0.0028181985929846215, + "loss": 7.6761, + "step": 746700 + }, + { + "epoch": 3.0422990786127557, + "grad_norm": 6.572902202606201, + "learning_rate": 0.002817706417742658, + "loss": 7.6797, + "step": 746800 + }, + { + "epoch": 3.0427064566361373, + "grad_norm": 3.7825119495391846, + "learning_rate": 0.0028172142301377398, + "loss": 7.6901, + "step": 746900 + }, + { + "epoch": 3.0431138346595183, + "grad_norm": 7.348376274108887, + "learning_rate": 0.0028167220301893196, + "loss": 7.6862, + "step": 747000 + }, + { + "epoch": 3.0431138346595183, + "eval_MaskedAccuracy": 0.5059655368405178, + "eval_loss": 1.6155526638031006, + "eval_runtime": 163.344, + "eval_samples_per_second": 388.603, + "eval_steps_per_second": 1.518, + "step": 747000 + }, + { + "epoch": 3.0435212126829, + "grad_norm": 4.178409576416016, + "learning_rate": 0.0028162298179168588, + "loss": 7.7009, + "step": 747100 + }, + { + "epoch": 3.0439285907062814, + "grad_norm": 9.232433319091797, + "learning_rate": 0.002815737593339823, + "loss": 7.7132, + "step": 747200 + }, + { + "epoch": 3.044335968729663, + "grad_norm": 5.816783905029297, + "learning_rate": 0.002815245356477669, + "loss": 7.6965, + "step": 747300 + }, + { + "epoch": 3.0447433467530445, + "grad_norm": 4.057966232299805, + "learning_rate": 0.0028147531073498535, + "loss": 7.7127, + "step": 747400 + }, + { + "epoch": 3.045150724776426, + "grad_norm": 4.916996479034424, + "learning_rate": 0.002814260845975841, + "loss": 7.6831, + "step": 747500 + }, + { + "epoch": 3.045558102799807, + "grad_norm": 2.2980215549468994, + "learning_rate": 0.0028137685723750943, + "loss": 7.7005, + "step": 747600 + }, + { + "epoch": 3.0459654808231886, + "grad_norm": 4.099763870239258, + "learning_rate": 0.0028132762865670734, + "loss": 7.7293, + "step": 747700 + }, + { + "epoch": 3.04637285884657, + "grad_norm": 3.3498318195343018, + "learning_rate": 0.002812783988571241, + "loss": 7.6964, + "step": 747800 + }, + { + "epoch": 3.0467802368699517, + "grad_norm": 4.06113862991333, + "learning_rate": 0.0028122916784070607, + "loss": 7.6875, + "step": 747900 + }, + { + "epoch": 3.0471876148933332, + "grad_norm": 4.0826239585876465, + "learning_rate": 0.002811799356093999, + "loss": 7.7025, + "step": 748000 + }, + { + "epoch": 3.0471876148933332, + "eval_MaskedAccuracy": 0.5053813683576247, + "eval_loss": 1.6258487701416016, + "eval_runtime": 152.3978, + "eval_samples_per_second": 416.515, + "eval_steps_per_second": 1.627, + "step": 748000 + }, + { + "epoch": 3.0475949929167148, + "grad_norm": 4.380719184875488, + "learning_rate": 0.0028113070216515173, + "loss": 7.7252, + "step": 748100 + }, + { + "epoch": 3.0480023709400963, + "grad_norm": 17.466968536376953, + "learning_rate": 0.002810814675099083, + "loss": 7.7, + "step": 748200 + }, + { + "epoch": 3.0484097489634774, + "grad_norm": 3.0531888008117676, + "learning_rate": 0.0028103223164561574, + "loss": 7.7112, + "step": 748300 + }, + { + "epoch": 3.048817126986859, + "grad_norm": 5.355355262756348, + "learning_rate": 0.0028098299457422074, + "loss": 7.7003, + "step": 748400 + }, + { + "epoch": 3.0492245050102404, + "grad_norm": 7.948986530303955, + "learning_rate": 0.0028093375629766994, + "loss": 7.7229, + "step": 748500 + }, + { + "epoch": 3.049631883033622, + "grad_norm": 4.31436824798584, + "learning_rate": 0.0028088451681790986, + "loss": 7.6893, + "step": 748600 + }, + { + "epoch": 3.0500392610570035, + "grad_norm": 6.25513219833374, + "learning_rate": 0.0028083527613688743, + "loss": 7.7159, + "step": 748700 + }, + { + "epoch": 3.050446639080385, + "grad_norm": 5.565917491912842, + "learning_rate": 0.0028078603425654904, + "loss": 7.6689, + "step": 748800 + }, + { + "epoch": 3.050854017103766, + "grad_norm": 3.3782989978790283, + "learning_rate": 0.0028073679117884202, + "loss": 7.7223, + "step": 748900 + }, + { + "epoch": 3.0512613951271477, + "grad_norm": 3.9914166927337646, + "learning_rate": 0.002806875469057129, + "loss": 7.6964, + "step": 749000 + }, + { + "epoch": 3.0512613951271477, + "eval_MaskedAccuracy": 0.506501001429651, + "eval_loss": 1.6175804138183594, + "eval_runtime": 150.5201, + "eval_samples_per_second": 421.711, + "eval_steps_per_second": 1.648, + "step": 749000 + }, + { + "epoch": 3.051668773150529, + "grad_norm": 2.5959930419921875, + "learning_rate": 0.0028063830143910864, + "loss": 7.7105, + "step": 749100 + }, + { + "epoch": 3.0520761511739107, + "grad_norm": 5.61108922958374, + "learning_rate": 0.0028058905478097595, + "loss": 7.6973, + "step": 749200 + }, + { + "epoch": 3.0524835291972923, + "grad_norm": 3.186306953430176, + "learning_rate": 0.0028053980693326223, + "loss": 7.7117, + "step": 749300 + }, + { + "epoch": 3.052890907220674, + "grad_norm": 6.981227397918701, + "learning_rate": 0.0028049055789791427, + "loss": 7.7288, + "step": 749400 + }, + { + "epoch": 3.053298285244055, + "grad_norm": 5.489975929260254, + "learning_rate": 0.0028044130767687914, + "loss": 7.7296, + "step": 749500 + }, + { + "epoch": 3.0537056632674364, + "grad_norm": 5.899916172027588, + "learning_rate": 0.002803920562721041, + "loss": 7.7182, + "step": 749600 + }, + { + "epoch": 3.054113041290818, + "grad_norm": 2.500195026397705, + "learning_rate": 0.0028034280368553583, + "loss": 7.7013, + "step": 749700 + }, + { + "epoch": 3.0545204193141995, + "grad_norm": 6.336365699768066, + "learning_rate": 0.0028029354991912204, + "loss": 7.6773, + "step": 749800 + }, + { + "epoch": 3.054927797337581, + "grad_norm": 8.86232852935791, + "learning_rate": 0.0028024429497481023, + "loss": 7.697, + "step": 749900 + }, + { + "epoch": 3.0553351753609626, + "grad_norm": 8.056133270263672, + "learning_rate": 0.002801950388545474, + "loss": 7.7169, + "step": 750000 + }, + { + "epoch": 3.0553351753609626, + "eval_MaskedAccuracy": 0.5053425370105782, + "eval_loss": 1.6197924613952637, + "eval_runtime": 151.6657, + "eval_samples_per_second": 418.526, + "eval_steps_per_second": 1.635, + "step": 750000 + }, + { + "epoch": 3.0557425533843436, + "grad_norm": 5.945562362670898, + "learning_rate": 0.0028014578156028043, + "loss": 7.7112, + "step": 750100 + }, + { + "epoch": 3.056149931407725, + "grad_norm": 4.3121657371521, + "learning_rate": 0.0028009652309395716, + "loss": 7.6802, + "step": 750200 + }, + { + "epoch": 3.0565573094311067, + "grad_norm": 6.809252738952637, + "learning_rate": 0.0028004726345752513, + "loss": 7.7059, + "step": 750300 + }, + { + "epoch": 3.0569646874544882, + "grad_norm": 6.637779712677002, + "learning_rate": 0.002799980026529318, + "loss": 7.6848, + "step": 750400 + }, + { + "epoch": 3.0573720654778698, + "grad_norm": 5.115314483642578, + "learning_rate": 0.0027994874068212436, + "loss": 7.6937, + "step": 750500 + }, + { + "epoch": 3.0577794435012513, + "grad_norm": 3.4024839401245117, + "learning_rate": 0.0027989947754705094, + "loss": 7.6794, + "step": 750600 + }, + { + "epoch": 3.058186821524633, + "grad_norm": 7.3663482666015625, + "learning_rate": 0.002798502132496588, + "loss": 7.6701, + "step": 750700 + }, + { + "epoch": 3.058594199548014, + "grad_norm": 4.044422626495361, + "learning_rate": 0.0027980094779189585, + "loss": 7.6958, + "step": 750800 + }, + { + "epoch": 3.0590015775713955, + "grad_norm": 2.6503303050994873, + "learning_rate": 0.002797516811757099, + "loss": 7.7023, + "step": 750900 + }, + { + "epoch": 3.059408955594777, + "grad_norm": 8.92728042602539, + "learning_rate": 0.002797024134030482, + "loss": 7.6873, + "step": 751000 + }, + { + "epoch": 3.059408955594777, + "eval_MaskedAccuracy": 0.5059805377342768, + "eval_loss": 1.624422550201416, + "eval_runtime": 151.4307, + "eval_samples_per_second": 419.175, + "eval_steps_per_second": 1.638, + "step": 751000 + }, + { + "epoch": 3.0598163336181585, + "grad_norm": 6.3868818283081055, + "learning_rate": 0.0027965314447585925, + "loss": 7.711, + "step": 751100 + }, + { + "epoch": 3.06022371164154, + "grad_norm": 7.902709484100342, + "learning_rate": 0.0027960387439609036, + "loss": 7.7238, + "step": 751200 + }, + { + "epoch": 3.0606310896649216, + "grad_norm": 5.930760383605957, + "learning_rate": 0.002795546031656899, + "loss": 7.6997, + "step": 751300 + }, + { + "epoch": 3.0610384676883027, + "grad_norm": 3.6389553546905518, + "learning_rate": 0.0027950533078660534, + "loss": 7.7025, + "step": 751400 + }, + { + "epoch": 3.061445845711684, + "grad_norm": 2.4691882133483887, + "learning_rate": 0.00279456057260785, + "loss": 7.6794, + "step": 751500 + }, + { + "epoch": 3.0618532237350657, + "grad_norm": 5.3170366287231445, + "learning_rate": 0.0027940678259017705, + "loss": 7.7213, + "step": 751600 + }, + { + "epoch": 3.0622606017584473, + "grad_norm": 4.524278163909912, + "learning_rate": 0.00279357506776729, + "loss": 7.7152, + "step": 751700 + }, + { + "epoch": 3.062667979781829, + "grad_norm": 5.752650260925293, + "learning_rate": 0.002793082298223896, + "loss": 7.6986, + "step": 751800 + }, + { + "epoch": 3.0630753578052103, + "grad_norm": 4.981680870056152, + "learning_rate": 0.002792589517291067, + "loss": 7.7098, + "step": 751900 + }, + { + "epoch": 3.0634827358285914, + "grad_norm": 3.780088186264038, + "learning_rate": 0.0027920967249882873, + "loss": 7.7109, + "step": 752000 + }, + { + "epoch": 3.0634827358285914, + "eval_MaskedAccuracy": 0.5055275134090221, + "eval_loss": 1.6219958066940308, + "eval_runtime": 159.1803, + "eval_samples_per_second": 398.768, + "eval_steps_per_second": 1.558, + "step": 752000 + }, + { + "epoch": 3.063890113851973, + "grad_norm": 5.787014007568359, + "learning_rate": 0.0027916039213350397, + "loss": 7.6746, + "step": 752100 + }, + { + "epoch": 3.0642974918753545, + "grad_norm": 7.504022121429443, + "learning_rate": 0.0027911111063508025, + "loss": 7.6924, + "step": 752200 + }, + { + "epoch": 3.064704869898736, + "grad_norm": 5.298858642578125, + "learning_rate": 0.0027906182800550637, + "loss": 7.6964, + "step": 752300 + }, + { + "epoch": 3.0651122479221176, + "grad_norm": 2.5735104084014893, + "learning_rate": 0.002790125442467312, + "loss": 7.7175, + "step": 752400 + }, + { + "epoch": 3.065519625945499, + "grad_norm": 5.3041558265686035, + "learning_rate": 0.0027896325936070214, + "loss": 7.7047, + "step": 752500 + }, + { + "epoch": 3.06592700396888, + "grad_norm": 6.335773944854736, + "learning_rate": 0.0027891397334936833, + "loss": 7.7084, + "step": 752600 + }, + { + "epoch": 3.0663343819922617, + "grad_norm": 3.198148012161255, + "learning_rate": 0.002788646862146781, + "loss": 7.701, + "step": 752700 + }, + { + "epoch": 3.0667417600156432, + "grad_norm": 9.311052322387695, + "learning_rate": 0.002788153979585801, + "loss": 7.72, + "step": 752800 + }, + { + "epoch": 3.067149138039025, + "grad_norm": 4.54910945892334, + "learning_rate": 0.0027876610858302304, + "loss": 7.6623, + "step": 752900 + }, + { + "epoch": 3.0675565160624063, + "grad_norm": 5.994970321655273, + "learning_rate": 0.0027871681808995566, + "loss": 7.7136, + "step": 753000 + }, + { + "epoch": 3.0675565160624063, + "eval_MaskedAccuracy": 0.5054040414939257, + "eval_loss": 1.6254481077194214, + "eval_runtime": 183.8547, + "eval_samples_per_second": 345.251, + "eval_steps_per_second": 1.349, + "step": 753000 + }, + { + "epoch": 3.067963894085788, + "grad_norm": 7.91969633102417, + "learning_rate": 0.0027866752648132657, + "loss": 7.7479, + "step": 753100 + }, + { + "epoch": 3.068371272109169, + "grad_norm": 6.5430731773376465, + "learning_rate": 0.0027861823375908445, + "loss": 7.7175, + "step": 753200 + }, + { + "epoch": 3.0687786501325505, + "grad_norm": 3.526317596435547, + "learning_rate": 0.002785689399251782, + "loss": 7.7156, + "step": 753300 + }, + { + "epoch": 3.069186028155932, + "grad_norm": 5.19420862197876, + "learning_rate": 0.0027851964498155656, + "loss": 7.6909, + "step": 753400 + }, + { + "epoch": 3.0695934061793135, + "grad_norm": 8.49240493774414, + "learning_rate": 0.002784703489301686, + "loss": 7.7331, + "step": 753500 + }, + { + "epoch": 3.070000784202695, + "grad_norm": 3.6290364265441895, + "learning_rate": 0.0027842105177296314, + "loss": 7.7022, + "step": 753600 + }, + { + "epoch": 3.0704081622260766, + "grad_norm": 9.527637481689453, + "learning_rate": 0.002783717535118892, + "loss": 7.7132, + "step": 753700 + }, + { + "epoch": 3.070815540249458, + "grad_norm": 11.094533920288086, + "learning_rate": 0.0027832245414889578, + "loss": 7.7039, + "step": 753800 + }, + { + "epoch": 3.071222918272839, + "grad_norm": 5.062388896942139, + "learning_rate": 0.0027827315368593158, + "loss": 7.7081, + "step": 753900 + }, + { + "epoch": 3.0716302962962208, + "grad_norm": 4.398228645324707, + "learning_rate": 0.002782238521249465, + "loss": 7.7018, + "step": 754000 + }, + { + "epoch": 3.0716302962962208, + "eval_MaskedAccuracy": 0.5057051822216122, + "eval_loss": 1.6179014444351196, + "eval_runtime": 150.941, + "eval_samples_per_second": 420.535, + "eval_steps_per_second": 1.643, + "step": 754000 + }, + { + "epoch": 3.0720376743196023, + "grad_norm": 4.103709697723389, + "learning_rate": 0.0027817454946788894, + "loss": 7.7045, + "step": 754100 + }, + { + "epoch": 3.072445052342984, + "grad_norm": 7.122503757476807, + "learning_rate": 0.0027812524571670847, + "loss": 7.7191, + "step": 754200 + }, + { + "epoch": 3.0728524303663654, + "grad_norm": 2.7399961948394775, + "learning_rate": 0.0027807594087335425, + "loss": 7.7057, + "step": 754300 + }, + { + "epoch": 3.073259808389747, + "grad_norm": 3.4263041019439697, + "learning_rate": 0.0027802663493977546, + "loss": 7.7328, + "step": 754400 + }, + { + "epoch": 3.073667186413128, + "grad_norm": 2.858315944671631, + "learning_rate": 0.0027797732791792202, + "loss": 7.7174, + "step": 754500 + }, + { + "epoch": 3.0740745644365095, + "grad_norm": 10.019489288330078, + "learning_rate": 0.002779280198097426, + "loss": 7.7021, + "step": 754600 + }, + { + "epoch": 3.074481942459891, + "grad_norm": 6.572674751281738, + "learning_rate": 0.002778787106171868, + "loss": 7.743, + "step": 754700 + }, + { + "epoch": 3.0748893204832726, + "grad_norm": 3.3612263202667236, + "learning_rate": 0.002778294003422041, + "loss": 7.7244, + "step": 754800 + }, + { + "epoch": 3.075296698506654, + "grad_norm": 2.9108200073242188, + "learning_rate": 0.002777800889867441, + "loss": 7.7064, + "step": 754900 + }, + { + "epoch": 3.0757040765300356, + "grad_norm": 7.32778787612915, + "learning_rate": 0.0027773077655275646, + "loss": 7.7206, + "step": 755000 + }, + { + "epoch": 3.0757040765300356, + "eval_MaskedAccuracy": 0.50512931439093, + "eval_loss": 1.6299889087677002, + "eval_runtime": 150.4053, + "eval_samples_per_second": 422.033, + "eval_steps_per_second": 1.649, + "step": 755000 + }, + { + "epoch": 3.0761114545534167, + "grad_norm": 4.767580509185791, + "learning_rate": 0.0027768146304219046, + "loss": 7.6982, + "step": 755100 + }, + { + "epoch": 3.0765188325767983, + "grad_norm": 3.6486380100250244, + "learning_rate": 0.002776321484569961, + "loss": 7.706, + "step": 755200 + }, + { + "epoch": 3.07692621060018, + "grad_norm": 4.942773342132568, + "learning_rate": 0.0027758283279912255, + "loss": 7.6904, + "step": 755300 + }, + { + "epoch": 3.0773335886235613, + "grad_norm": 3.760831832885742, + "learning_rate": 0.0027753351607051977, + "loss": 7.7028, + "step": 755400 + }, + { + "epoch": 3.077740966646943, + "grad_norm": 7.592773914337158, + "learning_rate": 0.0027748419827313734, + "loss": 7.7309, + "step": 755500 + }, + { + "epoch": 3.0781483446703244, + "grad_norm": 3.427999258041382, + "learning_rate": 0.0027743487940892543, + "loss": 7.7191, + "step": 755600 + }, + { + "epoch": 3.0785557226937055, + "grad_norm": 4.776392459869385, + "learning_rate": 0.002773855594798336, + "loss": 7.7126, + "step": 755700 + }, + { + "epoch": 3.078963100717087, + "grad_norm": 5.972330093383789, + "learning_rate": 0.00277336238487812, + "loss": 7.7021, + "step": 755800 + }, + { + "epoch": 3.0793704787404685, + "grad_norm": 6.197175025939941, + "learning_rate": 0.0027728691643481036, + "loss": 7.7335, + "step": 755900 + }, + { + "epoch": 3.07977785676385, + "grad_norm": 3.0436811447143555, + "learning_rate": 0.0027723759332277846, + "loss": 7.731, + "step": 756000 + }, + { + "epoch": 3.07977785676385, + "eval_MaskedAccuracy": 0.505950147114793, + "eval_loss": 1.6236408948898315, + "eval_runtime": 155.3747, + "eval_samples_per_second": 408.535, + "eval_steps_per_second": 1.596, + "step": 756000 + }, + { + "epoch": 3.0801852347872316, + "grad_norm": 6.094714641571045, + "learning_rate": 0.0027718826915366663, + "loss": 7.6971, + "step": 756100 + }, + { + "epoch": 3.080592612810613, + "grad_norm": 4.8946146965026855, + "learning_rate": 0.0027713894392942477, + "loss": 7.7232, + "step": 756200 + }, + { + "epoch": 3.0809999908339947, + "grad_norm": 4.392921447753906, + "learning_rate": 0.002770896176520028, + "loss": 7.7185, + "step": 756300 + }, + { + "epoch": 3.0814073688573758, + "grad_norm": 8.929402351379395, + "learning_rate": 0.0027704029032335088, + "loss": 7.7144, + "step": 756400 + }, + { + "epoch": 3.0818147468807573, + "grad_norm": 6.839146137237549, + "learning_rate": 0.0027699096194541977, + "loss": 7.7279, + "step": 756500 + }, + { + "epoch": 3.082222124904139, + "grad_norm": 8.299478530883789, + "learning_rate": 0.002769416325201593, + "loss": 7.7313, + "step": 756600 + }, + { + "epoch": 3.0826295029275204, + "grad_norm": 5.999608516693115, + "learning_rate": 0.002768923020495192, + "loss": 7.7132, + "step": 756700 + }, + { + "epoch": 3.083036880950902, + "grad_norm": 2.693417549133301, + "learning_rate": 0.002768429705354503, + "loss": 7.7067, + "step": 756800 + }, + { + "epoch": 3.0834442589742834, + "grad_norm": 4.526899814605713, + "learning_rate": 0.0027679363797990285, + "loss": 7.688, + "step": 756900 + }, + { + "epoch": 3.0838516369976645, + "grad_norm": 4.774698257446289, + "learning_rate": 0.002767443043848276, + "loss": 7.7004, + "step": 757000 + }, + { + "epoch": 3.0838516369976645, + "eval_MaskedAccuracy": 0.5057957761377743, + "eval_loss": 1.6153463125228882, + "eval_runtime": 154.1542, + "eval_samples_per_second": 411.769, + "eval_steps_per_second": 1.609, + "step": 757000 + }, + { + "epoch": 3.084259015021046, + "grad_norm": 3.5766282081604004, + "learning_rate": 0.002766949697521747, + "loss": 7.7101, + "step": 757100 + }, + { + "epoch": 3.0846663930444276, + "grad_norm": 6.593492031097412, + "learning_rate": 0.0027664563408389474, + "loss": 7.722, + "step": 757200 + }, + { + "epoch": 3.085073771067809, + "grad_norm": 3.907104015350342, + "learning_rate": 0.002765962973819378, + "loss": 7.696, + "step": 757300 + }, + { + "epoch": 3.0854811490911906, + "grad_norm": 2.727769136428833, + "learning_rate": 0.002765469596482544, + "loss": 7.6926, + "step": 757400 + }, + { + "epoch": 3.085888527114572, + "grad_norm": 5.050406455993652, + "learning_rate": 0.0027649762088479564, + "loss": 7.6885, + "step": 757500 + }, + { + "epoch": 3.0862959051379533, + "grad_norm": 3.762338876724243, + "learning_rate": 0.0027644828109351194, + "loss": 7.6738, + "step": 757600 + }, + { + "epoch": 3.086703283161335, + "grad_norm": 2.7442774772644043, + "learning_rate": 0.0027639894027635357, + "loss": 7.654, + "step": 757700 + }, + { + "epoch": 3.0871106611847163, + "grad_norm": 4.210478782653809, + "learning_rate": 0.002763495984352718, + "loss": 7.6858, + "step": 757800 + }, + { + "epoch": 3.087518039208098, + "grad_norm": 2.4720959663391113, + "learning_rate": 0.002763002555722172, + "loss": 7.713, + "step": 757900 + }, + { + "epoch": 3.0879254172314794, + "grad_norm": 7.692282199859619, + "learning_rate": 0.002762509116891405, + "loss": 7.7458, + "step": 758000 + }, + { + "epoch": 3.0879254172314794, + "eval_MaskedAccuracy": 0.5056275767802583, + "eval_loss": 1.6217570304870605, + "eval_runtime": 151.3128, + "eval_samples_per_second": 419.502, + "eval_steps_per_second": 1.639, + "step": 758000 + }, + { + "epoch": 3.088332795254861, + "grad_norm": 6.1380205154418945, + "learning_rate": 0.002762015667879925, + "loss": 7.6846, + "step": 758100 + }, + { + "epoch": 3.088740173278242, + "grad_norm": 4.44442081451416, + "learning_rate": 0.002761522208707242, + "loss": 7.7066, + "step": 758200 + }, + { + "epoch": 3.0891475513016236, + "grad_norm": 3.9880988597869873, + "learning_rate": 0.002761028739392862, + "loss": 7.6778, + "step": 758300 + }, + { + "epoch": 3.089554929325005, + "grad_norm": 4.732944965362549, + "learning_rate": 0.002760535259956298, + "loss": 7.6766, + "step": 758400 + }, + { + "epoch": 3.0899623073483866, + "grad_norm": 2.3220407962799072, + "learning_rate": 0.0027600417704170587, + "loss": 7.6579, + "step": 758500 + }, + { + "epoch": 3.090369685371768, + "grad_norm": 4.528346538543701, + "learning_rate": 0.0027595482707946546, + "loss": 7.7114, + "step": 758600 + }, + { + "epoch": 3.0907770633951497, + "grad_norm": 5.072365760803223, + "learning_rate": 0.002759054761108593, + "loss": 7.6787, + "step": 758700 + }, + { + "epoch": 3.091184441418531, + "grad_norm": 2.202188014984131, + "learning_rate": 0.0027585612413783887, + "loss": 7.7212, + "step": 758800 + }, + { + "epoch": 3.0915918194419123, + "grad_norm": 4.132675647735596, + "learning_rate": 0.0027580677116235534, + "loss": 7.6815, + "step": 758900 + }, + { + "epoch": 3.091999197465294, + "grad_norm": 4.466198921203613, + "learning_rate": 0.0027575741718635963, + "loss": 7.706, + "step": 759000 + }, + { + "epoch": 3.091999197465294, + "eval_MaskedAccuracy": 0.5057517451551324, + "eval_loss": 1.625289797782898, + "eval_runtime": 150.9548, + "eval_samples_per_second": 420.497, + "eval_steps_per_second": 1.643, + "step": 759000 + }, + { + "epoch": 3.0924065754886754, + "grad_norm": 8.609400749206543, + "learning_rate": 0.002757080622118032, + "loss": 7.7104, + "step": 759100 + }, + { + "epoch": 3.092813953512057, + "grad_norm": 3.13914155960083, + "learning_rate": 0.002756587062406375, + "loss": 7.7002, + "step": 759200 + }, + { + "epoch": 3.0932213315354384, + "grad_norm": 5.4335432052612305, + "learning_rate": 0.002756093492748131, + "loss": 7.7047, + "step": 759300 + }, + { + "epoch": 3.09362870955882, + "grad_norm": 8.296807289123535, + "learning_rate": 0.002755599913162822, + "loss": 7.7104, + "step": 759400 + }, + { + "epoch": 3.094036087582201, + "grad_norm": 6.492187976837158, + "learning_rate": 0.002755106323669959, + "loss": 7.707, + "step": 759500 + }, + { + "epoch": 3.0944434656055826, + "grad_norm": 3.22277569770813, + "learning_rate": 0.0027546127242890554, + "loss": 7.6715, + "step": 759600 + }, + { + "epoch": 3.094850843628964, + "grad_norm": 7.480574131011963, + "learning_rate": 0.002754119115039624, + "loss": 7.6811, + "step": 759700 + }, + { + "epoch": 3.0952582216523457, + "grad_norm": 3.214850664138794, + "learning_rate": 0.0027536254959411826, + "loss": 7.7066, + "step": 759800 + }, + { + "epoch": 3.095665599675727, + "grad_norm": 4.112349510192871, + "learning_rate": 0.002753131867013247, + "loss": 7.699, + "step": 759900 + }, + { + "epoch": 3.0960729776991087, + "grad_norm": 7.638454437255859, + "learning_rate": 0.0027526382282753274, + "loss": 7.7022, + "step": 760000 + }, + { + "epoch": 3.0960729776991087, + "eval_MaskedAccuracy": 0.5060889775801105, + "eval_loss": 1.6206738948822021, + "eval_runtime": 159.5708, + "eval_samples_per_second": 397.792, + "eval_steps_per_second": 1.554, + "step": 760000 + }, + { + "epoch": 3.09648035572249, + "grad_norm": 4.234745502471924, + "learning_rate": 0.002752144579746949, + "loss": 7.6631, + "step": 760100 + }, + { + "epoch": 3.0968877337458713, + "grad_norm": 4.058956623077393, + "learning_rate": 0.002751650921447622, + "loss": 7.6858, + "step": 760200 + }, + { + "epoch": 3.097295111769253, + "grad_norm": 2.646798610687256, + "learning_rate": 0.0027511572533968623, + "loss": 7.6926, + "step": 760300 + }, + { + "epoch": 3.0977024897926344, + "grad_norm": 6.737486839294434, + "learning_rate": 0.002750663575614189, + "loss": 7.7034, + "step": 760400 + }, + { + "epoch": 3.098109867816016, + "grad_norm": 5.023601531982422, + "learning_rate": 0.0027501698881191227, + "loss": 7.6799, + "step": 760500 + }, + { + "epoch": 3.0985172458393975, + "grad_norm": 5.718828201293945, + "learning_rate": 0.0027496761909311792, + "loss": 7.6983, + "step": 760600 + }, + { + "epoch": 3.0989246238627786, + "grad_norm": 7.084863185882568, + "learning_rate": 0.0027491824840698794, + "loss": 7.654, + "step": 760700 + }, + { + "epoch": 3.09933200188616, + "grad_norm": 7.82147741317749, + "learning_rate": 0.002748688767554738, + "loss": 7.6854, + "step": 760800 + }, + { + "epoch": 3.0997393799095416, + "grad_norm": 6.596715927124023, + "learning_rate": 0.0027481950414052743, + "loss": 7.6855, + "step": 760900 + }, + { + "epoch": 3.100146757932923, + "grad_norm": 8.626282691955566, + "learning_rate": 0.002747701305641013, + "loss": 7.726, + "step": 761000 + }, + { + "epoch": 3.100146757932923, + "eval_MaskedAccuracy": 0.5060363108688771, + "eval_loss": 1.615813136100769, + "eval_runtime": 157.5171, + "eval_samples_per_second": 402.978, + "eval_steps_per_second": 1.574, + "step": 761000 + }, + { + "epoch": 3.1005541359563047, + "grad_norm": 3.1481432914733887, + "learning_rate": 0.002747207560281467, + "loss": 7.6813, + "step": 761100 + }, + { + "epoch": 3.1009615139796862, + "grad_norm": 2.801621675491333, + "learning_rate": 0.002746713805346164, + "loss": 7.7187, + "step": 761200 + }, + { + "epoch": 3.1013688920030678, + "grad_norm": 10.537528038024902, + "learning_rate": 0.0027462200408546185, + "loss": 7.729, + "step": 761300 + }, + { + "epoch": 3.101776270026449, + "grad_norm": 3.829329013824463, + "learning_rate": 0.002745726266826354, + "loss": 7.6956, + "step": 761400 + }, + { + "epoch": 3.1021836480498304, + "grad_norm": 5.185670375823975, + "learning_rate": 0.002745232483280894, + "loss": 7.6825, + "step": 761500 + }, + { + "epoch": 3.102591026073212, + "grad_norm": 3.894953727722168, + "learning_rate": 0.0027447386902377615, + "loss": 7.6839, + "step": 761600 + }, + { + "epoch": 3.1029984040965934, + "grad_norm": 4.861565589904785, + "learning_rate": 0.002744244887716477, + "loss": 7.6802, + "step": 761700 + }, + { + "epoch": 3.103405782119975, + "grad_norm": 7.542404651641846, + "learning_rate": 0.002743751075736563, + "loss": 7.6918, + "step": 761800 + }, + { + "epoch": 3.1038131601433565, + "grad_norm": 5.748804092407227, + "learning_rate": 0.0027432572543175425, + "loss": 7.6957, + "step": 761900 + }, + { + "epoch": 3.1042205381667376, + "grad_norm": 5.699087142944336, + "learning_rate": 0.0027427634234789377, + "loss": 7.7039, + "step": 762000 + }, + { + "epoch": 3.1042205381667376, + "eval_MaskedAccuracy": 0.5066195956401219, + "eval_loss": 1.618896484375, + "eval_runtime": 151.8099, + "eval_samples_per_second": 418.128, + "eval_steps_per_second": 1.634, + "step": 762000 + }, + { + "epoch": 3.104627916190119, + "grad_norm": 4.970003604888916, + "learning_rate": 0.002742269583240274, + "loss": 7.6822, + "step": 762100 + }, + { + "epoch": 3.1050352942135007, + "grad_norm": 10.687162399291992, + "learning_rate": 0.0027417757336210733, + "loss": 7.6709, + "step": 762200 + }, + { + "epoch": 3.105442672236882, + "grad_norm": 6.89406681060791, + "learning_rate": 0.0027412818746408592, + "loss": 7.6976, + "step": 762300 + }, + { + "epoch": 3.1058500502602637, + "grad_norm": 5.004439353942871, + "learning_rate": 0.002740788006319161, + "loss": 7.6877, + "step": 762400 + }, + { + "epoch": 3.1062574282836453, + "grad_norm": 7.998891830444336, + "learning_rate": 0.002740294128675499, + "loss": 7.647, + "step": 762500 + }, + { + "epoch": 3.1066648063070263, + "grad_norm": 6.7538161277771, + "learning_rate": 0.002739800241729404, + "loss": 7.6801, + "step": 762600 + }, + { + "epoch": 3.107072184330408, + "grad_norm": 8.487037658691406, + "learning_rate": 0.002739306345500401, + "loss": 7.7004, + "step": 762700 + }, + { + "epoch": 3.1074795623537894, + "grad_norm": 3.9993841648101807, + "learning_rate": 0.0027388124400080137, + "loss": 7.6975, + "step": 762800 + }, + { + "epoch": 3.107886940377171, + "grad_norm": 4.43471097946167, + "learning_rate": 0.0027383185252717714, + "loss": 7.711, + "step": 762900 + }, + { + "epoch": 3.1082943184005525, + "grad_norm": 7.731508731842041, + "learning_rate": 0.0027378246013111964, + "loss": 7.6932, + "step": 763000 + }, + { + "epoch": 3.1082943184005525, + "eval_MaskedAccuracy": 0.5056539368572125, + "eval_loss": 1.6123262643814087, + "eval_runtime": 149.35, + "eval_samples_per_second": 425.015, + "eval_steps_per_second": 1.661, + "step": 763000 + }, + { + "epoch": 3.108701696423934, + "grad_norm": 9.151464462280273, + "learning_rate": 0.00273733066814582, + "loss": 7.6852, + "step": 763100 + }, + { + "epoch": 3.109109074447315, + "grad_norm": 3.820744752883911, + "learning_rate": 0.0027368367257951686, + "loss": 7.7124, + "step": 763200 + }, + { + "epoch": 3.1095164524706966, + "grad_norm": 4.586390018463135, + "learning_rate": 0.0027363427742787745, + "loss": 7.6838, + "step": 763300 + }, + { + "epoch": 3.109923830494078, + "grad_norm": 6.306432247161865, + "learning_rate": 0.002735848813616161, + "loss": 7.7275, + "step": 763400 + }, + { + "epoch": 3.1103312085174597, + "grad_norm": 2.7275564670562744, + "learning_rate": 0.0027353548438268597, + "loss": 7.7084, + "step": 763500 + }, + { + "epoch": 3.1107385865408412, + "grad_norm": 4.9357991218566895, + "learning_rate": 0.002734860864930402, + "loss": 7.7172, + "step": 763600 + }, + { + "epoch": 3.1111459645642228, + "grad_norm": 12.564998626708984, + "learning_rate": 0.002734366876946315, + "loss": 7.7076, + "step": 763700 + }, + { + "epoch": 3.1115533425876043, + "grad_norm": 5.375897407531738, + "learning_rate": 0.002733872879894126, + "loss": 7.6727, + "step": 763800 + }, + { + "epoch": 3.1119607206109854, + "grad_norm": 3.797677755355835, + "learning_rate": 0.0027333788737933682, + "loss": 7.6872, + "step": 763900 + }, + { + "epoch": 3.112368098634367, + "grad_norm": 5.234559059143066, + "learning_rate": 0.0027328848586635744, + "loss": 7.6786, + "step": 764000 + }, + { + "epoch": 3.112368098634367, + "eval_MaskedAccuracy": 0.5064838590018905, + "eval_loss": 1.6195224523544312, + "eval_runtime": 148.4772, + "eval_samples_per_second": 427.514, + "eval_steps_per_second": 1.67, + "step": 764000 + }, + { + "epoch": 3.1127754766577485, + "grad_norm": 5.013713359832764, + "learning_rate": 0.002732390834524275, + "loss": 7.7024, + "step": 764100 + }, + { + "epoch": 3.11318285468113, + "grad_norm": 6.045117378234863, + "learning_rate": 0.002731896801394997, + "loss": 7.7021, + "step": 764200 + }, + { + "epoch": 3.1135902327045115, + "grad_norm": 3.4436819553375244, + "learning_rate": 0.0027314027592952733, + "loss": 7.7039, + "step": 764300 + }, + { + "epoch": 3.113997610727893, + "grad_norm": 6.984184741973877, + "learning_rate": 0.00273090870824464, + "loss": 7.6958, + "step": 764400 + }, + { + "epoch": 3.114404988751274, + "grad_norm": 2.6970698833465576, + "learning_rate": 0.0027304146482626263, + "loss": 7.6822, + "step": 764500 + }, + { + "epoch": 3.1148123667746557, + "grad_norm": 8.229776382446289, + "learning_rate": 0.002729920579368766, + "loss": 7.7039, + "step": 764600 + }, + { + "epoch": 3.115219744798037, + "grad_norm": 5.920711994171143, + "learning_rate": 0.0027294265015825932, + "loss": 7.698, + "step": 764700 + }, + { + "epoch": 3.1156271228214187, + "grad_norm": 3.3548264503479004, + "learning_rate": 0.0027289324149236405, + "loss": 7.6951, + "step": 764800 + }, + { + "epoch": 3.1160345008448003, + "grad_norm": 2.312197685241699, + "learning_rate": 0.0027284383194114427, + "loss": 7.6999, + "step": 764900 + }, + { + "epoch": 3.116441878868182, + "grad_norm": 3.858964681625366, + "learning_rate": 0.0027279442150655296, + "loss": 7.6669, + "step": 765000 + }, + { + "epoch": 3.116441878868182, + "eval_MaskedAccuracy": 0.5051968226329966, + "eval_loss": 1.6254647970199585, + "eval_runtime": 150.8261, + "eval_samples_per_second": 420.856, + "eval_steps_per_second": 1.644, + "step": 765000 + }, + { + "epoch": 3.116849256891563, + "grad_norm": 11.507035255432129, + "learning_rate": 0.0027274501019054413, + "loss": 7.7111, + "step": 765100 + }, + { + "epoch": 3.1172566349149444, + "grad_norm": 3.928722858428955, + "learning_rate": 0.00272695597995071, + "loss": 7.6865, + "step": 765200 + }, + { + "epoch": 3.117664012938326, + "grad_norm": 4.361135005950928, + "learning_rate": 0.002726461849220871, + "loss": 7.6899, + "step": 765300 + }, + { + "epoch": 3.1180713909617075, + "grad_norm": 7.6984357833862305, + "learning_rate": 0.002725967709735463, + "loss": 7.7138, + "step": 765400 + }, + { + "epoch": 3.118478768985089, + "grad_norm": 4.049012660980225, + "learning_rate": 0.002725473561514019, + "loss": 7.6653, + "step": 765500 + }, + { + "epoch": 3.1188861470084706, + "grad_norm": 5.440489292144775, + "learning_rate": 0.0027249794045760764, + "loss": 7.6729, + "step": 765600 + }, + { + "epoch": 3.1192935250318516, + "grad_norm": 6.268097400665283, + "learning_rate": 0.0027244852389411707, + "loss": 7.7095, + "step": 765700 + }, + { + "epoch": 3.119700903055233, + "grad_norm": 5.593329429626465, + "learning_rate": 0.0027239910646288373, + "loss": 7.7053, + "step": 765800 + }, + { + "epoch": 3.1201082810786147, + "grad_norm": 5.340170383453369, + "learning_rate": 0.0027234968816586174, + "loss": 7.6662, + "step": 765900 + }, + { + "epoch": 3.1205156591019962, + "grad_norm": 4.175409317016602, + "learning_rate": 0.002723002690050049, + "loss": 7.7263, + "step": 766000 + }, + { + "epoch": 3.1205156591019962, + "eval_MaskedAccuracy": 0.5056226058133293, + "eval_loss": 1.6213122606277466, + "eval_runtime": 151.8872, + "eval_samples_per_second": 417.915, + "eval_steps_per_second": 1.633, + "step": 766000 + }, + { + "epoch": 3.1209230371253778, + "grad_norm": 10.50373363494873, + "learning_rate": 0.0027225084898226683, + "loss": 7.7059, + "step": 766100 + }, + { + "epoch": 3.1213304151487593, + "grad_norm": 3.3283989429473877, + "learning_rate": 0.0027220142809960125, + "loss": 7.6871, + "step": 766200 + }, + { + "epoch": 3.121737793172141, + "grad_norm": 6.762508392333984, + "learning_rate": 0.002721520063589619, + "loss": 7.6919, + "step": 766300 + }, + { + "epoch": 3.122145171195522, + "grad_norm": 4.057222843170166, + "learning_rate": 0.0027210258376230314, + "loss": 7.7004, + "step": 766400 + }, + { + "epoch": 3.1225525492189035, + "grad_norm": 5.02310848236084, + "learning_rate": 0.0027205316031157895, + "loss": 7.6696, + "step": 766500 + }, + { + "epoch": 3.122959927242285, + "grad_norm": 4.003963947296143, + "learning_rate": 0.0027200373600874284, + "loss": 7.7151, + "step": 766600 + }, + { + "epoch": 3.1233673052656665, + "grad_norm": 4.325287818908691, + "learning_rate": 0.00271954310855749, + "loss": 7.6745, + "step": 766700 + }, + { + "epoch": 3.123774683289048, + "grad_norm": 3.2648262977600098, + "learning_rate": 0.0027190488485455146, + "loss": 7.714, + "step": 766800 + }, + { + "epoch": 3.1241820613124296, + "grad_norm": 7.260473728179932, + "learning_rate": 0.0027185545800710443, + "loss": 7.7021, + "step": 766900 + }, + { + "epoch": 3.1245894393358107, + "grad_norm": 3.1286821365356445, + "learning_rate": 0.002718060303153616, + "loss": 7.6875, + "step": 767000 + }, + { + "epoch": 3.1245894393358107, + "eval_MaskedAccuracy": 0.5059939406712075, + "eval_loss": 1.6196115016937256, + "eval_runtime": 155.225, + "eval_samples_per_second": 408.929, + "eval_steps_per_second": 1.598, + "step": 767000 + }, + { + "epoch": 3.124996817359192, + "grad_norm": 9.294156074523926, + "learning_rate": 0.0027175660178127767, + "loss": 7.7038, + "step": 767100 + }, + { + "epoch": 3.1254041953825737, + "grad_norm": 6.32208776473999, + "learning_rate": 0.002717071724068067, + "loss": 7.6961, + "step": 767200 + }, + { + "epoch": 3.1258115734059553, + "grad_norm": 3.045856237411499, + "learning_rate": 0.002716577421939028, + "loss": 7.6998, + "step": 767300 + }, + { + "epoch": 3.126218951429337, + "grad_norm": 3.8133158683776855, + "learning_rate": 0.0027160831114452, + "loss": 7.6928, + "step": 767400 + }, + { + "epoch": 3.1266263294527183, + "grad_norm": 3.6581594944000244, + "learning_rate": 0.002715588792606129, + "loss": 7.6863, + "step": 767500 + }, + { + "epoch": 3.1270337074760994, + "grad_norm": 5.351816654205322, + "learning_rate": 0.0027150944654413573, + "loss": 7.6866, + "step": 767600 + }, + { + "epoch": 3.127441085499481, + "grad_norm": 6.225848197937012, + "learning_rate": 0.0027146001299704265, + "loss": 7.7126, + "step": 767700 + }, + { + "epoch": 3.1278484635228625, + "grad_norm": 3.376725912094116, + "learning_rate": 0.002714105786212884, + "loss": 7.7209, + "step": 767800 + }, + { + "epoch": 3.128255841546244, + "grad_norm": 2.856078863143921, + "learning_rate": 0.002713611434188269, + "loss": 7.7319, + "step": 767900 + }, + { + "epoch": 3.1286632195696256, + "grad_norm": 3.4593234062194824, + "learning_rate": 0.0027131170739161264, + "loss": 7.6623, + "step": 768000 + }, + { + "epoch": 3.1286632195696256, + "eval_MaskedAccuracy": 0.5066317843810505, + "eval_loss": 1.6134639978408813, + "eval_runtime": 353.3584, + "eval_samples_per_second": 179.636, + "eval_steps_per_second": 0.702, + "step": 768000 + }, + { + "epoch": 3.129070597593007, + "grad_norm": 4.538331031799316, + "learning_rate": 0.0027126227054160056, + "loss": 7.7282, + "step": 768100 + }, + { + "epoch": 3.129477975616388, + "grad_norm": 3.409496784210205, + "learning_rate": 0.00271212832870745, + "loss": 7.7104, + "step": 768200 + }, + { + "epoch": 3.1298853536397697, + "grad_norm": 2.624006509780884, + "learning_rate": 0.0027116339438100017, + "loss": 7.6978, + "step": 768300 + }, + { + "epoch": 3.1302927316631513, + "grad_norm": 5.233662128448486, + "learning_rate": 0.002711139550743207, + "loss": 7.6748, + "step": 768400 + }, + { + "epoch": 3.130700109686533, + "grad_norm": 4.042091369628906, + "learning_rate": 0.002710645149526616, + "loss": 7.6943, + "step": 768500 + }, + { + "epoch": 3.1311074877099143, + "grad_norm": 4.5074143409729, + "learning_rate": 0.002710150740179772, + "loss": 7.6773, + "step": 768600 + }, + { + "epoch": 3.131514865733296, + "grad_norm": 3.0886244773864746, + "learning_rate": 0.00270965632272222, + "loss": 7.7103, + "step": 768700 + }, + { + "epoch": 3.1319222437566774, + "grad_norm": 8.190417289733887, + "learning_rate": 0.0027091618971735124, + "loss": 7.6885, + "step": 768800 + }, + { + "epoch": 3.1323296217800585, + "grad_norm": 5.727116584777832, + "learning_rate": 0.0027086674635531933, + "loss": 7.6754, + "step": 768900 + }, + { + "epoch": 3.13273699980344, + "grad_norm": 7.181107521057129, + "learning_rate": 0.0027081730218808096, + "loss": 7.6879, + "step": 769000 + }, + { + "epoch": 3.13273699980344, + "eval_MaskedAccuracy": 0.5056934334928747, + "eval_loss": 1.6126961708068848, + "eval_runtime": 157.6959, + "eval_samples_per_second": 402.521, + "eval_steps_per_second": 1.573, + "step": 769000 + }, + { + "epoch": 3.1331443778268215, + "grad_norm": 4.570808410644531, + "learning_rate": 0.0027076785721759067, + "loss": 7.6934, + "step": 769100 + }, + { + "epoch": 3.133551755850203, + "grad_norm": 3.8933639526367188, + "learning_rate": 0.0027071841144580377, + "loss": 7.6887, + "step": 769200 + }, + { + "epoch": 3.1339591338735846, + "grad_norm": 6.667581081390381, + "learning_rate": 0.0027066896487467486, + "loss": 7.6811, + "step": 769300 + }, + { + "epoch": 3.134366511896966, + "grad_norm": 3.5448014736175537, + "learning_rate": 0.00270619517506159, + "loss": 7.7026, + "step": 769400 + }, + { + "epoch": 3.1347738899203472, + "grad_norm": 4.684604644775391, + "learning_rate": 0.002705700693422108, + "loss": 7.6957, + "step": 769500 + }, + { + "epoch": 3.1351812679437288, + "grad_norm": 4.270644187927246, + "learning_rate": 0.002705206203847856, + "loss": 7.7039, + "step": 769600 + }, + { + "epoch": 3.1355886459671103, + "grad_norm": 5.729297161102295, + "learning_rate": 0.002704711706358382, + "loss": 7.7222, + "step": 769700 + }, + { + "epoch": 3.135996023990492, + "grad_norm": 4.039698123931885, + "learning_rate": 0.0027042172009732347, + "loss": 7.7137, + "step": 769800 + }, + { + "epoch": 3.1364034020138734, + "grad_norm": 4.518555641174316, + "learning_rate": 0.002703722687711965, + "loss": 7.7264, + "step": 769900 + }, + { + "epoch": 3.136810780037255, + "grad_norm": 4.014138698577881, + "learning_rate": 0.002703228166594126, + "loss": 7.682, + "step": 770000 + }, + { + "epoch": 3.136810780037255, + "eval_MaskedAccuracy": 0.5060256769241583, + "eval_loss": 1.619013786315918, + "eval_runtime": 154.8899, + "eval_samples_per_second": 409.814, + "eval_steps_per_second": 1.601, + "step": 770000 + }, + { + "epoch": 3.137218158060636, + "grad_norm": 5.116852283477783, + "learning_rate": 0.0027027336376392696, + "loss": 7.6818, + "step": 770100 + }, + { + "epoch": 3.1376255360840175, + "grad_norm": 3.505202293395996, + "learning_rate": 0.002702239100866942, + "loss": 7.682, + "step": 770200 + }, + { + "epoch": 3.138032914107399, + "grad_norm": 3.6708834171295166, + "learning_rate": 0.002701744556296698, + "loss": 7.6966, + "step": 770300 + }, + { + "epoch": 3.1384402921307806, + "grad_norm": 4.903164386749268, + "learning_rate": 0.002701250003948092, + "loss": 7.7207, + "step": 770400 + }, + { + "epoch": 3.138847670154162, + "grad_norm": 3.7474172115325928, + "learning_rate": 0.0027007554438406704, + "loss": 7.6756, + "step": 770500 + }, + { + "epoch": 3.1392550481775436, + "grad_norm": 3.6070735454559326, + "learning_rate": 0.0027002608759939912, + "loss": 7.6811, + "step": 770600 + }, + { + "epoch": 3.1396624262009247, + "grad_norm": 2.534743547439575, + "learning_rate": 0.002699766300427606, + "loss": 7.6946, + "step": 770700 + }, + { + "epoch": 3.1400698042243063, + "grad_norm": 6.393507480621338, + "learning_rate": 0.002699271717161066, + "loss": 7.7245, + "step": 770800 + }, + { + "epoch": 3.140477182247688, + "grad_norm": 6.294739723205566, + "learning_rate": 0.0026987771262139255, + "loss": 7.6895, + "step": 770900 + }, + { + "epoch": 3.1408845602710693, + "grad_norm": 3.527824640274048, + "learning_rate": 0.0026982825276057397, + "loss": 7.68, + "step": 771000 + }, + { + "epoch": 3.1408845602710693, + "eval_MaskedAccuracy": 0.5066774540976102, + "eval_loss": 1.6144894361495972, + "eval_runtime": 152.2406, + "eval_samples_per_second": 416.945, + "eval_steps_per_second": 1.629, + "step": 771000 + }, + { + "epoch": 3.141291938294451, + "grad_norm": 6.735122203826904, + "learning_rate": 0.0026977879213560636, + "loss": 7.7008, + "step": 771100 + }, + { + "epoch": 3.1416993163178324, + "grad_norm": 4.2013115882873535, + "learning_rate": 0.00269729330748445, + "loss": 7.7151, + "step": 771200 + }, + { + "epoch": 3.142106694341214, + "grad_norm": 9.52634334564209, + "learning_rate": 0.0026967986860104514, + "loss": 7.6884, + "step": 771300 + }, + { + "epoch": 3.142514072364595, + "grad_norm": 7.722685813903809, + "learning_rate": 0.0026963040569536246, + "loss": 7.6874, + "step": 771400 + }, + { + "epoch": 3.1429214503879765, + "grad_norm": 3.3890602588653564, + "learning_rate": 0.002695809420333526, + "loss": 7.6941, + "step": 771500 + }, + { + "epoch": 3.143328828411358, + "grad_norm": 6.16286563873291, + "learning_rate": 0.0026953147761697113, + "loss": 7.7346, + "step": 771600 + }, + { + "epoch": 3.1437362064347396, + "grad_norm": 4.1274590492248535, + "learning_rate": 0.002694820124481733, + "loss": 7.7027, + "step": 771700 + }, + { + "epoch": 3.144143584458121, + "grad_norm": 7.944971561431885, + "learning_rate": 0.002694325465289152, + "loss": 7.7287, + "step": 771800 + }, + { + "epoch": 3.1445509624815027, + "grad_norm": 4.1289873123168945, + "learning_rate": 0.00269383079861152, + "loss": 7.7105, + "step": 771900 + }, + { + "epoch": 3.1449583405048838, + "grad_norm": 4.869955539703369, + "learning_rate": 0.0026933361244684028, + "loss": 7.6976, + "step": 772000 + }, + { + "epoch": 3.1449583405048838, + "eval_MaskedAccuracy": 0.5060203453393523, + "eval_loss": 1.620253324508667, + "eval_runtime": 149.7289, + "eval_samples_per_second": 423.939, + "eval_steps_per_second": 1.656, + "step": 772000 + }, + { + "epoch": 3.1453657185282653, + "grad_norm": 4.264239311218262, + "learning_rate": 0.0026928414428793474, + "loss": 7.6762, + "step": 772100 + }, + { + "epoch": 3.145773096551647, + "grad_norm": 7.852622985839844, + "learning_rate": 0.002692346753863915, + "loss": 7.6972, + "step": 772200 + }, + { + "epoch": 3.1461804745750284, + "grad_norm": 10.748968124389648, + "learning_rate": 0.0026918520574416615, + "loss": 7.6776, + "step": 772300 + }, + { + "epoch": 3.14658785259841, + "grad_norm": 7.000277042388916, + "learning_rate": 0.002691357353632149, + "loss": 7.7146, + "step": 772400 + }, + { + "epoch": 3.1469952306217914, + "grad_norm": 8.364676475524902, + "learning_rate": 0.0026908626424549365, + "loss": 7.7151, + "step": 772500 + }, + { + "epoch": 3.1474026086451725, + "grad_norm": 9.26862621307373, + "learning_rate": 0.0026903679239295773, + "loss": 7.6791, + "step": 772600 + }, + { + "epoch": 3.147809986668554, + "grad_norm": 10.48083782196045, + "learning_rate": 0.0026898731980756316, + "loss": 7.6928, + "step": 772700 + }, + { + "epoch": 3.1482173646919356, + "grad_norm": 4.311800479888916, + "learning_rate": 0.002689378464912659, + "loss": 7.7025, + "step": 772800 + }, + { + "epoch": 3.148624742715317, + "grad_norm": 12.309150695800781, + "learning_rate": 0.002688883724460219, + "loss": 7.6861, + "step": 772900 + }, + { + "epoch": 3.1490321207386986, + "grad_norm": 7.959762096405029, + "learning_rate": 0.0026883889767378728, + "loss": 7.6916, + "step": 773000 + }, + { + "epoch": 3.1490321207386986, + "eval_MaskedAccuracy": 0.5059848410346622, + "eval_loss": 1.614486575126648, + "eval_runtime": 153.1919, + "eval_samples_per_second": 414.356, + "eval_steps_per_second": 1.619, + "step": 773000 + }, + { + "epoch": 3.14943949876208, + "grad_norm": 8.189927101135254, + "learning_rate": 0.0026878942217651793, + "loss": 7.6946, + "step": 773100 + }, + { + "epoch": 3.1498468767854613, + "grad_norm": 3.512836456298828, + "learning_rate": 0.002687399459561704, + "loss": 7.6772, + "step": 773200 + }, + { + "epoch": 3.150254254808843, + "grad_norm": 4.427940845489502, + "learning_rate": 0.0026869046901469977, + "loss": 7.7165, + "step": 773300 + }, + { + "epoch": 3.1506616328322243, + "grad_norm": 13.9074125289917, + "learning_rate": 0.002686409913540624, + "loss": 7.6802, + "step": 773400 + }, + { + "epoch": 3.151069010855606, + "grad_norm": 4.601502895355225, + "learning_rate": 0.0026859151297621477, + "loss": 7.6944, + "step": 773500 + }, + { + "epoch": 3.1514763888789874, + "grad_norm": 4.3032732009887695, + "learning_rate": 0.00268542033883113, + "loss": 7.7127, + "step": 773600 + }, + { + "epoch": 3.151883766902369, + "grad_norm": 11.035786628723145, + "learning_rate": 0.0026849255407671332, + "loss": 7.7172, + "step": 773700 + }, + { + "epoch": 3.1522911449257505, + "grad_norm": 5.978248596191406, + "learning_rate": 0.0026844307355897152, + "loss": 7.6879, + "step": 773800 + }, + { + "epoch": 3.1526985229491316, + "grad_norm": 3.032963514328003, + "learning_rate": 0.0026839359233184405, + "loss": 7.6867, + "step": 773900 + }, + { + "epoch": 3.153105900972513, + "grad_norm": 11.734149932861328, + "learning_rate": 0.002683441103972874, + "loss": 7.7394, + "step": 774000 + }, + { + "epoch": 3.153105900972513, + "eval_MaskedAccuracy": 0.5058565238362183, + "eval_loss": 1.614915132522583, + "eval_runtime": 148.9732, + "eval_samples_per_second": 426.09, + "eval_steps_per_second": 1.665, + "step": 774000 + }, + { + "epoch": 3.1535132789958946, + "grad_norm": 3.2387919425964355, + "learning_rate": 0.0026829462775725737, + "loss": 7.6518, + "step": 774100 + }, + { + "epoch": 3.153920657019276, + "grad_norm": 11.412351608276367, + "learning_rate": 0.0026824514441371084, + "loss": 7.6718, + "step": 774200 + }, + { + "epoch": 3.1543280350426577, + "grad_norm": 2.723578929901123, + "learning_rate": 0.0026819566036860366, + "loss": 7.698, + "step": 774300 + }, + { + "epoch": 3.154735413066039, + "grad_norm": 4.925586700439453, + "learning_rate": 0.002681461756238926, + "loss": 7.6967, + "step": 774400 + }, + { + "epoch": 3.1551427910894203, + "grad_norm": 3.7491700649261475, + "learning_rate": 0.0026809669018153385, + "loss": 7.6531, + "step": 774500 + }, + { + "epoch": 3.155550169112802, + "grad_norm": 2.9871318340301514, + "learning_rate": 0.002680472040434838, + "loss": 7.704, + "step": 774600 + }, + { + "epoch": 3.1559575471361834, + "grad_norm": 2.868842840194702, + "learning_rate": 0.002679977172116985, + "loss": 7.7028, + "step": 774700 + }, + { + "epoch": 3.156364925159565, + "grad_norm": 4.306030750274658, + "learning_rate": 0.002679482296881351, + "loss": 7.6876, + "step": 774800 + }, + { + "epoch": 3.1567723031829464, + "grad_norm": 9.307113647460938, + "learning_rate": 0.002678987414747501, + "loss": 7.6928, + "step": 774900 + }, + { + "epoch": 3.157179681206328, + "grad_norm": 4.642914772033691, + "learning_rate": 0.0026784925257349944, + "loss": 7.7102, + "step": 775000 + }, + { + "epoch": 3.157179681206328, + "eval_MaskedAccuracy": 0.5066560003822872, + "eval_loss": 1.6202186346054077, + "eval_runtime": 149.5397, + "eval_samples_per_second": 424.476, + "eval_steps_per_second": 1.658, + "step": 775000 + }, + { + "epoch": 3.157587059229709, + "grad_norm": 8.615238189697266, + "learning_rate": 0.0026779976298634024, + "loss": 7.6996, + "step": 775100 + }, + { + "epoch": 3.1579944372530906, + "grad_norm": 4.410147190093994, + "learning_rate": 0.0026775027271522906, + "loss": 7.7021, + "step": 775200 + }, + { + "epoch": 3.158401815276472, + "grad_norm": 8.252384185791016, + "learning_rate": 0.0026770078176212193, + "loss": 7.6904, + "step": 775300 + }, + { + "epoch": 3.1588091932998537, + "grad_norm": 3.639887571334839, + "learning_rate": 0.0026765129012897592, + "loss": 7.6941, + "step": 775400 + }, + { + "epoch": 3.159216571323235, + "grad_norm": 3.2815327644348145, + "learning_rate": 0.0026760179781774736, + "loss": 7.7047, + "step": 775500 + }, + { + "epoch": 3.1596239493466167, + "grad_norm": 7.7857818603515625, + "learning_rate": 0.002675523048303935, + "loss": 7.6927, + "step": 775600 + }, + { + "epoch": 3.160031327369998, + "grad_norm": 5.169656753540039, + "learning_rate": 0.0026750281116887085, + "loss": 7.6722, + "step": 775700 + }, + { + "epoch": 3.1604387053933793, + "grad_norm": 3.543030023574829, + "learning_rate": 0.0026745331683513626, + "loss": 7.7149, + "step": 775800 + }, + { + "epoch": 3.160846083416761, + "grad_norm": 4.759836673736572, + "learning_rate": 0.0026740382183114615, + "loss": 7.7005, + "step": 775900 + }, + { + "epoch": 3.1612534614401424, + "grad_norm": 4.95238733291626, + "learning_rate": 0.0026735432615885776, + "loss": 7.6842, + "step": 776000 + }, + { + "epoch": 3.1612534614401424, + "eval_MaskedAccuracy": 0.5044781518966327, + "eval_loss": 1.6273850202560425, + "eval_runtime": 149.6541, + "eval_samples_per_second": 424.151, + "eval_steps_per_second": 1.657, + "step": 776000 + }, + { + "epoch": 3.161660839463524, + "grad_norm": 7.366076946258545, + "learning_rate": 0.002673048298202278, + "loss": 7.6985, + "step": 776100 + }, + { + "epoch": 3.1620682174869055, + "grad_norm": 3.573741912841797, + "learning_rate": 0.002672553328172129, + "loss": 7.692, + "step": 776200 + }, + { + "epoch": 3.162475595510287, + "grad_norm": 6.183897972106934, + "learning_rate": 0.0026720583515177014, + "loss": 7.6938, + "step": 776300 + }, + { + "epoch": 3.162882973533668, + "grad_norm": 5.623685359954834, + "learning_rate": 0.002671563368258564, + "loss": 7.6706, + "step": 776400 + }, + { + "epoch": 3.1632903515570496, + "grad_norm": 3.4732744693756104, + "learning_rate": 0.002671068378414285, + "loss": 7.6934, + "step": 776500 + }, + { + "epoch": 3.163697729580431, + "grad_norm": 3.870776891708374, + "learning_rate": 0.002670573382004436, + "loss": 7.6801, + "step": 776600 + }, + { + "epoch": 3.1641051076038127, + "grad_norm": 9.050191879272461, + "learning_rate": 0.002670078379048585, + "loss": 7.7044, + "step": 776700 + }, + { + "epoch": 3.1645124856271942, + "grad_norm": 5.239398956298828, + "learning_rate": 0.0026695833695663043, + "loss": 7.6659, + "step": 776800 + }, + { + "epoch": 3.1649198636505758, + "grad_norm": 3.975712299346924, + "learning_rate": 0.0026690883535771637, + "loss": 7.7076, + "step": 776900 + }, + { + "epoch": 3.165327241673957, + "grad_norm": 4.15570068359375, + "learning_rate": 0.0026685933311007314, + "loss": 7.6763, + "step": 777000 + }, + { + "epoch": 3.165327241673957, + "eval_MaskedAccuracy": 0.5066619214389548, + "eval_loss": 1.6140919923782349, + "eval_runtime": 160.6162, + "eval_samples_per_second": 395.203, + "eval_steps_per_second": 1.544, + "step": 777000 + }, + { + "epoch": 3.1657346196973384, + "grad_norm": 5.735141754150391, + "learning_rate": 0.0026680983021565804, + "loss": 7.7132, + "step": 777100 + }, + { + "epoch": 3.16614199772072, + "grad_norm": 3.445868730545044, + "learning_rate": 0.0026676032667642792, + "loss": 7.6667, + "step": 777200 + }, + { + "epoch": 3.1665493757441014, + "grad_norm": 3.7501468658447266, + "learning_rate": 0.0026671082249434025, + "loss": 7.6929, + "step": 777300 + }, + { + "epoch": 3.166956753767483, + "grad_norm": 8.324304580688477, + "learning_rate": 0.0026666131767135214, + "loss": 7.6802, + "step": 777400 + }, + { + "epoch": 3.1673641317908645, + "grad_norm": 5.689042568206787, + "learning_rate": 0.0026661181220942053, + "loss": 7.7014, + "step": 777500 + }, + { + "epoch": 3.1677715098142456, + "grad_norm": 6.338901996612549, + "learning_rate": 0.00266562306110503, + "loss": 7.6994, + "step": 777600 + }, + { + "epoch": 3.168178887837627, + "grad_norm": 7.056424617767334, + "learning_rate": 0.0026651279937655644, + "loss": 7.6646, + "step": 777700 + }, + { + "epoch": 3.1685862658610087, + "grad_norm": 6.031956672668457, + "learning_rate": 0.002664632920095385, + "loss": 7.6769, + "step": 777800 + }, + { + "epoch": 3.16899364388439, + "grad_norm": 3.548583984375, + "learning_rate": 0.0026641378401140612, + "loss": 7.6747, + "step": 777900 + }, + { + "epoch": 3.1694010219077717, + "grad_norm": 3.889172315597534, + "learning_rate": 0.0026636427538411643, + "loss": 7.6888, + "step": 778000 + }, + { + "epoch": 3.1694010219077717, + "eval_MaskedAccuracy": 0.5056879036485077, + "eval_loss": 1.6238088607788086, + "eval_runtime": 154.305, + "eval_samples_per_second": 411.367, + "eval_steps_per_second": 1.607, + "step": 778000 + }, + { + "epoch": 3.1698083999311533, + "grad_norm": 6.30963134765625, + "learning_rate": 0.0026631476612962745, + "loss": 7.6877, + "step": 778100 + }, + { + "epoch": 3.1702157779545344, + "grad_norm": 2.9132330417633057, + "learning_rate": 0.0026626525624989616, + "loss": 7.69, + "step": 778200 + }, + { + "epoch": 3.170623155977916, + "grad_norm": 5.352282524108887, + "learning_rate": 0.0026621574574687986, + "loss": 7.6722, + "step": 778300 + }, + { + "epoch": 3.1710305340012974, + "grad_norm": 5.92992639541626, + "learning_rate": 0.0026616623462253626, + "loss": 7.6795, + "step": 778400 + }, + { + "epoch": 3.171437912024679, + "grad_norm": 5.393660545349121, + "learning_rate": 0.0026611672287882235, + "loss": 7.6734, + "step": 778500 + }, + { + "epoch": 3.1718452900480605, + "grad_norm": 5.707131385803223, + "learning_rate": 0.002660672105176957, + "loss": 7.7077, + "step": 778600 + }, + { + "epoch": 3.172252668071442, + "grad_norm": 3.5185000896453857, + "learning_rate": 0.0026601769754111392, + "loss": 7.6781, + "step": 778700 + }, + { + "epoch": 3.1726600460948235, + "grad_norm": 2.8914966583251953, + "learning_rate": 0.0026596818395103468, + "loss": 7.7267, + "step": 778800 + }, + { + "epoch": 3.1730674241182046, + "grad_norm": 2.5010547637939453, + "learning_rate": 0.0026591866974941558, + "loss": 7.6956, + "step": 778900 + }, + { + "epoch": 3.173474802141586, + "grad_norm": 4.04721212387085, + "learning_rate": 0.00265869154938214, + "loss": 7.6819, + "step": 779000 + }, + { + "epoch": 3.173474802141586, + "eval_MaskedAccuracy": 0.5064465607655882, + "eval_loss": 1.6194339990615845, + "eval_runtime": 150.4166, + "eval_samples_per_second": 422.001, + "eval_steps_per_second": 1.649, + "step": 779000 + }, + { + "epoch": 3.1738821801649677, + "grad_norm": 8.249896049499512, + "learning_rate": 0.002658196395193876, + "loss": 7.6997, + "step": 779100 + }, + { + "epoch": 3.1742895581883492, + "grad_norm": 10.266298294067383, + "learning_rate": 0.002657701234948938, + "loss": 7.7037, + "step": 779200 + }, + { + "epoch": 3.1746969362117308, + "grad_norm": 4.910303592681885, + "learning_rate": 0.0026572060686669047, + "loss": 7.6826, + "step": 779300 + }, + { + "epoch": 3.1751043142351123, + "grad_norm": 5.045969009399414, + "learning_rate": 0.002656710896367351, + "loss": 7.6876, + "step": 779400 + }, + { + "epoch": 3.1755116922584934, + "grad_norm": 6.6033616065979, + "learning_rate": 0.0026562157180698522, + "loss": 7.6794, + "step": 779500 + }, + { + "epoch": 3.175919070281875, + "grad_norm": 3.0145375728607178, + "learning_rate": 0.0026557205337939854, + "loss": 7.6678, + "step": 779600 + }, + { + "epoch": 3.1763264483052565, + "grad_norm": 3.813138484954834, + "learning_rate": 0.00265522534355933, + "loss": 7.6944, + "step": 779700 + }, + { + "epoch": 3.176733826328638, + "grad_norm": 6.576977252960205, + "learning_rate": 0.0026547301473854638, + "loss": 7.6928, + "step": 779800 + }, + { + "epoch": 3.1771412043520195, + "grad_norm": 3.4925901889801025, + "learning_rate": 0.0026542349452919694, + "loss": 7.7008, + "step": 779900 + }, + { + "epoch": 3.177548582375401, + "grad_norm": 4.209640979766846, + "learning_rate": 0.002653739737298417, + "loss": 7.6801, + "step": 780000 + }, + { + "epoch": 3.177548582375401, + "eval_MaskedAccuracy": 0.5061531277212185, + "eval_loss": 1.6210638284683228, + "eval_runtime": 152.9389, + "eval_samples_per_second": 415.042, + "eval_steps_per_second": 1.622, + "step": 780000 + }, + { + "epoch": 3.177955960398782, + "grad_norm": 4.764199256896973, + "learning_rate": 0.0026532445234243858, + "loss": 7.6904, + "step": 780100 + }, + { + "epoch": 3.1783633384221637, + "grad_norm": 6.731513023376465, + "learning_rate": 0.0026527493036894545, + "loss": 7.7001, + "step": 780200 + }, + { + "epoch": 3.178770716445545, + "grad_norm": 6.768033504486084, + "learning_rate": 0.002652254078113203, + "loss": 7.7091, + "step": 780300 + }, + { + "epoch": 3.1791780944689267, + "grad_norm": 4.610159873962402, + "learning_rate": 0.0026517588467152133, + "loss": 7.6851, + "step": 780400 + }, + { + "epoch": 3.1795854724923083, + "grad_norm": 3.216808319091797, + "learning_rate": 0.002651263609515061, + "loss": 7.7173, + "step": 780500 + }, + { + "epoch": 3.17999285051569, + "grad_norm": 5.673268795013428, + "learning_rate": 0.002650768366532327, + "loss": 7.6733, + "step": 780600 + }, + { + "epoch": 3.180400228539071, + "grad_norm": 3.343284845352173, + "learning_rate": 0.002650273117786585, + "loss": 7.6909, + "step": 780700 + }, + { + "epoch": 3.1808076065624524, + "grad_norm": 6.623497009277344, + "learning_rate": 0.0026497778632974243, + "loss": 7.6662, + "step": 780800 + }, + { + "epoch": 3.181214984585834, + "grad_norm": 3.500135660171509, + "learning_rate": 0.002649282603084417, + "loss": 7.6678, + "step": 780900 + }, + { + "epoch": 3.1816223626092155, + "grad_norm": 7.12910795211792, + "learning_rate": 0.002648787337167148, + "loss": 7.691, + "step": 781000 + }, + { + "epoch": 3.1816223626092155, + "eval_MaskedAccuracy": 0.5053428490295067, + "eval_loss": 1.6209814548492432, + "eval_runtime": 150.3438, + "eval_samples_per_second": 422.206, + "eval_steps_per_second": 1.65, + "step": 781000 + }, + { + "epoch": 3.182029740632597, + "grad_norm": 4.574215412139893, + "learning_rate": 0.002648292065565196, + "loss": 7.6977, + "step": 781100 + }, + { + "epoch": 3.1824371186559786, + "grad_norm": 6.274864673614502, + "learning_rate": 0.0026477967882981464, + "loss": 7.698, + "step": 781200 + }, + { + "epoch": 3.18284449667936, + "grad_norm": 4.036904335021973, + "learning_rate": 0.0026473015053855723, + "loss": 7.6828, + "step": 781300 + }, + { + "epoch": 3.183251874702741, + "grad_norm": 3.259281873703003, + "learning_rate": 0.002646806216847059, + "loss": 7.6906, + "step": 781400 + }, + { + "epoch": 3.1836592527261227, + "grad_norm": 3.483506202697754, + "learning_rate": 0.0026463109227021853, + "loss": 7.6865, + "step": 781500 + }, + { + "epoch": 3.1840666307495042, + "grad_norm": 3.4609780311584473, + "learning_rate": 0.0026458156229705373, + "loss": 7.679, + "step": 781600 + }, + { + "epoch": 3.184474008772886, + "grad_norm": 10.09542465209961, + "learning_rate": 0.002645320317671695, + "loss": 7.689, + "step": 781700 + }, + { + "epoch": 3.1848813867962673, + "grad_norm": 5.758722305297852, + "learning_rate": 0.002644825006825238, + "loss": 7.6808, + "step": 781800 + }, + { + "epoch": 3.185288764819649, + "grad_norm": 5.515896797180176, + "learning_rate": 0.002644329690450753, + "loss": 7.6765, + "step": 781900 + }, + { + "epoch": 3.18569614284303, + "grad_norm": 3.3453032970428467, + "learning_rate": 0.0026438343685678193, + "loss": 7.6858, + "step": 782000 + }, + { + "epoch": 3.18569614284303, + "eval_MaskedAccuracy": 0.5066394594667833, + "eval_loss": 1.617942214012146, + "eval_runtime": 150.4516, + "eval_samples_per_second": 421.903, + "eval_steps_per_second": 1.648, + "step": 782000 + }, + { + "epoch": 3.1861035208664115, + "grad_norm": 5.306074142456055, + "learning_rate": 0.0026433390411960207, + "loss": 7.6757, + "step": 782100 + }, + { + "epoch": 3.186510898889793, + "grad_norm": 4.115302085876465, + "learning_rate": 0.002642843708354939, + "loss": 7.6585, + "step": 782200 + }, + { + "epoch": 3.1869182769131745, + "grad_norm": 5.047887325286865, + "learning_rate": 0.0026423483700641587, + "loss": 7.7016, + "step": 782300 + }, + { + "epoch": 3.187325654936556, + "grad_norm": 5.8126397132873535, + "learning_rate": 0.0026418530263432644, + "loss": 7.694, + "step": 782400 + }, + { + "epoch": 3.1877330329599376, + "grad_norm": 6.155248165130615, + "learning_rate": 0.002641357677211836, + "loss": 7.7, + "step": 782500 + }, + { + "epoch": 3.1881404109833187, + "grad_norm": 4.365386486053467, + "learning_rate": 0.002640862322689461, + "loss": 7.6771, + "step": 782600 + }, + { + "epoch": 3.1885477890067, + "grad_norm": 3.0681233406066895, + "learning_rate": 0.002640366962795722, + "loss": 7.67, + "step": 782700 + }, + { + "epoch": 3.1889551670300817, + "grad_norm": 7.4326300621032715, + "learning_rate": 0.0026398715975502036, + "loss": 7.6714, + "step": 782800 + }, + { + "epoch": 3.1893625450534633, + "grad_norm": 3.5183217525482178, + "learning_rate": 0.002639376226972487, + "loss": 7.6961, + "step": 782900 + }, + { + "epoch": 3.189769923076845, + "grad_norm": 6.676580905914307, + "learning_rate": 0.0026388808510821593, + "loss": 7.7003, + "step": 783000 + }, + { + "epoch": 3.189769923076845, + "eval_MaskedAccuracy": 0.5067654794652767, + "eval_loss": 1.6153063774108887, + "eval_runtime": 150.0364, + "eval_samples_per_second": 423.071, + "eval_steps_per_second": 1.653, + "step": 783000 + }, + { + "epoch": 3.1901773011002263, + "grad_norm": 6.243706703186035, + "learning_rate": 0.0026383854698988074, + "loss": 7.6941, + "step": 783100 + }, + { + "epoch": 3.1905846791236074, + "grad_norm": 8.316851615905762, + "learning_rate": 0.0026378900834420154, + "loss": 7.6673, + "step": 783200 + }, + { + "epoch": 3.190992057146989, + "grad_norm": 5.983206272125244, + "learning_rate": 0.0026373946917313657, + "loss": 7.6855, + "step": 783300 + }, + { + "epoch": 3.1913994351703705, + "grad_norm": 5.882169246673584, + "learning_rate": 0.0026368992947864447, + "loss": 7.7042, + "step": 783400 + }, + { + "epoch": 3.191806813193752, + "grad_norm": 7.513192653656006, + "learning_rate": 0.002636403892626837, + "loss": 7.7099, + "step": 783500 + }, + { + "epoch": 3.1922141912171336, + "grad_norm": 6.345666885375977, + "learning_rate": 0.0026359084852721365, + "loss": 7.6897, + "step": 783600 + }, + { + "epoch": 3.192621569240515, + "grad_norm": 7.76584005355835, + "learning_rate": 0.002635413072741921, + "loss": 7.6796, + "step": 783700 + }, + { + "epoch": 3.1930289472638966, + "grad_norm": 3.749234914779663, + "learning_rate": 0.0026349176550557797, + "loss": 7.6882, + "step": 783800 + }, + { + "epoch": 3.1934363252872777, + "grad_norm": 3.1690194606781006, + "learning_rate": 0.002634422232233299, + "loss": 7.6791, + "step": 783900 + }, + { + "epoch": 3.1938437033106593, + "grad_norm": 4.605543613433838, + "learning_rate": 0.002633926804294065, + "loss": 7.701, + "step": 784000 + }, + { + "epoch": 3.1938437033106593, + "eval_MaskedAccuracy": 0.5065523382952148, + "eval_loss": 1.6239582300186157, + "eval_runtime": 152.2555, + "eval_samples_per_second": 416.905, + "eval_steps_per_second": 1.629, + "step": 784000 + }, + { + "epoch": 3.194251081334041, + "grad_norm": 6.241485118865967, + "learning_rate": 0.002633431371257664, + "loss": 7.7153, + "step": 784100 + }, + { + "epoch": 3.1946584593574223, + "grad_norm": 2.8318705558776855, + "learning_rate": 0.0026329359331436855, + "loss": 7.6827, + "step": 784200 + }, + { + "epoch": 3.195065837380804, + "grad_norm": 3.8722984790802, + "learning_rate": 0.002632440489971713, + "loss": 7.6778, + "step": 784300 + }, + { + "epoch": 3.1954732154041854, + "grad_norm": 6.953472137451172, + "learning_rate": 0.0026319450417613363, + "loss": 7.6649, + "step": 784400 + }, + { + "epoch": 3.1958805934275665, + "grad_norm": 4.383271217346191, + "learning_rate": 0.0026314495885321415, + "loss": 7.6842, + "step": 784500 + }, + { + "epoch": 3.196287971450948, + "grad_norm": 6.655606746673584, + "learning_rate": 0.002630954130303722, + "loss": 7.6752, + "step": 784600 + }, + { + "epoch": 3.1966953494743295, + "grad_norm": 5.002296447753906, + "learning_rate": 0.0026304586670956617, + "loss": 7.6938, + "step": 784700 + }, + { + "epoch": 3.197102727497711, + "grad_norm": 8.698090553283691, + "learning_rate": 0.0026299631989275493, + "loss": 7.6592, + "step": 784800 + }, + { + "epoch": 3.1975101055210926, + "grad_norm": 4.085105895996094, + "learning_rate": 0.002629467725818974, + "loss": 7.7082, + "step": 784900 + }, + { + "epoch": 3.197917483544474, + "grad_norm": 5.493113994598389, + "learning_rate": 0.0026289722477895227, + "loss": 7.6974, + "step": 785000 + }, + { + "epoch": 3.197917483544474, + "eval_MaskedAccuracy": 0.5063409749369749, + "eval_loss": 1.6177254915237427, + "eval_runtime": 154.8526, + "eval_samples_per_second": 409.912, + "eval_steps_per_second": 1.602, + "step": 785000 + }, + { + "epoch": 3.1983248615678552, + "grad_norm": 5.456820487976074, + "learning_rate": 0.0026284767648587846, + "loss": 7.6985, + "step": 785100 + }, + { + "epoch": 3.1987322395912368, + "grad_norm": 7.716866493225098, + "learning_rate": 0.0026279812770463487, + "loss": 7.7319, + "step": 785200 + }, + { + "epoch": 3.1991396176146183, + "grad_norm": 5.643834590911865, + "learning_rate": 0.0026274857843718077, + "loss": 7.6988, + "step": 785300 + }, + { + "epoch": 3.199546995638, + "grad_norm": 4.009466648101807, + "learning_rate": 0.002626990286854748, + "loss": 7.672, + "step": 785400 + }, + { + "epoch": 3.1999543736613814, + "grad_norm": 6.094020843505859, + "learning_rate": 0.0026264947845147605, + "loss": 7.6919, + "step": 785500 + }, + { + "epoch": 3.200361751684763, + "grad_norm": 5.463302135467529, + "learning_rate": 0.002625999277371434, + "loss": 7.6692, + "step": 785600 + }, + { + "epoch": 3.200769129708144, + "grad_norm": 3.863553047180176, + "learning_rate": 0.0026255037654443593, + "loss": 7.6706, + "step": 785700 + }, + { + "epoch": 3.2011765077315255, + "grad_norm": 5.52654504776001, + "learning_rate": 0.002625008248753126, + "loss": 7.6779, + "step": 785800 + }, + { + "epoch": 3.201583885754907, + "grad_norm": 5.104406356811523, + "learning_rate": 0.002624512727317326, + "loss": 7.6878, + "step": 785900 + }, + { + "epoch": 3.2019912637782886, + "grad_norm": 4.26261568069458, + "learning_rate": 0.0026240172011565456, + "loss": 7.7023, + "step": 786000 + }, + { + "epoch": 3.2019912637782886, + "eval_MaskedAccuracy": 0.5069396359302315, + "eval_loss": 1.6208125352859497, + "eval_runtime": 150.9245, + "eval_samples_per_second": 420.581, + "eval_steps_per_second": 1.643, + "step": 786000 + }, + { + "epoch": 3.20239864180167, + "grad_norm": 2.636861562728882, + "learning_rate": 0.0026235216702903785, + "loss": 7.7174, + "step": 786100 + }, + { + "epoch": 3.2028060198250516, + "grad_norm": 5.463211536407471, + "learning_rate": 0.002623026134738419, + "loss": 7.7076, + "step": 786200 + }, + { + "epoch": 3.203213397848433, + "grad_norm": 11.473145484924316, + "learning_rate": 0.002622530594520253, + "loss": 7.7009, + "step": 786300 + }, + { + "epoch": 3.2036207758718143, + "grad_norm": 11.875235557556152, + "learning_rate": 0.0026220350496554724, + "loss": 7.71, + "step": 786400 + }, + { + "epoch": 3.204028153895196, + "grad_norm": 3.7144064903259277, + "learning_rate": 0.0026215395001636735, + "loss": 7.7101, + "step": 786500 + }, + { + "epoch": 3.2044355319185773, + "grad_norm": 4.499942779541016, + "learning_rate": 0.002621043946064445, + "loss": 7.6817, + "step": 786600 + }, + { + "epoch": 3.204842909941959, + "grad_norm": 4.006444931030273, + "learning_rate": 0.002620548387377378, + "loss": 7.6984, + "step": 786700 + }, + { + "epoch": 3.2052502879653404, + "grad_norm": 5.552825927734375, + "learning_rate": 0.0026200528241220634, + "loss": 7.6526, + "step": 786800 + }, + { + "epoch": 3.205657665988722, + "grad_norm": 5.5530781745910645, + "learning_rate": 0.002619557256318098, + "loss": 7.6713, + "step": 786900 + }, + { + "epoch": 3.206065044012103, + "grad_norm": 3.4495584964752197, + "learning_rate": 0.0026190616839850706, + "loss": 7.7045, + "step": 787000 + }, + { + "epoch": 3.206065044012103, + "eval_MaskedAccuracy": 0.5069079826713474, + "eval_loss": 1.6114245653152466, + "eval_runtime": 152.2694, + "eval_samples_per_second": 416.867, + "eval_steps_per_second": 1.629, + "step": 787000 + }, + { + "epoch": 3.2064724220354845, + "grad_norm": 6.009396553039551, + "learning_rate": 0.0026185661071425736, + "loss": 7.6799, + "step": 787100 + }, + { + "epoch": 3.206879800058866, + "grad_norm": 6.559325695037842, + "learning_rate": 0.0026180705258102006, + "loss": 7.6822, + "step": 787200 + }, + { + "epoch": 3.2072871780822476, + "grad_norm": 3.152937173843384, + "learning_rate": 0.0026175749400075437, + "loss": 7.689, + "step": 787300 + }, + { + "epoch": 3.207694556105629, + "grad_norm": 7.587808609008789, + "learning_rate": 0.0026170793497542013, + "loss": 7.6745, + "step": 787400 + }, + { + "epoch": 3.2081019341290107, + "grad_norm": 7.427546501159668, + "learning_rate": 0.0026165837550697624, + "loss": 7.6911, + "step": 787500 + }, + { + "epoch": 3.2085093121523918, + "grad_norm": 6.7571563720703125, + "learning_rate": 0.00261608815597382, + "loss": 7.7035, + "step": 787600 + }, + { + "epoch": 3.2089166901757733, + "grad_norm": 5.109492778778076, + "learning_rate": 0.0026155925524859684, + "loss": 7.7036, + "step": 787700 + }, + { + "epoch": 3.209324068199155, + "grad_norm": 5.578980922698975, + "learning_rate": 0.0026150969446258015, + "loss": 7.6811, + "step": 787800 + }, + { + "epoch": 3.2097314462225364, + "grad_norm": 8.288555145263672, + "learning_rate": 0.0026146013324129156, + "loss": 7.6812, + "step": 787900 + }, + { + "epoch": 3.210138824245918, + "grad_norm": 3.1266098022460938, + "learning_rate": 0.0026141057158668994, + "loss": 7.6949, + "step": 788000 + }, + { + "epoch": 3.210138824245918, + "eval_MaskedAccuracy": 0.5067794550282789, + "eval_loss": 1.6136877536773682, + "eval_runtime": 155.6805, + "eval_samples_per_second": 407.733, + "eval_steps_per_second": 1.593, + "step": 788000 + }, + { + "epoch": 3.2105462022692994, + "grad_norm": 7.264710903167725, + "learning_rate": 0.0026136100950073533, + "loss": 7.6616, + "step": 788100 + }, + { + "epoch": 3.2109535802926805, + "grad_norm": 3.103065013885498, + "learning_rate": 0.0026131144698538656, + "loss": 7.6963, + "step": 788200 + }, + { + "epoch": 3.211360958316062, + "grad_norm": 3.8947644233703613, + "learning_rate": 0.0026126188404260397, + "loss": 7.6848, + "step": 788300 + }, + { + "epoch": 3.2117683363394436, + "grad_norm": 5.128793239593506, + "learning_rate": 0.002612123206743462, + "loss": 7.6758, + "step": 788400 + }, + { + "epoch": 3.212175714362825, + "grad_norm": 10.643733024597168, + "learning_rate": 0.002611627568825732, + "loss": 7.6847, + "step": 788500 + }, + { + "epoch": 3.2125830923862067, + "grad_norm": 7.177453994750977, + "learning_rate": 0.0026111319266924433, + "loss": 7.706, + "step": 788600 + }, + { + "epoch": 3.212990470409588, + "grad_norm": 6.658900260925293, + "learning_rate": 0.002610636280363193, + "loss": 7.7116, + "step": 788700 + }, + { + "epoch": 3.2133978484329697, + "grad_norm": 3.2530908584594727, + "learning_rate": 0.0026101406298575753, + "loss": 7.6861, + "step": 788800 + }, + { + "epoch": 3.213805226456351, + "grad_norm": 2.9466047286987305, + "learning_rate": 0.0026096449751951863, + "loss": 7.6951, + "step": 788900 + }, + { + "epoch": 3.2142126044797323, + "grad_norm": 8.129348754882812, + "learning_rate": 0.002609149316395622, + "loss": 7.6805, + "step": 789000 + }, + { + "epoch": 3.2142126044797323, + "eval_MaskedAccuracy": 0.5062638108644215, + "eval_loss": 1.624267339706421, + "eval_runtime": 173.9503, + "eval_samples_per_second": 364.909, + "eval_steps_per_second": 1.426, + "step": 789000 + }, + { + "epoch": 3.214619982503114, + "grad_norm": 4.266531944274902, + "learning_rate": 0.002608653653478478, + "loss": 7.6982, + "step": 789100 + }, + { + "epoch": 3.2150273605264954, + "grad_norm": 5.689974308013916, + "learning_rate": 0.002608157986463352, + "loss": 7.6479, + "step": 789200 + }, + { + "epoch": 3.215434738549877, + "grad_norm": 5.786675930023193, + "learning_rate": 0.0026076623153698363, + "loss": 7.6495, + "step": 789300 + }, + { + "epoch": 3.2158421165732585, + "grad_norm": 4.292298316955566, + "learning_rate": 0.002607166640217529, + "loss": 7.6508, + "step": 789400 + }, + { + "epoch": 3.2162494945966396, + "grad_norm": 7.28933572769165, + "learning_rate": 0.0026066709610260298, + "loss": 7.6981, + "step": 789500 + }, + { + "epoch": 3.216656872620021, + "grad_norm": 5.634224891662598, + "learning_rate": 0.002606175277814936, + "loss": 7.6803, + "step": 789600 + }, + { + "epoch": 3.2170642506434026, + "grad_norm": 9.318557739257812, + "learning_rate": 0.0026056795906038385, + "loss": 7.6912, + "step": 789700 + }, + { + "epoch": 3.217471628666784, + "grad_norm": 5.599298477172852, + "learning_rate": 0.002605183899412342, + "loss": 7.6777, + "step": 789800 + }, + { + "epoch": 3.2178790066901657, + "grad_norm": 7.910877227783203, + "learning_rate": 0.002604688204260037, + "loss": 7.6685, + "step": 789900 + }, + { + "epoch": 3.218286384713547, + "grad_norm": 9.06559944152832, + "learning_rate": 0.002604192505166525, + "loss": 7.6621, + "step": 790000 + }, + { + "epoch": 3.218286384713547, + "eval_MaskedAccuracy": 0.5063175242791842, + "eval_loss": 1.6187844276428223, + "eval_runtime": 168.5787, + "eval_samples_per_second": 376.536, + "eval_steps_per_second": 1.471, + "step": 790000 + }, + { + "epoch": 3.2186937627369283, + "grad_norm": 4.518896579742432, + "learning_rate": 0.002603696802151403, + "loss": 7.6735, + "step": 790100 + }, + { + "epoch": 3.21910114076031, + "grad_norm": 6.1575188636779785, + "learning_rate": 0.002603201095234266, + "loss": 7.6557, + "step": 790200 + }, + { + "epoch": 3.2195085187836914, + "grad_norm": 5.235797882080078, + "learning_rate": 0.002602705384434718, + "loss": 7.6597, + "step": 790300 + }, + { + "epoch": 3.219915896807073, + "grad_norm": 7.212177276611328, + "learning_rate": 0.0026022096697723538, + "loss": 7.6782, + "step": 790400 + }, + { + "epoch": 3.2203232748304544, + "grad_norm": 15.714856147766113, + "learning_rate": 0.002601713951266769, + "loss": 7.677, + "step": 790500 + }, + { + "epoch": 3.220730652853836, + "grad_norm": 6.259230136871338, + "learning_rate": 0.002601218228937566, + "loss": 7.6816, + "step": 790600 + }, + { + "epoch": 3.221138030877217, + "grad_norm": 4.9577178955078125, + "learning_rate": 0.002600722502804343, + "loss": 7.6754, + "step": 790700 + }, + { + "epoch": 3.2215454089005986, + "grad_norm": 6.513192653656006, + "learning_rate": 0.0026002267728866962, + "loss": 7.6776, + "step": 790800 + }, + { + "epoch": 3.22195278692398, + "grad_norm": 4.687143325805664, + "learning_rate": 0.0025997310392042233, + "loss": 7.6789, + "step": 790900 + }, + { + "epoch": 3.2223601649473617, + "grad_norm": 6.113856315612793, + "learning_rate": 0.002599235301776526, + "loss": 7.6858, + "step": 791000 + }, + { + "epoch": 3.2223601649473617, + "eval_MaskedAccuracy": 0.5065886525316977, + "eval_loss": 1.6032770872116089, + "eval_runtime": 157.6746, + "eval_samples_per_second": 402.576, + "eval_steps_per_second": 1.573, + "step": 791000 + }, + { + "epoch": 3.222767542970743, + "grad_norm": 6.812439441680908, + "learning_rate": 0.002598739560623205, + "loss": 7.6818, + "step": 791100 + }, + { + "epoch": 3.2231749209941247, + "grad_norm": 6.572329044342041, + "learning_rate": 0.002598243815763861, + "loss": 7.6804, + "step": 791200 + }, + { + "epoch": 3.2235822990175063, + "grad_norm": 10.578540802001953, + "learning_rate": 0.0025977480672180852, + "loss": 7.6719, + "step": 791300 + }, + { + "epoch": 3.2239896770408873, + "grad_norm": 5.3097381591796875, + "learning_rate": 0.0025972523150054817, + "loss": 7.6551, + "step": 791400 + }, + { + "epoch": 3.224397055064269, + "grad_norm": 6.691742420196533, + "learning_rate": 0.002596756559145654, + "loss": 7.7099, + "step": 791500 + }, + { + "epoch": 3.2248044330876504, + "grad_norm": 7.910216808319092, + "learning_rate": 0.0025962607996581947, + "loss": 7.6928, + "step": 791600 + }, + { + "epoch": 3.225211811111032, + "grad_norm": 3.708089828491211, + "learning_rate": 0.002595765036562709, + "loss": 7.6641, + "step": 791700 + }, + { + "epoch": 3.2256191891344135, + "grad_norm": 5.971957683563232, + "learning_rate": 0.0025952692698788, + "loss": 7.6519, + "step": 791800 + }, + { + "epoch": 3.226026567157795, + "grad_norm": 6.111754894256592, + "learning_rate": 0.0025947734996260657, + "loss": 7.6577, + "step": 791900 + }, + { + "epoch": 3.226433945181176, + "grad_norm": 9.812180519104004, + "learning_rate": 0.0025942777258241047, + "loss": 7.6786, + "step": 792000 + }, + { + "epoch": 3.226433945181176, + "eval_MaskedAccuracy": 0.5064100908461951, + "eval_loss": 1.6158021688461304, + "eval_runtime": 152.3994, + "eval_samples_per_second": 416.511, + "eval_steps_per_second": 1.627, + "step": 792000 + }, + { + "epoch": 3.2268413232045576, + "grad_norm": 3.814610481262207, + "learning_rate": 0.0025937819484925166, + "loss": 7.6836, + "step": 792100 + }, + { + "epoch": 3.227248701227939, + "grad_norm": 7.241386890411377, + "learning_rate": 0.0025932861676508994, + "loss": 7.6843, + "step": 792200 + }, + { + "epoch": 3.2276560792513207, + "grad_norm": 9.676058769226074, + "learning_rate": 0.0025927903833188573, + "loss": 7.6763, + "step": 792300 + }, + { + "epoch": 3.2280634572747022, + "grad_norm": 9.320780754089355, + "learning_rate": 0.002592294595515992, + "loss": 7.682, + "step": 792400 + }, + { + "epoch": 3.2284708352980838, + "grad_norm": 7.394233703613281, + "learning_rate": 0.002591798804261906, + "loss": 7.6691, + "step": 792500 + }, + { + "epoch": 3.228878213321465, + "grad_norm": 3.47312331199646, + "learning_rate": 0.0025913030095761993, + "loss": 7.6674, + "step": 792600 + }, + { + "epoch": 3.2292855913448464, + "grad_norm": 3.3234751224517822, + "learning_rate": 0.002590807211478471, + "loss": 7.6895, + "step": 792700 + }, + { + "epoch": 3.229692969368228, + "grad_norm": 4.033292770385742, + "learning_rate": 0.0025903114099883264, + "loss": 7.6742, + "step": 792800 + }, + { + "epoch": 3.2301003473916094, + "grad_norm": 9.050682067871094, + "learning_rate": 0.002589815605125367, + "loss": 7.6805, + "step": 792900 + }, + { + "epoch": 3.230507725414991, + "grad_norm": 3.7985522747039795, + "learning_rate": 0.0025893197969091937, + "loss": 7.6909, + "step": 793000 + }, + { + "epoch": 3.230507725414991, + "eval_MaskedAccuracy": 0.5060678756069075, + "eval_loss": 1.6170668601989746, + "eval_runtime": 157.5492, + "eval_samples_per_second": 402.896, + "eval_steps_per_second": 1.574, + "step": 793000 + }, + { + "epoch": 3.2309151034383725, + "grad_norm": 11.379815101623535, + "learning_rate": 0.0025888239853594087, + "loss": 7.6679, + "step": 793100 + }, + { + "epoch": 3.2313224814617536, + "grad_norm": 4.251285552978516, + "learning_rate": 0.0025883281704956094, + "loss": 7.6722, + "step": 793200 + }, + { + "epoch": 3.231729859485135, + "grad_norm": 3.687894582748413, + "learning_rate": 0.0025878323523374014, + "loss": 7.6736, + "step": 793300 + }, + { + "epoch": 3.2321372375085167, + "grad_norm": 3.4797394275665283, + "learning_rate": 0.002587336530904386, + "loss": 7.6828, + "step": 793400 + }, + { + "epoch": 3.232544615531898, + "grad_norm": 7.058933258056641, + "learning_rate": 0.0025868407062161723, + "loss": 7.6808, + "step": 793500 + }, + { + "epoch": 3.2329519935552797, + "grad_norm": 4.875361919403076, + "learning_rate": 0.002586344878292359, + "loss": 7.6649, + "step": 793600 + }, + { + "epoch": 3.2333593715786613, + "grad_norm": 5.618971824645996, + "learning_rate": 0.002585849047152545, + "loss": 7.6777, + "step": 793700 + }, + { + "epoch": 3.233766749602043, + "grad_norm": 6.132779598236084, + "learning_rate": 0.002585353212816335, + "loss": 7.6948, + "step": 793800 + }, + { + "epoch": 3.234174127625424, + "grad_norm": 3.53932785987854, + "learning_rate": 0.0025848573753033334, + "loss": 7.6862, + "step": 793900 + }, + { + "epoch": 3.2345815056488054, + "grad_norm": 5.543338298797607, + "learning_rate": 0.002584361534633141, + "loss": 7.6988, + "step": 794000 + }, + { + "epoch": 3.2345815056488054, + "eval_MaskedAccuracy": 0.50695645326961, + "eval_loss": 1.6186436414718628, + "eval_runtime": 161.857, + "eval_samples_per_second": 392.173, + "eval_steps_per_second": 1.532, + "step": 794000 + }, + { + "epoch": 3.234988883672187, + "grad_norm": 17.440731048583984, + "learning_rate": 0.0025838656908253664, + "loss": 7.6754, + "step": 794100 + }, + { + "epoch": 3.2353962616955685, + "grad_norm": 3.4629099369049072, + "learning_rate": 0.00258336984389961, + "loss": 7.6435, + "step": 794200 + }, + { + "epoch": 3.23580363971895, + "grad_norm": 9.417774200439453, + "learning_rate": 0.0025828739938754688, + "loss": 7.6781, + "step": 794300 + }, + { + "epoch": 3.2362110177423316, + "grad_norm": 6.153289794921875, + "learning_rate": 0.0025823781407725553, + "loss": 7.6774, + "step": 794400 + }, + { + "epoch": 3.2366183957657126, + "grad_norm": 12.250358581542969, + "learning_rate": 0.002581882284610468, + "loss": 7.6891, + "step": 794500 + }, + { + "epoch": 3.237025773789094, + "grad_norm": 10.045564651489258, + "learning_rate": 0.0025813864254088134, + "loss": 7.6464, + "step": 794600 + }, + { + "epoch": 3.2374331518124757, + "grad_norm": 4.859644889831543, + "learning_rate": 0.002580890563187195, + "loss": 7.6857, + "step": 794700 + }, + { + "epoch": 3.2378405298358572, + "grad_norm": 3.5310721397399902, + "learning_rate": 0.0025803946979652197, + "loss": 7.6961, + "step": 794800 + }, + { + "epoch": 3.2382479078592388, + "grad_norm": 5.769550323486328, + "learning_rate": 0.0025798988297624876, + "loss": 7.6683, + "step": 794900 + }, + { + "epoch": 3.2386552858826203, + "grad_norm": 8.49172592163086, + "learning_rate": 0.0025794029585986034, + "loss": 7.6626, + "step": 795000 + }, + { + "epoch": 3.2386552858826203, + "eval_MaskedAccuracy": 0.506165139490274, + "eval_loss": 1.6166064739227295, + "eval_runtime": 155.8636, + "eval_samples_per_second": 407.253, + "eval_steps_per_second": 1.591, + "step": 795000 + }, + { + "epoch": 3.2390626639060014, + "grad_norm": 3.65195369720459, + "learning_rate": 0.002578907084493171, + "loss": 7.6849, + "step": 795100 + }, + { + "epoch": 3.239470041929383, + "grad_norm": 7.123879909515381, + "learning_rate": 0.0025784112074657994, + "loss": 7.6886, + "step": 795200 + }, + { + "epoch": 3.2398774199527645, + "grad_norm": 3.4789581298828125, + "learning_rate": 0.0025779153275360886, + "loss": 7.657, + "step": 795300 + }, + { + "epoch": 3.240284797976146, + "grad_norm": 6.556728839874268, + "learning_rate": 0.0025774194447236456, + "loss": 7.6918, + "step": 795400 + }, + { + "epoch": 3.2406921759995275, + "grad_norm": 4.326566219329834, + "learning_rate": 0.0025769235590480735, + "loss": 7.66, + "step": 795500 + }, + { + "epoch": 3.241099554022909, + "grad_norm": 5.212819576263428, + "learning_rate": 0.0025764276705289796, + "loss": 7.6641, + "step": 795600 + }, + { + "epoch": 3.24150693204629, + "grad_norm": 11.032466888427734, + "learning_rate": 0.002575931779185968, + "loss": 7.6738, + "step": 795700 + }, + { + "epoch": 3.2419143100696717, + "grad_norm": 3.8308167457580566, + "learning_rate": 0.0025754358850386423, + "loss": 7.6656, + "step": 795800 + }, + { + "epoch": 3.242321688093053, + "grad_norm": 5.099149227142334, + "learning_rate": 0.002574939988106612, + "loss": 7.6669, + "step": 795900 + }, + { + "epoch": 3.2427290661164347, + "grad_norm": 6.100481986999512, + "learning_rate": 0.002574444088409479, + "loss": 7.7018, + "step": 796000 + }, + { + "epoch": 3.2427290661164347, + "eval_MaskedAccuracy": 0.5066901859609768, + "eval_loss": 1.6176397800445557, + "eval_runtime": 154.6266, + "eval_samples_per_second": 410.512, + "eval_steps_per_second": 1.604, + "step": 796000 + }, + { + "epoch": 3.2431364441398163, + "grad_norm": 3.527095079421997, + "learning_rate": 0.0025739481859668495, + "loss": 7.6949, + "step": 796100 + }, + { + "epoch": 3.243543822163198, + "grad_norm": 6.980803966522217, + "learning_rate": 0.0025734522807983303, + "loss": 7.6947, + "step": 796200 + }, + { + "epoch": 3.2439512001865793, + "grad_norm": 5.4837212562561035, + "learning_rate": 0.002572956372923524, + "loss": 7.6714, + "step": 796300 + }, + { + "epoch": 3.2443585782099604, + "grad_norm": 4.07946252822876, + "learning_rate": 0.002572460462362038, + "loss": 7.6724, + "step": 796400 + }, + { + "epoch": 3.244765956233342, + "grad_norm": 3.421415090560913, + "learning_rate": 0.0025719645491334786, + "loss": 7.6686, + "step": 796500 + }, + { + "epoch": 3.2451733342567235, + "grad_norm": 6.321123123168945, + "learning_rate": 0.0025714686332574534, + "loss": 7.693, + "step": 796600 + }, + { + "epoch": 3.245580712280105, + "grad_norm": 6.269840717315674, + "learning_rate": 0.002570972714753567, + "loss": 7.6786, + "step": 796700 + }, + { + "epoch": 3.2459880903034866, + "grad_norm": 4.550161361694336, + "learning_rate": 0.002570476793641428, + "loss": 7.6605, + "step": 796800 + }, + { + "epoch": 3.246395468326868, + "grad_norm": 8.534757614135742, + "learning_rate": 0.002569980869940643, + "loss": 7.6706, + "step": 796900 + }, + { + "epoch": 3.246802846350249, + "grad_norm": 11.600919723510742, + "learning_rate": 0.0025694849436708168, + "loss": 7.6493, + "step": 797000 + }, + { + "epoch": 3.246802846350249, + "eval_MaskedAccuracy": 0.5071739173757946, + "eval_loss": 1.6162763833999634, + "eval_runtime": 159.0367, + "eval_samples_per_second": 399.128, + "eval_steps_per_second": 1.559, + "step": 797000 + }, + { + "epoch": 3.2472102243736307, + "grad_norm": 3.2398877143859863, + "learning_rate": 0.002568989014851553, + "loss": 7.6875, + "step": 797100 + }, + { + "epoch": 3.2476176023970122, + "grad_norm": 4.973508358001709, + "learning_rate": 0.0025684930835024604, + "loss": 7.7187, + "step": 797200 + }, + { + "epoch": 3.248024980420394, + "grad_norm": 2.883657693862915, + "learning_rate": 0.0025679971496431497, + "loss": 7.6984, + "step": 797300 + }, + { + "epoch": 3.2484323584437753, + "grad_norm": 6.177520275115967, + "learning_rate": 0.0025675012132932214, + "loss": 7.6635, + "step": 797400 + }, + { + "epoch": 3.248839736467157, + "grad_norm": 5.778260707855225, + "learning_rate": 0.0025670052744722867, + "loss": 7.6964, + "step": 797500 + }, + { + "epoch": 3.249247114490538, + "grad_norm": 8.930655479431152, + "learning_rate": 0.002566509333199952, + "loss": 7.7054, + "step": 797600 + }, + { + "epoch": 3.2496544925139195, + "grad_norm": 9.496335983276367, + "learning_rate": 0.002566013389495825, + "loss": 7.6869, + "step": 797700 + }, + { + "epoch": 3.250061870537301, + "grad_norm": 5.685776710510254, + "learning_rate": 0.0025655174433795104, + "loss": 7.6672, + "step": 797800 + }, + { + "epoch": 3.2504692485606825, + "grad_norm": 4.112448692321777, + "learning_rate": 0.002565021494870619, + "loss": 7.6707, + "step": 797900 + }, + { + "epoch": 3.250876626584064, + "grad_norm": 9.60294246673584, + "learning_rate": 0.0025645255439887553, + "loss": 7.6897, + "step": 798000 + }, + { + "epoch": 3.250876626584064, + "eval_MaskedAccuracy": 0.5067806243928795, + "eval_loss": 1.6066287755966187, + "eval_runtime": 151.4015, + "eval_samples_per_second": 419.256, + "eval_steps_per_second": 1.638, + "step": 798000 + }, + { + "epoch": 3.2512840046074456, + "grad_norm": 11.241584777832031, + "learning_rate": 0.0025640295907535323, + "loss": 7.6509, + "step": 798100 + }, + { + "epoch": 3.2516913826308267, + "grad_norm": 5.0842461585998535, + "learning_rate": 0.0025635336351845506, + "loss": 7.6853, + "step": 798200 + }, + { + "epoch": 3.252098760654208, + "grad_norm": 3.7420406341552734, + "learning_rate": 0.0025630376773014226, + "loss": 7.6807, + "step": 798300 + }, + { + "epoch": 3.2525061386775898, + "grad_norm": 4.594023704528809, + "learning_rate": 0.002562541717123752, + "loss": 7.6355, + "step": 798400 + }, + { + "epoch": 3.2529135167009713, + "grad_norm": 3.3977713584899902, + "learning_rate": 0.0025620457546711517, + "loss": 7.7051, + "step": 798500 + }, + { + "epoch": 3.253320894724353, + "grad_norm": 3.1453049182891846, + "learning_rate": 0.0025615497899632296, + "loss": 7.7023, + "step": 798600 + }, + { + "epoch": 3.2537282727477344, + "grad_norm": 2.8873097896575928, + "learning_rate": 0.0025610538230195906, + "loss": 7.6959, + "step": 798700 + }, + { + "epoch": 3.254135650771116, + "grad_norm": 4.299108982086182, + "learning_rate": 0.002560557853859846, + "loss": 7.6753, + "step": 798800 + }, + { + "epoch": 3.254543028794497, + "grad_norm": 5.521244049072266, + "learning_rate": 0.0025600618825036025, + "loss": 7.7243, + "step": 798900 + }, + { + "epoch": 3.2549504068178785, + "grad_norm": 2.167487144470215, + "learning_rate": 0.002559565908970463, + "loss": 7.6554, + "step": 799000 + }, + { + "epoch": 3.2549504068178785, + "eval_MaskedAccuracy": 0.5067806013398966, + "eval_loss": 1.620605230331421, + "eval_runtime": 153.664, + "eval_samples_per_second": 413.083, + "eval_steps_per_second": 1.614, + "step": 799000 + }, + { + "epoch": 3.25535778484126, + "grad_norm": 4.254641532897949, + "learning_rate": 0.002559069933280044, + "loss": 7.6711, + "step": 799100 + }, + { + "epoch": 3.2557651628646416, + "grad_norm": 8.08716869354248, + "learning_rate": 0.002558573955451954, + "loss": 7.692, + "step": 799200 + }, + { + "epoch": 3.256172540888023, + "grad_norm": 6.920125484466553, + "learning_rate": 0.0025580779755057944, + "loss": 7.6689, + "step": 799300 + }, + { + "epoch": 3.2565799189114046, + "grad_norm": 12.316925048828125, + "learning_rate": 0.002557581993461181, + "loss": 7.6866, + "step": 799400 + }, + { + "epoch": 3.2569872969347857, + "grad_norm": 4.028006553649902, + "learning_rate": 0.0025570860093377203, + "loss": 7.7037, + "step": 799500 + }, + { + "epoch": 3.2573946749581673, + "grad_norm": 4.205632209777832, + "learning_rate": 0.0025565900231550196, + "loss": 7.6766, + "step": 799600 + }, + { + "epoch": 3.257802052981549, + "grad_norm": 3.163806200027466, + "learning_rate": 0.0025560940349326935, + "loss": 7.6651, + "step": 799700 + }, + { + "epoch": 3.2582094310049303, + "grad_norm": 4.043802261352539, + "learning_rate": 0.0025555980446903444, + "loss": 7.6903, + "step": 799800 + }, + { + "epoch": 3.258616809028312, + "grad_norm": 5.942582130432129, + "learning_rate": 0.0025551020524475824, + "loss": 7.714, + "step": 799900 + }, + { + "epoch": 3.2590241870516934, + "grad_norm": 10.770370483398438, + "learning_rate": 0.002554606058224021, + "loss": 7.7052, + "step": 800000 + }, + { + "epoch": 3.2590241870516934, + "eval_MaskedAccuracy": 0.5065970476477271, + "eval_loss": 1.621268630027771, + "eval_runtime": 151.7535, + "eval_samples_per_second": 418.284, + "eval_steps_per_second": 1.634, + "step": 800000 + }, + { + "epoch": 3.259431565075075, + "grad_norm": 6.294545650482178, + "learning_rate": 0.002554110062039264, + "loss": 7.6744, + "step": 800100 + }, + { + "epoch": 3.259838943098456, + "grad_norm": 14.498682022094727, + "learning_rate": 0.0025536140639129245, + "loss": 7.6987, + "step": 800200 + }, + { + "epoch": 3.2602463211218375, + "grad_norm": 3.168377637863159, + "learning_rate": 0.0025531180638646103, + "loss": 7.6485, + "step": 800300 + }, + { + "epoch": 3.260653699145219, + "grad_norm": 12.571187973022461, + "learning_rate": 0.0025526220619139315, + "loss": 7.722, + "step": 800400 + }, + { + "epoch": 3.2610610771686006, + "grad_norm": 9.39305591583252, + "learning_rate": 0.0025521260580804992, + "loss": 7.6599, + "step": 800500 + }, + { + "epoch": 3.261468455191982, + "grad_norm": 4.574954032897949, + "learning_rate": 0.0025516300523839228, + "loss": 7.6529, + "step": 800600 + }, + { + "epoch": 3.2618758332153632, + "grad_norm": 7.393759250640869, + "learning_rate": 0.002551134044843811, + "loss": 7.6694, + "step": 800700 + }, + { + "epoch": 3.2622832112387448, + "grad_norm": 10.334128379821777, + "learning_rate": 0.002550638035479774, + "loss": 7.6941, + "step": 800800 + }, + { + "epoch": 3.2626905892621263, + "grad_norm": 3.6924545764923096, + "learning_rate": 0.0025501420243114205, + "loss": 7.663, + "step": 800900 + }, + { + "epoch": 3.263097967285508, + "grad_norm": 5.345485687255859, + "learning_rate": 0.002549646011358361, + "loss": 7.645, + "step": 801000 + }, + { + "epoch": 3.263097967285508, + "eval_MaskedAccuracy": 0.5065235927475761, + "eval_loss": 1.615440845489502, + "eval_runtime": 151.0628, + "eval_samples_per_second": 420.196, + "eval_steps_per_second": 1.642, + "step": 801000 + }, + { + "epoch": 3.2635053453088894, + "grad_norm": 6.097316741943359, + "learning_rate": 0.002549149996640208, + "loss": 7.6828, + "step": 801100 + }, + { + "epoch": 3.263912723332271, + "grad_norm": 5.069605827331543, + "learning_rate": 0.0025486539801765685, + "loss": 7.7135, + "step": 801200 + }, + { + "epoch": 3.2643201013556524, + "grad_norm": 3.518381118774414, + "learning_rate": 0.0025481579619870518, + "loss": 7.6617, + "step": 801300 + }, + { + "epoch": 3.2647274793790335, + "grad_norm": 3.3528285026550293, + "learning_rate": 0.0025476619420912736, + "loss": 7.6659, + "step": 801400 + }, + { + "epoch": 3.265134857402415, + "grad_norm": 6.6037421226501465, + "learning_rate": 0.0025471659205088355, + "loss": 7.7063, + "step": 801500 + }, + { + "epoch": 3.2655422354257966, + "grad_norm": 5.243923664093018, + "learning_rate": 0.0025466698972593563, + "loss": 7.6827, + "step": 801600 + }, + { + "epoch": 3.265949613449178, + "grad_norm": 6.573047637939453, + "learning_rate": 0.0025461738723624432, + "loss": 7.6895, + "step": 801700 + }, + { + "epoch": 3.2663569914725596, + "grad_norm": 5.855910778045654, + "learning_rate": 0.0025456778458377055, + "loss": 7.673, + "step": 801800 + }, + { + "epoch": 3.266764369495941, + "grad_norm": 5.195866584777832, + "learning_rate": 0.0025451818177047547, + "loss": 7.6784, + "step": 801900 + }, + { + "epoch": 3.2671717475193223, + "grad_norm": 4.261105060577393, + "learning_rate": 0.002544685787983205, + "loss": 7.6847, + "step": 802000 + }, + { + "epoch": 3.2671717475193223, + "eval_MaskedAccuracy": 0.5070783132797549, + "eval_loss": 1.6130510568618774, + "eval_runtime": 178.8347, + "eval_samples_per_second": 354.942, + "eval_steps_per_second": 1.387, + "step": 802000 + }, + { + "epoch": 3.267579125542704, + "grad_norm": 9.53943157196045, + "learning_rate": 0.002544189756692662, + "loss": 7.7014, + "step": 802100 + }, + { + "epoch": 3.2679865035660853, + "grad_norm": 4.8277130126953125, + "learning_rate": 0.002543693723852737, + "loss": 7.6575, + "step": 802200 + }, + { + "epoch": 3.268393881589467, + "grad_norm": 2.8353869915008545, + "learning_rate": 0.0025431976894830424, + "loss": 7.7079, + "step": 802300 + }, + { + "epoch": 3.2688012596128484, + "grad_norm": 4.182636260986328, + "learning_rate": 0.0025427016536031886, + "loss": 7.6707, + "step": 802400 + }, + { + "epoch": 3.26920863763623, + "grad_norm": 4.57123327255249, + "learning_rate": 0.0025422056162327854, + "loss": 7.6783, + "step": 802500 + }, + { + "epoch": 3.2696160156596115, + "grad_norm": 4.688640594482422, + "learning_rate": 0.002541709577391448, + "loss": 7.6814, + "step": 802600 + }, + { + "epoch": 3.2700233936829926, + "grad_norm": 6.715690612792969, + "learning_rate": 0.002541213537098782, + "loss": 7.6787, + "step": 802700 + }, + { + "epoch": 3.270430771706374, + "grad_norm": 12.053544044494629, + "learning_rate": 0.0025407174953744035, + "loss": 7.669, + "step": 802800 + }, + { + "epoch": 3.2708381497297556, + "grad_norm": 4.029909133911133, + "learning_rate": 0.0025402214522379195, + "loss": 7.6633, + "step": 802900 + }, + { + "epoch": 3.271245527753137, + "grad_norm": 3.22373366355896, + "learning_rate": 0.0025397254077089435, + "loss": 7.6628, + "step": 803000 + }, + { + "epoch": 3.271245527753137, + "eval_MaskedAccuracy": 0.5067959192123417, + "eval_loss": 1.615372657775879, + "eval_runtime": 152.4193, + "eval_samples_per_second": 416.456, + "eval_steps_per_second": 1.627, + "step": 803000 + }, + { + "epoch": 3.2716529057765187, + "grad_norm": 4.651641368865967, + "learning_rate": 0.002539229361807089, + "loss": 7.6845, + "step": 803100 + }, + { + "epoch": 3.2720602837998998, + "grad_norm": 5.103263854980469, + "learning_rate": 0.002538733314551964, + "loss": 7.6793, + "step": 803200 + }, + { + "epoch": 3.2724676618232813, + "grad_norm": 5.449814319610596, + "learning_rate": 0.002538237265963181, + "loss": 7.666, + "step": 803300 + }, + { + "epoch": 3.272875039846663, + "grad_norm": 8.951945304870605, + "learning_rate": 0.002537741216060347, + "loss": 7.6829, + "step": 803400 + }, + { + "epoch": 3.2732824178700444, + "grad_norm": 10.639776229858398, + "learning_rate": 0.002537245164863083, + "loss": 7.6379, + "step": 803500 + }, + { + "epoch": 3.273689795893426, + "grad_norm": 4.770523548126221, + "learning_rate": 0.002536749112390995, + "loss": 7.6708, + "step": 803600 + }, + { + "epoch": 3.2740971739168074, + "grad_norm": 6.401170253753662, + "learning_rate": 0.0025362530586636903, + "loss": 7.6789, + "step": 803700 + }, + { + "epoch": 3.274504551940189, + "grad_norm": 10.611383438110352, + "learning_rate": 0.0025357570037007884, + "loss": 7.6818, + "step": 803800 + }, + { + "epoch": 3.27491192996357, + "grad_norm": 4.672033786773682, + "learning_rate": 0.002535260947521895, + "loss": 7.6442, + "step": 803900 + }, + { + "epoch": 3.2753193079869516, + "grad_norm": 4.015482425689697, + "learning_rate": 0.002534764890146625, + "loss": 7.6568, + "step": 804000 + }, + { + "epoch": 3.2753193079869516, + "eval_MaskedAccuracy": 0.5067048275041367, + "eval_loss": 1.618336796760559, + "eval_runtime": 150.6131, + "eval_samples_per_second": 421.451, + "eval_steps_per_second": 1.647, + "step": 804000 + }, + { + "epoch": 3.275726686010333, + "grad_norm": 4.1908769607543945, + "learning_rate": 0.0025342688315945897, + "loss": 7.6986, + "step": 804100 + }, + { + "epoch": 3.2761340640337147, + "grad_norm": 8.904797554016113, + "learning_rate": 0.002533772771885403, + "loss": 7.6702, + "step": 804200 + }, + { + "epoch": 3.276541442057096, + "grad_norm": 3.950209856033325, + "learning_rate": 0.0025332767110386716, + "loss": 7.679, + "step": 804300 + }, + { + "epoch": 3.2769488200804777, + "grad_norm": 5.024580955505371, + "learning_rate": 0.002532780649074014, + "loss": 7.7098, + "step": 804400 + }, + { + "epoch": 3.277356198103859, + "grad_norm": 10.476592063903809, + "learning_rate": 0.002532284586011038, + "loss": 7.6975, + "step": 804500 + }, + { + "epoch": 3.2777635761272403, + "grad_norm": 5.804891586303711, + "learning_rate": 0.0025317885218693564, + "loss": 7.6933, + "step": 804600 + }, + { + "epoch": 3.278170954150622, + "grad_norm": 4.553762912750244, + "learning_rate": 0.0025312924566685824, + "loss": 7.6826, + "step": 804700 + }, + { + "epoch": 3.2785783321740034, + "grad_norm": 4.334897041320801, + "learning_rate": 0.002530796390428322, + "loss": 7.6761, + "step": 804800 + }, + { + "epoch": 3.278985710197385, + "grad_norm": 3.2402517795562744, + "learning_rate": 0.002530300323168194, + "loss": 7.698, + "step": 804900 + }, + { + "epoch": 3.2793930882207665, + "grad_norm": 5.841893672943115, + "learning_rate": 0.002529804254907811, + "loss": 7.6657, + "step": 805000 + }, + { + "epoch": 3.2793930882207665, + "eval_MaskedAccuracy": 0.5069957634032773, + "eval_loss": 1.621890664100647, + "eval_runtime": 158.4293, + "eval_samples_per_second": 400.658, + "eval_steps_per_second": 1.565, + "step": 805000 + }, + { + "epoch": 3.279800466244148, + "grad_norm": 3.275071382522583, + "learning_rate": 0.002529308185666784, + "loss": 7.6573, + "step": 805100 + }, + { + "epoch": 3.280207844267529, + "grad_norm": 8.552112579345703, + "learning_rate": 0.0025288121154647216, + "loss": 7.672, + "step": 805200 + }, + { + "epoch": 3.2806152222909106, + "grad_norm": 6.308941841125488, + "learning_rate": 0.0025283160443212403, + "loss": 7.6898, + "step": 805300 + }, + { + "epoch": 3.281022600314292, + "grad_norm": 5.6279215812683105, + "learning_rate": 0.0025278199722559526, + "loss": 7.6964, + "step": 805400 + }, + { + "epoch": 3.2814299783376737, + "grad_norm": 9.12334156036377, + "learning_rate": 0.0025273238992884694, + "loss": 7.6976, + "step": 805500 + }, + { + "epoch": 3.2818373563610552, + "grad_norm": 4.141939640045166, + "learning_rate": 0.002526827825438402, + "loss": 7.6785, + "step": 805600 + }, + { + "epoch": 3.2822447343844363, + "grad_norm": 4.386415481567383, + "learning_rate": 0.0025263317507253627, + "loss": 7.6561, + "step": 805700 + }, + { + "epoch": 3.282652112407818, + "grad_norm": 12.528108596801758, + "learning_rate": 0.0025258356751689673, + "loss": 7.6917, + "step": 805800 + }, + { + "epoch": 3.2830594904311994, + "grad_norm": 4.788679599761963, + "learning_rate": 0.002525339598788825, + "loss": 7.6804, + "step": 805900 + }, + { + "epoch": 3.283466868454581, + "grad_norm": 4.137840747833252, + "learning_rate": 0.0025248435216045483, + "loss": 7.6733, + "step": 806000 + }, + { + "epoch": 3.283466868454581, + "eval_MaskedAccuracy": 0.5072434898924681, + "eval_loss": 1.6060513257980347, + "eval_runtime": 168.0009, + "eval_samples_per_second": 377.831, + "eval_steps_per_second": 1.476, + "step": 806000 + }, + { + "epoch": 3.2838742464779624, + "grad_norm": 4.594651699066162, + "learning_rate": 0.0025243474436357507, + "loss": 7.6882, + "step": 806100 + }, + { + "epoch": 3.284281624501344, + "grad_norm": 3.3456647396087646, + "learning_rate": 0.0025238513649020475, + "loss": 7.6801, + "step": 806200 + }, + { + "epoch": 3.2846890025247255, + "grad_norm": 7.094583034515381, + "learning_rate": 0.002523355285423046, + "loss": 7.6871, + "step": 806300 + }, + { + "epoch": 3.2850963805481066, + "grad_norm": 15.75999927520752, + "learning_rate": 0.0025228592052183654, + "loss": 7.6768, + "step": 806400 + }, + { + "epoch": 3.285503758571488, + "grad_norm": 6.918394565582275, + "learning_rate": 0.002522363124307616, + "loss": 7.6739, + "step": 806500 + }, + { + "epoch": 3.2859111365948697, + "grad_norm": 11.002787590026855, + "learning_rate": 0.002521867042710408, + "loss": 7.6724, + "step": 806600 + }, + { + "epoch": 3.286318514618251, + "grad_norm": 4.709118366241455, + "learning_rate": 0.0025213709604463516, + "loss": 7.6999, + "step": 806700 + }, + { + "epoch": 3.2867258926416327, + "grad_norm": 5.836373329162598, + "learning_rate": 0.0025208748775350665, + "loss": 7.6598, + "step": 806800 + }, + { + "epoch": 3.2871332706650143, + "grad_norm": 3.9379138946533203, + "learning_rate": 0.0025203787939961633, + "loss": 7.7014, + "step": 806900 + }, + { + "epoch": 3.2875406486883954, + "grad_norm": 7.974735736846924, + "learning_rate": 0.002519882709849256, + "loss": 7.6726, + "step": 807000 + }, + { + "epoch": 3.2875406486883954, + "eval_MaskedAccuracy": 0.5066687168272626, + "eval_loss": 1.6104329824447632, + "eval_runtime": 156.0427, + "eval_samples_per_second": 406.786, + "eval_steps_per_second": 1.589, + "step": 807000 + }, + { + "epoch": 3.287948026711777, + "grad_norm": 9.561921119689941, + "learning_rate": 0.002519386625113955, + "loss": 7.6568, + "step": 807100 + }, + { + "epoch": 3.2883554047351584, + "grad_norm": 2.4830827713012695, + "learning_rate": 0.0025188905398098745, + "loss": 7.6945, + "step": 807200 + }, + { + "epoch": 3.28876278275854, + "grad_norm": 8.926530838012695, + "learning_rate": 0.002518394453956622, + "loss": 7.6425, + "step": 807300 + }, + { + "epoch": 3.2891701607819215, + "grad_norm": 6.503739833831787, + "learning_rate": 0.0025178983675738145, + "loss": 7.6919, + "step": 807400 + }, + { + "epoch": 3.289577538805303, + "grad_norm": 5.133355617523193, + "learning_rate": 0.0025174022806810662, + "loss": 7.6513, + "step": 807500 + }, + { + "epoch": 3.2899849168286845, + "grad_norm": 3.24489164352417, + "learning_rate": 0.002516906193297992, + "loss": 7.6742, + "step": 807600 + }, + { + "epoch": 3.2903922948520656, + "grad_norm": 4.072437286376953, + "learning_rate": 0.0025164101054442035, + "loss": 7.6676, + "step": 807700 + }, + { + "epoch": 3.290799672875447, + "grad_norm": 4.990175724029541, + "learning_rate": 0.0025159140171393114, + "loss": 7.6899, + "step": 807800 + }, + { + "epoch": 3.2912070508988287, + "grad_norm": 4.706521034240723, + "learning_rate": 0.0025154179284029334, + "loss": 7.6741, + "step": 807900 + }, + { + "epoch": 3.2916144289222102, + "grad_norm": 3.965909719467163, + "learning_rate": 0.002514921839254675, + "loss": 7.6574, + "step": 808000 + }, + { + "epoch": 3.2916144289222102, + "eval_MaskedAccuracy": 0.5066463820582249, + "eval_loss": 1.6170014142990112, + "eval_runtime": 155.9467, + "eval_samples_per_second": 407.037, + "eval_steps_per_second": 1.59, + "step": 808000 + }, + { + "epoch": 3.2920218069455918, + "grad_norm": 4.205779075622559, + "learning_rate": 0.002514425749714154, + "loss": 7.6996, + "step": 808100 + }, + { + "epoch": 3.292429184968973, + "grad_norm": 3.5748183727264404, + "learning_rate": 0.002513929659800981, + "loss": 7.6637, + "step": 808200 + }, + { + "epoch": 3.2928365629923544, + "grad_norm": 3.411655902862549, + "learning_rate": 0.002513433569534771, + "loss": 7.667, + "step": 808300 + }, + { + "epoch": 3.293243941015736, + "grad_norm": 5.77153205871582, + "learning_rate": 0.002512937478935137, + "loss": 7.6738, + "step": 808400 + }, + { + "epoch": 3.2936513190391175, + "grad_norm": 8.544811248779297, + "learning_rate": 0.0025124413880216936, + "loss": 7.6931, + "step": 808500 + }, + { + "epoch": 3.294058697062499, + "grad_norm": 9.314810752868652, + "learning_rate": 0.0025119452968140493, + "loss": 7.6602, + "step": 808600 + }, + { + "epoch": 3.2944660750858805, + "grad_norm": 4.716907978057861, + "learning_rate": 0.002511449205331821, + "loss": 7.6472, + "step": 808700 + }, + { + "epoch": 3.294873453109262, + "grad_norm": 6.9617743492126465, + "learning_rate": 0.0025109531135946235, + "loss": 7.6572, + "step": 808800 + }, + { + "epoch": 3.295280831132643, + "grad_norm": 4.736232280731201, + "learning_rate": 0.002510457021622067, + "loss": 7.6661, + "step": 808900 + }, + { + "epoch": 3.2956882091560247, + "grad_norm": 3.1326653957366943, + "learning_rate": 0.002509960929433766, + "loss": 7.6567, + "step": 809000 + }, + { + "epoch": 3.2956882091560247, + "eval_MaskedAccuracy": 0.5064585278057439, + "eval_loss": 1.6185801029205322, + "eval_runtime": 154.763, + "eval_samples_per_second": 410.15, + "eval_steps_per_second": 1.602, + "step": 809000 + }, + { + "epoch": 3.296095587179406, + "grad_norm": 4.165744304656982, + "learning_rate": 0.002509464837049331, + "loss": 7.7108, + "step": 809100 + }, + { + "epoch": 3.2965029652027877, + "grad_norm": 5.649426460266113, + "learning_rate": 0.0025089687444883745, + "loss": 7.6691, + "step": 809200 + }, + { + "epoch": 3.2969103432261693, + "grad_norm": 4.543972969055176, + "learning_rate": 0.0025084726517705137, + "loss": 7.6689, + "step": 809300 + }, + { + "epoch": 3.297317721249551, + "grad_norm": 2.8531885147094727, + "learning_rate": 0.002507976558915362, + "loss": 7.6529, + "step": 809400 + }, + { + "epoch": 3.297725099272932, + "grad_norm": 6.886300086975098, + "learning_rate": 0.0025074804659425314, + "loss": 7.6462, + "step": 809500 + }, + { + "epoch": 3.2981324772963134, + "grad_norm": 3.5927135944366455, + "learning_rate": 0.0025069843728716353, + "loss": 7.666, + "step": 809600 + }, + { + "epoch": 3.298539855319695, + "grad_norm": 9.71517276763916, + "learning_rate": 0.0025064882797222895, + "loss": 7.6824, + "step": 809700 + }, + { + "epoch": 3.2989472333430765, + "grad_norm": 8.23390007019043, + "learning_rate": 0.0025059921865141018, + "loss": 7.677, + "step": 809800 + }, + { + "epoch": 3.299354611366458, + "grad_norm": 2.9719061851501465, + "learning_rate": 0.002505496093266686, + "loss": 7.6958, + "step": 809900 + }, + { + "epoch": 3.2997619893898396, + "grad_norm": 6.405104160308838, + "learning_rate": 0.002504999999999657, + "loss": 7.6959, + "step": 810000 + }, + { + "epoch": 3.2997619893898396, + "eval_MaskedAccuracy": 0.5065891772807434, + "eval_loss": 1.6186394691467285, + "eval_runtime": 159.5458, + "eval_samples_per_second": 397.854, + "eval_steps_per_second": 1.554, + "step": 810000 + }, + { + "epoch": 3.300169367413221, + "grad_norm": 5.008472442626953, + "learning_rate": 0.0025045039067326273, + "loss": 7.7108, + "step": 810100 + }, + { + "epoch": 3.300576745436602, + "grad_norm": 6.079156875610352, + "learning_rate": 0.002504007813485209, + "loss": 7.6702, + "step": 810200 + }, + { + "epoch": 3.3009841234599837, + "grad_norm": 3.707399845123291, + "learning_rate": 0.0025035117202770163, + "loss": 7.6839, + "step": 810300 + }, + { + "epoch": 3.3013915014833652, + "grad_norm": 4.559576988220215, + "learning_rate": 0.002503015627127667, + "loss": 7.6854, + "step": 810400 + }, + { + "epoch": 3.3017988795067468, + "grad_norm": 6.961183071136475, + "learning_rate": 0.0025025195340567714, + "loss": 7.681, + "step": 810500 + }, + { + "epoch": 3.3022062575301283, + "grad_norm": 4.960519313812256, + "learning_rate": 0.002502023441083942, + "loss": 7.6743, + "step": 810600 + }, + { + "epoch": 3.3026136355535094, + "grad_norm": 4.832784175872803, + "learning_rate": 0.0025015273482287895, + "loss": 7.6567, + "step": 810700 + }, + { + "epoch": 3.303021013576891, + "grad_norm": 3.261326313018799, + "learning_rate": 0.0025010312555109305, + "loss": 7.7116, + "step": 810800 + }, + { + "epoch": 3.3034283916002725, + "grad_norm": 8.754937171936035, + "learning_rate": 0.002500535162949978, + "loss": 7.6876, + "step": 810900 + }, + { + "epoch": 3.303835769623654, + "grad_norm": 4.600485324859619, + "learning_rate": 0.0025000390705655438, + "loss": 7.674, + "step": 811000 + }, + { + "epoch": 3.303835769623654, + "eval_MaskedAccuracy": 0.5071697466006893, + "eval_loss": 1.604413628578186, + "eval_runtime": 156.5897, + "eval_samples_per_second": 405.365, + "eval_steps_per_second": 1.584, + "step": 811000 + }, + { + "epoch": 3.3042431476470355, + "grad_norm": 10.910696983337402, + "learning_rate": 0.002499542978377241, + "loss": 7.6794, + "step": 811100 + }, + { + "epoch": 3.304650525670417, + "grad_norm": 5.202807426452637, + "learning_rate": 0.0024990468864046853, + "loss": 7.6865, + "step": 811200 + }, + { + "epoch": 3.3050579036937986, + "grad_norm": 2.4385552406311035, + "learning_rate": 0.002498550794667481, + "loss": 7.6832, + "step": 811300 + }, + { + "epoch": 3.3054652817171797, + "grad_norm": 3.8294498920440674, + "learning_rate": 0.002498054703185255, + "loss": 7.6796, + "step": 811400 + }, + { + "epoch": 3.305872659740561, + "grad_norm": 4.572168350219727, + "learning_rate": 0.0024975586119776122, + "loss": 7.6867, + "step": 811500 + }, + { + "epoch": 3.3062800377639427, + "grad_norm": 4.635795593261719, + "learning_rate": 0.0024970625210641666, + "loss": 7.6873, + "step": 811600 + }, + { + "epoch": 3.3066874157873243, + "grad_norm": 4.829721450805664, + "learning_rate": 0.0024965664304645342, + "loss": 7.6672, + "step": 811700 + }, + { + "epoch": 3.307094793810706, + "grad_norm": 5.453036308288574, + "learning_rate": 0.0024960703401983248, + "loss": 7.6808, + "step": 811800 + }, + { + "epoch": 3.3075021718340873, + "grad_norm": 10.133064270019531, + "learning_rate": 0.0024955742502851524, + "loss": 7.6761, + "step": 811900 + }, + { + "epoch": 3.3079095498574684, + "grad_norm": 4.497378826141357, + "learning_rate": 0.0024950781607446302, + "loss": 7.6589, + "step": 812000 + }, + { + "epoch": 3.3079095498574684, + "eval_MaskedAccuracy": 0.5067306555130758, + "eval_loss": 1.6124553680419922, + "eval_runtime": 157.0336, + "eval_samples_per_second": 404.219, + "eval_steps_per_second": 1.579, + "step": 812000 + }, + { + "epoch": 3.30831692788085, + "grad_norm": 4.651226997375488, + "learning_rate": 0.0024945820715963694, + "loss": 7.6896, + "step": 812100 + }, + { + "epoch": 3.3087243059042315, + "grad_norm": 5.181884765625, + "learning_rate": 0.002494085982859984, + "loss": 7.6753, + "step": 812200 + }, + { + "epoch": 3.309131683927613, + "grad_norm": 6.102396011352539, + "learning_rate": 0.002493589894555091, + "loss": 7.6916, + "step": 812300 + }, + { + "epoch": 3.3095390619509946, + "grad_norm": 5.432448387145996, + "learning_rate": 0.0024930938067013004, + "loss": 7.7008, + "step": 812400 + }, + { + "epoch": 3.309946439974376, + "grad_norm": 5.259257793426514, + "learning_rate": 0.0024925977193182235, + "loss": 7.6815, + "step": 812500 + }, + { + "epoch": 3.3103538179977576, + "grad_norm": 4.742974758148193, + "learning_rate": 0.0024921016324254788, + "loss": 7.6563, + "step": 812600 + }, + { + "epoch": 3.3107611960211387, + "grad_norm": 3.7323997020721436, + "learning_rate": 0.002491605546042675, + "loss": 7.6616, + "step": 812700 + }, + { + "epoch": 3.3111685740445203, + "grad_norm": 4.109499454498291, + "learning_rate": 0.0024911094601894235, + "loss": 7.6636, + "step": 812800 + }, + { + "epoch": 3.311575952067902, + "grad_norm": 4.057988166809082, + "learning_rate": 0.0024906133748853407, + "loss": 7.6849, + "step": 812900 + }, + { + "epoch": 3.3119833300912833, + "grad_norm": 5.340011119842529, + "learning_rate": 0.0024901172901500407, + "loss": 7.6987, + "step": 813000 + }, + { + "epoch": 3.3119833300912833, + "eval_MaskedAccuracy": 0.5066784227780029, + "eval_loss": 1.6174936294555664, + "eval_runtime": 157.179, + "eval_samples_per_second": 403.845, + "eval_steps_per_second": 1.578, + "step": 813000 + }, + { + "epoch": 3.312390708114665, + "grad_norm": 7.912527084350586, + "learning_rate": 0.0024896212060031325, + "loss": 7.6837, + "step": 813100 + }, + { + "epoch": 3.312798086138046, + "grad_norm": 6.485486030578613, + "learning_rate": 0.0024891251224642284, + "loss": 7.6961, + "step": 813200 + }, + { + "epoch": 3.3132054641614275, + "grad_norm": 5.897899627685547, + "learning_rate": 0.002488629039552944, + "loss": 7.6642, + "step": 813300 + }, + { + "epoch": 3.313612842184809, + "grad_norm": 3.0667595863342285, + "learning_rate": 0.002488132957288891, + "loss": 7.6893, + "step": 813400 + }, + { + "epoch": 3.3140202202081905, + "grad_norm": 3.066885232925415, + "learning_rate": 0.002487636875691679, + "loss": 7.6936, + "step": 813500 + }, + { + "epoch": 3.314427598231572, + "grad_norm": 4.320520877838135, + "learning_rate": 0.00248714079478093, + "loss": 7.6324, + "step": 813600 + }, + { + "epoch": 3.3148349762549536, + "grad_norm": 9.33206844329834, + "learning_rate": 0.0024866447145762472, + "loss": 7.6475, + "step": 813700 + }, + { + "epoch": 3.315242354278335, + "grad_norm": 5.835254192352295, + "learning_rate": 0.002486148635097248, + "loss": 7.6736, + "step": 813800 + }, + { + "epoch": 3.3156497323017162, + "grad_norm": 5.379861354827881, + "learning_rate": 0.0024856525563635465, + "loss": 7.6713, + "step": 813900 + }, + { + "epoch": 3.3160571103250978, + "grad_norm": 4.814055919647217, + "learning_rate": 0.0024851564783947497, + "loss": 7.6467, + "step": 814000 + }, + { + "epoch": 3.3160571103250978, + "eval_MaskedAccuracy": 0.5070498232020509, + "eval_loss": 1.6167371273040771, + "eval_runtime": 156.0631, + "eval_samples_per_second": 406.733, + "eval_steps_per_second": 1.589, + "step": 814000 + }, + { + "epoch": 3.3164644883484793, + "grad_norm": 7.058439254760742, + "learning_rate": 0.0024846604012104746, + "loss": 7.6669, + "step": 814100 + }, + { + "epoch": 3.316871866371861, + "grad_norm": 9.72732925415039, + "learning_rate": 0.002484164324830333, + "loss": 7.6409, + "step": 814200 + }, + { + "epoch": 3.3172792443952424, + "grad_norm": 7.237351417541504, + "learning_rate": 0.0024836682492739353, + "loss": 7.6698, + "step": 814300 + }, + { + "epoch": 3.317686622418624, + "grad_norm": 4.3387064933776855, + "learning_rate": 0.002483172174560898, + "loss": 7.6407, + "step": 814400 + }, + { + "epoch": 3.318094000442005, + "grad_norm": 5.568198204040527, + "learning_rate": 0.0024826761007108295, + "loss": 7.6651, + "step": 814500 + }, + { + "epoch": 3.3185013784653865, + "grad_norm": 11.06213665008545, + "learning_rate": 0.002482180027743345, + "loss": 7.6739, + "step": 814600 + }, + { + "epoch": 3.318908756488768, + "grad_norm": 3.0172247886657715, + "learning_rate": 0.0024816839556780546, + "loss": 7.67, + "step": 814700 + }, + { + "epoch": 3.3193161345121496, + "grad_norm": 5.155795574188232, + "learning_rate": 0.002481187884534573, + "loss": 7.6611, + "step": 814800 + }, + { + "epoch": 3.319723512535531, + "grad_norm": 8.746994018554688, + "learning_rate": 0.0024806918143325143, + "loss": 7.6505, + "step": 814900 + }, + { + "epoch": 3.3201308905589126, + "grad_norm": 8.797126770019531, + "learning_rate": 0.0024801957450914895, + "loss": 7.6684, + "step": 815000 + }, + { + "epoch": 3.3201308905589126, + "eval_MaskedAccuracy": 0.506765295166268, + "eval_loss": 1.6302165985107422, + "eval_runtime": 158.1441, + "eval_samples_per_second": 401.381, + "eval_steps_per_second": 1.568, + "step": 815000 + }, + { + "epoch": 3.320538268582294, + "grad_norm": 4.325685024261475, + "learning_rate": 0.0024796996768311048, + "loss": 7.6537, + "step": 815100 + }, + { + "epoch": 3.3209456466056753, + "grad_norm": 3.9383885860443115, + "learning_rate": 0.0024792036095709776, + "loss": 7.6676, + "step": 815200 + }, + { + "epoch": 3.321353024629057, + "grad_norm": 4.64644193649292, + "learning_rate": 0.002478707543330718, + "loss": 7.6647, + "step": 815300 + }, + { + "epoch": 3.3217604026524383, + "grad_norm": 4.420475482940674, + "learning_rate": 0.002478211478129945, + "loss": 7.6666, + "step": 815400 + }, + { + "epoch": 3.32216778067582, + "grad_norm": 16.778709411621094, + "learning_rate": 0.0024777154139882635, + "loss": 7.6451, + "step": 815500 + }, + { + "epoch": 3.3225751586992014, + "grad_norm": 3.701763153076172, + "learning_rate": 0.0024772193509252837, + "loss": 7.6381, + "step": 815600 + }, + { + "epoch": 3.3229825367225825, + "grad_norm": 8.478386878967285, + "learning_rate": 0.002476723288960625, + "loss": 7.6689, + "step": 815700 + }, + { + "epoch": 3.323389914745964, + "grad_norm": 8.00002670288086, + "learning_rate": 0.002476227228113897, + "loss": 7.6785, + "step": 815800 + }, + { + "epoch": 3.3237972927693455, + "grad_norm": 6.67539119720459, + "learning_rate": 0.0024757311684047104, + "loss": 7.6529, + "step": 815900 + }, + { + "epoch": 3.324204670792727, + "grad_norm": 9.989371299743652, + "learning_rate": 0.002475235109852678, + "loss": 7.6829, + "step": 816000 + }, + { + "epoch": 3.324204670792727, + "eval_MaskedAccuracy": 0.506909330141037, + "eval_loss": 1.6146438121795654, + "eval_runtime": 160.3621, + "eval_samples_per_second": 395.829, + "eval_steps_per_second": 1.547, + "step": 816000 + }, + { + "epoch": 3.3246120488161086, + "grad_norm": 3.530759811401367, + "learning_rate": 0.0024747390524774084, + "loss": 7.6373, + "step": 816100 + }, + { + "epoch": 3.32501942683949, + "grad_norm": 4.5824198722839355, + "learning_rate": 0.002474242996298517, + "loss": 7.6693, + "step": 816200 + }, + { + "epoch": 3.3254268048628717, + "grad_norm": 5.958522319793701, + "learning_rate": 0.0024737469413356147, + "loss": 7.6494, + "step": 816300 + }, + { + "epoch": 3.3258341828862528, + "grad_norm": 13.90539836883545, + "learning_rate": 0.002473250887608314, + "loss": 7.645, + "step": 816400 + }, + { + "epoch": 3.3262415609096343, + "grad_norm": 3.1944069862365723, + "learning_rate": 0.0024727548351362225, + "loss": 7.6785, + "step": 816500 + }, + { + "epoch": 3.326648938933016, + "grad_norm": 7.595718860626221, + "learning_rate": 0.00247225878393896, + "loss": 7.6689, + "step": 816600 + }, + { + "epoch": 3.3270563169563974, + "grad_norm": 8.94078540802002, + "learning_rate": 0.002471762734036131, + "loss": 7.6623, + "step": 816700 + }, + { + "epoch": 3.327463694979779, + "grad_norm": 4.060114860534668, + "learning_rate": 0.0024712666854473475, + "loss": 7.6815, + "step": 816800 + }, + { + "epoch": 3.3278710730031604, + "grad_norm": 5.923506259918213, + "learning_rate": 0.0024707706381922228, + "loss": 7.6493, + "step": 816900 + }, + { + "epoch": 3.3282784510265415, + "grad_norm": 10.673074722290039, + "learning_rate": 0.0024702745922903693, + "loss": 7.6818, + "step": 817000 + }, + { + "epoch": 3.3282784510265415, + "eval_MaskedAccuracy": 0.506995028131734, + "eval_loss": 1.6104918718338013, + "eval_runtime": 161.0248, + "eval_samples_per_second": 394.2, + "eval_steps_per_second": 1.54, + "step": 817000 + }, + { + "epoch": 3.328685829049923, + "grad_norm": 5.100268363952637, + "learning_rate": 0.002469778547761395, + "loss": 7.6891, + "step": 817100 + }, + { + "epoch": 3.3290932070733046, + "grad_norm": 5.348010540008545, + "learning_rate": 0.002469282504624912, + "loss": 7.6533, + "step": 817200 + }, + { + "epoch": 3.329500585096686, + "grad_norm": 8.126382827758789, + "learning_rate": 0.002468786462900533, + "loss": 7.6506, + "step": 817300 + }, + { + "epoch": 3.3299079631200676, + "grad_norm": 5.103185653686523, + "learning_rate": 0.002468290422607871, + "loss": 7.6558, + "step": 817400 + }, + { + "epoch": 3.330315341143449, + "grad_norm": 9.422660827636719, + "learning_rate": 0.0024677943837665356, + "loss": 7.6619, + "step": 817500 + }, + { + "epoch": 3.3307227191668307, + "grad_norm": 6.416852951049805, + "learning_rate": 0.0024672983463961346, + "loss": 7.6366, + "step": 817600 + }, + { + "epoch": 3.331130097190212, + "grad_norm": 3.414886951446533, + "learning_rate": 0.0024668023105162812, + "loss": 7.6791, + "step": 817700 + }, + { + "epoch": 3.3315374752135933, + "grad_norm": 5.236874580383301, + "learning_rate": 0.0024663062761465884, + "loss": 7.6712, + "step": 817800 + }, + { + "epoch": 3.331944853236975, + "grad_norm": 4.077887058258057, + "learning_rate": 0.002465810243306666, + "loss": 7.6438, + "step": 817900 + }, + { + "epoch": 3.3323522312603564, + "grad_norm": 8.175515174865723, + "learning_rate": 0.0024653142120161235, + "loss": 7.6625, + "step": 818000 + }, + { + "epoch": 3.3323522312603564, + "eval_MaskedAccuracy": 0.5075873129937887, + "eval_loss": 1.612989902496338, + "eval_runtime": 160.8331, + "eval_samples_per_second": 394.67, + "eval_steps_per_second": 1.542, + "step": 818000 + }, + { + "epoch": 3.332759609283738, + "grad_norm": 9.09115219116211, + "learning_rate": 0.002464818182294573, + "loss": 7.6806, + "step": 818100 + }, + { + "epoch": 3.333166987307119, + "grad_norm": 9.988933563232422, + "learning_rate": 0.0024643221541616215, + "loss": 7.6469, + "step": 818200 + }, + { + "epoch": 3.3335743653305006, + "grad_norm": 7.367795467376709, + "learning_rate": 0.002463826127636883, + "loss": 7.6759, + "step": 818300 + }, + { + "epoch": 3.333981743353882, + "grad_norm": 7.297520637512207, + "learning_rate": 0.0024633301027399703, + "loss": 7.6325, + "step": 818400 + }, + { + "epoch": 3.3343891213772636, + "grad_norm": 4.209428787231445, + "learning_rate": 0.0024628340794904894, + "loss": 7.6408, + "step": 818500 + }, + { + "epoch": 3.334796499400645, + "grad_norm": 7.046091556549072, + "learning_rate": 0.0024623380579080535, + "loss": 7.6797, + "step": 818600 + }, + { + "epoch": 3.3352038774240267, + "grad_norm": 7.12550163269043, + "learning_rate": 0.0024618420380122736, + "loss": 7.6509, + "step": 818700 + }, + { + "epoch": 3.335611255447408, + "grad_norm": 4.62762975692749, + "learning_rate": 0.002461346019822758, + "loss": 7.6946, + "step": 818800 + }, + { + "epoch": 3.3360186334707893, + "grad_norm": 7.47962760925293, + "learning_rate": 0.002460850003359117, + "loss": 7.6563, + "step": 818900 + }, + { + "epoch": 3.336426011494171, + "grad_norm": 9.683699607849121, + "learning_rate": 0.0024603539886409614, + "loss": 7.6782, + "step": 819000 + }, + { + "epoch": 3.336426011494171, + "eval_MaskedAccuracy": 0.5057671322914739, + "eval_loss": 1.620146632194519, + "eval_runtime": 161.8696, + "eval_samples_per_second": 392.143, + "eval_steps_per_second": 1.532, + "step": 819000 + }, + { + "epoch": 3.3368333895175524, + "grad_norm": 3.1025774478912354, + "learning_rate": 0.0024598579756879035, + "loss": 7.6678, + "step": 819100 + }, + { + "epoch": 3.337240767540934, + "grad_norm": 5.008169174194336, + "learning_rate": 0.002459361964519551, + "loss": 7.6418, + "step": 819200 + }, + { + "epoch": 3.3376481455643154, + "grad_norm": 4.815037727355957, + "learning_rate": 0.0024588659551555126, + "loss": 7.6788, + "step": 819300 + }, + { + "epoch": 3.3380555235876965, + "grad_norm": 5.289239406585693, + "learning_rate": 0.002458369947615401, + "loss": 7.6675, + "step": 819400 + }, + { + "epoch": 3.338462901611078, + "grad_norm": 9.67807674407959, + "learning_rate": 0.0024578739419188252, + "loss": 7.6561, + "step": 819500 + }, + { + "epoch": 3.3388702796344596, + "grad_norm": 6.424745082855225, + "learning_rate": 0.0024573779380853934, + "loss": 7.6818, + "step": 819600 + }, + { + "epoch": 3.339277657657841, + "grad_norm": 10.432646751403809, + "learning_rate": 0.002456881936134717, + "loss": 7.6853, + "step": 819700 + }, + { + "epoch": 3.3396850356812227, + "grad_norm": 3.7950234413146973, + "learning_rate": 0.0024563859360864047, + "loss": 7.6897, + "step": 819800 + }, + { + "epoch": 3.340092413704604, + "grad_norm": 8.507339477539062, + "learning_rate": 0.0024558899379600657, + "loss": 7.6618, + "step": 819900 + }, + { + "epoch": 3.3404997917279857, + "grad_norm": 5.6887617111206055, + "learning_rate": 0.0024553939417753094, + "loss": 7.6526, + "step": 820000 + }, + { + "epoch": 3.3404997917279857, + "eval_MaskedAccuracy": 0.5076526077855775, + "eval_loss": 1.6125506162643433, + "eval_runtime": 157.1165, + "eval_samples_per_second": 404.006, + "eval_steps_per_second": 1.578, + "step": 820000 + }, + { + "epoch": 3.3409071697513673, + "grad_norm": 4.954686164855957, + "learning_rate": 0.0024548979475517456, + "loss": 7.6473, + "step": 820100 + }, + { + "epoch": 3.3413145477747483, + "grad_norm": 3.6541824340820312, + "learning_rate": 0.0024544019553089853, + "loss": 7.6493, + "step": 820200 + }, + { + "epoch": 3.34172192579813, + "grad_norm": 8.365074157714844, + "learning_rate": 0.0024539059650666413, + "loss": 7.671, + "step": 820300 + }, + { + "epoch": 3.3421293038215114, + "grad_norm": 4.78241491317749, + "learning_rate": 0.0024534099768443157, + "loss": 7.6516, + "step": 820400 + }, + { + "epoch": 3.342536681844893, + "grad_norm": 5.602843284606934, + "learning_rate": 0.0024529139906616167, + "loss": 7.667, + "step": 820500 + }, + { + "epoch": 3.3429440598682745, + "grad_norm": 9.948620796203613, + "learning_rate": 0.0024524180065381576, + "loss": 7.6627, + "step": 820600 + }, + { + "epoch": 3.3433514378916556, + "grad_norm": 13.063933372497559, + "learning_rate": 0.0024519220244935444, + "loss": 7.669, + "step": 820700 + }, + { + "epoch": 3.343758815915037, + "grad_norm": 9.975808143615723, + "learning_rate": 0.0024514260445473867, + "loss": 7.6707, + "step": 820800 + }, + { + "epoch": 3.3441661939384186, + "grad_norm": 3.7139270305633545, + "learning_rate": 0.0024509300667192977, + "loss": 7.6485, + "step": 820900 + }, + { + "epoch": 3.3445735719618, + "grad_norm": 7.64076566696167, + "learning_rate": 0.0024504340910288832, + "loss": 7.6765, + "step": 821000 + }, + { + "epoch": 3.3445735719618, + "eval_MaskedAccuracy": 0.5068409945564115, + "eval_loss": 1.608394980430603, + "eval_runtime": 154.527, + "eval_samples_per_second": 410.776, + "eval_steps_per_second": 1.605, + "step": 821000 + }, + { + "epoch": 3.3449809499851817, + "grad_norm": 4.352261066436768, + "learning_rate": 0.0024499381174957462, + "loss": 7.6776, + "step": 821100 + }, + { + "epoch": 3.3453883280085632, + "grad_norm": 3.314389228820801, + "learning_rate": 0.0024494421461395024, + "loss": 7.6637, + "step": 821200 + }, + { + "epoch": 3.3457957060319448, + "grad_norm": 4.812958240509033, + "learning_rate": 0.0024489461769797573, + "loss": 7.7008, + "step": 821300 + }, + { + "epoch": 3.346203084055326, + "grad_norm": 9.614579200744629, + "learning_rate": 0.0024484502100361166, + "loss": 7.6563, + "step": 821400 + }, + { + "epoch": 3.3466104620787074, + "grad_norm": 9.866796493530273, + "learning_rate": 0.002447954245328194, + "loss": 7.6485, + "step": 821500 + }, + { + "epoch": 3.347017840102089, + "grad_norm": 6.312223434448242, + "learning_rate": 0.0024474582828755984, + "loss": 7.6668, + "step": 821600 + }, + { + "epoch": 3.3474252181254704, + "grad_norm": 3.8034420013427734, + "learning_rate": 0.002446962322697934, + "loss": 7.6654, + "step": 821700 + }, + { + "epoch": 3.347832596148852, + "grad_norm": 5.688786506652832, + "learning_rate": 0.002446466364814806, + "loss": 7.6635, + "step": 821800 + }, + { + "epoch": 3.348239974172233, + "grad_norm": 4.1226677894592285, + "learning_rate": 0.0024459704092458265, + "loss": 7.6707, + "step": 821900 + }, + { + "epoch": 3.3486473521956146, + "grad_norm": 5.9946417808532715, + "learning_rate": 0.002445474456010603, + "loss": 7.679, + "step": 822000 + }, + { + "epoch": 3.3486473521956146, + "eval_MaskedAccuracy": 0.5075059830427925, + "eval_loss": 1.6148271560668945, + "eval_runtime": 156.096, + "eval_samples_per_second": 406.647, + "eval_steps_per_second": 1.589, + "step": 822000 + }, + { + "epoch": 3.349054730218996, + "grad_norm": 5.2468390464782715, + "learning_rate": 0.00244497850512874, + "loss": 7.6851, + "step": 822100 + }, + { + "epoch": 3.3494621082423777, + "grad_norm": 10.354859352111816, + "learning_rate": 0.0024444825566198487, + "loss": 7.6781, + "step": 822200 + }, + { + "epoch": 3.349869486265759, + "grad_norm": 5.5955424308776855, + "learning_rate": 0.002443986610503536, + "loss": 7.6694, + "step": 822300 + }, + { + "epoch": 3.3502768642891407, + "grad_norm": 6.757050514221191, + "learning_rate": 0.0024434906667994095, + "loss": 7.6669, + "step": 822400 + }, + { + "epoch": 3.3506842423125223, + "grad_norm": 4.649733066558838, + "learning_rate": 0.0024429947255270763, + "loss": 7.6732, + "step": 822500 + }, + { + "epoch": 3.351091620335904, + "grad_norm": 5.6255998611450195, + "learning_rate": 0.00244249878670614, + "loss": 7.6712, + "step": 822600 + }, + { + "epoch": 3.351498998359285, + "grad_norm": 3.62913179397583, + "learning_rate": 0.002442002850356214, + "loss": 7.6585, + "step": 822700 + }, + { + "epoch": 3.3519063763826664, + "grad_norm": 4.7510833740234375, + "learning_rate": 0.0024415069164969005, + "loss": 7.6666, + "step": 822800 + }, + { + "epoch": 3.352313754406048, + "grad_norm": 8.73514175415039, + "learning_rate": 0.002441010985147808, + "loss": 7.6782, + "step": 822900 + }, + { + "epoch": 3.3527211324294295, + "grad_norm": 8.673792839050293, + "learning_rate": 0.002440515056328547, + "loss": 7.6822, + "step": 823000 + }, + { + "epoch": 3.3527211324294295, + "eval_MaskedAccuracy": 0.5070311418367376, + "eval_loss": 1.6214524507522583, + "eval_runtime": 167.6911, + "eval_samples_per_second": 378.529, + "eval_steps_per_second": 1.479, + "step": 823000 + }, + { + "epoch": 3.353128510452811, + "grad_norm": 9.47994327545166, + "learning_rate": 0.0024400191300587167, + "loss": 7.6653, + "step": 823100 + }, + { + "epoch": 3.353535888476192, + "grad_norm": 3.8950607776641846, + "learning_rate": 0.002439523206357927, + "loss": 7.6919, + "step": 823200 + }, + { + "epoch": 3.3539432664995736, + "grad_norm": 8.29250717163086, + "learning_rate": 0.0024390272852457874, + "loss": 7.6693, + "step": 823300 + }, + { + "epoch": 3.354350644522955, + "grad_norm": 3.7033770084381104, + "learning_rate": 0.002438531366741902, + "loss": 7.6653, + "step": 823400 + }, + { + "epoch": 3.3547580225463367, + "grad_norm": 8.865307807922363, + "learning_rate": 0.0024380354508658783, + "loss": 7.6741, + "step": 823500 + }, + { + "epoch": 3.3551654005697182, + "grad_norm": 2.245955467224121, + "learning_rate": 0.0024375395376373195, + "loss": 7.6803, + "step": 823600 + }, + { + "epoch": 3.3555727785930998, + "grad_norm": 3.0160439014434814, + "learning_rate": 0.002437043627075836, + "loss": 7.6555, + "step": 823700 + }, + { + "epoch": 3.3559801566164813, + "grad_norm": 6.213773727416992, + "learning_rate": 0.00243654771920103, + "loss": 7.6513, + "step": 823800 + }, + { + "epoch": 3.3563875346398624, + "grad_norm": 5.2603983879089355, + "learning_rate": 0.0024360518140325083, + "loss": 7.6741, + "step": 823900 + }, + { + "epoch": 3.356794912663244, + "grad_norm": 5.670897483825684, + "learning_rate": 0.0024355559115898763, + "loss": 7.6521, + "step": 824000 + }, + { + "epoch": 3.356794912663244, + "eval_MaskedAccuracy": 0.5072930216411269, + "eval_loss": 1.6101815700531006, + "eval_runtime": 151.6168, + "eval_samples_per_second": 418.661, + "eval_steps_per_second": 1.636, + "step": 824000 + }, + { + "epoch": 3.3572022906866255, + "grad_norm": 7.838451862335205, + "learning_rate": 0.0024350600118927417, + "loss": 7.6388, + "step": 824100 + }, + { + "epoch": 3.357609668710007, + "grad_norm": 8.521012306213379, + "learning_rate": 0.0024345641149607095, + "loss": 7.6269, + "step": 824200 + }, + { + "epoch": 3.3580170467333885, + "grad_norm": 4.4888081550598145, + "learning_rate": 0.0024340682208133847, + "loss": 7.6575, + "step": 824300 + }, + { + "epoch": 3.3584244247567696, + "grad_norm": 12.87949275970459, + "learning_rate": 0.0024335723294703735, + "loss": 7.6461, + "step": 824400 + }, + { + "epoch": 3.358831802780151, + "grad_norm": 8.240680694580078, + "learning_rate": 0.0024330764409512783, + "loss": 7.6768, + "step": 824500 + }, + { + "epoch": 3.3592391808035327, + "grad_norm": 6.6971917152404785, + "learning_rate": 0.0024325805552757053, + "loss": 7.6795, + "step": 824600 + }, + { + "epoch": 3.359646558826914, + "grad_norm": 5.107844829559326, + "learning_rate": 0.0024320846724632627, + "loss": 7.6604, + "step": 824700 + }, + { + "epoch": 3.3600539368502957, + "grad_norm": 3.7403807640075684, + "learning_rate": 0.0024315887925335537, + "loss": 7.6643, + "step": 824800 + }, + { + "epoch": 3.3604613148736773, + "grad_norm": 3.1838297843933105, + "learning_rate": 0.0024310929155061793, + "loss": 7.6528, + "step": 824900 + }, + { + "epoch": 3.360868692897059, + "grad_norm": 4.040480613708496, + "learning_rate": 0.002430597041400746, + "loss": 7.636, + "step": 825000 + }, + { + "epoch": 3.360868692897059, + "eval_MaskedAccuracy": 0.5077692151325165, + "eval_loss": 1.606154441833496, + "eval_runtime": 167.0439, + "eval_samples_per_second": 379.996, + "eval_steps_per_second": 1.485, + "step": 825000 + }, + { + "epoch": 3.3612760709204403, + "grad_norm": 5.605712413787842, + "learning_rate": 0.0024301011702368607, + "loss": 7.6593, + "step": 825100 + }, + { + "epoch": 3.3616834489438214, + "grad_norm": 6.239404678344727, + "learning_rate": 0.002429605302034129, + "loss": 7.6526, + "step": 825200 + }, + { + "epoch": 3.362090826967203, + "grad_norm": 4.008897304534912, + "learning_rate": 0.002429109436812153, + "loss": 7.6671, + "step": 825300 + }, + { + "epoch": 3.3624982049905845, + "grad_norm": 5.22152853012085, + "learning_rate": 0.0024286135745905397, + "loss": 7.6774, + "step": 825400 + }, + { + "epoch": 3.362905583013966, + "grad_norm": 5.3362555503845215, + "learning_rate": 0.0024281177153888866, + "loss": 7.671, + "step": 825500 + }, + { + "epoch": 3.3633129610373476, + "grad_norm": 3.5259885787963867, + "learning_rate": 0.0024276218592268, + "loss": 7.6432, + "step": 825600 + }, + { + "epoch": 3.3637203390607286, + "grad_norm": 4.645276069641113, + "learning_rate": 0.0024271260061238874, + "loss": 7.6706, + "step": 825700 + }, + { + "epoch": 3.36412771708411, + "grad_norm": 5.165298938751221, + "learning_rate": 0.002426630156099749, + "loss": 7.6634, + "step": 825800 + }, + { + "epoch": 3.3645350951074917, + "grad_norm": 3.0694339275360107, + "learning_rate": 0.0024261343091739906, + "loss": 7.6427, + "step": 825900 + }, + { + "epoch": 3.3649424731308732, + "grad_norm": 6.479582786560059, + "learning_rate": 0.002425638465366213, + "loss": 7.6484, + "step": 826000 + }, + { + "epoch": 3.3649424731308732, + "eval_MaskedAccuracy": 0.507014539951461, + "eval_loss": 1.6130748987197876, + "eval_runtime": 162.9599, + "eval_samples_per_second": 389.519, + "eval_steps_per_second": 1.522, + "step": 826000 + }, + { + "epoch": 3.365349851154255, + "grad_norm": 6.702629089355469, + "learning_rate": 0.002425142624696022, + "loss": 7.657, + "step": 826100 + }, + { + "epoch": 3.3657572291776363, + "grad_norm": 4.354644775390625, + "learning_rate": 0.002424646787183022, + "loss": 7.6614, + "step": 826200 + }, + { + "epoch": 3.366164607201018, + "grad_norm": 6.56011962890625, + "learning_rate": 0.0024241509528468127, + "loss": 7.6674, + "step": 826300 + }, + { + "epoch": 3.366571985224399, + "grad_norm": 7.575295448303223, + "learning_rate": 0.002423655121707001, + "loss": 7.6662, + "step": 826400 + }, + { + "epoch": 3.3669793632477805, + "grad_norm": 5.027916431427002, + "learning_rate": 0.0024231592937831847, + "loss": 7.6646, + "step": 826500 + }, + { + "epoch": 3.367386741271162, + "grad_norm": 2.8268332481384277, + "learning_rate": 0.00242266346909497, + "loss": 7.6551, + "step": 826600 + }, + { + "epoch": 3.3677941192945435, + "grad_norm": 12.005270957946777, + "learning_rate": 0.0024221676476619616, + "loss": 7.6723, + "step": 826700 + }, + { + "epoch": 3.368201497317925, + "grad_norm": 4.354057312011719, + "learning_rate": 0.0024216718295037546, + "loss": 7.6652, + "step": 826800 + }, + { + "epoch": 3.368608875341306, + "grad_norm": 4.566364288330078, + "learning_rate": 0.0024211760146399574, + "loss": 7.657, + "step": 826900 + }, + { + "epoch": 3.3690162533646877, + "grad_norm": 3.726116180419922, + "learning_rate": 0.0024206802030901707, + "loss": 7.6665, + "step": 827000 + }, + { + "epoch": 3.3690162533646877, + "eval_MaskedAccuracy": 0.5072880942238474, + "eval_loss": 1.610743522644043, + "eval_runtime": 156.0728, + "eval_samples_per_second": 406.708, + "eval_steps_per_second": 1.589, + "step": 827000 + }, + { + "epoch": 3.369423631388069, + "grad_norm": 3.6981687545776367, + "learning_rate": 0.0024201843948739974, + "loss": 7.6521, + "step": 827100 + }, + { + "epoch": 3.3698310094114508, + "grad_norm": 2.9989585876464844, + "learning_rate": 0.0024196885900110397, + "loss": 7.6803, + "step": 827200 + }, + { + "epoch": 3.3702383874348323, + "grad_norm": 10.081335067749023, + "learning_rate": 0.0024191927885208993, + "loss": 7.6505, + "step": 827300 + }, + { + "epoch": 3.370645765458214, + "grad_norm": 10.75118637084961, + "learning_rate": 0.0024186969904231755, + "loss": 7.6719, + "step": 827400 + }, + { + "epoch": 3.3710531434815953, + "grad_norm": 4.597393035888672, + "learning_rate": 0.002418201195737471, + "loss": 7.6763, + "step": 827500 + }, + { + "epoch": 3.3714605215049764, + "grad_norm": 7.283711910247803, + "learning_rate": 0.0024177054044833856, + "loss": 7.6566, + "step": 827600 + }, + { + "epoch": 3.371867899528358, + "grad_norm": 4.523886203765869, + "learning_rate": 0.002417209616680525, + "loss": 7.6649, + "step": 827700 + }, + { + "epoch": 3.3722752775517395, + "grad_norm": 4.122554779052734, + "learning_rate": 0.002416713832348482, + "loss": 7.6693, + "step": 827800 + }, + { + "epoch": 3.372682655575121, + "grad_norm": 5.343982219696045, + "learning_rate": 0.0024162180515068655, + "loss": 7.6575, + "step": 827900 + }, + { + "epoch": 3.3730900335985026, + "grad_norm": 6.65548849105835, + "learning_rate": 0.0024157222741752757, + "loss": 7.6535, + "step": 828000 + }, + { + "epoch": 3.3730900335985026, + "eval_MaskedAccuracy": 0.5065662796216804, + "eval_loss": 1.6128860712051392, + "eval_runtime": 156.5143, + "eval_samples_per_second": 405.561, + "eval_steps_per_second": 1.585, + "step": 828000 + }, + { + "epoch": 3.373497411621884, + "grad_norm": 3.8162009716033936, + "learning_rate": 0.002415226500373313, + "loss": 7.6711, + "step": 828100 + }, + { + "epoch": 3.373904789645265, + "grad_norm": 12.830845832824707, + "learning_rate": 0.0024147307301205755, + "loss": 7.6657, + "step": 828200 + }, + { + "epoch": 3.3743121676686467, + "grad_norm": 8.961577415466309, + "learning_rate": 0.002414234963436664, + "loss": 7.6956, + "step": 828300 + }, + { + "epoch": 3.3747195456920283, + "grad_norm": 6.989332675933838, + "learning_rate": 0.0024137392003411814, + "loss": 7.6912, + "step": 828400 + }, + { + "epoch": 3.37512692371541, + "grad_norm": 4.718576431274414, + "learning_rate": 0.002413243440853725, + "loss": 7.6849, + "step": 828500 + }, + { + "epoch": 3.3755343017387913, + "grad_norm": 5.740972518920898, + "learning_rate": 0.002412747684993897, + "loss": 7.6422, + "step": 828600 + }, + { + "epoch": 3.375941679762173, + "grad_norm": 8.048383712768555, + "learning_rate": 0.0024122519327812956, + "loss": 7.6806, + "step": 828700 + }, + { + "epoch": 3.3763490577855544, + "grad_norm": 7.133415699005127, + "learning_rate": 0.0024117561842355215, + "loss": 7.6453, + "step": 828800 + }, + { + "epoch": 3.3767564358089355, + "grad_norm": 4.826624393463135, + "learning_rate": 0.002411260439376174, + "loss": 7.6548, + "step": 828900 + }, + { + "epoch": 3.377163813832317, + "grad_norm": 10.053572654724121, + "learning_rate": 0.002410764698222855, + "loss": 7.6543, + "step": 829000 + }, + { + "epoch": 3.377163813832317, + "eval_MaskedAccuracy": 0.5070204764525635, + "eval_loss": 1.6115893125534058, + "eval_runtime": 154.8877, + "eval_samples_per_second": 409.819, + "eval_steps_per_second": 1.601, + "step": 829000 + }, + { + "epoch": 3.3775711918556985, + "grad_norm": 4.761382579803467, + "learning_rate": 0.0024102689607951597, + "loss": 7.644, + "step": 829100 + }, + { + "epoch": 3.37797856987908, + "grad_norm": 9.084874153137207, + "learning_rate": 0.00240977322711269, + "loss": 7.6406, + "step": 829200 + }, + { + "epoch": 3.3783859479024616, + "grad_norm": 3.393242835998535, + "learning_rate": 0.0024092774971950473, + "loss": 7.6674, + "step": 829300 + }, + { + "epoch": 3.3787933259258427, + "grad_norm": 4.38031530380249, + "learning_rate": 0.0024087817710618242, + "loss": 7.663, + "step": 829400 + }, + { + "epoch": 3.3792007039492242, + "grad_norm": 4.793169975280762, + "learning_rate": 0.0024082860487326216, + "loss": 7.6583, + "step": 829500 + }, + { + "epoch": 3.3796080819726058, + "grad_norm": 6.607907772064209, + "learning_rate": 0.0024077903302270366, + "loss": 7.6713, + "step": 829600 + }, + { + "epoch": 3.3800154599959873, + "grad_norm": 3.7557523250579834, + "learning_rate": 0.002407294615564673, + "loss": 7.657, + "step": 829700 + }, + { + "epoch": 3.380422838019369, + "grad_norm": 6.895918369293213, + "learning_rate": 0.002406798904765123, + "loss": 7.6658, + "step": 829800 + }, + { + "epoch": 3.3808302160427504, + "grad_norm": 14.377764701843262, + "learning_rate": 0.0024063031978479887, + "loss": 7.6702, + "step": 829900 + }, + { + "epoch": 3.381237594066132, + "grad_norm": 7.503129005432129, + "learning_rate": 0.0024058074948328676, + "loss": 7.6571, + "step": 830000 + }, + { + "epoch": 3.381237594066132, + "eval_MaskedAccuracy": 0.5073381341409644, + "eval_loss": 1.6124346256256104, + "eval_runtime": 189.1157, + "eval_samples_per_second": 335.646, + "eval_steps_per_second": 1.311, + "step": 830000 + }, + { + "epoch": 3.381644972089513, + "grad_norm": 6.230504989624023, + "learning_rate": 0.002405311795739352, + "loss": 7.6695, + "step": 830100 + }, + { + "epoch": 3.3820523501128945, + "grad_norm": 5.507158279418945, + "learning_rate": 0.0024048161005870506, + "loss": 7.7014, + "step": 830200 + }, + { + "epoch": 3.382459728136276, + "grad_norm": 3.3367087841033936, + "learning_rate": 0.0024043204093955536, + "loss": 7.6919, + "step": 830300 + }, + { + "epoch": 3.3828671061596576, + "grad_norm": 4.103170871734619, + "learning_rate": 0.0024038247221844565, + "loss": 7.646, + "step": 830400 + }, + { + "epoch": 3.383274484183039, + "grad_norm": 2.787203550338745, + "learning_rate": 0.002403329038973361, + "loss": 7.6722, + "step": 830500 + }, + { + "epoch": 3.3836818622064206, + "grad_norm": 7.393489837646484, + "learning_rate": 0.002402833359781865, + "loss": 7.6633, + "step": 830600 + }, + { + "epoch": 3.3840892402298017, + "grad_norm": 7.408133506774902, + "learning_rate": 0.0024023376846295597, + "loss": 7.7092, + "step": 830700 + }, + { + "epoch": 3.3844966182531833, + "grad_norm": 6.954722881317139, + "learning_rate": 0.002401842013536045, + "loss": 7.6782, + "step": 830800 + }, + { + "epoch": 3.384903996276565, + "grad_norm": 7.104769229888916, + "learning_rate": 0.002401346346520915, + "loss": 7.6704, + "step": 830900 + }, + { + "epoch": 3.3853113742999463, + "grad_norm": 6.4443535804748535, + "learning_rate": 0.0024008506836037725, + "loss": 7.6697, + "step": 831000 + }, + { + "epoch": 3.3853113742999463, + "eval_MaskedAccuracy": 0.5072718987680596, + "eval_loss": 1.6166130304336548, + "eval_runtime": 152.8159, + "eval_samples_per_second": 415.376, + "eval_steps_per_second": 1.623, + "step": 831000 + }, + { + "epoch": 3.385718752323328, + "grad_norm": 7.106754779815674, + "learning_rate": 0.0024003550248042063, + "loss": 7.6626, + "step": 831100 + }, + { + "epoch": 3.3861261303467094, + "grad_norm": 4.369802474975586, + "learning_rate": 0.0023998593701418185, + "loss": 7.6737, + "step": 831200 + }, + { + "epoch": 3.386533508370091, + "grad_norm": 3.6945748329162598, + "learning_rate": 0.0023993637196362003, + "loss": 7.6837, + "step": 831300 + }, + { + "epoch": 3.386940886393472, + "grad_norm": 4.411739826202393, + "learning_rate": 0.002398868073306949, + "loss": 7.6952, + "step": 831400 + }, + { + "epoch": 3.3873482644168535, + "grad_norm": 3.0309081077575684, + "learning_rate": 0.0023983724311736634, + "loss": 7.674, + "step": 831500 + }, + { + "epoch": 3.387755642440235, + "grad_norm": 4.751334190368652, + "learning_rate": 0.002397876793255933, + "loss": 7.6716, + "step": 831600 + }, + { + "epoch": 3.3881630204636166, + "grad_norm": 5.458463668823242, + "learning_rate": 0.0023973811595733555, + "loss": 7.699, + "step": 831700 + }, + { + "epoch": 3.388570398486998, + "grad_norm": 3.3413469791412354, + "learning_rate": 0.0023968855301455282, + "loss": 7.7036, + "step": 831800 + }, + { + "epoch": 3.3889777765103792, + "grad_norm": 3.039830207824707, + "learning_rate": 0.0023963899049920384, + "loss": 7.674, + "step": 831900 + }, + { + "epoch": 3.3893851545337608, + "grad_norm": 3.8685097694396973, + "learning_rate": 0.002395894284132492, + "loss": 7.6793, + "step": 832000 + }, + { + "epoch": 3.3893851545337608, + "eval_MaskedAccuracy": 0.5073406821303406, + "eval_loss": 1.6137722730636597, + "eval_runtime": 161.1481, + "eval_samples_per_second": 393.899, + "eval_steps_per_second": 1.539, + "step": 832000 + }, + { + "epoch": 3.3897925325571423, + "grad_norm": 3.665583848953247, + "learning_rate": 0.002395398667586476, + "loss": 7.6717, + "step": 832100 + }, + { + "epoch": 3.390199910580524, + "grad_norm": 4.173010349273682, + "learning_rate": 0.002394903055373587, + "loss": 7.659, + "step": 832200 + }, + { + "epoch": 3.3906072886039054, + "grad_norm": 3.1028964519500732, + "learning_rate": 0.0023944074475134203, + "loss": 7.6563, + "step": 832300 + }, + { + "epoch": 3.391014666627287, + "grad_norm": 5.7776288986206055, + "learning_rate": 0.0023939118440255694, + "loss": 7.667, + "step": 832400 + }, + { + "epoch": 3.3914220446506684, + "grad_norm": 10.680218696594238, + "learning_rate": 0.0023934162449296297, + "loss": 7.6756, + "step": 832500 + }, + { + "epoch": 3.3918294226740495, + "grad_norm": 4.816702842712402, + "learning_rate": 0.002392920650245192, + "loss": 7.6148, + "step": 832600 + }, + { + "epoch": 3.392236800697431, + "grad_norm": 4.031116962432861, + "learning_rate": 0.0023924250599918484, + "loss": 7.6459, + "step": 832700 + }, + { + "epoch": 3.3926441787208126, + "grad_norm": 4.229380130767822, + "learning_rate": 0.0023919294741891945, + "loss": 7.6725, + "step": 832800 + }, + { + "epoch": 3.393051556744194, + "grad_norm": 5.243058681488037, + "learning_rate": 0.0023914338928568237, + "loss": 7.6495, + "step": 832900 + }, + { + "epoch": 3.3934589347675757, + "grad_norm": 7.159852981567383, + "learning_rate": 0.0023909383160143284, + "loss": 7.6504, + "step": 833000 + }, + { + "epoch": 3.3934589347675757, + "eval_MaskedAccuracy": 0.50725899062841, + "eval_loss": 1.6201882362365723, + "eval_runtime": 163.8534, + "eval_samples_per_second": 387.395, + "eval_steps_per_second": 1.514, + "step": 833000 + }, + { + "epoch": 3.393866312790957, + "grad_norm": 5.065793991088867, + "learning_rate": 0.002390442743681302, + "loss": 7.6309, + "step": 833100 + }, + { + "epoch": 3.3942736908143383, + "grad_norm": 6.484888553619385, + "learning_rate": 0.0023899471758773356, + "loss": 7.6971, + "step": 833200 + }, + { + "epoch": 3.39468106883772, + "grad_norm": 3.7068583965301514, + "learning_rate": 0.002389451612622021, + "loss": 7.661, + "step": 833300 + }, + { + "epoch": 3.3950884468611013, + "grad_norm": 9.850645065307617, + "learning_rate": 0.0023889560539349544, + "loss": 7.6384, + "step": 833400 + }, + { + "epoch": 3.395495824884483, + "grad_norm": 8.616034507751465, + "learning_rate": 0.002388460499835725, + "loss": 7.6634, + "step": 833500 + }, + { + "epoch": 3.3959032029078644, + "grad_norm": 5.236738204956055, + "learning_rate": 0.002387964950343926, + "loss": 7.6906, + "step": 833600 + }, + { + "epoch": 3.396310580931246, + "grad_norm": 4.034549713134766, + "learning_rate": 0.0023874694054791482, + "loss": 7.6662, + "step": 833700 + }, + { + "epoch": 3.3967179589546275, + "grad_norm": 3.472949981689453, + "learning_rate": 0.0023869738652609832, + "loss": 7.6667, + "step": 833800 + }, + { + "epoch": 3.3971253369780086, + "grad_norm": 3.4258108139038086, + "learning_rate": 0.0023864783297090197, + "loss": 7.6713, + "step": 833900 + }, + { + "epoch": 3.39753271500139, + "grad_norm": 5.242177486419678, + "learning_rate": 0.002385982798842854, + "loss": 7.6666, + "step": 834000 + }, + { + "epoch": 3.39753271500139, + "eval_MaskedAccuracy": 0.5080079469272007, + "eval_loss": 1.6206566095352173, + "eval_runtime": 159.6813, + "eval_samples_per_second": 397.517, + "eval_steps_per_second": 1.553, + "step": 834000 + }, + { + "epoch": 3.3979400930247716, + "grad_norm": 8.542133331298828, + "learning_rate": 0.0023854872726820782, + "loss": 7.6469, + "step": 834100 + }, + { + "epoch": 3.398347471048153, + "grad_norm": 8.990893363952637, + "learning_rate": 0.002384991751246276, + "loss": 7.6571, + "step": 834200 + }, + { + "epoch": 3.3987548490715347, + "grad_norm": 6.003313064575195, + "learning_rate": 0.0023844962345550428, + "loss": 7.6614, + "step": 834300 + }, + { + "epoch": 3.399162227094916, + "grad_norm": 7.175158500671387, + "learning_rate": 0.0023840007226279664, + "loss": 7.6823, + "step": 834400 + }, + { + "epoch": 3.3995696051182973, + "grad_norm": 7.723865985870361, + "learning_rate": 0.0023835052154846407, + "loss": 7.6675, + "step": 834500 + }, + { + "epoch": 3.399976983141679, + "grad_norm": 8.568977355957031, + "learning_rate": 0.002383009713144654, + "loss": 7.6294, + "step": 834600 + }, + { + "epoch": 3.4003843611650604, + "grad_norm": 4.583963394165039, + "learning_rate": 0.0023825142156275957, + "loss": 7.6542, + "step": 834700 + }, + { + "epoch": 3.400791739188442, + "grad_norm": 9.71217155456543, + "learning_rate": 0.002382018722953053, + "loss": 7.6478, + "step": 834800 + }, + { + "epoch": 3.4011991172118234, + "grad_norm": 6.550137996673584, + "learning_rate": 0.002381523235140619, + "loss": 7.6678, + "step": 834900 + }, + { + "epoch": 3.401606495235205, + "grad_norm": 2.844496488571167, + "learning_rate": 0.0023810277522098825, + "loss": 7.6329, + "step": 835000 + }, + { + "epoch": 3.401606495235205, + "eval_MaskedAccuracy": 0.5079399934763913, + "eval_loss": 1.6060205698013306, + "eval_runtime": 156.158, + "eval_samples_per_second": 406.486, + "eval_steps_per_second": 1.588, + "step": 835000 + }, + { + "epoch": 3.402013873258586, + "grad_norm": 5.937469005584717, + "learning_rate": 0.002380532274180431, + "loss": 7.676, + "step": 835100 + }, + { + "epoch": 3.4024212512819676, + "grad_norm": 5.086205005645752, + "learning_rate": 0.0023800368010718533, + "loss": 7.644, + "step": 835200 + }, + { + "epoch": 3.402828629305349, + "grad_norm": 3.5045113563537598, + "learning_rate": 0.002379541332903739, + "loss": 7.6603, + "step": 835300 + }, + { + "epoch": 3.4032360073287307, + "grad_norm": 7.176479339599609, + "learning_rate": 0.0023790458696956772, + "loss": 7.6436, + "step": 835400 + }, + { + "epoch": 3.403643385352112, + "grad_norm": 4.983194351196289, + "learning_rate": 0.0023785504114672566, + "loss": 7.6698, + "step": 835500 + }, + { + "epoch": 3.4040507633754937, + "grad_norm": 3.059422731399536, + "learning_rate": 0.002378054958238063, + "loss": 7.6595, + "step": 835600 + }, + { + "epoch": 3.404458141398875, + "grad_norm": 8.528168678283691, + "learning_rate": 0.0023775595100276863, + "loss": 7.6538, + "step": 835700 + }, + { + "epoch": 3.4048655194222563, + "grad_norm": 5.731561183929443, + "learning_rate": 0.002377064066855718, + "loss": 7.6842, + "step": 835800 + }, + { + "epoch": 3.405272897445638, + "grad_norm": 10.47243881225586, + "learning_rate": 0.002376568628741738, + "loss": 7.6407, + "step": 835900 + }, + { + "epoch": 3.4056802754690194, + "grad_norm": 5.699708461761475, + "learning_rate": 0.0023760731957053367, + "loss": 7.6225, + "step": 836000 + }, + { + "epoch": 3.4056802754690194, + "eval_MaskedAccuracy": 0.5075000617329241, + "eval_loss": 1.609775185585022, + "eval_runtime": 155.485, + "eval_samples_per_second": 408.245, + "eval_steps_per_second": 1.595, + "step": 836000 + }, + { + "epoch": 3.406087653492401, + "grad_norm": 4.290980815887451, + "learning_rate": 0.002375577767766104, + "loss": 7.6752, + "step": 836100 + }, + { + "epoch": 3.4064950315157825, + "grad_norm": 5.101619720458984, + "learning_rate": 0.002375082344943621, + "loss": 7.6888, + "step": 836200 + }, + { + "epoch": 3.406902409539164, + "grad_norm": 4.72704553604126, + "learning_rate": 0.002374586927257481, + "loss": 7.6807, + "step": 836300 + }, + { + "epoch": 3.407309787562545, + "grad_norm": 5.734152793884277, + "learning_rate": 0.0023740915147272687, + "loss": 7.6606, + "step": 836400 + }, + { + "epoch": 3.4077171655859266, + "grad_norm": 12.271175384521484, + "learning_rate": 0.002373596107372566, + "loss": 7.6721, + "step": 836500 + }, + { + "epoch": 3.408124543609308, + "grad_norm": 9.951292991638184, + "learning_rate": 0.0023731007052129613, + "loss": 7.6859, + "step": 836600 + }, + { + "epoch": 3.4085319216326897, + "grad_norm": 4.7487030029296875, + "learning_rate": 0.0023726053082680433, + "loss": 7.6775, + "step": 836700 + }, + { + "epoch": 3.4089392996560712, + "grad_norm": 7.660290241241455, + "learning_rate": 0.0023721099165573962, + "loss": 7.6731, + "step": 836800 + }, + { + "epoch": 3.4093466776794523, + "grad_norm": 3.7929229736328125, + "learning_rate": 0.002371614530100602, + "loss": 7.6713, + "step": 836900 + }, + { + "epoch": 3.409754055702834, + "grad_norm": 4.572562217712402, + "learning_rate": 0.0023711191489172484, + "loss": 7.6696, + "step": 837000 + }, + { + "epoch": 3.409754055702834, + "eval_MaskedAccuracy": 0.507034410269781, + "eval_loss": 1.612809181213379, + "eval_runtime": 164.4911, + "eval_samples_per_second": 385.893, + "eval_steps_per_second": 1.508, + "step": 837000 + }, + { + "epoch": 3.4101614337262154, + "grad_norm": 11.561025619506836, + "learning_rate": 0.0023706237730269255, + "loss": 7.652, + "step": 837100 + }, + { + "epoch": 3.410568811749597, + "grad_norm": 5.122629165649414, + "learning_rate": 0.002370128402449212, + "loss": 7.6614, + "step": 837200 + }, + { + "epoch": 3.4109761897729785, + "grad_norm": 6.234327793121338, + "learning_rate": 0.0023696330372036936, + "loss": 7.6058, + "step": 837300 + }, + { + "epoch": 3.41138356779636, + "grad_norm": 10.977176666259766, + "learning_rate": 0.002369137677309954, + "loss": 7.6961, + "step": 837400 + }, + { + "epoch": 3.4117909458197415, + "grad_norm": 4.57222843170166, + "learning_rate": 0.0023686423227875794, + "loss": 7.6631, + "step": 837500 + }, + { + "epoch": 3.4121983238431226, + "grad_norm": 4.3072991371154785, + "learning_rate": 0.0023681469736561507, + "loss": 7.6266, + "step": 837600 + }, + { + "epoch": 3.412605701866504, + "grad_norm": 4.29278564453125, + "learning_rate": 0.0023676516299352547, + "loss": 7.6838, + "step": 837700 + }, + { + "epoch": 3.4130130798898857, + "grad_norm": 5.988570213317871, + "learning_rate": 0.0023671562916444764, + "loss": 7.6315, + "step": 837800 + }, + { + "epoch": 3.413420457913267, + "grad_norm": 3.544550895690918, + "learning_rate": 0.0023666609588033935, + "loss": 7.6627, + "step": 837900 + }, + { + "epoch": 3.4138278359366487, + "grad_norm": 5.6219329833984375, + "learning_rate": 0.0023661656314315953, + "loss": 7.6553, + "step": 838000 + }, + { + "epoch": 3.4138278359366487, + "eval_MaskedAccuracy": 0.5070233921709371, + "eval_loss": 1.610743522644043, + "eval_runtime": 173.3502, + "eval_samples_per_second": 366.172, + "eval_steps_per_second": 1.431, + "step": 838000 + }, + { + "epoch": 3.4142352139600303, + "grad_norm": 9.236532211303711, + "learning_rate": 0.002365670309548661, + "loss": 7.6769, + "step": 838100 + }, + { + "epoch": 3.4146425919834114, + "grad_norm": 5.578697681427002, + "learning_rate": 0.0023651749931741742, + "loss": 7.66, + "step": 838200 + }, + { + "epoch": 3.415049970006793, + "grad_norm": 9.232965469360352, + "learning_rate": 0.002364679682327721, + "loss": 7.6472, + "step": 838300 + }, + { + "epoch": 3.4154573480301744, + "grad_norm": 4.654336929321289, + "learning_rate": 0.0023641843770288774, + "loss": 7.6592, + "step": 838400 + }, + { + "epoch": 3.415864726053556, + "grad_norm": 4.323183059692383, + "learning_rate": 0.0023636890772972315, + "loss": 7.654, + "step": 838500 + }, + { + "epoch": 3.4162721040769375, + "grad_norm": 4.7549848556518555, + "learning_rate": 0.002363193783152361, + "loss": 7.6212, + "step": 838600 + }, + { + "epoch": 3.416679482100319, + "grad_norm": 8.74051284790039, + "learning_rate": 0.0023626984946138463, + "loss": 7.6539, + "step": 838700 + }, + { + "epoch": 3.4170868601237006, + "grad_norm": 7.818248271942139, + "learning_rate": 0.0023622032117012735, + "loss": 7.6563, + "step": 838800 + }, + { + "epoch": 3.4174942381470816, + "grad_norm": 4.790457248687744, + "learning_rate": 0.00236170793443422, + "loss": 7.6859, + "step": 838900 + }, + { + "epoch": 3.417901616170463, + "grad_norm": 4.331161022186279, + "learning_rate": 0.0023612126628322714, + "loss": 7.6578, + "step": 839000 + }, + { + "epoch": 3.417901616170463, + "eval_MaskedAccuracy": 0.5078824194169587, + "eval_loss": 1.6002832651138306, + "eval_runtime": 161.4853, + "eval_samples_per_second": 393.076, + "eval_steps_per_second": 1.536, + "step": 839000 + }, + { + "epoch": 3.4183089941938447, + "grad_norm": 5.092179775238037, + "learning_rate": 0.0023607173969150035, + "loss": 7.6521, + "step": 839100 + }, + { + "epoch": 3.4187163722172262, + "grad_norm": 10.025911331176758, + "learning_rate": 0.0023602221367020003, + "loss": 7.6779, + "step": 839200 + }, + { + "epoch": 3.4191237502406078, + "grad_norm": 5.616159439086914, + "learning_rate": 0.00235972688221284, + "loss": 7.6613, + "step": 839300 + }, + { + "epoch": 3.419531128263989, + "grad_norm": 3.7099969387054443, + "learning_rate": 0.0023592316334671002, + "loss": 7.666, + "step": 839400 + }, + { + "epoch": 3.4199385062873704, + "grad_norm": 9.61128044128418, + "learning_rate": 0.0023587363904843633, + "loss": 7.6497, + "step": 839500 + }, + { + "epoch": 3.420345884310752, + "grad_norm": 3.531198024749756, + "learning_rate": 0.002358241153284211, + "loss": 7.6365, + "step": 839600 + }, + { + "epoch": 3.4207532623341335, + "grad_norm": 3.091862440109253, + "learning_rate": 0.002357745921886218, + "loss": 7.6708, + "step": 839700 + }, + { + "epoch": 3.421160640357515, + "grad_norm": 9.81743049621582, + "learning_rate": 0.0023572506963099706, + "loss": 7.6823, + "step": 839800 + }, + { + "epoch": 3.4215680183808965, + "grad_norm": 7.8127055168151855, + "learning_rate": 0.0023567554765750423, + "loss": 7.6884, + "step": 839900 + }, + { + "epoch": 3.421975396404278, + "grad_norm": 12.666727066040039, + "learning_rate": 0.002356260262701012, + "loss": 7.6404, + "step": 840000 + }, + { + "epoch": 3.421975396404278, + "eval_MaskedAccuracy": 0.5075237537295956, + "eval_loss": 1.6168293952941895, + "eval_runtime": 160.0079, + "eval_samples_per_second": 396.705, + "eval_steps_per_second": 1.55, + "step": 840000 + }, + { + "epoch": 3.422382774427659, + "grad_norm": 5.36226749420166, + "learning_rate": 0.0023557650547074605, + "loss": 7.6518, + "step": 840100 + }, + { + "epoch": 3.4227901524510407, + "grad_norm": 3.917013645172119, + "learning_rate": 0.0023552698526139635, + "loss": 7.6621, + "step": 840200 + }, + { + "epoch": 3.423197530474422, + "grad_norm": 6.045384883880615, + "learning_rate": 0.0023547746564400976, + "loss": 7.6472, + "step": 840300 + }, + { + "epoch": 3.4236049084978037, + "grad_norm": 3.1460657119750977, + "learning_rate": 0.002354279466205448, + "loss": 7.6519, + "step": 840400 + }, + { + "epoch": 3.4240122865211853, + "grad_norm": 6.753716945648193, + "learning_rate": 0.0023537842819295837, + "loss": 7.6562, + "step": 840500 + }, + { + "epoch": 3.424419664544567, + "grad_norm": 6.56367301940918, + "learning_rate": 0.0023532891036320857, + "loss": 7.6255, + "step": 840600 + }, + { + "epoch": 3.424827042567948, + "grad_norm": 3.510305881500244, + "learning_rate": 0.0023527939313325304, + "loss": 7.6579, + "step": 840700 + }, + { + "epoch": 3.4252344205913294, + "grad_norm": 4.690621376037598, + "learning_rate": 0.0023522987650504935, + "loss": 7.6697, + "step": 840800 + }, + { + "epoch": 3.425641798614711, + "grad_norm": 2.993744134902954, + "learning_rate": 0.0023518036048055563, + "loss": 7.6767, + "step": 840900 + }, + { + "epoch": 3.4260491766380925, + "grad_norm": 7.15242862701416, + "learning_rate": 0.00235130845061729, + "loss": 7.6736, + "step": 841000 + }, + { + "epoch": 3.4260491766380925, + "eval_MaskedAccuracy": 0.5068737466172768, + "eval_loss": 1.6180052757263184, + "eval_runtime": 176.1771, + "eval_samples_per_second": 360.297, + "eval_steps_per_second": 1.408, + "step": 841000 + }, + { + "epoch": 3.426456554661474, + "grad_norm": 4.285866737365723, + "learning_rate": 0.0023508133025052736, + "loss": 7.6334, + "step": 841100 + }, + { + "epoch": 3.4268639326848556, + "grad_norm": 3.8874728679656982, + "learning_rate": 0.0023503181604890835, + "loss": 7.6595, + "step": 841200 + }, + { + "epoch": 3.427271310708237, + "grad_norm": 8.724327087402344, + "learning_rate": 0.00234982302458829, + "loss": 7.7037, + "step": 841300 + }, + { + "epoch": 3.427678688731618, + "grad_norm": 3.802838087081909, + "learning_rate": 0.0023493278948224744, + "loss": 7.6617, + "step": 841400 + }, + { + "epoch": 3.4280860667549997, + "grad_norm": 5.62872838973999, + "learning_rate": 0.002348832771211211, + "loss": 7.6423, + "step": 841500 + }, + { + "epoch": 3.4284934447783812, + "grad_norm": 4.328555107116699, + "learning_rate": 0.0023483376537740703, + "loss": 7.6562, + "step": 841600 + }, + { + "epoch": 3.428900822801763, + "grad_norm": 3.9430062770843506, + "learning_rate": 0.0023478425425306326, + "loss": 7.6444, + "step": 841700 + }, + { + "epoch": 3.4293082008251443, + "grad_norm": 10.714035987854004, + "learning_rate": 0.002347347437500468, + "loss": 7.6648, + "step": 841800 + }, + { + "epoch": 3.4297155788485254, + "grad_norm": 4.730955600738525, + "learning_rate": 0.0023468523387031527, + "loss": 7.644, + "step": 841900 + }, + { + "epoch": 3.430122956871907, + "grad_norm": 3.557774066925049, + "learning_rate": 0.0023463572461582603, + "loss": 7.6728, + "step": 842000 + }, + { + "epoch": 3.430122956871907, + "eval_MaskedAccuracy": 0.5075078193670638, + "eval_loss": 1.6078718900680542, + "eval_runtime": 162.7347, + "eval_samples_per_second": 390.058, + "eval_steps_per_second": 1.524, + "step": 842000 + }, + { + "epoch": 3.4305303348952885, + "grad_norm": 6.733087539672852, + "learning_rate": 0.0023458621598853656, + "loss": 7.6328, + "step": 842100 + }, + { + "epoch": 3.43093771291867, + "grad_norm": 7.9630303382873535, + "learning_rate": 0.0023453670799040427, + "loss": 7.6389, + "step": 842200 + }, + { + "epoch": 3.4313450909420515, + "grad_norm": 6.708206653594971, + "learning_rate": 0.0023448720062338624, + "loss": 7.6758, + "step": 842300 + }, + { + "epoch": 3.431752468965433, + "grad_norm": 8.816062927246094, + "learning_rate": 0.0023443769388943984, + "loss": 7.6319, + "step": 842400 + }, + { + "epoch": 3.4321598469888146, + "grad_norm": 4.445310592651367, + "learning_rate": 0.0023438818779052233, + "loss": 7.661, + "step": 842500 + }, + { + "epoch": 3.4325672250121957, + "grad_norm": 7.010202884674072, + "learning_rate": 0.0023433868232859076, + "loss": 7.6547, + "step": 842600 + }, + { + "epoch": 3.432974603035577, + "grad_norm": 4.143481254577637, + "learning_rate": 0.002342891775056026, + "loss": 7.699, + "step": 842700 + }, + { + "epoch": 3.4333819810589588, + "grad_norm": 7.141845703125, + "learning_rate": 0.002342396733235149, + "loss": 7.6717, + "step": 842800 + }, + { + "epoch": 3.4337893590823403, + "grad_norm": 4.089468479156494, + "learning_rate": 0.002341901697842847, + "loss": 7.661, + "step": 842900 + }, + { + "epoch": 3.434196737105722, + "grad_norm": 5.759546756744385, + "learning_rate": 0.0023414066688986954, + "loss": 7.6137, + "step": 843000 + }, + { + "epoch": 3.434196737105722, + "eval_MaskedAccuracy": 0.5077780087292555, + "eval_loss": 1.6099501848220825, + "eval_runtime": 163.3517, + "eval_samples_per_second": 388.585, + "eval_steps_per_second": 1.518, + "step": 843000 + }, + { + "epoch": 3.4346041151291034, + "grad_norm": 4.585111141204834, + "learning_rate": 0.0023409116464222653, + "loss": 7.6545, + "step": 843100 + }, + { + "epoch": 3.4350114931524844, + "grad_norm": 10.037331581115723, + "learning_rate": 0.002340416630433123, + "loss": 7.6688, + "step": 843200 + }, + { + "epoch": 3.435418871175866, + "grad_norm": 6.103098392486572, + "learning_rate": 0.0023399216209508424, + "loss": 7.6411, + "step": 843300 + }, + { + "epoch": 3.4358262491992475, + "grad_norm": 6.380743026733398, + "learning_rate": 0.0023394266179949936, + "loss": 7.6444, + "step": 843400 + }, + { + "epoch": 3.436233627222629, + "grad_norm": 12.773555755615234, + "learning_rate": 0.0023389316215851434, + "loss": 7.6456, + "step": 843500 + }, + { + "epoch": 3.4366410052460106, + "grad_norm": 5.444146156311035, + "learning_rate": 0.0023384366317408677, + "loss": 7.6484, + "step": 843600 + }, + { + "epoch": 3.437048383269392, + "grad_norm": 4.911456108093262, + "learning_rate": 0.0023379416484817298, + "loss": 7.6505, + "step": 843700 + }, + { + "epoch": 3.4374557612927736, + "grad_norm": 10.463674545288086, + "learning_rate": 0.002337446671827304, + "loss": 7.6468, + "step": 843800 + }, + { + "epoch": 3.4378631393161547, + "grad_norm": 4.498503684997559, + "learning_rate": 0.002336951701797155, + "loss": 7.6749, + "step": 843900 + }, + { + "epoch": 3.4382705173395363, + "grad_norm": 5.607254981994629, + "learning_rate": 0.0023364567384108557, + "loss": 7.6593, + "step": 844000 + }, + { + "epoch": 3.4382705173395363, + "eval_MaskedAccuracy": 0.5075430470323052, + "eval_loss": 1.6166020631790161, + "eval_runtime": 161.8542, + "eval_samples_per_second": 392.18, + "eval_steps_per_second": 1.532, + "step": 844000 + }, + { + "epoch": 3.438677895362918, + "grad_norm": 4.140470027923584, + "learning_rate": 0.002335961781687971, + "loss": 7.6626, + "step": 844100 + }, + { + "epoch": 3.4390852733862993, + "grad_norm": 3.1368746757507324, + "learning_rate": 0.002335466831648073, + "loss": 7.6365, + "step": 844200 + }, + { + "epoch": 3.439492651409681, + "grad_norm": 6.783154010772705, + "learning_rate": 0.002334971888310729, + "loss": 7.63, + "step": 844300 + }, + { + "epoch": 3.439900029433062, + "grad_norm": 2.900268077850342, + "learning_rate": 0.0023344769516955043, + "loss": 7.6636, + "step": 844400 + }, + { + "epoch": 3.4403074074564435, + "grad_norm": 5.9655442237854, + "learning_rate": 0.002333982021821967, + "loss": 7.6346, + "step": 844500 + }, + { + "epoch": 3.440714785479825, + "grad_norm": 5.214728832244873, + "learning_rate": 0.0023334870987096836, + "loss": 7.6563, + "step": 844600 + }, + { + "epoch": 3.4411221635032065, + "grad_norm": 5.482659816741943, + "learning_rate": 0.002332992182378224, + "loss": 7.6706, + "step": 844700 + }, + { + "epoch": 3.441529541526588, + "grad_norm": 6.027997016906738, + "learning_rate": 0.0023324972728471513, + "loss": 7.6664, + "step": 844800 + }, + { + "epoch": 3.4419369195499696, + "grad_norm": 5.681214809417725, + "learning_rate": 0.002332002370136035, + "loss": 7.6654, + "step": 844900 + }, + { + "epoch": 3.442344297573351, + "grad_norm": 6.694900035858154, + "learning_rate": 0.002331507474264443, + "loss": 7.6208, + "step": 845000 + }, + { + "epoch": 3.442344297573351, + "eval_MaskedAccuracy": 0.5075787665132473, + "eval_loss": 1.6100114583969116, + "eval_runtime": 166.2451, + "eval_samples_per_second": 381.822, + "eval_steps_per_second": 1.492, + "step": 845000 + }, + { + "epoch": 3.4427516755967322, + "grad_norm": 4.156238555908203, + "learning_rate": 0.002331012585251938, + "loss": 7.669, + "step": 845100 + }, + { + "epoch": 3.4431590536201138, + "grad_norm": 5.38286018371582, + "learning_rate": 0.0023305177031180836, + "loss": 7.6487, + "step": 845200 + }, + { + "epoch": 3.4435664316434953, + "grad_norm": 3.2154221534729004, + "learning_rate": 0.0023300228278824485, + "loss": 7.6623, + "step": 845300 + }, + { + "epoch": 3.443973809666877, + "grad_norm": 7.410704612731934, + "learning_rate": 0.0023295279595645967, + "loss": 7.6404, + "step": 845400 + }, + { + "epoch": 3.4443811876902584, + "grad_norm": 3.97457218170166, + "learning_rate": 0.0023290330981840936, + "loss": 7.675, + "step": 845500 + }, + { + "epoch": 3.44478856571364, + "grad_norm": 3.8987467288970947, + "learning_rate": 0.0023285382437605038, + "loss": 7.654, + "step": 845600 + }, + { + "epoch": 3.445195943737021, + "grad_norm": 4.117824554443359, + "learning_rate": 0.0023280433963133894, + "loss": 7.6211, + "step": 845700 + }, + { + "epoch": 3.4456033217604025, + "grad_norm": 3.741800546646118, + "learning_rate": 0.002327548555862319, + "loss": 7.6609, + "step": 845800 + }, + { + "epoch": 3.446010699783784, + "grad_norm": 5.278165817260742, + "learning_rate": 0.0023270537224268505, + "loss": 7.6582, + "step": 845900 + }, + { + "epoch": 3.4464180778071656, + "grad_norm": 5.979162693023682, + "learning_rate": 0.00232655889602655, + "loss": 7.6605, + "step": 846000 + }, + { + "epoch": 3.4464180778071656, + "eval_MaskedAccuracy": 0.5075461690063723, + "eval_loss": 1.609590768814087, + "eval_runtime": 164.4847, + "eval_samples_per_second": 385.908, + "eval_steps_per_second": 1.508, + "step": 846000 + }, + { + "epoch": 3.446825455830547, + "grad_norm": 3.0768141746520996, + "learning_rate": 0.0023260640766809815, + "loss": 7.6496, + "step": 846100 + }, + { + "epoch": 3.4472328338539286, + "grad_norm": 4.9974284172058105, + "learning_rate": 0.0023255692644097085, + "loss": 7.6899, + "step": 846200 + }, + { + "epoch": 3.44764021187731, + "grad_norm": 5.592404842376709, + "learning_rate": 0.0023250744592322914, + "loss": 7.6687, + "step": 846300 + }, + { + "epoch": 3.4480475899006913, + "grad_norm": 6.906416416168213, + "learning_rate": 0.002324579661168292, + "loss": 7.6625, + "step": 846400 + }, + { + "epoch": 3.448454967924073, + "grad_norm": 11.000245094299316, + "learning_rate": 0.0023240848702372717, + "loss": 7.6962, + "step": 846500 + }, + { + "epoch": 3.4488623459474543, + "grad_norm": 4.422186374664307, + "learning_rate": 0.0023235900864587975, + "loss": 7.6695, + "step": 846600 + }, + { + "epoch": 3.449269723970836, + "grad_norm": 3.7603700160980225, + "learning_rate": 0.002323095309852429, + "loss": 7.6794, + "step": 846700 + }, + { + "epoch": 3.4496771019942174, + "grad_norm": 4.572369575500488, + "learning_rate": 0.002322600540437724, + "loss": 7.6602, + "step": 846800 + }, + { + "epoch": 3.4500844800175985, + "grad_norm": 8.38378620147705, + "learning_rate": 0.0023221057782342436, + "loss": 7.65, + "step": 846900 + }, + { + "epoch": 3.45049185804098, + "grad_norm": 4.574950695037842, + "learning_rate": 0.002321611023261551, + "loss": 7.6548, + "step": 847000 + }, + { + "epoch": 3.45049185804098, + "eval_MaskedAccuracy": 0.5073147574991682, + "eval_loss": 1.611008644104004, + "eval_runtime": 155.2146, + "eval_samples_per_second": 408.956, + "eval_steps_per_second": 1.598, + "step": 847000 + }, + { + "epoch": 3.4508992360643616, + "grad_norm": 11.892849922180176, + "learning_rate": 0.002321116275539205, + "loss": 7.6373, + "step": 847100 + }, + { + "epoch": 3.451306614087743, + "grad_norm": 7.027309417724609, + "learning_rate": 0.002320621535086765, + "loss": 7.6589, + "step": 847200 + }, + { + "epoch": 3.4517139921111246, + "grad_norm": 6.043745994567871, + "learning_rate": 0.002320126801923793, + "loss": 7.6598, + "step": 847300 + }, + { + "epoch": 3.452121370134506, + "grad_norm": 8.969237327575684, + "learning_rate": 0.002319632076069847, + "loss": 7.6371, + "step": 847400 + }, + { + "epoch": 3.4525287481578877, + "grad_norm": 3.8428471088409424, + "learning_rate": 0.002319137357544488, + "loss": 7.6735, + "step": 847500 + }, + { + "epoch": 3.4529361261812688, + "grad_norm": 5.334589958190918, + "learning_rate": 0.0023186426463672736, + "loss": 7.6582, + "step": 847600 + }, + { + "epoch": 3.4533435042046503, + "grad_norm": 8.992326736450195, + "learning_rate": 0.0023181479425577606, + "loss": 7.6213, + "step": 847700 + }, + { + "epoch": 3.453750882228032, + "grad_norm": 4.794075012207031, + "learning_rate": 0.0023176532461355063, + "loss": 7.643, + "step": 847800 + }, + { + "epoch": 3.4541582602514134, + "grad_norm": 4.427097797393799, + "learning_rate": 0.002317158557120074, + "loss": 7.6314, + "step": 847900 + }, + { + "epoch": 3.454565638274795, + "grad_norm": 7.120694160461426, + "learning_rate": 0.002316663875531021, + "loss": 7.6218, + "step": 848000 + }, + { + "epoch": 3.454565638274795, + "eval_MaskedAccuracy": 0.5076196713802682, + "eval_loss": 1.6033300161361694, + "eval_runtime": 153.0214, + "eval_samples_per_second": 414.818, + "eval_steps_per_second": 1.621, + "step": 848000 + }, + { + "epoch": 3.4549730162981764, + "grad_norm": 8.416783332824707, + "learning_rate": 0.0023161692013878997, + "loss": 7.6466, + "step": 848100 + }, + { + "epoch": 3.4553803943215575, + "grad_norm": 6.999683380126953, + "learning_rate": 0.0023156745347102685, + "loss": 7.6664, + "step": 848200 + }, + { + "epoch": 3.455787772344939, + "grad_norm": 2.674804449081421, + "learning_rate": 0.002315179875517685, + "loss": 7.6796, + "step": 848300 + }, + { + "epoch": 3.4561951503683206, + "grad_norm": 11.891407012939453, + "learning_rate": 0.00231468522382971, + "loss": 7.6673, + "step": 848400 + }, + { + "epoch": 3.456602528391702, + "grad_norm": 11.699591636657715, + "learning_rate": 0.002314190579665894, + "loss": 7.6564, + "step": 848500 + }, + { + "epoch": 3.4570099064150837, + "grad_norm": 2.579500436782837, + "learning_rate": 0.002313695943045797, + "loss": 7.651, + "step": 848600 + }, + { + "epoch": 3.457417284438465, + "grad_norm": 4.95082426071167, + "learning_rate": 0.002313201313988971, + "loss": 7.6678, + "step": 848700 + }, + { + "epoch": 3.4578246624618467, + "grad_norm": 5.20321798324585, + "learning_rate": 0.002312706692514975, + "loss": 7.6261, + "step": 848800 + }, + { + "epoch": 3.458232040485228, + "grad_norm": 3.087709426879883, + "learning_rate": 0.0023122120786433593, + "loss": 7.6493, + "step": 848900 + }, + { + "epoch": 3.4586394185086093, + "grad_norm": 6.596898555755615, + "learning_rate": 0.0023117174723936815, + "loss": 7.671, + "step": 849000 + }, + { + "epoch": 3.4586394185086093, + "eval_MaskedAccuracy": 0.5074166381801796, + "eval_loss": 1.610337257385254, + "eval_runtime": 169.3394, + "eval_samples_per_second": 374.845, + "eval_steps_per_second": 1.465, + "step": 849000 + }, + { + "epoch": 3.459046796531991, + "grad_norm": 4.9657368659973145, + "learning_rate": 0.0023112228737854944, + "loss": 7.6404, + "step": 849100 + }, + { + "epoch": 3.4594541745553724, + "grad_norm": 9.766314506530762, + "learning_rate": 0.0023107282828383553, + "loss": 7.674, + "step": 849200 + }, + { + "epoch": 3.459861552578754, + "grad_norm": 9.312552452087402, + "learning_rate": 0.0023102336995718164, + "loss": 7.6506, + "step": 849300 + }, + { + "epoch": 3.460268930602135, + "grad_norm": 5.444794178009033, + "learning_rate": 0.0023097391240054312, + "loss": 7.6577, + "step": 849400 + }, + { + "epoch": 3.4606763086255166, + "grad_norm": 5.837581634521484, + "learning_rate": 0.002309244556158751, + "loss": 7.6578, + "step": 849500 + }, + { + "epoch": 3.461083686648898, + "grad_norm": 3.6645426750183105, + "learning_rate": 0.002308749996051329, + "loss": 7.6411, + "step": 849600 + }, + { + "epoch": 3.4614910646722796, + "grad_norm": 3.197244644165039, + "learning_rate": 0.0023082554437027234, + "loss": 7.6716, + "step": 849700 + }, + { + "epoch": 3.461898442695661, + "grad_norm": 5.347982406616211, + "learning_rate": 0.002307760899132483, + "loss": 7.6504, + "step": 849800 + }, + { + "epoch": 3.4623058207190427, + "grad_norm": 4.38731050491333, + "learning_rate": 0.0023072663623601567, + "loss": 7.6677, + "step": 849900 + }, + { + "epoch": 3.4627131987424242, + "grad_norm": 4.275853633880615, + "learning_rate": 0.0023067718334052988, + "loss": 7.6603, + "step": 850000 + }, + { + "epoch": 3.4627131987424242, + "eval_MaskedAccuracy": 0.5076754871832684, + "eval_loss": 1.6124376058578491, + "eval_runtime": 162.2568, + "eval_samples_per_second": 391.207, + "eval_steps_per_second": 1.528, + "step": 850000 + }, + { + "epoch": 3.4631205767658053, + "grad_norm": 3.056399345397949, + "learning_rate": 0.0023062773122874613, + "loss": 7.6485, + "step": 850100 + }, + { + "epoch": 3.463527954789187, + "grad_norm": 7.190247535705566, + "learning_rate": 0.0023057827990261934, + "loss": 7.6689, + "step": 850200 + }, + { + "epoch": 3.4639353328125684, + "grad_norm": 5.2235612869262695, + "learning_rate": 0.00230528829364105, + "loss": 7.6253, + "step": 850300 + }, + { + "epoch": 3.46434271083595, + "grad_norm": 6.351413726806641, + "learning_rate": 0.0023047937961515746, + "loss": 7.6658, + "step": 850400 + }, + { + "epoch": 3.4647500888593314, + "grad_norm": 6.8891191482543945, + "learning_rate": 0.0023042993065773214, + "loss": 7.6694, + "step": 850500 + }, + { + "epoch": 3.465157466882713, + "grad_norm": 3.4621753692626953, + "learning_rate": 0.0023038048249378407, + "loss": 7.6528, + "step": 850600 + }, + { + "epoch": 3.465564844906094, + "grad_norm": 9.703103065490723, + "learning_rate": 0.0023033103512526826, + "loss": 7.6309, + "step": 850700 + }, + { + "epoch": 3.4659722229294756, + "grad_norm": 3.3330769538879395, + "learning_rate": 0.002302815885541394, + "loss": 7.6724, + "step": 850800 + }, + { + "epoch": 3.466379600952857, + "grad_norm": 6.550228595733643, + "learning_rate": 0.0023023214278235244, + "loss": 7.6785, + "step": 850900 + }, + { + "epoch": 3.4667869789762387, + "grad_norm": 9.059885025024414, + "learning_rate": 0.002301826978118625, + "loss": 7.6422, + "step": 851000 + }, + { + "epoch": 3.4667869789762387, + "eval_MaskedAccuracy": 0.5075013760835131, + "eval_loss": 1.6093096733093262, + "eval_runtime": 162.2557, + "eval_samples_per_second": 391.21, + "eval_steps_per_second": 1.528, + "step": 851000 + }, + { + "epoch": 3.46719435699962, + "grad_norm": 11.803396224975586, + "learning_rate": 0.002301332536446239, + "loss": 7.6661, + "step": 851100 + }, + { + "epoch": 3.4676017350230017, + "grad_norm": 7.435437202453613, + "learning_rate": 0.0023008381028259196, + "loss": 7.6549, + "step": 851200 + }, + { + "epoch": 3.4680091130463833, + "grad_norm": 3.796633005142212, + "learning_rate": 0.0023003436772772116, + "loss": 7.6632, + "step": 851300 + }, + { + "epoch": 3.4684164910697644, + "grad_norm": 8.798277854919434, + "learning_rate": 0.0022998492598196585, + "loss": 7.6673, + "step": 851400 + }, + { + "epoch": 3.468823869093146, + "grad_norm": 6.033581733703613, + "learning_rate": 0.0022993548504728132, + "loss": 7.6103, + "step": 851500 + }, + { + "epoch": 3.4692312471165274, + "grad_norm": 3.235607147216797, + "learning_rate": 0.0022988604492562223, + "loss": 7.656, + "step": 851600 + }, + { + "epoch": 3.469638625139909, + "grad_norm": 8.098994255065918, + "learning_rate": 0.00229836605618943, + "loss": 7.6414, + "step": 851700 + }, + { + "epoch": 3.4700460031632905, + "grad_norm": 3.687758445739746, + "learning_rate": 0.0022978716712919807, + "loss": 7.6537, + "step": 851800 + }, + { + "epoch": 3.4704533811866716, + "grad_norm": 3.4156992435455322, + "learning_rate": 0.0022973772945834208, + "loss": 7.6415, + "step": 851900 + }, + { + "epoch": 3.470860759210053, + "grad_norm": 6.617143630981445, + "learning_rate": 0.0022968829260833026, + "loss": 7.6185, + "step": 852000 + }, + { + "epoch": 3.470860759210053, + "eval_MaskedAccuracy": 0.5079360642413758, + "eval_loss": 1.6091926097869873, + "eval_runtime": 157.0559, + "eval_samples_per_second": 404.162, + "eval_steps_per_second": 1.579, + "step": 852000 + }, + { + "epoch": 3.4712681372334346, + "grad_norm": 10.953904151916504, + "learning_rate": 0.002296388565811161, + "loss": 7.6753, + "step": 852100 + }, + { + "epoch": 3.471675515256816, + "grad_norm": 4.454187870025635, + "learning_rate": 0.0022958942137865468, + "loss": 7.7009, + "step": 852200 + }, + { + "epoch": 3.4720828932801977, + "grad_norm": 13.108904838562012, + "learning_rate": 0.0022953998700290046, + "loss": 7.6521, + "step": 852300 + }, + { + "epoch": 3.4724902713035792, + "grad_norm": 7.302590370178223, + "learning_rate": 0.0022949055345580774, + "loss": 7.6702, + "step": 852400 + }, + { + "epoch": 3.4728976493269608, + "grad_norm": 8.430106163024902, + "learning_rate": 0.0022944112073933066, + "loss": 7.6754, + "step": 852500 + }, + { + "epoch": 3.473305027350342, + "grad_norm": 3.9765546321868896, + "learning_rate": 0.002293916888554235, + "loss": 7.6396, + "step": 852600 + }, + { + "epoch": 3.4737124053737234, + "grad_norm": 7.331282615661621, + "learning_rate": 0.0022934225780604106, + "loss": 7.6808, + "step": 852700 + }, + { + "epoch": 3.474119783397105, + "grad_norm": 3.967650890350342, + "learning_rate": 0.0022929282759313707, + "loss": 7.6518, + "step": 852800 + }, + { + "epoch": 3.4745271614204865, + "grad_norm": 3.893359899520874, + "learning_rate": 0.002292433982186661, + "loss": 7.6548, + "step": 852900 + }, + { + "epoch": 3.474934539443868, + "grad_norm": 5.097898483276367, + "learning_rate": 0.0022919396968458217, + "loss": 7.6644, + "step": 853000 + }, + { + "epoch": 3.474934539443868, + "eval_MaskedAccuracy": 0.5079424612083103, + "eval_loss": 1.614734411239624, + "eval_runtime": 163.4256, + "eval_samples_per_second": 388.409, + "eval_steps_per_second": 1.518, + "step": 853000 + }, + { + "epoch": 3.4753419174672495, + "grad_norm": 11.478796005249023, + "learning_rate": 0.0022914454199283985, + "loss": 7.6821, + "step": 853100 + }, + { + "epoch": 3.4757492954906306, + "grad_norm": 6.6236395835876465, + "learning_rate": 0.0022909511514539295, + "loss": 7.6885, + "step": 853200 + }, + { + "epoch": 3.476156673514012, + "grad_norm": 7.352938175201416, + "learning_rate": 0.0022904568914419547, + "loss": 7.6725, + "step": 853300 + }, + { + "epoch": 3.4765640515373937, + "grad_norm": 14.414018630981445, + "learning_rate": 0.0022899626399120175, + "loss": 7.6521, + "step": 853400 + }, + { + "epoch": 3.476971429560775, + "grad_norm": 5.734878063201904, + "learning_rate": 0.002289468396883654, + "loss": 7.6881, + "step": 853500 + }, + { + "epoch": 3.4773788075841567, + "grad_norm": 4.813170909881592, + "learning_rate": 0.0022889741623764123, + "loss": 7.6756, + "step": 853600 + }, + { + "epoch": 3.4777861856075383, + "grad_norm": 7.029141902923584, + "learning_rate": 0.0022884799364098255, + "loss": 7.6432, + "step": 853700 + }, + { + "epoch": 3.47819356363092, + "grad_norm": 3.3507001399993896, + "learning_rate": 0.0022879857190034342, + "loss": 7.6259, + "step": 853800 + }, + { + "epoch": 3.478600941654301, + "grad_norm": 4.767160892486572, + "learning_rate": 0.002287491510176778, + "loss": 7.6345, + "step": 853900 + }, + { + "epoch": 3.4790083196776824, + "grad_norm": 3.7685225009918213, + "learning_rate": 0.0022869973099493984, + "loss": 7.6477, + "step": 854000 + }, + { + "epoch": 3.4790083196776824, + "eval_MaskedAccuracy": 0.507706179515219, + "eval_loss": 1.5977897644042969, + "eval_runtime": 173.9633, + "eval_samples_per_second": 364.882, + "eval_steps_per_second": 1.426, + "step": 854000 + }, + { + "epoch": 3.479415697701064, + "grad_norm": 6.97108793258667, + "learning_rate": 0.0022865031183408268, + "loss": 7.6404, + "step": 854100 + }, + { + "epoch": 3.4798230757244455, + "grad_norm": 9.563529014587402, + "learning_rate": 0.0022860089353706043, + "loss": 7.65, + "step": 854200 + }, + { + "epoch": 3.480230453747827, + "grad_norm": 4.536949634552002, + "learning_rate": 0.0022855147610582726, + "loss": 7.6417, + "step": 854300 + }, + { + "epoch": 3.480637831771208, + "grad_norm": 10.869682312011719, + "learning_rate": 0.0022850205954233664, + "loss": 7.6466, + "step": 854400 + }, + { + "epoch": 3.4810452097945896, + "grad_norm": 7.783188819885254, + "learning_rate": 0.002284526438485423, + "loss": 7.657, + "step": 854500 + }, + { + "epoch": 3.481452587817971, + "grad_norm": 5.100917339324951, + "learning_rate": 0.0022840322902639818, + "loss": 7.6232, + "step": 854600 + }, + { + "epoch": 3.4818599658413527, + "grad_norm": 17.816843032836914, + "learning_rate": 0.0022835381507785735, + "loss": 7.6513, + "step": 854700 + }, + { + "epoch": 3.4822673438647342, + "grad_norm": 7.975155830383301, + "learning_rate": 0.002283044020048735, + "loss": 7.6461, + "step": 854800 + }, + { + "epoch": 3.4826747218881158, + "grad_norm": 4.69168758392334, + "learning_rate": 0.002282549898094005, + "loss": 7.6544, + "step": 854900 + }, + { + "epoch": 3.4830820999114973, + "grad_norm": 4.364626884460449, + "learning_rate": 0.002282055784933918, + "loss": 7.6323, + "step": 855000 + }, + { + "epoch": 3.4830820999114973, + "eval_MaskedAccuracy": 0.5072453696647916, + "eval_loss": 1.6087863445281982, + "eval_runtime": 157.7357, + "eval_samples_per_second": 402.42, + "eval_steps_per_second": 1.572, + "step": 855000 + }, + { + "epoch": 3.4834894779348784, + "grad_norm": 3.039024829864502, + "learning_rate": 0.00228156168058801, + "loss": 7.638, + "step": 855100 + }, + { + "epoch": 3.48389685595826, + "grad_norm": 3.5653367042541504, + "learning_rate": 0.002281067585075812, + "loss": 7.6424, + "step": 855200 + }, + { + "epoch": 3.4843042339816415, + "grad_norm": 6.967785358428955, + "learning_rate": 0.0022805734984168587, + "loss": 7.669, + "step": 855300 + }, + { + "epoch": 3.484711612005023, + "grad_norm": 4.891968250274658, + "learning_rate": 0.002280079420630686, + "loss": 7.6531, + "step": 855400 + }, + { + "epoch": 3.4851189900284045, + "grad_norm": 4.905412673950195, + "learning_rate": 0.0022795853517368248, + "loss": 7.6608, + "step": 855500 + }, + { + "epoch": 3.485526368051786, + "grad_norm": 3.145766019821167, + "learning_rate": 0.0022790912917548117, + "loss": 7.6707, + "step": 855600 + }, + { + "epoch": 3.485933746075167, + "grad_norm": 9.804694175720215, + "learning_rate": 0.0022785972407041795, + "loss": 7.6604, + "step": 855700 + }, + { + "epoch": 3.4863411240985487, + "grad_norm": 5.194327354431152, + "learning_rate": 0.0022781031986044554, + "loss": 7.6586, + "step": 855800 + }, + { + "epoch": 3.48674850212193, + "grad_norm": 11.771062850952148, + "learning_rate": 0.0022776091654751777, + "loss": 7.6579, + "step": 855900 + }, + { + "epoch": 3.4871558801453117, + "grad_norm": 10.029210090637207, + "learning_rate": 0.002277115141335875, + "loss": 7.6355, + "step": 856000 + }, + { + "epoch": 3.4871558801453117, + "eval_MaskedAccuracy": 0.5075777652786438, + "eval_loss": 1.608164668083191, + "eval_runtime": 167.0189, + "eval_samples_per_second": 380.053, + "eval_steps_per_second": 1.485, + "step": 856000 + }, + { + "epoch": 3.4875632581686933, + "grad_norm": 4.199847221374512, + "learning_rate": 0.002276621126206079, + "loss": 7.6806, + "step": 856100 + }, + { + "epoch": 3.487970636192075, + "grad_norm": 4.938039779663086, + "learning_rate": 0.002276127120105323, + "loss": 7.6347, + "step": 856200 + }, + { + "epoch": 3.4883780142154563, + "grad_norm": 4.0086350440979, + "learning_rate": 0.002275633123053136, + "loss": 7.662, + "step": 856300 + }, + { + "epoch": 3.4887853922388374, + "grad_norm": 11.219549179077148, + "learning_rate": 0.0022751391350690486, + "loss": 7.6609, + "step": 856400 + }, + { + "epoch": 3.489192770262219, + "grad_norm": 2.854553461074829, + "learning_rate": 0.002274645156172589, + "loss": 7.6425, + "step": 856500 + }, + { + "epoch": 3.4896001482856005, + "grad_norm": 11.257872581481934, + "learning_rate": 0.0022741511863832876, + "loss": 7.6496, + "step": 856600 + }, + { + "epoch": 3.490007526308982, + "grad_norm": 6.427947044372559, + "learning_rate": 0.002273657225720677, + "loss": 7.6464, + "step": 856700 + }, + { + "epoch": 3.4904149043323636, + "grad_norm": 5.208683013916016, + "learning_rate": 0.0022731632742042838, + "loss": 7.6603, + "step": 856800 + }, + { + "epoch": 3.4908222823557447, + "grad_norm": 10.774505615234375, + "learning_rate": 0.0022726693318536346, + "loss": 7.649, + "step": 856900 + }, + { + "epoch": 3.491229660379126, + "grad_norm": 6.411166191101074, + "learning_rate": 0.00227217539868826, + "loss": 7.6436, + "step": 857000 + }, + { + "epoch": 3.491229660379126, + "eval_MaskedAccuracy": 0.5076728692143582, + "eval_loss": 1.6141939163208008, + "eval_runtime": 199.506, + "eval_samples_per_second": 318.166, + "eval_steps_per_second": 1.243, + "step": 857000 + }, + { + "epoch": 3.4916370384025077, + "grad_norm": 7.696373462677002, + "learning_rate": 0.002271681474727689, + "loss": 7.6009, + "step": 857100 + }, + { + "epoch": 3.4920444164258893, + "grad_norm": 3.2003326416015625, + "learning_rate": 0.0022711875599914447, + "loss": 7.6482, + "step": 857200 + }, + { + "epoch": 3.492451794449271, + "grad_norm": 3.957310914993286, + "learning_rate": 0.0022706936544990572, + "loss": 7.66, + "step": 857300 + }, + { + "epoch": 3.4928591724726523, + "grad_norm": 19.03056526184082, + "learning_rate": 0.0022701997582700523, + "loss": 7.6458, + "step": 857400 + }, + { + "epoch": 3.493266550496034, + "grad_norm": 2.931204080581665, + "learning_rate": 0.0022697058713239578, + "loss": 7.6854, + "step": 857500 + }, + { + "epoch": 3.493673928519415, + "grad_norm": 3.9756855964660645, + "learning_rate": 0.0022692119936802975, + "loss": 7.657, + "step": 857600 + }, + { + "epoch": 3.4940813065427965, + "grad_norm": 5.919060230255127, + "learning_rate": 0.0022687181253586, + "loss": 7.6684, + "step": 857700 + }, + { + "epoch": 3.494488684566178, + "grad_norm": 7.748708248138428, + "learning_rate": 0.0022682242663783865, + "loss": 7.6627, + "step": 857800 + }, + { + "epoch": 3.4948960625895595, + "grad_norm": 7.620152473449707, + "learning_rate": 0.0022677304167591857, + "loss": 7.6573, + "step": 857900 + }, + { + "epoch": 3.495303440612941, + "grad_norm": 10.640727996826172, + "learning_rate": 0.0022672365765205194, + "loss": 7.658, + "step": 858000 + }, + { + "epoch": 3.495303440612941, + "eval_MaskedAccuracy": 0.5079203208738784, + "eval_loss": 1.6093806028366089, + "eval_runtime": 155.7792, + "eval_samples_per_second": 407.474, + "eval_steps_per_second": 1.592, + "step": 858000 + }, + { + "epoch": 3.4957108186363226, + "grad_norm": 5.147162914276123, + "learning_rate": 0.002266742745681915, + "loss": 7.6528, + "step": 858100 + }, + { + "epoch": 3.4961181966597037, + "grad_norm": 5.744800567626953, + "learning_rate": 0.0022662489242628913, + "loss": 7.6804, + "step": 858200 + }, + { + "epoch": 3.4965255746830852, + "grad_norm": 4.321141242980957, + "learning_rate": 0.002265755112282977, + "loss": 7.6806, + "step": 858300 + }, + { + "epoch": 3.4969329527064668, + "grad_norm": 8.63270378112793, + "learning_rate": 0.002265261309761691, + "loss": 7.6707, + "step": 858400 + }, + { + "epoch": 3.4973403307298483, + "grad_norm": 5.3142900466918945, + "learning_rate": 0.0022647675167185577, + "loss": 7.6382, + "step": 858500 + }, + { + "epoch": 3.49774770875323, + "grad_norm": 5.011209011077881, + "learning_rate": 0.0022642737331730993, + "loss": 7.6731, + "step": 858600 + }, + { + "epoch": 3.4981550867766114, + "grad_norm": 9.25820255279541, + "learning_rate": 0.0022637799591448383, + "loss": 7.6675, + "step": 858700 + }, + { + "epoch": 3.498562464799993, + "grad_norm": 4.86635684967041, + "learning_rate": 0.002263286194653295, + "loss": 7.6337, + "step": 858800 + }, + { + "epoch": 3.498969842823374, + "grad_norm": 5.551971912384033, + "learning_rate": 0.002262792439717993, + "loss": 7.6365, + "step": 858900 + }, + { + "epoch": 3.4993772208467555, + "grad_norm": 16.170394897460938, + "learning_rate": 0.0022622986943584534, + "loss": 7.668, + "step": 859000 + }, + { + "epoch": 3.4993772208467555, + "eval_MaskedAccuracy": 0.507519777077441, + "eval_loss": 1.6039706468582153, + "eval_runtime": 156.5045, + "eval_samples_per_second": 405.586, + "eval_steps_per_second": 1.585, + "step": 859000 + }, + { + "epoch": 3.499784598870137, + "grad_norm": 17.170879364013672, + "learning_rate": 0.0022618049585941906, + "loss": 7.635, + "step": 859100 + }, + { + "epoch": 3.5001919768935186, + "grad_norm": 6.251809597015381, + "learning_rate": 0.002261311232444728, + "loss": 7.6465, + "step": 859200 + }, + { + "epoch": 3.5005993549169, + "grad_norm": 6.715796947479248, + "learning_rate": 0.002260817515929586, + "loss": 7.6451, + "step": 859300 + }, + { + "epoch": 3.501006732940281, + "grad_norm": 4.212535381317139, + "learning_rate": 0.002260323809068286, + "loss": 7.6352, + "step": 859400 + }, + { + "epoch": 3.5014141109636627, + "grad_norm": 5.789183139801025, + "learning_rate": 0.0022598301118803435, + "loss": 7.6419, + "step": 859500 + }, + { + "epoch": 3.5018214889870443, + "grad_norm": 7.619360446929932, + "learning_rate": 0.0022593364243852767, + "loss": 7.6433, + "step": 859600 + }, + { + "epoch": 3.502228867010426, + "grad_norm": 10.62928581237793, + "learning_rate": 0.0022588427466026036, + "loss": 7.6414, + "step": 859700 + }, + { + "epoch": 3.5026362450338073, + "grad_norm": 6.69666862487793, + "learning_rate": 0.002258349078551843, + "loss": 7.6481, + "step": 859800 + }, + { + "epoch": 3.503043623057189, + "grad_norm": 10.35774040222168, + "learning_rate": 0.002257855420252513, + "loss": 7.6396, + "step": 859900 + }, + { + "epoch": 3.5034510010805704, + "grad_norm": 9.715471267700195, + "learning_rate": 0.002257361771724131, + "loss": 7.6518, + "step": 860000 + }, + { + "epoch": 3.5034510010805704, + "eval_MaskedAccuracy": 0.5082721296773883, + "eval_loss": 1.6053332090377808, + "eval_runtime": 156.5663, + "eval_samples_per_second": 405.426, + "eval_steps_per_second": 1.584, + "step": 860000 + }, + { + "epoch": 3.503858379103952, + "grad_norm": 6.737760066986084, + "learning_rate": 0.0022568681329862136, + "loss": 7.6393, + "step": 860100 + }, + { + "epoch": 3.504265757127333, + "grad_norm": 12.537190437316895, + "learning_rate": 0.002256374504058277, + "loss": 7.6236, + "step": 860200 + }, + { + "epoch": 3.5046731351507145, + "grad_norm": 6.036975860595703, + "learning_rate": 0.0022558808849598324, + "loss": 7.6448, + "step": 860300 + }, + { + "epoch": 3.505080513174096, + "grad_norm": 7.80363130569458, + "learning_rate": 0.0022553872757104, + "loss": 7.6271, + "step": 860400 + }, + { + "epoch": 3.5054878911974776, + "grad_norm": 4.068698406219482, + "learning_rate": 0.002254893676329494, + "loss": 7.6543, + "step": 860500 + }, + { + "epoch": 3.5058952692208587, + "grad_norm": 10.014059066772461, + "learning_rate": 0.002254400086836629, + "loss": 7.6429, + "step": 860600 + }, + { + "epoch": 3.5063026472442402, + "grad_norm": 8.289796829223633, + "learning_rate": 0.0022539065072513173, + "loss": 7.6515, + "step": 860700 + }, + { + "epoch": 3.5067100252676218, + "grad_norm": 5.379649639129639, + "learning_rate": 0.002253412937593076, + "loss": 7.6373, + "step": 860800 + }, + { + "epoch": 3.5071174032910033, + "grad_norm": 11.413267135620117, + "learning_rate": 0.002252919377881416, + "loss": 7.6663, + "step": 860900 + }, + { + "epoch": 3.507524781314385, + "grad_norm": 8.432605743408203, + "learning_rate": 0.0022524258281358556, + "loss": 7.6316, + "step": 861000 + }, + { + "epoch": 3.507524781314385, + "eval_MaskedAccuracy": 0.5077200990247405, + "eval_loss": 1.6118197441101074, + "eval_runtime": 153.7307, + "eval_samples_per_second": 412.904, + "eval_steps_per_second": 1.613, + "step": 861000 + }, + { + "epoch": 3.5079321593377664, + "grad_norm": 6.178477764129639, + "learning_rate": 0.0022519322883759003, + "loss": 7.6285, + "step": 861100 + }, + { + "epoch": 3.508339537361148, + "grad_norm": 6.329875469207764, + "learning_rate": 0.0022514387586210637, + "loss": 7.6426, + "step": 861200 + }, + { + "epoch": 3.5087469153845294, + "grad_norm": 7.15531587600708, + "learning_rate": 0.002250945238890861, + "loss": 7.6393, + "step": 861300 + }, + { + "epoch": 3.5091542934079105, + "grad_norm": 4.224843502044678, + "learning_rate": 0.0022504517292048025, + "loss": 7.6266, + "step": 861400 + }, + { + "epoch": 3.509561671431292, + "grad_norm": 6.41402006149292, + "learning_rate": 0.002249958229582398, + "loss": 7.6397, + "step": 861500 + }, + { + "epoch": 3.5099690494546736, + "grad_norm": 8.511131286621094, + "learning_rate": 0.002249464740043158, + "loss": 7.665, + "step": 861600 + }, + { + "epoch": 3.510376427478055, + "grad_norm": 9.402780532836914, + "learning_rate": 0.0022489712606065916, + "loss": 7.6536, + "step": 861700 + }, + { + "epoch": 3.5107838055014366, + "grad_norm": 6.63273811340332, + "learning_rate": 0.0022484777912922106, + "loss": 7.6683, + "step": 861800 + }, + { + "epoch": 3.5111911835248177, + "grad_norm": 12.446701049804688, + "learning_rate": 0.0022479843321195265, + "loss": 7.6191, + "step": 861900 + }, + { + "epoch": 3.5115985615481993, + "grad_norm": 4.364391803741455, + "learning_rate": 0.002247490883108045, + "loss": 7.6422, + "step": 862000 + }, + { + "epoch": 3.5115985615481993, + "eval_MaskedAccuracy": 0.5081787838428641, + "eval_loss": 1.6055957078933716, + "eval_runtime": 160.9716, + "eval_samples_per_second": 394.33, + "eval_steps_per_second": 1.541, + "step": 862000 + }, + { + "epoch": 3.512005939571581, + "grad_norm": 4.078834056854248, + "learning_rate": 0.0022469974442772753, + "loss": 7.6737, + "step": 862100 + }, + { + "epoch": 3.5124133175949623, + "grad_norm": 2.7827649116516113, + "learning_rate": 0.0022465040156467273, + "loss": 7.6371, + "step": 862200 + }, + { + "epoch": 3.512820695618344, + "grad_norm": 9.272007942199707, + "learning_rate": 0.0022460105972359093, + "loss": 7.6791, + "step": 862300 + }, + { + "epoch": 3.5132280736417254, + "grad_norm": 11.089544296264648, + "learning_rate": 0.0022455171890643252, + "loss": 7.6573, + "step": 862400 + }, + { + "epoch": 3.513635451665107, + "grad_norm": 4.901332855224609, + "learning_rate": 0.0022450237911514847, + "loss": 7.6553, + "step": 862500 + }, + { + "epoch": 3.5140428296884885, + "grad_norm": 6.1682891845703125, + "learning_rate": 0.0022445304035168976, + "loss": 7.637, + "step": 862600 + }, + { + "epoch": 3.5144502077118696, + "grad_norm": 7.853944778442383, + "learning_rate": 0.0022440370261800685, + "loss": 7.6601, + "step": 862700 + }, + { + "epoch": 3.514857585735251, + "grad_norm": 11.519634246826172, + "learning_rate": 0.002243543659160498, + "loss": 7.6357, + "step": 862800 + }, + { + "epoch": 3.5152649637586326, + "grad_norm": 2.628192186355591, + "learning_rate": 0.0022430503024776956, + "loss": 7.6635, + "step": 862900 + }, + { + "epoch": 3.515672341782014, + "grad_norm": 10.383161544799805, + "learning_rate": 0.002242556956151167, + "loss": 7.6626, + "step": 863000 + }, + { + "epoch": 3.515672341782014, + "eval_MaskedAccuracy": 0.5071665735026636, + "eval_loss": 1.6087853908538818, + "eval_runtime": 160.8476, + "eval_samples_per_second": 394.634, + "eval_steps_per_second": 1.542, + "step": 863000 + }, + { + "epoch": 3.5160797198053952, + "grad_norm": 7.387240886688232, + "learning_rate": 0.0022420636202004148, + "loss": 7.6364, + "step": 863100 + }, + { + "epoch": 3.5164870978287768, + "grad_norm": 13.875762939453125, + "learning_rate": 0.0022415702946449443, + "loss": 7.6802, + "step": 863200 + }, + { + "epoch": 3.5168944758521583, + "grad_norm": 2.7607388496398926, + "learning_rate": 0.0022410769795042592, + "loss": 7.6344, + "step": 863300 + }, + { + "epoch": 3.51730185387554, + "grad_norm": 9.745854377746582, + "learning_rate": 0.002240583674797862, + "loss": 7.6235, + "step": 863400 + }, + { + "epoch": 3.5177092318989214, + "grad_norm": 3.2563323974609375, + "learning_rate": 0.002240090380545254, + "loss": 7.6139, + "step": 863500 + }, + { + "epoch": 3.518116609922303, + "grad_norm": 3.583744764328003, + "learning_rate": 0.002239597096765943, + "loss": 7.6296, + "step": 863600 + }, + { + "epoch": 3.5185239879456844, + "grad_norm": 7.513589859008789, + "learning_rate": 0.002239103823479425, + "loss": 7.6522, + "step": 863700 + }, + { + "epoch": 3.518931365969066, + "grad_norm": 7.896854877471924, + "learning_rate": 0.0022386105607052078, + "loss": 7.6664, + "step": 863800 + }, + { + "epoch": 3.519338743992447, + "grad_norm": 6.768326282501221, + "learning_rate": 0.0022381173084627883, + "loss": 7.6193, + "step": 863900 + }, + { + "epoch": 3.5197461220158286, + "grad_norm": 7.823085784912109, + "learning_rate": 0.0022376240667716673, + "loss": 7.6272, + "step": 864000 + }, + { + "epoch": 3.5197461220158286, + "eval_MaskedAccuracy": 0.5073980986652228, + "eval_loss": 1.6157567501068115, + "eval_runtime": 156.696, + "eval_samples_per_second": 405.09, + "eval_steps_per_second": 1.583, + "step": 864000 + }, + { + "epoch": 3.52015350003921, + "grad_norm": 10.738860130310059, + "learning_rate": 0.0022371308356513527, + "loss": 7.6474, + "step": 864100 + }, + { + "epoch": 3.5205608780625917, + "grad_norm": 5.488829612731934, + "learning_rate": 0.0022366376151213346, + "loss": 7.6362, + "step": 864200 + }, + { + "epoch": 3.520968256085973, + "grad_norm": 9.830921173095703, + "learning_rate": 0.0022361444052011186, + "loss": 7.6252, + "step": 864300 + }, + { + "epoch": 3.5213756341093543, + "grad_norm": 4.068374156951904, + "learning_rate": 0.0022356512059102025, + "loss": 7.6508, + "step": 864400 + }, + { + "epoch": 3.521783012132736, + "grad_norm": 8.725272178649902, + "learning_rate": 0.002235158017268082, + "loss": 7.6446, + "step": 864500 + }, + { + "epoch": 3.5221903901561173, + "grad_norm": 5.265230178833008, + "learning_rate": 0.0022346648392942603, + "loss": 7.6805, + "step": 864600 + }, + { + "epoch": 3.522597768179499, + "grad_norm": 10.594454765319824, + "learning_rate": 0.0022341716720082334, + "loss": 7.6503, + "step": 864700 + }, + { + "epoch": 3.5230051462028804, + "grad_norm": 4.110137462615967, + "learning_rate": 0.0022336785154294953, + "loss": 7.5997, + "step": 864800 + }, + { + "epoch": 3.523412524226262, + "grad_norm": 7.475287437438965, + "learning_rate": 0.0022331853695775504, + "loss": 7.6551, + "step": 864900 + }, + { + "epoch": 3.5238199022496435, + "grad_norm": 10.34378433227539, + "learning_rate": 0.00223269223447189, + "loss": 7.6492, + "step": 865000 + }, + { + "epoch": 3.5238199022496435, + "eval_MaskedAccuracy": 0.5075955693325948, + "eval_loss": 1.6088305711746216, + "eval_runtime": 156.3533, + "eval_samples_per_second": 405.978, + "eval_steps_per_second": 1.586, + "step": 865000 + }, + { + "epoch": 3.524227280273025, + "grad_norm": 4.775081634521484, + "learning_rate": 0.0022321991101320166, + "loss": 7.6302, + "step": 865100 + }, + { + "epoch": 3.524634658296406, + "grad_norm": 3.5852935314178467, + "learning_rate": 0.002231705996577416, + "loss": 7.6434, + "step": 865200 + }, + { + "epoch": 3.5250420363197876, + "grad_norm": 7.204189777374268, + "learning_rate": 0.0022312128938275907, + "loss": 7.6231, + "step": 865300 + }, + { + "epoch": 3.525449414343169, + "grad_norm": 3.662602424621582, + "learning_rate": 0.0022307198019020313, + "loss": 7.6321, + "step": 865400 + }, + { + "epoch": 3.5258567923665507, + "grad_norm": 12.092575073242188, + "learning_rate": 0.002230226720820236, + "loss": 7.6488, + "step": 865500 + }, + { + "epoch": 3.526264170389932, + "grad_norm": 5.39505672454834, + "learning_rate": 0.0022297336506016986, + "loss": 7.6587, + "step": 865600 + }, + { + "epoch": 3.5266715484133133, + "grad_norm": 3.886173725128174, + "learning_rate": 0.0022292405912659116, + "loss": 7.6305, + "step": 865700 + }, + { + "epoch": 3.527078926436695, + "grad_norm": 7.400486469268799, + "learning_rate": 0.002228747542832371, + "loss": 7.6279, + "step": 865800 + }, + { + "epoch": 3.5274863044600764, + "grad_norm": 3.2266361713409424, + "learning_rate": 0.0022282545053205647, + "loss": 7.6445, + "step": 865900 + }, + { + "epoch": 3.527893682483458, + "grad_norm": 6.4646406173706055, + "learning_rate": 0.0022277614787499906, + "loss": 7.6614, + "step": 866000 + }, + { + "epoch": 3.527893682483458, + "eval_MaskedAccuracy": 0.508327164619293, + "eval_loss": 1.6072479486465454, + "eval_runtime": 178.343, + "eval_samples_per_second": 355.921, + "eval_steps_per_second": 1.391, + "step": 866000 + }, + { + "epoch": 3.5283010605068394, + "grad_norm": 5.047513008117676, + "learning_rate": 0.0022272684631401383, + "loss": 7.6268, + "step": 866100 + }, + { + "epoch": 3.528708438530221, + "grad_norm": 3.4761483669281006, + "learning_rate": 0.002226775458510497, + "loss": 7.6494, + "step": 866200 + }, + { + "epoch": 3.5291158165536025, + "grad_norm": 9.057031631469727, + "learning_rate": 0.0022262824648805607, + "loss": 7.6314, + "step": 866300 + }, + { + "epoch": 3.5295231945769836, + "grad_norm": 5.275428771972656, + "learning_rate": 0.0022257894822698206, + "loss": 7.6535, + "step": 866400 + }, + { + "epoch": 3.529930572600365, + "grad_norm": 8.792091369628906, + "learning_rate": 0.002225296510697767, + "loss": 7.6516, + "step": 866500 + }, + { + "epoch": 3.5303379506237467, + "grad_norm": 10.201578140258789, + "learning_rate": 0.0022248035501838837, + "loss": 7.6221, + "step": 866600 + }, + { + "epoch": 3.530745328647128, + "grad_norm": 10.079939842224121, + "learning_rate": 0.0022243106007476658, + "loss": 7.6352, + "step": 866700 + }, + { + "epoch": 3.5311527066705097, + "grad_norm": 3.5065391063690186, + "learning_rate": 0.0022238176624086023, + "loss": 7.637, + "step": 866800 + }, + { + "epoch": 3.531560084693891, + "grad_norm": 3.66342830657959, + "learning_rate": 0.002223324735186181, + "loss": 7.6384, + "step": 866900 + }, + { + "epoch": 3.5319674627172724, + "grad_norm": 4.13564395904541, + "learning_rate": 0.0022228318190998897, + "loss": 7.6674, + "step": 867000 + }, + { + "epoch": 3.5319674627172724, + "eval_MaskedAccuracy": 0.5078339605061698, + "eval_loss": 1.6016442775726318, + "eval_runtime": 167.2502, + "eval_samples_per_second": 379.527, + "eval_steps_per_second": 1.483, + "step": 867000 + }, + { + "epoch": 3.532374840740654, + "grad_norm": 9.793412208557129, + "learning_rate": 0.002222338914169216, + "loss": 7.6937, + "step": 867100 + }, + { + "epoch": 3.5327822187640354, + "grad_norm": 6.829715728759766, + "learning_rate": 0.002221846020413645, + "loss": 7.6395, + "step": 867200 + }, + { + "epoch": 3.533189596787417, + "grad_norm": 4.2996506690979, + "learning_rate": 0.0022213531378526656, + "loss": 7.6595, + "step": 867300 + }, + { + "epoch": 3.5335969748107985, + "grad_norm": 4.399179458618164, + "learning_rate": 0.0022208602665057652, + "loss": 7.6601, + "step": 867400 + }, + { + "epoch": 3.53400435283418, + "grad_norm": 9.048361778259277, + "learning_rate": 0.0022203674063924284, + "loss": 7.6578, + "step": 867500 + }, + { + "epoch": 3.5344117308575616, + "grad_norm": 5.1573567390441895, + "learning_rate": 0.0022198745575321395, + "loss": 7.646, + "step": 867600 + }, + { + "epoch": 3.5348191088809426, + "grad_norm": 10.374008178710938, + "learning_rate": 0.002219381719944383, + "loss": 7.6632, + "step": 867700 + }, + { + "epoch": 3.535226486904324, + "grad_norm": 4.753935813903809, + "learning_rate": 0.002218888893648646, + "loss": 7.6326, + "step": 867800 + }, + { + "epoch": 3.5356338649277057, + "grad_norm": 3.602890968322754, + "learning_rate": 0.002218396078664412, + "loss": 7.655, + "step": 867900 + }, + { + "epoch": 3.5360412429510872, + "grad_norm": 4.311542510986328, + "learning_rate": 0.002217903275011164, + "loss": 7.6387, + "step": 868000 + }, + { + "epoch": 3.5360412429510872, + "eval_MaskedAccuracy": 0.5078075448271019, + "eval_loss": 1.6154001951217651, + "eval_runtime": 193.8666, + "eval_samples_per_second": 327.421, + "eval_steps_per_second": 1.279, + "step": 868000 + }, + { + "epoch": 3.5364486209744683, + "grad_norm": 9.806257247924805, + "learning_rate": 0.002217410482708384, + "loss": 7.6441, + "step": 868100 + }, + { + "epoch": 3.53685599899785, + "grad_norm": 7.351921081542969, + "learning_rate": 0.002216917701775556, + "loss": 7.6496, + "step": 868200 + }, + { + "epoch": 3.5372633770212314, + "grad_norm": 3.7131845951080322, + "learning_rate": 0.00221642493223216, + "loss": 7.6651, + "step": 868300 + }, + { + "epoch": 3.537670755044613, + "grad_norm": 8.0667724609375, + "learning_rate": 0.0022159321740976828, + "loss": 7.6045, + "step": 868400 + }, + { + "epoch": 3.5380781330679945, + "grad_norm": 6.803450584411621, + "learning_rate": 0.0022154394273916004, + "loss": 7.6298, + "step": 868500 + }, + { + "epoch": 3.538485511091376, + "grad_norm": 6.105456829071045, + "learning_rate": 0.002214946692133398, + "loss": 7.6561, + "step": 868600 + }, + { + "epoch": 3.5388928891147575, + "grad_norm": 5.054286956787109, + "learning_rate": 0.002214453968342553, + "loss": 7.6309, + "step": 868700 + }, + { + "epoch": 3.539300267138139, + "grad_norm": 5.653079032897949, + "learning_rate": 0.0022139612560385484, + "loss": 7.6118, + "step": 868800 + }, + { + "epoch": 3.53970764516152, + "grad_norm": 8.30525016784668, + "learning_rate": 0.0022134685552408612, + "loss": 7.6569, + "step": 868900 + }, + { + "epoch": 3.5401150231849017, + "grad_norm": 7.925002098083496, + "learning_rate": 0.002212975865968968, + "loss": 7.6269, + "step": 869000 + }, + { + "epoch": 3.5401150231849017, + "eval_MaskedAccuracy": 0.5070655657190831, + "eval_loss": 1.6203315258026123, + "eval_runtime": 407.0913, + "eval_samples_per_second": 155.926, + "eval_steps_per_second": 0.609, + "step": 869000 + }, + { + "epoch": 3.540522401208283, + "grad_norm": 5.564241886138916, + "learning_rate": 0.0022124831882423535, + "loss": 7.6465, + "step": 869100 + }, + { + "epoch": 3.5409297792316647, + "grad_norm": 8.257840156555176, + "learning_rate": 0.0022119905220804935, + "loss": 7.6475, + "step": 869200 + }, + { + "epoch": 3.5413371572550463, + "grad_norm": 6.186727046966553, + "learning_rate": 0.0022114978675028636, + "loss": 7.6533, + "step": 869300 + }, + { + "epoch": 3.5417445352784274, + "grad_norm": 8.564448356628418, + "learning_rate": 0.0022110052245289434, + "loss": 7.6514, + "step": 869400 + }, + { + "epoch": 3.542151913301809, + "grad_norm": 4.951117992401123, + "learning_rate": 0.002210512593178211, + "loss": 7.6313, + "step": 869500 + }, + { + "epoch": 3.5425592913251904, + "grad_norm": 6.333032131195068, + "learning_rate": 0.00221001997347014, + "loss": 7.6275, + "step": 869600 + }, + { + "epoch": 3.542966669348572, + "grad_norm": 5.469081878662109, + "learning_rate": 0.0022095273654242063, + "loss": 7.6638, + "step": 869700 + }, + { + "epoch": 3.5433740473719535, + "grad_norm": 2.768857717514038, + "learning_rate": 0.0022090347690598856, + "loss": 7.647, + "step": 869800 + }, + { + "epoch": 3.543781425395335, + "grad_norm": 11.990005493164062, + "learning_rate": 0.0022085421843966533, + "loss": 7.6299, + "step": 869900 + }, + { + "epoch": 3.5441888034187166, + "grad_norm": 5.885626316070557, + "learning_rate": 0.0022080496114539865, + "loss": 7.6763, + "step": 870000 + }, + { + "epoch": 3.5441888034187166, + "eval_MaskedAccuracy": 0.5077186503853933, + "eval_loss": 1.6030515432357788, + "eval_runtime": 186.6037, + "eval_samples_per_second": 340.165, + "eval_steps_per_second": 1.329, + "step": 870000 + }, + { + "epoch": 3.544596181442098, + "grad_norm": 4.5661115646362305, + "learning_rate": 0.0022075570502513553, + "loss": 7.6813, + "step": 870100 + }, + { + "epoch": 3.545003559465479, + "grad_norm": 9.719332695007324, + "learning_rate": 0.002207064500808236, + "loss": 7.6504, + "step": 870200 + }, + { + "epoch": 3.5454109374888607, + "grad_norm": 2.993973731994629, + "learning_rate": 0.0022065719631440975, + "loss": 7.6145, + "step": 870300 + }, + { + "epoch": 3.5458183155122422, + "grad_norm": 8.430723190307617, + "learning_rate": 0.0022060794372784145, + "loss": 7.6143, + "step": 870400 + }, + { + "epoch": 3.546225693535624, + "grad_norm": 11.184769630432129, + "learning_rate": 0.0022055869232306628, + "loss": 7.6144, + "step": 870500 + }, + { + "epoch": 3.546633071559005, + "grad_norm": 2.9587326049804688, + "learning_rate": 0.002205094421020311, + "loss": 7.6254, + "step": 870600 + }, + { + "epoch": 3.5470404495823864, + "grad_norm": 8.092852592468262, + "learning_rate": 0.002204601930666831, + "loss": 7.6361, + "step": 870700 + }, + { + "epoch": 3.547447827605768, + "grad_norm": 9.773558616638184, + "learning_rate": 0.0022041094521896933, + "loss": 7.6525, + "step": 870800 + }, + { + "epoch": 3.5478552056291495, + "grad_norm": 5.076406478881836, + "learning_rate": 0.0022036169856083695, + "loss": 7.6596, + "step": 870900 + }, + { + "epoch": 3.548262583652531, + "grad_norm": 5.9360785484313965, + "learning_rate": 0.002203124530942327, + "loss": 7.6561, + "step": 871000 + }, + { + "epoch": 3.548262583652531, + "eval_MaskedAccuracy": 0.5077936875617609, + "eval_loss": 1.6093236207962036, + "eval_runtime": 213.3314, + "eval_samples_per_second": 297.547, + "eval_steps_per_second": 1.163, + "step": 871000 + }, + { + "epoch": 3.5486699616759125, + "grad_norm": 5.750765323638916, + "learning_rate": 0.0022026320882110348, + "loss": 7.6483, + "step": 871100 + }, + { + "epoch": 3.549077339699294, + "grad_norm": 6.0726470947265625, + "learning_rate": 0.0022021396574339646, + "loss": 7.6499, + "step": 871200 + }, + { + "epoch": 3.5494847177226756, + "grad_norm": 5.044371128082275, + "learning_rate": 0.0022016472386305825, + "loss": 7.6338, + "step": 871300 + }, + { + "epoch": 3.5498920957460567, + "grad_norm": 7.558262825012207, + "learning_rate": 0.0022011548318203616, + "loss": 7.6243, + "step": 871400 + }, + { + "epoch": 3.550299473769438, + "grad_norm": 4.824985980987549, + "learning_rate": 0.0022006624370227612, + "loss": 7.6515, + "step": 871500 + }, + { + "epoch": 3.5507068517928198, + "grad_norm": 3.010861396789551, + "learning_rate": 0.002200170054257255, + "loss": 7.6246, + "step": 871600 + }, + { + "epoch": 3.5511142298162013, + "grad_norm": 4.284688949584961, + "learning_rate": 0.002199677683543306, + "loss": 7.6511, + "step": 871700 + }, + { + "epoch": 3.551521607839583, + "grad_norm": 2.767360210418701, + "learning_rate": 0.0021991853249003807, + "loss": 7.6695, + "step": 871800 + }, + { + "epoch": 3.551928985862964, + "grad_norm": 7.54105806350708, + "learning_rate": 0.0021986929783479455, + "loss": 7.6632, + "step": 871900 + }, + { + "epoch": 3.5523363638863454, + "grad_norm": 6.911463737487793, + "learning_rate": 0.002198200643905462, + "loss": 7.654, + "step": 872000 + }, + { + "epoch": 3.5523363638863454, + "eval_MaskedAccuracy": 0.5081919099463476, + "eval_loss": 1.605948805809021, + "eval_runtime": 322.012, + "eval_samples_per_second": 197.123, + "eval_steps_per_second": 0.77, + "step": 872000 + }, + { + "epoch": 3.552743741909727, + "grad_norm": 4.053170204162598, + "learning_rate": 0.0021977083215924008, + "loss": 7.657, + "step": 872100 + }, + { + "epoch": 3.5531511199331085, + "grad_norm": 11.83296012878418, + "learning_rate": 0.002197216011428222, + "loss": 7.6329, + "step": 872200 + }, + { + "epoch": 3.55355849795649, + "grad_norm": 4.449741840362549, + "learning_rate": 0.002196723713432391, + "loss": 7.6774, + "step": 872300 + }, + { + "epoch": 3.5539658759798716, + "grad_norm": 6.159327507019043, + "learning_rate": 0.0021962314276243694, + "loss": 7.6271, + "step": 872400 + }, + { + "epoch": 3.554373254003253, + "grad_norm": 9.53366470336914, + "learning_rate": 0.0021957391540236232, + "loss": 7.6257, + "step": 872500 + }, + { + "epoch": 3.5547806320266346, + "grad_norm": 4.169156551361084, + "learning_rate": 0.002195246892649613, + "loss": 7.6404, + "step": 872600 + }, + { + "epoch": 3.5551880100500157, + "grad_norm": 6.204394340515137, + "learning_rate": 0.0021947546435217968, + "loss": 7.65, + "step": 872700 + }, + { + "epoch": 3.5555953880733973, + "grad_norm": 4.08635950088501, + "learning_rate": 0.002194262406659641, + "loss": 7.6369, + "step": 872800 + }, + { + "epoch": 3.556002766096779, + "grad_norm": 6.075716018676758, + "learning_rate": 0.002193770182082606, + "loss": 7.636, + "step": 872900 + }, + { + "epoch": 3.5564101441201603, + "grad_norm": 8.030529975891113, + "learning_rate": 0.002193277969810146, + "loss": 7.6425, + "step": 873000 + }, + { + "epoch": 3.5564101441201603, + "eval_MaskedAccuracy": 0.507682783853077, + "eval_loss": 1.6131370067596436, + "eval_runtime": 265.9635, + "eval_samples_per_second": 238.664, + "eval_steps_per_second": 0.932, + "step": 873000 + }, + { + "epoch": 3.5568175221435414, + "grad_norm": 5.904312610626221, + "learning_rate": 0.0021927857698617265, + "loss": 7.6403, + "step": 873100 + }, + { + "epoch": 3.557224900166923, + "grad_norm": 5.031678199768066, + "learning_rate": 0.002192293582256804, + "loss": 7.6359, + "step": 873200 + }, + { + "epoch": 3.5576322781903045, + "grad_norm": 5.363577842712402, + "learning_rate": 0.0021918014070148413, + "loss": 7.6331, + "step": 873300 + }, + { + "epoch": 3.558039656213686, + "grad_norm": 7.076882362365723, + "learning_rate": 0.002191309244155291, + "loss": 7.6238, + "step": 873400 + }, + { + "epoch": 3.5584470342370675, + "grad_norm": 7.54783821105957, + "learning_rate": 0.0021908170936976164, + "loss": 7.6256, + "step": 873500 + }, + { + "epoch": 3.558854412260449, + "grad_norm": 4.612677574157715, + "learning_rate": 0.0021903249556612706, + "loss": 7.6591, + "step": 873600 + }, + { + "epoch": 3.5592617902838306, + "grad_norm": 2.7689554691314697, + "learning_rate": 0.0021898328300657102, + "loss": 7.6651, + "step": 873700 + }, + { + "epoch": 3.559669168307212, + "grad_norm": 8.302124977111816, + "learning_rate": 0.002189340716930396, + "loss": 7.6497, + "step": 873800 + }, + { + "epoch": 3.5600765463305932, + "grad_norm": 4.4748125076293945, + "learning_rate": 0.0021888486162747836, + "loss": 7.6327, + "step": 873900 + }, + { + "epoch": 3.5604839243539748, + "grad_norm": 7.404576778411865, + "learning_rate": 0.0021883565281183236, + "loss": 7.6806, + "step": 874000 + }, + { + "epoch": 3.5604839243539748, + "eval_MaskedAccuracy": 0.5083195085567322, + "eval_loss": 1.6122350692749023, + "eval_runtime": 171.3922, + "eval_samples_per_second": 370.355, + "eval_steps_per_second": 1.447, + "step": 874000 + }, + { + "epoch": 3.5608913023773563, + "grad_norm": 4.407498359680176, + "learning_rate": 0.0021878644524804727, + "loss": 7.6571, + "step": 874100 + }, + { + "epoch": 3.561298680400738, + "grad_norm": 14.454693794250488, + "learning_rate": 0.0021873723893806878, + "loss": 7.6546, + "step": 874200 + }, + { + "epoch": 3.5617060584241194, + "grad_norm": 3.9932785034179688, + "learning_rate": 0.002186880338838423, + "loss": 7.6362, + "step": 874300 + }, + { + "epoch": 3.5621134364475004, + "grad_norm": 7.53162145614624, + "learning_rate": 0.002186388300873129, + "loss": 7.6297, + "step": 874400 + }, + { + "epoch": 3.562520814470882, + "grad_norm": 4.9030890464782715, + "learning_rate": 0.0021858962755042614, + "loss": 7.645, + "step": 874500 + }, + { + "epoch": 3.5629281924942635, + "grad_norm": 4.759460926055908, + "learning_rate": 0.0021854042627512697, + "loss": 7.6694, + "step": 874600 + }, + { + "epoch": 3.563335570517645, + "grad_norm": 5.22857141494751, + "learning_rate": 0.0021849122626336043, + "loss": 7.6052, + "step": 874700 + }, + { + "epoch": 3.5637429485410266, + "grad_norm": 4.2129807472229, + "learning_rate": 0.0021844202751707207, + "loss": 7.6232, + "step": 874800 + }, + { + "epoch": 3.564150326564408, + "grad_norm": 4.902995586395264, + "learning_rate": 0.002183928300382066, + "loss": 7.6013, + "step": 874900 + }, + { + "epoch": 3.5645577045877896, + "grad_norm": 11.46718692779541, + "learning_rate": 0.0021834363382870945, + "loss": 7.6285, + "step": 875000 + }, + { + "epoch": 3.5645577045877896, + "eval_MaskedAccuracy": 0.5080486656609293, + "eval_loss": 1.608930230140686, + "eval_runtime": 181.7519, + "eval_samples_per_second": 349.245, + "eval_steps_per_second": 1.364, + "step": 875000 + }, + { + "epoch": 3.564965082611171, + "grad_norm": 5.476961612701416, + "learning_rate": 0.0021829443889052535, + "loss": 7.6636, + "step": 875100 + }, + { + "epoch": 3.5653724606345523, + "grad_norm": 5.607533931732178, + "learning_rate": 0.002182452452255995, + "loss": 7.6408, + "step": 875200 + }, + { + "epoch": 3.565779838657934, + "grad_norm": 4.789079666137695, + "learning_rate": 0.0021819605283587653, + "loss": 7.6229, + "step": 875300 + }, + { + "epoch": 3.5661872166813153, + "grad_norm": 8.760357856750488, + "learning_rate": 0.0021814686172330136, + "loss": 7.6536, + "step": 875400 + }, + { + "epoch": 3.566594594704697, + "grad_norm": 8.119916915893555, + "learning_rate": 0.0021809767188981865, + "loss": 7.6421, + "step": 875500 + }, + { + "epoch": 3.567001972728078, + "grad_norm": 7.076836585998535, + "learning_rate": 0.0021804848333737367, + "loss": 7.6652, + "step": 875600 + }, + { + "epoch": 3.5674093507514595, + "grad_norm": 6.827029705047607, + "learning_rate": 0.0021799929606791044, + "loss": 7.6398, + "step": 875700 + }, + { + "epoch": 3.567816728774841, + "grad_norm": 5.244140625, + "learning_rate": 0.0021795011008337393, + "loss": 7.6074, + "step": 875800 + }, + { + "epoch": 3.5682241067982226, + "grad_norm": 4.503063201904297, + "learning_rate": 0.0021790092538570864, + "loss": 7.6649, + "step": 875900 + }, + { + "epoch": 3.568631484821604, + "grad_norm": 4.182742118835449, + "learning_rate": 0.002178517419768591, + "loss": 7.6101, + "step": 876000 + }, + { + "epoch": 3.568631484821604, + "eval_MaskedAccuracy": 0.5082813732347208, + "eval_loss": 1.6111879348754883, + "eval_runtime": 170.1251, + "eval_samples_per_second": 373.114, + "eval_steps_per_second": 1.458, + "step": 876000 + }, + { + "epoch": 3.5690388628449856, + "grad_norm": 8.675579071044922, + "learning_rate": 0.0021780255985876995, + "loss": 7.6186, + "step": 876100 + }, + { + "epoch": 3.569446240868367, + "grad_norm": 2.9475040435791016, + "learning_rate": 0.0021775337903338545, + "loss": 7.6093, + "step": 876200 + }, + { + "epoch": 3.5698536188917487, + "grad_norm": 10.103228569030762, + "learning_rate": 0.0021770419950265, + "loss": 7.6757, + "step": 876300 + }, + { + "epoch": 3.5702609969151298, + "grad_norm": 6.754882335662842, + "learning_rate": 0.002176550212685081, + "loss": 7.6479, + "step": 876400 + }, + { + "epoch": 3.5706683749385113, + "grad_norm": 5.378737926483154, + "learning_rate": 0.002176058443329037, + "loss": 7.6296, + "step": 876500 + }, + { + "epoch": 3.571075752961893, + "grad_norm": 4.21753454208374, + "learning_rate": 0.0021755666869778113, + "loss": 7.6436, + "step": 876600 + }, + { + "epoch": 3.5714831309852744, + "grad_norm": 7.464591979980469, + "learning_rate": 0.0021750749436508453, + "loss": 7.6198, + "step": 876700 + }, + { + "epoch": 3.571890509008656, + "grad_norm": 3.5655879974365234, + "learning_rate": 0.0021745832133675843, + "loss": 7.6177, + "step": 876800 + }, + { + "epoch": 3.572297887032037, + "grad_norm": 3.804499864578247, + "learning_rate": 0.002174091496147467, + "loss": 7.6452, + "step": 876900 + }, + { + "epoch": 3.5727052650554185, + "grad_norm": 3.8965039253234863, + "learning_rate": 0.002173599792009929, + "loss": 7.6433, + "step": 877000 + }, + { + "epoch": 3.5727052650554185, + "eval_MaskedAccuracy": 0.5077628099448271, + "eval_loss": 1.615176796913147, + "eval_runtime": 171.6107, + "eval_samples_per_second": 369.884, + "eval_steps_per_second": 1.445, + "step": 877000 + }, + { + "epoch": 3.5731126430788, + "grad_norm": 7.956528663635254, + "learning_rate": 0.002173108100974414, + "loss": 7.6361, + "step": 877100 + }, + { + "epoch": 3.5735200211021816, + "grad_norm": 7.382539749145508, + "learning_rate": 0.002172616423060361, + "loss": 7.604, + "step": 877200 + }, + { + "epoch": 3.573927399125563, + "grad_norm": 12.62801742553711, + "learning_rate": 0.0021721247582872096, + "loss": 7.6364, + "step": 877300 + }, + { + "epoch": 3.5743347771489447, + "grad_norm": 6.42581033706665, + "learning_rate": 0.0021716331066743964, + "loss": 7.6594, + "step": 877400 + }, + { + "epoch": 3.574742155172326, + "grad_norm": 10.753722190856934, + "learning_rate": 0.002171141468241361, + "loss": 7.6234, + "step": 877500 + }, + { + "epoch": 3.5751495331957077, + "grad_norm": 8.056370735168457, + "learning_rate": 0.0021706498430075346, + "loss": 7.6749, + "step": 877600 + }, + { + "epoch": 3.575556911219089, + "grad_norm": 13.071642875671387, + "learning_rate": 0.0021701582309923604, + "loss": 7.6608, + "step": 877700 + }, + { + "epoch": 3.5759642892424703, + "grad_norm": 7.352053165435791, + "learning_rate": 0.002169666632215273, + "loss": 7.6386, + "step": 877800 + }, + { + "epoch": 3.576371667265852, + "grad_norm": 7.554142951965332, + "learning_rate": 0.0021691750466957056, + "loss": 7.6175, + "step": 877900 + }, + { + "epoch": 3.5767790452892334, + "grad_norm": 6.251834392547607, + "learning_rate": 0.0021686834744530925, + "loss": 7.6542, + "step": 878000 + }, + { + "epoch": 3.5767790452892334, + "eval_MaskedAccuracy": 0.5077756948399319, + "eval_loss": 1.6095921993255615, + "eval_runtime": 439.2866, + "eval_samples_per_second": 144.498, + "eval_steps_per_second": 0.565, + "step": 878000 + }, + { + "epoch": 3.5771864233126145, + "grad_norm": 6.588141441345215, + "learning_rate": 0.0021681919155068733, + "loss": 7.647, + "step": 878100 + }, + { + "epoch": 3.577593801335996, + "grad_norm": 6.322723388671875, + "learning_rate": 0.002167700369876481, + "loss": 7.6746, + "step": 878200 + }, + { + "epoch": 3.5780011793593776, + "grad_norm": 6.316498756408691, + "learning_rate": 0.002167208837581346, + "loss": 7.6256, + "step": 878300 + }, + { + "epoch": 3.578408557382759, + "grad_norm": 5.972914695739746, + "learning_rate": 0.002166717318640904, + "loss": 7.6369, + "step": 878400 + }, + { + "epoch": 3.5788159354061406, + "grad_norm": 8.659676551818848, + "learning_rate": 0.0021662258130745854, + "loss": 7.5971, + "step": 878500 + }, + { + "epoch": 3.579223313429522, + "grad_norm": 10.309601783752441, + "learning_rate": 0.002165734320901821, + "loss": 7.6087, + "step": 878600 + }, + { + "epoch": 3.5796306914529037, + "grad_norm": 8.766733169555664, + "learning_rate": 0.002165242842142042, + "loss": 7.628, + "step": 878700 + }, + { + "epoch": 3.5800380694762852, + "grad_norm": 6.971269607543945, + "learning_rate": 0.002164751376814684, + "loss": 7.6364, + "step": 878800 + }, + { + "epoch": 3.5804454474996663, + "grad_norm": 17.327312469482422, + "learning_rate": 0.002164259924939174, + "loss": 7.6125, + "step": 878900 + }, + { + "epoch": 3.580852825523048, + "grad_norm": 3.636362075805664, + "learning_rate": 0.0021637684865349394, + "loss": 7.6401, + "step": 879000 + }, + { + "epoch": 3.580852825523048, + "eval_MaskedAccuracy": 0.5087040819822556, + "eval_loss": 1.605638861656189, + "eval_runtime": 171.5638, + "eval_samples_per_second": 369.985, + "eval_steps_per_second": 1.446, + "step": 879000 + }, + { + "epoch": 3.5812602035464294, + "grad_norm": 3.375300168991089, + "learning_rate": 0.002163277061621411, + "loss": 7.644, + "step": 879100 + }, + { + "epoch": 3.581667581569811, + "grad_norm": 7.568451881408691, + "learning_rate": 0.0021627856502180174, + "loss": 7.6559, + "step": 879200 + }, + { + "epoch": 3.5820749595931924, + "grad_norm": 6.051039218902588, + "learning_rate": 0.00216229425234419, + "loss": 7.6583, + "step": 879300 + }, + { + "epoch": 3.5824823376165735, + "grad_norm": 3.77250599861145, + "learning_rate": 0.002161802868019353, + "loss": 7.6349, + "step": 879400 + }, + { + "epoch": 3.582889715639955, + "grad_norm": 7.995368003845215, + "learning_rate": 0.0021613114972629343, + "loss": 7.643, + "step": 879500 + }, + { + "epoch": 3.5832970936633366, + "grad_norm": 9.187032699584961, + "learning_rate": 0.0021608201400943597, + "loss": 7.6183, + "step": 879600 + }, + { + "epoch": 3.583704471686718, + "grad_norm": 4.510272026062012, + "learning_rate": 0.002160328796533055, + "loss": 7.6557, + "step": 879700 + }, + { + "epoch": 3.5841118497100997, + "grad_norm": 5.87296724319458, + "learning_rate": 0.0021598374665984465, + "loss": 7.6431, + "step": 879800 + }, + { + "epoch": 3.584519227733481, + "grad_norm": 15.391018867492676, + "learning_rate": 0.002159346150309961, + "loss": 7.6412, + "step": 879900 + }, + { + "epoch": 3.5849266057568627, + "grad_norm": 5.5155110359191895, + "learning_rate": 0.0021588548476870182, + "loss": 7.6294, + "step": 880000 + }, + { + "epoch": 3.5849266057568627, + "eval_MaskedAccuracy": 0.5079862422568046, + "eval_loss": 1.6146249771118164, + "eval_runtime": 170.5121, + "eval_samples_per_second": 372.267, + "eval_steps_per_second": 1.454, + "step": 880000 + }, + { + "epoch": 3.5853339837802443, + "grad_norm": 16.883499145507812, + "learning_rate": 0.002158363558749045, + "loss": 7.6364, + "step": 880100 + }, + { + "epoch": 3.5857413618036253, + "grad_norm": 6.272616386413574, + "learning_rate": 0.0021578722835154654, + "loss": 7.6152, + "step": 880200 + }, + { + "epoch": 3.586148739827007, + "grad_norm": 10.381025314331055, + "learning_rate": 0.002157381022005699, + "loss": 7.6318, + "step": 880300 + }, + { + "epoch": 3.5865561178503884, + "grad_norm": 11.820782661437988, + "learning_rate": 0.002156889774239174, + "loss": 7.6454, + "step": 880400 + }, + { + "epoch": 3.58696349587377, + "grad_norm": 6.760218143463135, + "learning_rate": 0.002156398540235307, + "loss": 7.6618, + "step": 880500 + }, + { + "epoch": 3.587370873897151, + "grad_norm": 4.550642967224121, + "learning_rate": 0.0021559073200135195, + "loss": 7.6232, + "step": 880600 + }, + { + "epoch": 3.5877782519205326, + "grad_norm": 4.033792495727539, + "learning_rate": 0.00215541611359323, + "loss": 7.6453, + "step": 880700 + }, + { + "epoch": 3.588185629943914, + "grad_norm": 4.415927410125732, + "learning_rate": 0.002154924920993864, + "loss": 7.6513, + "step": 880800 + }, + { + "epoch": 3.5885930079672956, + "grad_norm": 8.303081512451172, + "learning_rate": 0.0021544337422348373, + "loss": 7.6258, + "step": 880900 + }, + { + "epoch": 3.589000385990677, + "grad_norm": 7.362762451171875, + "learning_rate": 0.0021539425773355676, + "loss": 7.643, + "step": 881000 + }, + { + "epoch": 3.589000385990677, + "eval_MaskedAccuracy": 0.5081290455190871, + "eval_loss": 1.6080726385116577, + "eval_runtime": 179.4765, + "eval_samples_per_second": 353.673, + "eval_steps_per_second": 1.382, + "step": 881000 + }, + { + "epoch": 3.5894077640140587, + "grad_norm": 12.051417350769043, + "learning_rate": 0.0021534514263154766, + "loss": 7.6196, + "step": 881100 + }, + { + "epoch": 3.5898151420374402, + "grad_norm": 8.640392303466797, + "learning_rate": 0.002152960289193983, + "loss": 7.6462, + "step": 881200 + }, + { + "epoch": 3.5902225200608218, + "grad_norm": 5.608726501464844, + "learning_rate": 0.0021524691659905, + "loss": 7.6224, + "step": 881300 + }, + { + "epoch": 3.590629898084203, + "grad_norm": 5.6090497970581055, + "learning_rate": 0.0021519780567244465, + "loss": 7.6475, + "step": 881400 + }, + { + "epoch": 3.5910372761075844, + "grad_norm": 7.759212493896484, + "learning_rate": 0.002151486961415241, + "loss": 7.6184, + "step": 881500 + }, + { + "epoch": 3.591444654130966, + "grad_norm": 10.882990837097168, + "learning_rate": 0.0021509958800822945, + "loss": 7.6673, + "step": 881600 + }, + { + "epoch": 3.5918520321543475, + "grad_norm": 6.636781215667725, + "learning_rate": 0.002150504812745023, + "loss": 7.6569, + "step": 881700 + }, + { + "epoch": 3.592259410177729, + "grad_norm": 6.783194065093994, + "learning_rate": 0.0021500137594228446, + "loss": 7.6366, + "step": 881800 + }, + { + "epoch": 3.59266678820111, + "grad_norm": 6.8873515129089355, + "learning_rate": 0.002149522720135168, + "loss": 7.6504, + "step": 881900 + }, + { + "epoch": 3.5930741662244916, + "grad_norm": 9.64719295501709, + "learning_rate": 0.0021490316949014108, + "loss": 7.6463, + "step": 882000 + }, + { + "epoch": 3.5930741662244916, + "eval_MaskedAccuracy": 0.507845655953523, + "eval_loss": 1.6132745742797852, + "eval_runtime": 195.9464, + "eval_samples_per_second": 323.946, + "eval_steps_per_second": 1.266, + "step": 882000 + }, + { + "epoch": 3.593481544247873, + "grad_norm": 3.836606979370117, + "learning_rate": 0.0021485406837409827, + "loss": 7.6709, + "step": 882100 + }, + { + "epoch": 3.5938889222712547, + "grad_norm": 8.204010963439941, + "learning_rate": 0.0021480496866733, + "loss": 7.6298, + "step": 882200 + }, + { + "epoch": 3.594296300294636, + "grad_norm": 3.425236701965332, + "learning_rate": 0.0021475587037177732, + "loss": 7.6375, + "step": 882300 + }, + { + "epoch": 3.5947036783180177, + "grad_norm": 5.571497440338135, + "learning_rate": 0.002147067734893812, + "loss": 7.6211, + "step": 882400 + }, + { + "epoch": 3.5951110563413993, + "grad_norm": 8.662858963012695, + "learning_rate": 0.002146576780220827, + "loss": 7.644, + "step": 882500 + }, + { + "epoch": 3.595518434364781, + "grad_norm": 12.79269027709961, + "learning_rate": 0.002146085839718229, + "loss": 7.6186, + "step": 882600 + }, + { + "epoch": 3.595925812388162, + "grad_norm": 8.456405639648438, + "learning_rate": 0.002145594913405426, + "loss": 7.6344, + "step": 882700 + }, + { + "epoch": 3.5963331904115434, + "grad_norm": 2.7036311626434326, + "learning_rate": 0.0021451040013018307, + "loss": 7.6566, + "step": 882800 + }, + { + "epoch": 3.596740568434925, + "grad_norm": 10.803698539733887, + "learning_rate": 0.0021446131034268496, + "loss": 7.6465, + "step": 882900 + }, + { + "epoch": 3.5971479464583065, + "grad_norm": 13.076610565185547, + "learning_rate": 0.002144122219799888, + "loss": 7.6376, + "step": 883000 + }, + { + "epoch": 3.5971479464583065, + "eval_MaskedAccuracy": 0.5076123440642883, + "eval_loss": 1.6148388385772705, + "eval_runtime": 252.0209, + "eval_samples_per_second": 251.868, + "eval_steps_per_second": 0.984, + "step": 883000 + }, + { + "epoch": 3.5975553244816876, + "grad_norm": 10.464448928833008, + "learning_rate": 0.0021436313504403525, + "loss": 7.612, + "step": 883100 + }, + { + "epoch": 3.597962702505069, + "grad_norm": 4.088959217071533, + "learning_rate": 0.002143140495367657, + "loss": 7.6582, + "step": 883200 + }, + { + "epoch": 3.5983700805284506, + "grad_norm": 6.57127046585083, + "learning_rate": 0.0021426496546012, + "loss": 7.6182, + "step": 883300 + }, + { + "epoch": 3.598777458551832, + "grad_norm": 3.783757448196411, + "learning_rate": 0.0021421588281603913, + "loss": 7.6258, + "step": 883400 + }, + { + "epoch": 3.5991848365752137, + "grad_norm": 6.03716516494751, + "learning_rate": 0.0021416680160646355, + "loss": 7.6571, + "step": 883500 + }, + { + "epoch": 3.5995922145985952, + "grad_norm": 10.619534492492676, + "learning_rate": 0.0021411772183333347, + "loss": 7.6549, + "step": 883600 + }, + { + "epoch": 3.5999995926219768, + "grad_norm": 6.190208911895752, + "learning_rate": 0.002140686434985898, + "loss": 7.6275, + "step": 883700 + }, + { + "epoch": 3.6004069706453583, + "grad_norm": 4.570251941680908, + "learning_rate": 0.002140195666041722, + "loss": 7.6603, + "step": 883800 + }, + { + "epoch": 3.6008143486687394, + "grad_norm": 9.952339172363281, + "learning_rate": 0.0021397049115202136, + "loss": 7.6427, + "step": 883900 + }, + { + "epoch": 3.601221726692121, + "grad_norm": 4.950263977050781, + "learning_rate": 0.002139214171440775, + "loss": 7.6463, + "step": 884000 + }, + { + "epoch": 3.601221726692121, + "eval_MaskedAccuracy": 0.5078028810447769, + "eval_loss": 1.6191045045852661, + "eval_runtime": 180.2412, + "eval_samples_per_second": 352.173, + "eval_steps_per_second": 1.376, + "step": 884000 + }, + { + "epoch": 3.6016291047155025, + "grad_norm": 6.89466667175293, + "learning_rate": 0.002138723445822806, + "loss": 7.6315, + "step": 884100 + }, + { + "epoch": 3.602036482738884, + "grad_norm": 4.768282413482666, + "learning_rate": 0.0021382327346857084, + "loss": 7.6457, + "step": 884200 + }, + { + "epoch": 3.6024438607622655, + "grad_norm": 11.934220314025879, + "learning_rate": 0.0021377420380488807, + "loss": 7.6237, + "step": 884300 + }, + { + "epoch": 3.6028512387856466, + "grad_norm": 5.59706974029541, + "learning_rate": 0.002137251355931724, + "loss": 7.6372, + "step": 884400 + }, + { + "epoch": 3.603258616809028, + "grad_norm": 6.063025951385498, + "learning_rate": 0.002136760688353638, + "loss": 7.6692, + "step": 884500 + }, + { + "epoch": 3.6036659948324097, + "grad_norm": 5.742177486419678, + "learning_rate": 0.002136270035334024, + "loss": 7.6454, + "step": 884600 + }, + { + "epoch": 3.604073372855791, + "grad_norm": 3.289231538772583, + "learning_rate": 0.002135779396892275, + "loss": 7.6529, + "step": 884700 + }, + { + "epoch": 3.6044807508791727, + "grad_norm": 4.300974369049072, + "learning_rate": 0.002135288773047792, + "loss": 7.6624, + "step": 884800 + }, + { + "epoch": 3.6048881289025543, + "grad_norm": 7.5341997146606445, + "learning_rate": 0.002134798163819972, + "loss": 7.6501, + "step": 884900 + }, + { + "epoch": 3.605295506925936, + "grad_norm": 12.429261207580566, + "learning_rate": 0.002134307569228211, + "loss": 7.6669, + "step": 885000 + }, + { + "epoch": 3.605295506925936, + "eval_MaskedAccuracy": 0.5080789949313501, + "eval_loss": 1.614434003829956, + "eval_runtime": 208.2465, + "eval_samples_per_second": 304.812, + "eval_steps_per_second": 1.191, + "step": 885000 + }, + { + "epoch": 3.6057028849493173, + "grad_norm": 7.60191011428833, + "learning_rate": 0.0021338169892919064, + "loss": 7.6408, + "step": 885100 + }, + { + "epoch": 3.6061102629726984, + "grad_norm": 2.9621386528015137, + "learning_rate": 0.002133326424030451, + "loss": 7.6484, + "step": 885200 + }, + { + "epoch": 3.60651764099608, + "grad_norm": 11.868209838867188, + "learning_rate": 0.002132835873463239, + "loss": 7.6349, + "step": 885300 + }, + { + "epoch": 3.6069250190194615, + "grad_norm": 10.728883743286133, + "learning_rate": 0.0021323453376096636, + "loss": 7.6487, + "step": 885400 + }, + { + "epoch": 3.607332397042843, + "grad_norm": 5.587571620941162, + "learning_rate": 0.002131854816489121, + "loss": 7.6376, + "step": 885500 + }, + { + "epoch": 3.607739775066224, + "grad_norm": 6.433836460113525, + "learning_rate": 0.0021313643101210015, + "loss": 7.6492, + "step": 885600 + }, + { + "epoch": 3.6081471530896057, + "grad_norm": 10.285565376281738, + "learning_rate": 0.0021308738185247017, + "loss": 7.6476, + "step": 885700 + }, + { + "epoch": 3.608554531112987, + "grad_norm": 3.460230588912964, + "learning_rate": 0.00213038334171961, + "loss": 7.6237, + "step": 885800 + }, + { + "epoch": 3.6089619091363687, + "grad_norm": 4.975857734680176, + "learning_rate": 0.002129892879725118, + "loss": 7.646, + "step": 885900 + }, + { + "epoch": 3.6093692871597503, + "grad_norm": 8.449679374694824, + "learning_rate": 0.0021294024325606144, + "loss": 7.6341, + "step": 886000 + }, + { + "epoch": 3.6093692871597503, + "eval_MaskedAccuracy": 0.5083332423401736, + "eval_loss": 1.607765555381775, + "eval_runtime": 186.2194, + "eval_samples_per_second": 340.867, + "eval_steps_per_second": 1.332, + "step": 886000 + }, + { + "epoch": 3.609776665183132, + "grad_norm": 3.533583641052246, + "learning_rate": 0.0021289120002454948, + "loss": 7.6316, + "step": 886100 + }, + { + "epoch": 3.6101840432065133, + "grad_norm": 7.7914838790893555, + "learning_rate": 0.002128421582799146, + "loss": 7.628, + "step": 886200 + }, + { + "epoch": 3.610591421229895, + "grad_norm": 3.6636064052581787, + "learning_rate": 0.0021279311802409567, + "loss": 7.6392, + "step": 886300 + }, + { + "epoch": 3.610998799253276, + "grad_norm": 7.064916610717773, + "learning_rate": 0.0021274407925903124, + "loss": 7.6339, + "step": 886400 + }, + { + "epoch": 3.6114061772766575, + "grad_norm": 6.368926525115967, + "learning_rate": 0.0021269504198666014, + "loss": 7.6003, + "step": 886500 + }, + { + "epoch": 3.611813555300039, + "grad_norm": 9.411510467529297, + "learning_rate": 0.0021264600620892115, + "loss": 7.6517, + "step": 886600 + }, + { + "epoch": 3.6122209333234205, + "grad_norm": 2.905181407928467, + "learning_rate": 0.002125969719277534, + "loss": 7.6408, + "step": 886700 + }, + { + "epoch": 3.612628311346802, + "grad_norm": 2.163015604019165, + "learning_rate": 0.002125479391450947, + "loss": 7.6243, + "step": 886800 + }, + { + "epoch": 3.613035689370183, + "grad_norm": 13.100394248962402, + "learning_rate": 0.00212498907862884, + "loss": 7.649, + "step": 886900 + }, + { + "epoch": 3.6134430673935647, + "grad_norm": 8.736556053161621, + "learning_rate": 0.002124498780830599, + "loss": 7.6183, + "step": 887000 + }, + { + "epoch": 3.6134430673935647, + "eval_MaskedAccuracy": 0.5080628830843521, + "eval_loss": 1.6166939735412598, + "eval_runtime": 172.8935, + "eval_samples_per_second": 367.139, + "eval_steps_per_second": 1.434, + "step": 887000 + }, + { + "epoch": 3.6138504454169462, + "grad_norm": 3.8619651794433594, + "learning_rate": 0.002124008498075608, + "loss": 7.6219, + "step": 887100 + }, + { + "epoch": 3.6142578234403278, + "grad_norm": 3.6366708278656006, + "learning_rate": 0.002123518230383244, + "loss": 7.6517, + "step": 887200 + }, + { + "epoch": 3.6146652014637093, + "grad_norm": 7.061435699462891, + "learning_rate": 0.0021230279777728974, + "loss": 7.6591, + "step": 887300 + }, + { + "epoch": 3.615072579487091, + "grad_norm": 3.008315086364746, + "learning_rate": 0.002122537740263949, + "loss": 7.6186, + "step": 887400 + }, + { + "epoch": 3.6154799575104724, + "grad_norm": 5.90070104598999, + "learning_rate": 0.002122047517875774, + "loss": 7.6343, + "step": 887500 + }, + { + "epoch": 3.615887335533854, + "grad_norm": 4.847671031951904, + "learning_rate": 0.002121557310627762, + "loss": 7.6518, + "step": 887600 + }, + { + "epoch": 3.616294713557235, + "grad_norm": 3.052391767501831, + "learning_rate": 0.002121067118539291, + "loss": 7.602, + "step": 887700 + }, + { + "epoch": 3.6167020915806165, + "grad_norm": 6.374086856842041, + "learning_rate": 0.0021205769416297393, + "loss": 7.6295, + "step": 887800 + }, + { + "epoch": 3.617109469603998, + "grad_norm": 10.769808769226074, + "learning_rate": 0.002120086779918488, + "loss": 7.6628, + "step": 887900 + }, + { + "epoch": 3.6175168476273796, + "grad_norm": 6.5341620445251465, + "learning_rate": 0.0021195966334249163, + "loss": 7.6452, + "step": 888000 + }, + { + "epoch": 3.6175168476273796, + "eval_MaskedAccuracy": 0.5081228197925867, + "eval_loss": 1.6121056079864502, + "eval_runtime": 182.3327, + "eval_samples_per_second": 348.133, + "eval_steps_per_second": 1.36, + "step": 888000 + }, + { + "epoch": 3.6179242256507607, + "grad_norm": 10.292481422424316, + "learning_rate": 0.0021191065021683984, + "loss": 7.657, + "step": 888100 + }, + { + "epoch": 3.618331603674142, + "grad_norm": 7.091249465942383, + "learning_rate": 0.0021186163861683155, + "loss": 7.6772, + "step": 888200 + }, + { + "epoch": 3.6187389816975237, + "grad_norm": 3.879603385925293, + "learning_rate": 0.0021181262854440453, + "loss": 7.6197, + "step": 888300 + }, + { + "epoch": 3.6191463597209053, + "grad_norm": 2.8298897743225098, + "learning_rate": 0.0021176362000149577, + "loss": 7.6453, + "step": 888400 + }, + { + "epoch": 3.619553737744287, + "grad_norm": 6.767982006072998, + "learning_rate": 0.0021171461299004353, + "loss": 7.6315, + "step": 888500 + }, + { + "epoch": 3.6199611157676683, + "grad_norm": 9.25776195526123, + "learning_rate": 0.0021166560751198524, + "loss": 7.6741, + "step": 888600 + }, + { + "epoch": 3.62036849379105, + "grad_norm": 5.514349460601807, + "learning_rate": 0.002116166035692577, + "loss": 7.6259, + "step": 888700 + }, + { + "epoch": 3.6207758718144314, + "grad_norm": 5.164654731750488, + "learning_rate": 0.0021156760116379926, + "loss": 7.6129, + "step": 888800 + }, + { + "epoch": 3.6211832498378125, + "grad_norm": 11.57204818725586, + "learning_rate": 0.0021151860029754647, + "loss": 7.6765, + "step": 888900 + }, + { + "epoch": 3.621590627861194, + "grad_norm": 4.421428680419922, + "learning_rate": 0.002114696009724373, + "loss": 7.651, + "step": 889000 + }, + { + "epoch": 3.621590627861194, + "eval_MaskedAccuracy": 0.5081642887177764, + "eval_loss": 1.6025340557098389, + "eval_runtime": 175.4212, + "eval_samples_per_second": 361.849, + "eval_steps_per_second": 1.414, + "step": 889000 + }, + { + "epoch": 3.6219980058845755, + "grad_norm": 11.134932518005371, + "learning_rate": 0.0021142060319040814, + "loss": 7.6201, + "step": 889100 + }, + { + "epoch": 3.622405383907957, + "grad_norm": 10.667376518249512, + "learning_rate": 0.002113716069533967, + "loss": 7.6492, + "step": 889200 + }, + { + "epoch": 3.6228127619313386, + "grad_norm": 12.43516731262207, + "learning_rate": 0.002113226122633397, + "loss": 7.6487, + "step": 889300 + }, + { + "epoch": 3.6232201399547197, + "grad_norm": 5.9131646156311035, + "learning_rate": 0.0021127361912217425, + "loss": 7.6384, + "step": 889400 + }, + { + "epoch": 3.6236275179781012, + "grad_norm": 6.579653739929199, + "learning_rate": 0.0021122462753183796, + "loss": 7.6262, + "step": 889500 + }, + { + "epoch": 3.6240348960014828, + "grad_norm": 4.793057918548584, + "learning_rate": 0.0021117563749426726, + "loss": 7.6143, + "step": 889600 + }, + { + "epoch": 3.6244422740248643, + "grad_norm": 5.972531795501709, + "learning_rate": 0.0021112664901139864, + "loss": 7.6014, + "step": 889700 + }, + { + "epoch": 3.624849652048246, + "grad_norm": 5.3787522315979, + "learning_rate": 0.0021107766208516903, + "loss": 7.6263, + "step": 889800 + }, + { + "epoch": 3.6252570300716274, + "grad_norm": 8.72839069366455, + "learning_rate": 0.0021102867671751527, + "loss": 7.6563, + "step": 889900 + }, + { + "epoch": 3.625664408095009, + "grad_norm": 5.1460723876953125, + "learning_rate": 0.002109796929103743, + "loss": 7.6243, + "step": 890000 + }, + { + "epoch": 3.625664408095009, + "eval_MaskedAccuracy": 0.5083265363894516, + "eval_loss": 1.6125589609146118, + "eval_runtime": 157.2142, + "eval_samples_per_second": 403.755, + "eval_steps_per_second": 1.577, + "step": 890000 + }, + { + "epoch": 3.6260717861183904, + "grad_norm": 4.11286735534668, + "learning_rate": 0.002109307106656823, + "loss": 7.6554, + "step": 890100 + }, + { + "epoch": 3.6264791641417715, + "grad_norm": 5.135982513427734, + "learning_rate": 0.0021088172998537597, + "loss": 7.6006, + "step": 890200 + }, + { + "epoch": 3.626886542165153, + "grad_norm": 9.693115234375, + "learning_rate": 0.0021083275087139175, + "loss": 7.6465, + "step": 890300 + }, + { + "epoch": 3.6272939201885346, + "grad_norm": 8.499496459960938, + "learning_rate": 0.002107837733256662, + "loss": 7.6339, + "step": 890400 + }, + { + "epoch": 3.627701298211916, + "grad_norm": 5.727963924407959, + "learning_rate": 0.0021073479735013523, + "loss": 7.6518, + "step": 890500 + }, + { + "epoch": 3.628108676235297, + "grad_norm": 3.8457725048065186, + "learning_rate": 0.002106858229467355, + "loss": 7.6473, + "step": 890600 + }, + { + "epoch": 3.6285160542586787, + "grad_norm": 8.028230667114258, + "learning_rate": 0.002106368501174033, + "loss": 7.6095, + "step": 890700 + }, + { + "epoch": 3.6289234322820603, + "grad_norm": 7.135133266448975, + "learning_rate": 0.002105878788640746, + "loss": 7.6446, + "step": 890800 + }, + { + "epoch": 3.629330810305442, + "grad_norm": 7.200752258300781, + "learning_rate": 0.0021053890918868562, + "loss": 7.6468, + "step": 890900 + }, + { + "epoch": 3.6297381883288233, + "grad_norm": 7.582151889801025, + "learning_rate": 0.0021048994109317196, + "loss": 7.6434, + "step": 891000 + }, + { + "epoch": 3.6297381883288233, + "eval_MaskedAccuracy": 0.5084321665863265, + "eval_loss": 1.6051677465438843, + "eval_runtime": 157.0626, + "eval_samples_per_second": 404.145, + "eval_steps_per_second": 1.579, + "step": 891000 + }, + { + "epoch": 3.630145566352205, + "grad_norm": 7.139379024505615, + "learning_rate": 0.0021044097457946997, + "loss": 7.6365, + "step": 891100 + }, + { + "epoch": 3.6305529443755864, + "grad_norm": 2.8056225776672363, + "learning_rate": 0.002103920096495156, + "loss": 7.6527, + "step": 891200 + }, + { + "epoch": 3.630960322398968, + "grad_norm": 3.0227692127227783, + "learning_rate": 0.002103430463052445, + "loss": 7.6285, + "step": 891300 + }, + { + "epoch": 3.631367700422349, + "grad_norm": 5.516915798187256, + "learning_rate": 0.002102940845485924, + "loss": 7.6399, + "step": 891400 + }, + { + "epoch": 3.6317750784457306, + "grad_norm": 6.106595993041992, + "learning_rate": 0.0021024512438149543, + "loss": 7.6298, + "step": 891500 + }, + { + "epoch": 3.632182456469112, + "grad_norm": 5.04689884185791, + "learning_rate": 0.0021019616580588898, + "loss": 7.6452, + "step": 891600 + }, + { + "epoch": 3.6325898344924936, + "grad_norm": 5.227834224700928, + "learning_rate": 0.0021014720882370857, + "loss": 7.6337, + "step": 891700 + }, + { + "epoch": 3.632997212515875, + "grad_norm": 3.651257276535034, + "learning_rate": 0.0021009825343688985, + "loss": 7.6431, + "step": 891800 + }, + { + "epoch": 3.6334045905392562, + "grad_norm": 4.403896331787109, + "learning_rate": 0.0021004929964736833, + "loss": 7.6151, + "step": 891900 + }, + { + "epoch": 3.6338119685626378, + "grad_norm": 6.865539073944092, + "learning_rate": 0.0021000034745707963, + "loss": 7.6336, + "step": 892000 + }, + { + "epoch": 3.6338119685626378, + "eval_MaskedAccuracy": 0.5085419425763867, + "eval_loss": 1.6030364036560059, + "eval_runtime": 164.8446, + "eval_samples_per_second": 385.066, + "eval_steps_per_second": 1.504, + "step": 892000 + }, + { + "epoch": 3.6342193465860193, + "grad_norm": 6.314415454864502, + "learning_rate": 0.002099513968679585, + "loss": 7.6495, + "step": 892100 + }, + { + "epoch": 3.634626724609401, + "grad_norm": 5.714036464691162, + "learning_rate": 0.002099024478819405, + "loss": 7.6556, + "step": 892200 + }, + { + "epoch": 3.6350341026327824, + "grad_norm": 3.957120418548584, + "learning_rate": 0.0020985350050096076, + "loss": 7.6168, + "step": 892300 + }, + { + "epoch": 3.635441480656164, + "grad_norm": 5.000965118408203, + "learning_rate": 0.0020980455472695446, + "loss": 7.5973, + "step": 892400 + }, + { + "epoch": 3.6358488586795454, + "grad_norm": 4.517456531524658, + "learning_rate": 0.0020975561056185686, + "loss": 7.6422, + "step": 892500 + }, + { + "epoch": 3.636256236702927, + "grad_norm": 12.821765899658203, + "learning_rate": 0.0020970666800760288, + "loss": 7.6485, + "step": 892600 + }, + { + "epoch": 3.636663614726308, + "grad_norm": 4.305283069610596, + "learning_rate": 0.002096577270661276, + "loss": 7.6234, + "step": 892700 + }, + { + "epoch": 3.6370709927496896, + "grad_norm": 10.463276863098145, + "learning_rate": 0.0020960878773936572, + "loss": 7.6539, + "step": 892800 + }, + { + "epoch": 3.637478370773071, + "grad_norm": 6.03818941116333, + "learning_rate": 0.0020955985002925216, + "loss": 7.6635, + "step": 892900 + }, + { + "epoch": 3.6378857487964527, + "grad_norm": 7.45972204208374, + "learning_rate": 0.002095109139377217, + "loss": 7.5967, + "step": 893000 + }, + { + "epoch": 3.6378857487964527, + "eval_MaskedAccuracy": 0.508194595348992, + "eval_loss": 1.6029776334762573, + "eval_runtime": 193.1379, + "eval_samples_per_second": 328.656, + "eval_steps_per_second": 1.284, + "step": 893000 + }, + { + "epoch": 3.6382931268198337, + "grad_norm": 9.505080223083496, + "learning_rate": 0.0020946197946670898, + "loss": 7.6727, + "step": 893100 + }, + { + "epoch": 3.6387005048432153, + "grad_norm": 14.210636138916016, + "learning_rate": 0.0020941304661814862, + "loss": 7.6625, + "step": 893200 + }, + { + "epoch": 3.639107882866597, + "grad_norm": 5.206877708435059, + "learning_rate": 0.002093641153939754, + "loss": 7.6222, + "step": 893300 + }, + { + "epoch": 3.6395152608899783, + "grad_norm": 7.7267913818359375, + "learning_rate": 0.002093151857961241, + "loss": 7.6441, + "step": 893400 + }, + { + "epoch": 3.63992263891336, + "grad_norm": 3.843454599380493, + "learning_rate": 0.002092662578265287, + "loss": 7.6285, + "step": 893500 + }, + { + "epoch": 3.6403300169367414, + "grad_norm": 6.689214706420898, + "learning_rate": 0.0020921733148712355, + "loss": 7.6272, + "step": 893600 + }, + { + "epoch": 3.640737394960123, + "grad_norm": 7.946575164794922, + "learning_rate": 0.0020916840677984304, + "loss": 7.6374, + "step": 893700 + }, + { + "epoch": 3.6411447729835045, + "grad_norm": 3.621073007583618, + "learning_rate": 0.0020911948370662173, + "loss": 7.6289, + "step": 893800 + }, + { + "epoch": 3.6415521510068856, + "grad_norm": 10.197724342346191, + "learning_rate": 0.0020907056226939316, + "loss": 7.638, + "step": 893900 + }, + { + "epoch": 3.641959529030267, + "grad_norm": 9.06722640991211, + "learning_rate": 0.0020902164247009196, + "loss": 7.6382, + "step": 894000 + }, + { + "epoch": 3.641959529030267, + "eval_MaskedAccuracy": 0.507824996847747, + "eval_loss": 1.6094565391540527, + "eval_runtime": 225.8666, + "eval_samples_per_second": 281.033, + "eval_steps_per_second": 1.098, + "step": 894000 + }, + { + "epoch": 3.6423669070536486, + "grad_norm": 6.535556793212891, + "learning_rate": 0.00208972724310652, + "loss": 7.6525, + "step": 894100 + }, + { + "epoch": 3.64277428507703, + "grad_norm": 11.369897842407227, + "learning_rate": 0.0020892380779300715, + "loss": 7.6301, + "step": 894200 + }, + { + "epoch": 3.6431816631004117, + "grad_norm": 11.29706859588623, + "learning_rate": 0.002088748929190916, + "loss": 7.6612, + "step": 894300 + }, + { + "epoch": 3.643589041123793, + "grad_norm": 3.8788299560546875, + "learning_rate": 0.002088259796908394, + "loss": 7.6186, + "step": 894400 + }, + { + "epoch": 3.6439964191471743, + "grad_norm": 3.9642136096954346, + "learning_rate": 0.002087770681101839, + "loss": 7.6408, + "step": 894500 + }, + { + "epoch": 3.644403797170556, + "grad_norm": 11.986189842224121, + "learning_rate": 0.002087281581790592, + "loss": 7.6151, + "step": 894600 + }, + { + "epoch": 3.6448111751939374, + "grad_norm": 4.163311958312988, + "learning_rate": 0.0020867924989939886, + "loss": 7.6424, + "step": 894700 + }, + { + "epoch": 3.645218553217319, + "grad_norm": 7.488025188446045, + "learning_rate": 0.002086303432731363, + "loss": 7.627, + "step": 894800 + }, + { + "epoch": 3.6456259312407004, + "grad_norm": 6.97831392288208, + "learning_rate": 0.002085814383022053, + "loss": 7.6482, + "step": 894900 + }, + { + "epoch": 3.646033309264082, + "grad_norm": 7.675668239593506, + "learning_rate": 0.002085325349885391, + "loss": 7.6348, + "step": 895000 + }, + { + "epoch": 3.646033309264082, + "eval_MaskedAccuracy": 0.5079352136965306, + "eval_loss": 1.6075338125228882, + "eval_runtime": 184.7545, + "eval_samples_per_second": 343.569, + "eval_steps_per_second": 1.342, + "step": 895000 + }, + { + "epoch": 3.6464406872874635, + "grad_norm": 5.817262649536133, + "learning_rate": 0.002084836333340711, + "loss": 7.6289, + "step": 895100 + }, + { + "epoch": 3.6468480653108446, + "grad_norm": 4.482575416564941, + "learning_rate": 0.002084347333407352, + "loss": 7.6222, + "step": 895200 + }, + { + "epoch": 3.647255443334226, + "grad_norm": 8.638537406921387, + "learning_rate": 0.00208385835010464, + "loss": 7.6217, + "step": 895300 + }, + { + "epoch": 3.6476628213576077, + "grad_norm": 4.3328375816345215, + "learning_rate": 0.0020833693834519107, + "loss": 7.6416, + "step": 895400 + }, + { + "epoch": 3.648070199380989, + "grad_norm": 6.512923240661621, + "learning_rate": 0.0020828804334684903, + "loss": 7.6581, + "step": 895500 + }, + { + "epoch": 3.6484775774043703, + "grad_norm": 9.623422622680664, + "learning_rate": 0.00208239150017372, + "loss": 7.6086, + "step": 895600 + }, + { + "epoch": 3.648884955427752, + "grad_norm": 9.830459594726562, + "learning_rate": 0.0020819025835869197, + "loss": 7.6891, + "step": 895700 + }, + { + "epoch": 3.6492923334511334, + "grad_norm": 4.8809099197387695, + "learning_rate": 0.0020814136837274265, + "loss": 7.6553, + "step": 895800 + }, + { + "epoch": 3.649699711474515, + "grad_norm": 4.494979381561279, + "learning_rate": 0.0020809248006145656, + "loss": 7.6237, + "step": 895900 + }, + { + "epoch": 3.6501070894978964, + "grad_norm": 11.90749454498291, + "learning_rate": 0.0020804359342676665, + "loss": 7.627, + "step": 896000 + }, + { + "epoch": 3.6501070894978964, + "eval_MaskedAccuracy": 0.5086819885680661, + "eval_loss": 1.601949691772461, + "eval_runtime": 156.8831, + "eval_samples_per_second": 404.607, + "eval_steps_per_second": 1.581, + "step": 896000 + }, + { + "epoch": 3.650514467521278, + "grad_norm": 9.674173355102539, + "learning_rate": 0.0020799470847060572, + "loss": 7.626, + "step": 896100 + }, + { + "epoch": 3.6509218455446595, + "grad_norm": 3.3969898223876953, + "learning_rate": 0.0020794582519490594, + "loss": 7.6072, + "step": 896200 + }, + { + "epoch": 3.651329223568041, + "grad_norm": 10.800909042358398, + "learning_rate": 0.0020789694360160045, + "loss": 7.5998, + "step": 896300 + }, + { + "epoch": 3.651736601591422, + "grad_norm": 4.3343353271484375, + "learning_rate": 0.0020784806369262138, + "loss": 7.6265, + "step": 896400 + }, + { + "epoch": 3.6521439796148036, + "grad_norm": 6.152104377746582, + "learning_rate": 0.0020779918546990203, + "loss": 7.6163, + "step": 896500 + }, + { + "epoch": 3.652551357638185, + "grad_norm": 4.405981540679932, + "learning_rate": 0.0020775030893537367, + "loss": 7.6275, + "step": 896600 + }, + { + "epoch": 3.6529587356615667, + "grad_norm": 9.420879364013672, + "learning_rate": 0.0020770143409096957, + "loss": 7.6232, + "step": 896700 + }, + { + "epoch": 3.6533661136849482, + "grad_norm": 7.606795310974121, + "learning_rate": 0.0020765256093862147, + "loss": 7.6283, + "step": 896800 + }, + { + "epoch": 3.6537734917083293, + "grad_norm": 4.768980979919434, + "learning_rate": 0.002076036894802622, + "loss": 7.6383, + "step": 896900 + }, + { + "epoch": 3.654180869731711, + "grad_norm": 4.798346996307373, + "learning_rate": 0.0020755481971782334, + "loss": 7.622, + "step": 897000 + }, + { + "epoch": 3.654180869731711, + "eval_MaskedAccuracy": 0.5083818313553284, + "eval_loss": 1.6076077222824097, + "eval_runtime": 161.0187, + "eval_samples_per_second": 394.215, + "eval_steps_per_second": 1.54, + "step": 897000 + }, + { + "epoch": 3.6545882477550924, + "grad_norm": 11.385787010192871, + "learning_rate": 0.002075059516532371, + "loss": 7.6523, + "step": 897100 + }, + { + "epoch": 3.654995625778474, + "grad_norm": 7.296698570251465, + "learning_rate": 0.0020745708528843525, + "loss": 7.6199, + "step": 897200 + }, + { + "epoch": 3.6554030038018555, + "grad_norm": 10.062533378601074, + "learning_rate": 0.0020740822062535014, + "loss": 7.6328, + "step": 897300 + }, + { + "epoch": 3.655810381825237, + "grad_norm": 4.156154632568359, + "learning_rate": 0.0020735935766591353, + "loss": 7.6397, + "step": 897400 + }, + { + "epoch": 3.6562177598486185, + "grad_norm": 5.237282752990723, + "learning_rate": 0.0020731049641205773, + "loss": 7.6019, + "step": 897500 + }, + { + "epoch": 3.656625137872, + "grad_norm": 11.06998062133789, + "learning_rate": 0.0020726163686571385, + "loss": 7.6242, + "step": 897600 + }, + { + "epoch": 3.657032515895381, + "grad_norm": 5.797039985656738, + "learning_rate": 0.002072127790288137, + "loss": 7.6159, + "step": 897700 + }, + { + "epoch": 3.6574398939187627, + "grad_norm": 5.237504482269287, + "learning_rate": 0.0020716392290328883, + "loss": 7.6375, + "step": 897800 + }, + { + "epoch": 3.657847271942144, + "grad_norm": 7.14192533493042, + "learning_rate": 0.002071150684910709, + "loss": 7.6388, + "step": 897900 + }, + { + "epoch": 3.6582546499655257, + "grad_norm": 4.7378106117248535, + "learning_rate": 0.0020706621579409112, + "loss": 7.5909, + "step": 898000 + }, + { + "epoch": 3.6582546499655257, + "eval_MaskedAccuracy": 0.5089759042056345, + "eval_loss": 1.5967822074890137, + "eval_runtime": 171.3986, + "eval_samples_per_second": 370.341, + "eval_steps_per_second": 1.447, + "step": 898000 + }, + { + "epoch": 3.658662027988907, + "grad_norm": 3.3009800910949707, + "learning_rate": 0.002070173648142811, + "loss": 7.6107, + "step": 898100 + }, + { + "epoch": 3.6590694060122884, + "grad_norm": 3.3634684085845947, + "learning_rate": 0.002069685155535723, + "loss": 7.6128, + "step": 898200 + }, + { + "epoch": 3.65947678403567, + "grad_norm": 3.283738136291504, + "learning_rate": 0.00206919668013896, + "loss": 7.6455, + "step": 898300 + }, + { + "epoch": 3.6598841620590514, + "grad_norm": 5.325892925262451, + "learning_rate": 0.002068708221971831, + "loss": 7.6271, + "step": 898400 + }, + { + "epoch": 3.660291540082433, + "grad_norm": 3.7771248817443848, + "learning_rate": 0.002068219781053649, + "loss": 7.6476, + "step": 898500 + }, + { + "epoch": 3.6606989181058145, + "grad_norm": 11.584171295166016, + "learning_rate": 0.0020677313574037274, + "loss": 7.6508, + "step": 898600 + }, + { + "epoch": 3.661106296129196, + "grad_norm": 5.788618564605713, + "learning_rate": 0.0020672429510413723, + "loss": 7.641, + "step": 898700 + }, + { + "epoch": 3.6615136741525776, + "grad_norm": 5.707223892211914, + "learning_rate": 0.0020667545619858924, + "loss": 7.623, + "step": 898800 + }, + { + "epoch": 3.6619210521759586, + "grad_norm": 5.797310829162598, + "learning_rate": 0.0020662661902566028, + "loss": 7.6452, + "step": 898900 + }, + { + "epoch": 3.66232843019934, + "grad_norm": 8.31090259552002, + "learning_rate": 0.0020657778358728067, + "loss": 7.6483, + "step": 899000 + }, + { + "epoch": 3.66232843019934, + "eval_MaskedAccuracy": 0.5084633935316983, + "eval_loss": 1.61117684841156, + "eval_runtime": 166.658, + "eval_samples_per_second": 380.876, + "eval_steps_per_second": 1.488, + "step": 899000 + }, + { + "epoch": 3.6627358082227217, + "grad_norm": 9.342093467712402, + "learning_rate": 0.002065289498853814, + "loss": 7.6256, + "step": 899100 + }, + { + "epoch": 3.6631431862461032, + "grad_norm": 4.870360374450684, + "learning_rate": 0.0020648011792189272, + "loss": 7.6325, + "step": 899200 + }, + { + "epoch": 3.6635505642694848, + "grad_norm": 4.592978477478027, + "learning_rate": 0.002064312876987453, + "loss": 7.6417, + "step": 899300 + }, + { + "epoch": 3.663957942292866, + "grad_norm": 6.682163715362549, + "learning_rate": 0.002063824592178697, + "loss": 7.6283, + "step": 899400 + }, + { + "epoch": 3.6643653203162474, + "grad_norm": 11.536015510559082, + "learning_rate": 0.002063336324811965, + "loss": 7.6433, + "step": 899500 + }, + { + "epoch": 3.664772698339629, + "grad_norm": 6.760446071624756, + "learning_rate": 0.00206284807490656, + "loss": 7.6333, + "step": 899600 + }, + { + "epoch": 3.6651800763630105, + "grad_norm": 2.6141417026519775, + "learning_rate": 0.0020623598424817863, + "loss": 7.6164, + "step": 899700 + }, + { + "epoch": 3.665587454386392, + "grad_norm": 4.794512748718262, + "learning_rate": 0.0020618716275569455, + "loss": 7.6386, + "step": 899800 + }, + { + "epoch": 3.6659948324097735, + "grad_norm": 11.944540023803711, + "learning_rate": 0.00206138343015134, + "loss": 7.6386, + "step": 899900 + }, + { + "epoch": 3.666402210433155, + "grad_norm": 2.631260871887207, + "learning_rate": 0.0020608952502842713, + "loss": 7.6714, + "step": 900000 + }, + { + "epoch": 3.666402210433155, + "eval_MaskedAccuracy": 0.5085446445840142, + "eval_loss": 1.612879991531372, + "eval_runtime": 178.0649, + "eval_samples_per_second": 356.477, + "eval_steps_per_second": 1.393, + "step": 900000 + }, + { + "epoch": 3.6668095884565366, + "grad_norm": 4.964359283447266, + "learning_rate": 0.0020604070879750368, + "loss": 7.6399, + "step": 900100 + }, + { + "epoch": 3.6672169664799177, + "grad_norm": 11.734545707702637, + "learning_rate": 0.0020599189432429364, + "loss": 7.6603, + "step": 900200 + }, + { + "epoch": 3.667624344503299, + "grad_norm": 4.237592697143555, + "learning_rate": 0.002059430816107272, + "loss": 7.6365, + "step": 900300 + }, + { + "epoch": 3.6680317225266807, + "grad_norm": 5.3469743728637695, + "learning_rate": 0.0020589427065873405, + "loss": 7.6163, + "step": 900400 + }, + { + "epoch": 3.6684391005500623, + "grad_norm": 10.180450439453125, + "learning_rate": 0.002058454614702438, + "loss": 7.6176, + "step": 900500 + }, + { + "epoch": 3.6688464785734434, + "grad_norm": 5.476974010467529, + "learning_rate": 0.0020579665404718635, + "loss": 7.642, + "step": 900600 + }, + { + "epoch": 3.669253856596825, + "grad_norm": 3.240492105484009, + "learning_rate": 0.002057478483914913, + "loss": 7.6244, + "step": 900700 + }, + { + "epoch": 3.6696612346202064, + "grad_norm": 4.910428047180176, + "learning_rate": 0.0020569904450508836, + "loss": 7.6391, + "step": 900800 + }, + { + "epoch": 3.670068612643588, + "grad_norm": 8.302295684814453, + "learning_rate": 0.002056502423899065, + "loss": 7.6495, + "step": 900900 + }, + { + "epoch": 3.6704759906669695, + "grad_norm": 3.451448917388916, + "learning_rate": 0.0020560144204787552, + "loss": 7.6515, + "step": 901000 + }, + { + "epoch": 3.6704759906669695, + "eval_MaskedAccuracy": 0.5089203770788138, + "eval_loss": 1.603334665298462, + "eval_runtime": 180.2357, + "eval_samples_per_second": 352.183, + "eval_steps_per_second": 1.376, + "step": 901000 + }, + { + "epoch": 3.670883368690351, + "grad_norm": 3.943767547607422, + "learning_rate": 0.002055526434809248, + "loss": 7.6194, + "step": 901100 + }, + { + "epoch": 3.6712907467137326, + "grad_norm": 9.712082862854004, + "learning_rate": 0.0020550384669098336, + "loss": 7.643, + "step": 901200 + }, + { + "epoch": 3.671698124737114, + "grad_norm": 8.442924499511719, + "learning_rate": 0.002054550516799806, + "loss": 7.6393, + "step": 901300 + }, + { + "epoch": 3.672105502760495, + "grad_norm": 8.293615341186523, + "learning_rate": 0.0020540625844984546, + "loss": 7.6541, + "step": 901400 + }, + { + "epoch": 3.6725128807838767, + "grad_norm": 10.332056999206543, + "learning_rate": 0.002053574670025072, + "loss": 7.6333, + "step": 901500 + }, + { + "epoch": 3.6729202588072583, + "grad_norm": 3.7203426361083984, + "learning_rate": 0.0020530867733989456, + "loss": 7.6398, + "step": 901600 + }, + { + "epoch": 3.67332763683064, + "grad_norm": 3.9798550605773926, + "learning_rate": 0.002052598894639367, + "loss": 7.6438, + "step": 901700 + }, + { + "epoch": 3.6737350148540213, + "grad_norm": 3.9761064052581787, + "learning_rate": 0.0020521110337656237, + "loss": 7.596, + "step": 901800 + }, + { + "epoch": 3.6741423928774024, + "grad_norm": 2.844905138015747, + "learning_rate": 0.0020516231907970026, + "loss": 7.6495, + "step": 901900 + }, + { + "epoch": 3.674549770900784, + "grad_norm": 3.3876631259918213, + "learning_rate": 0.0020511353657527926, + "loss": 7.6413, + "step": 902000 + }, + { + "epoch": 3.674549770900784, + "eval_MaskedAccuracy": 0.5079336132759298, + "eval_loss": 1.6143743991851807, + "eval_runtime": 191.2568, + "eval_samples_per_second": 331.889, + "eval_steps_per_second": 1.297, + "step": 902000 + }, + { + "epoch": 3.6749571489241655, + "grad_norm": 6.008819580078125, + "learning_rate": 0.002050647558652281, + "loss": 7.6347, + "step": 902100 + }, + { + "epoch": 3.675364526947547, + "grad_norm": 3.743192672729492, + "learning_rate": 0.0020501597695147472, + "loss": 7.6377, + "step": 902200 + }, + { + "epoch": 3.6757719049709285, + "grad_norm": 4.820095062255859, + "learning_rate": 0.002049671998359482, + "loss": 7.6569, + "step": 902300 + }, + { + "epoch": 3.67617928299431, + "grad_norm": 3.514479875564575, + "learning_rate": 0.0020491842452057644, + "loss": 7.627, + "step": 902400 + }, + { + "epoch": 3.6765866610176916, + "grad_norm": 5.559212684631348, + "learning_rate": 0.002048696510072885, + "loss": 7.6581, + "step": 902500 + }, + { + "epoch": 3.676994039041073, + "grad_norm": 3.9598398208618164, + "learning_rate": 0.0020482087929801253, + "loss": 7.6323, + "step": 902600 + }, + { + "epoch": 3.6774014170644542, + "grad_norm": 4.086262226104736, + "learning_rate": 0.0020477210939467633, + "loss": 7.6391, + "step": 902700 + }, + { + "epoch": 3.6778087950878358, + "grad_norm": 9.071392059326172, + "learning_rate": 0.002047233412992082, + "loss": 7.6273, + "step": 902800 + }, + { + "epoch": 3.6782161731112173, + "grad_norm": 7.524474143981934, + "learning_rate": 0.0020467457501353622, + "loss": 7.6524, + "step": 902900 + }, + { + "epoch": 3.678623551134599, + "grad_norm": 3.225539445877075, + "learning_rate": 0.0020462581053958865, + "loss": 7.6391, + "step": 903000 + }, + { + "epoch": 3.678623551134599, + "eval_MaskedAccuracy": 0.5081074179945364, + "eval_loss": 1.6174676418304443, + "eval_runtime": 170.825, + "eval_samples_per_second": 371.585, + "eval_steps_per_second": 1.452, + "step": 903000 + }, + { + "epoch": 3.67903092915798, + "grad_norm": 4.473433017730713, + "learning_rate": 0.002045770478792931, + "loss": 7.6309, + "step": 903100 + }, + { + "epoch": 3.6794383071813614, + "grad_norm": 8.447395324707031, + "learning_rate": 0.002045282870345772, + "loss": 7.5943, + "step": 903200 + }, + { + "epoch": 3.679845685204743, + "grad_norm": 10.553620338439941, + "learning_rate": 0.00204479528007369, + "loss": 7.6199, + "step": 903300 + }, + { + "epoch": 3.6802530632281245, + "grad_norm": 7.491590976715088, + "learning_rate": 0.002044307707995961, + "loss": 7.6273, + "step": 903400 + }, + { + "epoch": 3.680660441251506, + "grad_norm": 6.983431339263916, + "learning_rate": 0.002043820154131864, + "loss": 7.5983, + "step": 903500 + }, + { + "epoch": 3.6810678192748876, + "grad_norm": 7.161558628082275, + "learning_rate": 0.002043332618500672, + "loss": 7.6203, + "step": 903600 + }, + { + "epoch": 3.681475197298269, + "grad_norm": 6.19473934173584, + "learning_rate": 0.0020428451011216607, + "loss": 7.6518, + "step": 903700 + }, + { + "epoch": 3.6818825753216506, + "grad_norm": 9.952789306640625, + "learning_rate": 0.0020423576020141055, + "loss": 7.6285, + "step": 903800 + }, + { + "epoch": 3.6822899533450317, + "grad_norm": 6.193504810333252, + "learning_rate": 0.0020418701211972804, + "loss": 7.6272, + "step": 903900 + }, + { + "epoch": 3.6826973313684133, + "grad_norm": 4.269301891326904, + "learning_rate": 0.0020413826586904533, + "loss": 7.6176, + "step": 904000 + }, + { + "epoch": 3.6826973313684133, + "eval_MaskedAccuracy": 0.5083830301982519, + "eval_loss": 1.61100172996521, + "eval_runtime": 193.3793, + "eval_samples_per_second": 328.246, + "eval_steps_per_second": 1.282, + "step": 904000 + }, + { + "epoch": 3.683104709391795, + "grad_norm": 5.101426601409912, + "learning_rate": 0.0020408952145129, + "loss": 7.6138, + "step": 904100 + }, + { + "epoch": 3.6835120874151763, + "grad_norm": 4.785317420959473, + "learning_rate": 0.002040407788683889, + "loss": 7.6337, + "step": 904200 + }, + { + "epoch": 3.683919465438558, + "grad_norm": 4.20262336730957, + "learning_rate": 0.002039920381222697, + "loss": 7.6427, + "step": 904300 + }, + { + "epoch": 3.684326843461939, + "grad_norm": 3.919015407562256, + "learning_rate": 0.0020394329921485854, + "loss": 7.6244, + "step": 904400 + }, + { + "epoch": 3.6847342214853205, + "grad_norm": 7.638157844543457, + "learning_rate": 0.00203894562148083, + "loss": 7.6385, + "step": 904500 + }, + { + "epoch": 3.685141599508702, + "grad_norm": 7.017099380493164, + "learning_rate": 0.002038458269238695, + "loss": 7.6287, + "step": 904600 + }, + { + "epoch": 3.6855489775320835, + "grad_norm": 4.562326431274414, + "learning_rate": 0.0020379709354414514, + "loss": 7.6273, + "step": 904700 + }, + { + "epoch": 3.685956355555465, + "grad_norm": 5.913040637969971, + "learning_rate": 0.0020374836201083623, + "loss": 7.6478, + "step": 904800 + }, + { + "epoch": 3.6863637335788466, + "grad_norm": 6.3806891441345215, + "learning_rate": 0.0020369963232587007, + "loss": 7.6429, + "step": 904900 + }, + { + "epoch": 3.686771111602228, + "grad_norm": 6.356684684753418, + "learning_rate": 0.002036509044911725, + "loss": 7.6437, + "step": 905000 + }, + { + "epoch": 3.686771111602228, + "eval_MaskedAccuracy": 0.5082058590471668, + "eval_loss": 1.6076563596725464, + "eval_runtime": 163.38, + "eval_samples_per_second": 388.518, + "eval_steps_per_second": 1.518, + "step": 905000 + }, + { + "epoch": 3.6871784896256097, + "grad_norm": 6.108409881591797, + "learning_rate": 0.002036021785086704, + "loss": 7.6024, + "step": 905100 + }, + { + "epoch": 3.6875858676489908, + "grad_norm": 4.241765975952148, + "learning_rate": 0.002035534543802898, + "loss": 7.6181, + "step": 905200 + }, + { + "epoch": 3.6879932456723723, + "grad_norm": 7.218456268310547, + "learning_rate": 0.0020350473210795747, + "loss": 7.6158, + "step": 905300 + }, + { + "epoch": 3.688400623695754, + "grad_norm": 2.6576573848724365, + "learning_rate": 0.002034560116935994, + "loss": 7.6536, + "step": 905400 + }, + { + "epoch": 3.6888080017191354, + "grad_norm": 4.5723700523376465, + "learning_rate": 0.002034072931391417, + "loss": 7.6368, + "step": 905500 + }, + { + "epoch": 3.6892153797425165, + "grad_norm": 6.250267028808594, + "learning_rate": 0.002033585764465104, + "loss": 7.6665, + "step": 905600 + }, + { + "epoch": 3.689622757765898, + "grad_norm": 7.656291484832764, + "learning_rate": 0.0020330986161763206, + "loss": 7.639, + "step": 905700 + }, + { + "epoch": 3.6900301357892795, + "grad_norm": 4.797623634338379, + "learning_rate": 0.0020326114865443206, + "loss": 7.6211, + "step": 905800 + }, + { + "epoch": 3.690437513812661, + "grad_norm": 9.249707221984863, + "learning_rate": 0.0020321243755883674, + "loss": 7.6396, + "step": 905900 + }, + { + "epoch": 3.6908448918360426, + "grad_norm": 6.6858439445495605, + "learning_rate": 0.0020316372833277144, + "loss": 7.6265, + "step": 906000 + }, + { + "epoch": 3.6908448918360426, + "eval_MaskedAccuracy": 0.5085401463231789, + "eval_loss": 1.6020891666412354, + "eval_runtime": 182.5086, + "eval_samples_per_second": 347.797, + "eval_steps_per_second": 1.359, + "step": 906000 + }, + { + "epoch": 3.691252269859424, + "grad_norm": 6.077550888061523, + "learning_rate": 0.002031150209781623, + "loss": 7.6321, + "step": 906100 + }, + { + "epoch": 3.6916596478828057, + "grad_norm": 3.146226644515991, + "learning_rate": 0.0020306631549693453, + "loss": 7.6213, + "step": 906200 + }, + { + "epoch": 3.692067025906187, + "grad_norm": 7.1037211418151855, + "learning_rate": 0.002030176118910142, + "loss": 7.6714, + "step": 906300 + }, + { + "epoch": 3.6924744039295683, + "grad_norm": 5.205930709838867, + "learning_rate": 0.0020296891016232657, + "loss": 7.6276, + "step": 906400 + }, + { + "epoch": 3.69288178195295, + "grad_norm": 3.720398187637329, + "learning_rate": 0.0020292021031279696, + "loss": 7.6167, + "step": 906500 + }, + { + "epoch": 3.6932891599763313, + "grad_norm": 6.338487148284912, + "learning_rate": 0.0020287151234435093, + "loss": 7.5683, + "step": 906600 + }, + { + "epoch": 3.693696537999713, + "grad_norm": 9.687365531921387, + "learning_rate": 0.00202822816258914, + "loss": 7.6301, + "step": 906700 + }, + { + "epoch": 3.6941039160230944, + "grad_norm": 4.659130096435547, + "learning_rate": 0.00202774122058411, + "loss": 7.6488, + "step": 906800 + }, + { + "epoch": 3.6945112940464755, + "grad_norm": 6.577041149139404, + "learning_rate": 0.0020272542974476737, + "loss": 7.6651, + "step": 906900 + }, + { + "epoch": 3.694918672069857, + "grad_norm": 4.362553596496582, + "learning_rate": 0.002026767393199078, + "loss": 7.645, + "step": 907000 + }, + { + "epoch": 3.694918672069857, + "eval_MaskedAccuracy": 0.5082458465978787, + "eval_loss": 1.6149495840072632, + "eval_runtime": 176.4897, + "eval_samples_per_second": 359.658, + "eval_steps_per_second": 1.405, + "step": 907000 + }, + { + "epoch": 3.6953260500932386, + "grad_norm": 9.533172607421875, + "learning_rate": 0.0020262805078575764, + "loss": 7.6072, + "step": 907100 + }, + { + "epoch": 3.69573342811662, + "grad_norm": 8.81629753112793, + "learning_rate": 0.0020257936414424136, + "loss": 7.6248, + "step": 907200 + }, + { + "epoch": 3.6961408061400016, + "grad_norm": 4.230309009552002, + "learning_rate": 0.0020253067939728455, + "loss": 7.6214, + "step": 907300 + }, + { + "epoch": 3.696548184163383, + "grad_norm": 7.531736373901367, + "learning_rate": 0.002024819965468115, + "loss": 7.5999, + "step": 907400 + }, + { + "epoch": 3.6969555621867647, + "grad_norm": 7.610778331756592, + "learning_rate": 0.0020243331559474686, + "loss": 7.6341, + "step": 907500 + }, + { + "epoch": 3.697362940210146, + "grad_norm": 3.27508807182312, + "learning_rate": 0.0020238463654301555, + "loss": 7.6104, + "step": 907600 + }, + { + "epoch": 3.6977703182335273, + "grad_norm": 10.008770942687988, + "learning_rate": 0.0020233595939354145, + "loss": 7.6273, + "step": 907700 + }, + { + "epoch": 3.698177696256909, + "grad_norm": 5.908839702606201, + "learning_rate": 0.002022872841482497, + "loss": 7.6468, + "step": 907800 + }, + { + "epoch": 3.6985850742802904, + "grad_norm": 4.560897350311279, + "learning_rate": 0.0020223861080906442, + "loss": 7.6604, + "step": 907900 + }, + { + "epoch": 3.698992452303672, + "grad_norm": 6.814455032348633, + "learning_rate": 0.0020218993937791033, + "loss": 7.6636, + "step": 908000 + }, + { + "epoch": 3.698992452303672, + "eval_MaskedAccuracy": 0.5087985280176558, + "eval_loss": 1.6035388708114624, + "eval_runtime": 170.1775, + "eval_samples_per_second": 372.999, + "eval_steps_per_second": 1.457, + "step": 908000 + }, + { + "epoch": 3.699399830327053, + "grad_norm": 4.749573707580566, + "learning_rate": 0.0020214126985671106, + "loss": 7.6349, + "step": 908100 + }, + { + "epoch": 3.6998072083504345, + "grad_norm": 8.210793495178223, + "learning_rate": 0.0020209260224739096, + "loss": 7.6234, + "step": 908200 + }, + { + "epoch": 3.700214586373816, + "grad_norm": 4.4578657150268555, + "learning_rate": 0.0020204393655187405, + "loss": 7.6381, + "step": 908300 + }, + { + "epoch": 3.7006219643971976, + "grad_norm": 4.6508612632751465, + "learning_rate": 0.0020199527277208473, + "loss": 7.626, + "step": 908400 + }, + { + "epoch": 3.701029342420579, + "grad_norm": 5.958534240722656, + "learning_rate": 0.0020194661090994683, + "loss": 7.6338, + "step": 908500 + }, + { + "epoch": 3.7014367204439607, + "grad_norm": 4.354109764099121, + "learning_rate": 0.0020189795096738376, + "loss": 7.618, + "step": 908600 + }, + { + "epoch": 3.701844098467342, + "grad_norm": 4.007742404937744, + "learning_rate": 0.002018492929463198, + "loss": 7.6354, + "step": 908700 + }, + { + "epoch": 3.7022514764907237, + "grad_norm": 4.210424423217773, + "learning_rate": 0.0020180063684867855, + "loss": 7.6146, + "step": 908800 + }, + { + "epoch": 3.702658854514105, + "grad_norm": 7.69437837600708, + "learning_rate": 0.002017519826763835, + "loss": 7.624, + "step": 908900 + }, + { + "epoch": 3.7030662325374863, + "grad_norm": 7.199598789215088, + "learning_rate": 0.0020170333043135853, + "loss": 7.615, + "step": 909000 + }, + { + "epoch": 3.7030662325374863, + "eval_MaskedAccuracy": 0.5089749605240288, + "eval_loss": 1.6038508415222168, + "eval_runtime": 166.3254, + "eval_samples_per_second": 381.638, + "eval_steps_per_second": 1.491, + "step": 909000 + }, + { + "epoch": 3.703473610560868, + "grad_norm": 8.40762710571289, + "learning_rate": 0.0020165468011552647, + "loss": 7.6256, + "step": 909100 + }, + { + "epoch": 3.7038809885842494, + "grad_norm": 3.7668421268463135, + "learning_rate": 0.0020160603173081135, + "loss": 7.6101, + "step": 909200 + }, + { + "epoch": 3.704288366607631, + "grad_norm": 10.608351707458496, + "learning_rate": 0.0020155738527913624, + "loss": 7.6247, + "step": 909300 + }, + { + "epoch": 3.704695744631012, + "grad_norm": 5.143110752105713, + "learning_rate": 0.002015087407624243, + "loss": 7.631, + "step": 909400 + }, + { + "epoch": 3.7051031226543936, + "grad_norm": 12.672056198120117, + "learning_rate": 0.0020146009818259904, + "loss": 7.624, + "step": 909500 + }, + { + "epoch": 3.705510500677775, + "grad_norm": 8.395350456237793, + "learning_rate": 0.0020141145754158348, + "loss": 7.6635, + "step": 909600 + }, + { + "epoch": 3.7059178787011566, + "grad_norm": 13.078659057617188, + "learning_rate": 0.002013628188413004, + "loss": 7.6088, + "step": 909700 + }, + { + "epoch": 3.706325256724538, + "grad_norm": 5.733210563659668, + "learning_rate": 0.0020131418208367306, + "loss": 7.6195, + "step": 909800 + }, + { + "epoch": 3.7067326347479197, + "grad_norm": 5.2784857749938965, + "learning_rate": 0.0020126554727062434, + "loss": 7.6087, + "step": 909900 + }, + { + "epoch": 3.7071400127713012, + "grad_norm": 4.753120422363281, + "learning_rate": 0.002012169144040761, + "loss": 7.6253, + "step": 910000 + }, + { + "epoch": 3.7071400127713012, + "eval_MaskedAccuracy": 0.5084015905511936, + "eval_loss": 1.610862135887146, + "eval_runtime": 168.2194, + "eval_samples_per_second": 377.34, + "eval_steps_per_second": 1.474, + "step": 910000 + }, + { + "epoch": 3.7075473907946828, + "grad_norm": 6.685218334197998, + "learning_rate": 0.0020116828348595204, + "loss": 7.6265, + "step": 910100 + }, + { + "epoch": 3.707954768818064, + "grad_norm": 8.341331481933594, + "learning_rate": 0.0020111965451817494, + "loss": 7.629, + "step": 910200 + }, + { + "epoch": 3.7083621468414454, + "grad_norm": 7.240558624267578, + "learning_rate": 0.002010710275026669, + "loss": 7.6323, + "step": 910300 + }, + { + "epoch": 3.708769524864827, + "grad_norm": 4.858267307281494, + "learning_rate": 0.0020102240244135082, + "loss": 7.6126, + "step": 910400 + }, + { + "epoch": 3.7091769028882084, + "grad_norm": 7.6505560874938965, + "learning_rate": 0.0020097377933614856, + "loss": 7.6153, + "step": 910500 + }, + { + "epoch": 3.7095842809115895, + "grad_norm": 7.287432670593262, + "learning_rate": 0.0020092515818898276, + "loss": 7.6285, + "step": 910600 + }, + { + "epoch": 3.709991658934971, + "grad_norm": 5.595905303955078, + "learning_rate": 0.0020087653900177542, + "loss": 7.66, + "step": 910700 + }, + { + "epoch": 3.7103990369583526, + "grad_norm": 5.411009788513184, + "learning_rate": 0.0020082792177644865, + "loss": 7.635, + "step": 910800 + }, + { + "epoch": 3.710806414981734, + "grad_norm": 12.026354789733887, + "learning_rate": 0.002007793065149252, + "loss": 7.6275, + "step": 910900 + }, + { + "epoch": 3.7112137930051157, + "grad_norm": 3.368460178375244, + "learning_rate": 0.002007306932191267, + "loss": 7.6035, + "step": 911000 + }, + { + "epoch": 3.7112137930051157, + "eval_MaskedAccuracy": 0.5090320116095475, + "eval_loss": 1.6031590700149536, + "eval_runtime": 170.3081, + "eval_samples_per_second": 372.713, + "eval_steps_per_second": 1.456, + "step": 911000 + }, + { + "epoch": 3.711621171028497, + "grad_norm": 7.255367755889893, + "learning_rate": 0.002006820818909748, + "loss": 7.6008, + "step": 911100 + }, + { + "epoch": 3.7120285490518787, + "grad_norm": 4.47907829284668, + "learning_rate": 0.0020063347253239158, + "loss": 7.6551, + "step": 911200 + }, + { + "epoch": 3.7124359270752603, + "grad_norm": 6.953063011169434, + "learning_rate": 0.002005848651452989, + "loss": 7.6289, + "step": 911300 + }, + { + "epoch": 3.7128433050986414, + "grad_norm": 5.317480564117432, + "learning_rate": 0.0020053625973161875, + "loss": 7.6479, + "step": 911400 + }, + { + "epoch": 3.713250683122023, + "grad_norm": 3.5525434017181396, + "learning_rate": 0.0020048765629327213, + "loss": 7.6311, + "step": 911500 + }, + { + "epoch": 3.7136580611454044, + "grad_norm": 5.438886642456055, + "learning_rate": 0.002004390548321811, + "loss": 7.6205, + "step": 911600 + }, + { + "epoch": 3.714065439168786, + "grad_norm": 3.7494962215423584, + "learning_rate": 0.0020039045535026737, + "loss": 7.6396, + "step": 911700 + }, + { + "epoch": 3.7144728171921675, + "grad_norm": 5.122317314147949, + "learning_rate": 0.0020034185784945178, + "loss": 7.6494, + "step": 911800 + }, + { + "epoch": 3.7148801952155486, + "grad_norm": 5.092377662658691, + "learning_rate": 0.002002932623316553, + "loss": 7.6101, + "step": 911900 + }, + { + "epoch": 3.71528757323893, + "grad_norm": 3.388908863067627, + "learning_rate": 0.0020024466879880014, + "loss": 7.6088, + "step": 912000 + }, + { + "epoch": 3.71528757323893, + "eval_MaskedAccuracy": 0.5082819329059137, + "eval_loss": 1.6137927770614624, + "eval_runtime": 168.6591, + "eval_samples_per_second": 376.357, + "eval_steps_per_second": 1.47, + "step": 912000 + }, + { + "epoch": 3.7156949512623116, + "grad_norm": 9.515368461608887, + "learning_rate": 0.0020019607725280707, + "loss": 7.6392, + "step": 912100 + }, + { + "epoch": 3.716102329285693, + "grad_norm": 5.843565940856934, + "learning_rate": 0.0020014748769559693, + "loss": 7.629, + "step": 912200 + }, + { + "epoch": 3.7165097073090747, + "grad_norm": 5.855798244476318, + "learning_rate": 0.0020009890012909087, + "loss": 7.6139, + "step": 912300 + }, + { + "epoch": 3.7169170853324562, + "grad_norm": 8.065990447998047, + "learning_rate": 0.0020005031455521015, + "loss": 7.6508, + "step": 912400 + }, + { + "epoch": 3.7173244633558378, + "grad_norm": 6.999570369720459, + "learning_rate": 0.0020000173097587505, + "loss": 7.6084, + "step": 912500 + }, + { + "epoch": 3.7177318413792193, + "grad_norm": 6.957723140716553, + "learning_rate": 0.0019995314939300645, + "loss": 7.6511, + "step": 912600 + }, + { + "epoch": 3.7181392194026004, + "grad_norm": 5.770177364349365, + "learning_rate": 0.0019990456980852546, + "loss": 7.6507, + "step": 912700 + }, + { + "epoch": 3.718546597425982, + "grad_norm": 11.67274284362793, + "learning_rate": 0.001998559922243523, + "loss": 7.6312, + "step": 912800 + }, + { + "epoch": 3.7189539754493635, + "grad_norm": 8.449727058410645, + "learning_rate": 0.0019980741664240784, + "loss": 7.5975, + "step": 912900 + }, + { + "epoch": 3.719361353472745, + "grad_norm": 5.722390651702881, + "learning_rate": 0.001997588430646121, + "loss": 7.6141, + "step": 913000 + }, + { + "epoch": 3.719361353472745, + "eval_MaskedAccuracy": 0.5086555214553905, + "eval_loss": 1.6092941761016846, + "eval_runtime": 211.3674, + "eval_samples_per_second": 300.311, + "eval_steps_per_second": 1.173, + "step": 913000 + }, + { + "epoch": 3.719768731496126, + "grad_norm": 7.868215560913086, + "learning_rate": 0.0019971027149288576, + "loss": 7.6129, + "step": 913100 + }, + { + "epoch": 3.7201761095195076, + "grad_norm": 8.220807075500488, + "learning_rate": 0.0019966170192914907, + "loss": 7.635, + "step": 913200 + }, + { + "epoch": 3.720583487542889, + "grad_norm": 4.928590297698975, + "learning_rate": 0.0019961313437532203, + "loss": 7.6159, + "step": 913300 + }, + { + "epoch": 3.7209908655662707, + "grad_norm": 8.91843318939209, + "learning_rate": 0.00199564568833325, + "loss": 7.5896, + "step": 913400 + }, + { + "epoch": 3.721398243589652, + "grad_norm": 7.5331501960754395, + "learning_rate": 0.00199516005305078, + "loss": 7.6472, + "step": 913500 + }, + { + "epoch": 3.7218056216130337, + "grad_norm": 5.197667598724365, + "learning_rate": 0.0019946744379250115, + "loss": 7.6312, + "step": 913600 + }, + { + "epoch": 3.7222129996364153, + "grad_norm": 5.760436058044434, + "learning_rate": 0.001994188842975141, + "loss": 7.6081, + "step": 913700 + }, + { + "epoch": 3.722620377659797, + "grad_norm": 4.419961452484131, + "learning_rate": 0.0019937032682203664, + "loss": 7.5963, + "step": 913800 + }, + { + "epoch": 3.723027755683178, + "grad_norm": 4.65272331237793, + "learning_rate": 0.001993217713679888, + "loss": 7.6263, + "step": 913900 + }, + { + "epoch": 3.7234351337065594, + "grad_norm": 9.407983779907227, + "learning_rate": 0.0019927321793728973, + "loss": 7.6246, + "step": 914000 + }, + { + "epoch": 3.7234351337065594, + "eval_MaskedAccuracy": 0.5081790232648262, + "eval_loss": 1.6192362308502197, + "eval_runtime": 164.5577, + "eval_samples_per_second": 385.737, + "eval_steps_per_second": 1.507, + "step": 914000 + }, + { + "epoch": 3.723842511729941, + "grad_norm": 8.16222858428955, + "learning_rate": 0.0019922466653185945, + "loss": 7.6237, + "step": 914100 + }, + { + "epoch": 3.7242498897533225, + "grad_norm": 4.593684673309326, + "learning_rate": 0.0019917611715361745, + "loss": 7.59, + "step": 914200 + }, + { + "epoch": 3.724657267776704, + "grad_norm": 14.930048942565918, + "learning_rate": 0.0019912756980448297, + "loss": 7.586, + "step": 914300 + }, + { + "epoch": 3.725064645800085, + "grad_norm": 4.083841323852539, + "learning_rate": 0.0019907902448637557, + "loss": 7.6586, + "step": 914400 + }, + { + "epoch": 3.7254720238234666, + "grad_norm": 4.637545108795166, + "learning_rate": 0.001990304812012143, + "loss": 7.6282, + "step": 914500 + }, + { + "epoch": 3.725879401846848, + "grad_norm": 4.426086902618408, + "learning_rate": 0.001989819399509184, + "loss": 7.6296, + "step": 914600 + }, + { + "epoch": 3.7262867798702297, + "grad_norm": 4.19540548324585, + "learning_rate": 0.0019893340073740703, + "loss": 7.6132, + "step": 914700 + }, + { + "epoch": 3.7266941578936112, + "grad_norm": 4.008831977844238, + "learning_rate": 0.001988848635625989, + "loss": 7.6167, + "step": 914800 + }, + { + "epoch": 3.727101535916993, + "grad_norm": 4.061708927154541, + "learning_rate": 0.001988363284284133, + "loss": 7.6498, + "step": 914900 + }, + { + "epoch": 3.7275089139403743, + "grad_norm": 3.444599151611328, + "learning_rate": 0.0019878779533676894, + "loss": 7.6172, + "step": 915000 + }, + { + "epoch": 3.7275089139403743, + "eval_MaskedAccuracy": 0.508393301990947, + "eval_loss": 1.6040995121002197, + "eval_runtime": 211.9712, + "eval_samples_per_second": 299.456, + "eval_steps_per_second": 1.17, + "step": 915000 + }, + { + "epoch": 3.727916291963756, + "grad_norm": 5.284646034240723, + "learning_rate": 0.001987392642895846, + "loss": 7.6335, + "step": 915100 + }, + { + "epoch": 3.728323669987137, + "grad_norm": 6.965787410736084, + "learning_rate": 0.0019869073528877915, + "loss": 7.6545, + "step": 915200 + }, + { + "epoch": 3.7287310480105185, + "grad_norm": 10.287710189819336, + "learning_rate": 0.0019864220833627085, + "loss": 7.6184, + "step": 915300 + }, + { + "epoch": 3.7291384260339, + "grad_norm": 10.985379219055176, + "learning_rate": 0.001985936834339787, + "loss": 7.6207, + "step": 915400 + }, + { + "epoch": 3.7295458040572815, + "grad_norm": 6.770969390869141, + "learning_rate": 0.0019854516058382078, + "loss": 7.6202, + "step": 915500 + }, + { + "epoch": 3.7299531820806626, + "grad_norm": 5.606983184814453, + "learning_rate": 0.001984966397877154, + "loss": 7.6091, + "step": 915600 + }, + { + "epoch": 3.730360560104044, + "grad_norm": 7.817966938018799, + "learning_rate": 0.0019844812104758123, + "loss": 7.6162, + "step": 915700 + }, + { + "epoch": 3.7307679381274257, + "grad_norm": 5.647502899169922, + "learning_rate": 0.0019839960436533587, + "loss": 7.637, + "step": 915800 + }, + { + "epoch": 3.731175316150807, + "grad_norm": 7.4646525382995605, + "learning_rate": 0.001983510897428981, + "loss": 7.609, + "step": 915900 + }, + { + "epoch": 3.7315826941741888, + "grad_norm": 6.90963888168335, + "learning_rate": 0.0019830257718218528, + "loss": 7.6275, + "step": 916000 + }, + { + "epoch": 3.7315826941741888, + "eval_MaskedAccuracy": 0.5085301177683921, + "eval_loss": 1.616068720817566, + "eval_runtime": 184.7032, + "eval_samples_per_second": 343.665, + "eval_steps_per_second": 1.343, + "step": 916000 + }, + { + "epoch": 3.7319900721975703, + "grad_norm": 4.30002498626709, + "learning_rate": 0.001982540666851161, + "loss": 7.6591, + "step": 916100 + }, + { + "epoch": 3.732397450220952, + "grad_norm": 6.250236988067627, + "learning_rate": 0.001982055582536079, + "loss": 7.633, + "step": 916200 + }, + { + "epoch": 3.7328048282443334, + "grad_norm": 8.453675270080566, + "learning_rate": 0.001981570518895786, + "loss": 7.6295, + "step": 916300 + }, + { + "epoch": 3.7332122062677144, + "grad_norm": 4.36338996887207, + "learning_rate": 0.0019810854759494596, + "loss": 7.6035, + "step": 916400 + }, + { + "epoch": 3.733619584291096, + "grad_norm": 10.802000999450684, + "learning_rate": 0.0019806004537162767, + "loss": 7.6431, + "step": 916500 + }, + { + "epoch": 3.7340269623144775, + "grad_norm": 5.39288330078125, + "learning_rate": 0.001980115452215415, + "loss": 7.6356, + "step": 916600 + }, + { + "epoch": 3.734434340337859, + "grad_norm": 13.934906005859375, + "learning_rate": 0.001979630471466044, + "loss": 7.6191, + "step": 916700 + }, + { + "epoch": 3.7348417183612406, + "grad_norm": 4.2907233238220215, + "learning_rate": 0.0019791455114873398, + "loss": 7.6207, + "step": 916800 + }, + { + "epoch": 3.7352490963846217, + "grad_norm": 3.705734968185425, + "learning_rate": 0.001978660572298477, + "loss": 7.6012, + "step": 916900 + }, + { + "epoch": 3.735656474408003, + "grad_norm": 3.0304861068725586, + "learning_rate": 0.001978175653918627, + "loss": 7.6497, + "step": 917000 + }, + { + "epoch": 3.735656474408003, + "eval_MaskedAccuracy": 0.5084901686969557, + "eval_loss": 1.614431619644165, + "eval_runtime": 174.333, + "eval_samples_per_second": 364.108, + "eval_steps_per_second": 1.423, + "step": 917000 + }, + { + "epoch": 3.7360638524313847, + "grad_norm": 6.816333293914795, + "learning_rate": 0.0019776907563669624, + "loss": 7.6248, + "step": 917100 + }, + { + "epoch": 3.7364712304547663, + "grad_norm": 8.567267417907715, + "learning_rate": 0.001977205879662651, + "loss": 7.6239, + "step": 917200 + }, + { + "epoch": 3.736878608478148, + "grad_norm": 4.037398815155029, + "learning_rate": 0.001976721023824865, + "loss": 7.6, + "step": 917300 + }, + { + "epoch": 3.7372859865015293, + "grad_norm": 3.024914264678955, + "learning_rate": 0.00197623618887277, + "loss": 7.6173, + "step": 917400 + }, + { + "epoch": 3.737693364524911, + "grad_norm": 5.302032947540283, + "learning_rate": 0.001975751374825533, + "loss": 7.6221, + "step": 917500 + }, + { + "epoch": 3.7381007425482924, + "grad_norm": 5.088874816894531, + "learning_rate": 0.0019752665817023307, + "loss": 7.6246, + "step": 917600 + }, + { + "epoch": 3.7385081205716735, + "grad_norm": 2.968637704849243, + "learning_rate": 0.001974781809522325, + "loss": 7.6405, + "step": 917700 + }, + { + "epoch": 3.738915498595055, + "grad_norm": 13.242080688476562, + "learning_rate": 0.0019742970583046765, + "loss": 7.6194, + "step": 917800 + }, + { + "epoch": 3.7393228766184365, + "grad_norm": 5.979050636291504, + "learning_rate": 0.0019738123280685564, + "loss": 7.6502, + "step": 917900 + }, + { + "epoch": 3.739730254641818, + "grad_norm": 2.8038132190704346, + "learning_rate": 0.0019733276188331253, + "loss": 7.6651, + "step": 918000 + }, + { + "epoch": 3.739730254641818, + "eval_MaskedAccuracy": 0.5081496086599453, + "eval_loss": 1.61286199092865, + "eval_runtime": 174.8365, + "eval_samples_per_second": 363.059, + "eval_steps_per_second": 1.418, + "step": 918000 + }, + { + "epoch": 3.740137632665199, + "grad_norm": 12.330081939697266, + "learning_rate": 0.001972842930617548, + "loss": 7.6246, + "step": 918100 + }, + { + "epoch": 3.7405450106885807, + "grad_norm": 6.4346537590026855, + "learning_rate": 0.0019723582634409826, + "loss": 7.6453, + "step": 918200 + }, + { + "epoch": 3.7409523887119622, + "grad_norm": 12.415046691894531, + "learning_rate": 0.0019718736173225957, + "loss": 7.6195, + "step": 918300 + }, + { + "epoch": 3.7413597667353438, + "grad_norm": 4.888115882873535, + "learning_rate": 0.001971388992281549, + "loss": 7.637, + "step": 918400 + }, + { + "epoch": 3.7417671447587253, + "grad_norm": 7.780838966369629, + "learning_rate": 0.001970904388336996, + "loss": 7.6309, + "step": 918500 + }, + { + "epoch": 3.742174522782107, + "grad_norm": 11.20776653289795, + "learning_rate": 0.0019704198055080988, + "loss": 7.6622, + "step": 918600 + }, + { + "epoch": 3.7425819008054884, + "grad_norm": 9.625692367553711, + "learning_rate": 0.001969935243814019, + "loss": 7.6533, + "step": 918700 + }, + { + "epoch": 3.74298927882887, + "grad_norm": 7.409010410308838, + "learning_rate": 0.001969450703273912, + "loss": 7.5809, + "step": 918800 + }, + { + "epoch": 3.743396656852251, + "grad_norm": 6.391397476196289, + "learning_rate": 0.0019689661839069296, + "loss": 7.6396, + "step": 918900 + }, + { + "epoch": 3.7438040348756325, + "grad_norm": 5.268857479095459, + "learning_rate": 0.0019684816857322295, + "loss": 7.6487, + "step": 919000 + }, + { + "epoch": 3.7438040348756325, + "eval_MaskedAccuracy": 0.5092185881805066, + "eval_loss": 1.6022710800170898, + "eval_runtime": 167.9693, + "eval_samples_per_second": 377.902, + "eval_steps_per_second": 1.476, + "step": 919000 + }, + { + "epoch": 3.744211412899014, + "grad_norm": 5.5644659996032715, + "learning_rate": 0.00196799720876897, + "loss": 7.6263, + "step": 919100 + }, + { + "epoch": 3.7446187909223956, + "grad_norm": 5.995559215545654, + "learning_rate": 0.001967512753036302, + "loss": 7.649, + "step": 919200 + }, + { + "epoch": 3.745026168945777, + "grad_norm": 8.925673484802246, + "learning_rate": 0.0019670283185533794, + "loss": 7.6201, + "step": 919300 + }, + { + "epoch": 3.745433546969158, + "grad_norm": 4.775676250457764, + "learning_rate": 0.0019665439053393567, + "loss": 7.6222, + "step": 919400 + }, + { + "epoch": 3.7458409249925397, + "grad_norm": 9.755720138549805, + "learning_rate": 0.0019660595134133813, + "loss": 7.6237, + "step": 919500 + }, + { + "epoch": 3.7462483030159213, + "grad_norm": 3.6146299839019775, + "learning_rate": 0.0019655751427946062, + "loss": 7.5949, + "step": 919600 + }, + { + "epoch": 3.746655681039303, + "grad_norm": 6.875545978546143, + "learning_rate": 0.0019650907935021803, + "loss": 7.6283, + "step": 919700 + }, + { + "epoch": 3.7470630590626843, + "grad_norm": 5.4195356369018555, + "learning_rate": 0.0019646064655552543, + "loss": 7.6282, + "step": 919800 + }, + { + "epoch": 3.747470437086066, + "grad_norm": 16.771759033203125, + "learning_rate": 0.001964122158972976, + "loss": 7.6528, + "step": 919900 + }, + { + "epoch": 3.7478778151094474, + "grad_norm": 11.797569274902344, + "learning_rate": 0.00196363787377449, + "loss": 7.6391, + "step": 920000 + }, + { + "epoch": 3.7478778151094474, + "eval_MaskedAccuracy": 0.5082259376828405, + "eval_loss": 1.6109910011291504, + "eval_runtime": 176.7264, + "eval_samples_per_second": 359.177, + "eval_steps_per_second": 1.403, + "step": 920000 + }, + { + "epoch": 3.748285193132829, + "grad_norm": 4.627211570739746, + "learning_rate": 0.001963153609978944, + "loss": 7.5959, + "step": 920100 + }, + { + "epoch": 3.74869257115621, + "grad_norm": 4.405156135559082, + "learning_rate": 0.001962669367605476, + "loss": 7.6312, + "step": 920200 + }, + { + "epoch": 3.7490999491795916, + "grad_norm": 6.155109882354736, + "learning_rate": 0.00196218514667323, + "loss": 7.6155, + "step": 920300 + }, + { + "epoch": 3.749507327202973, + "grad_norm": 8.350790023803711, + "learning_rate": 0.001961700947201364, + "loss": 7.6211, + "step": 920400 + }, + { + "epoch": 3.7499147052263546, + "grad_norm": 4.750356197357178, + "learning_rate": 0.0019612167692090164, + "loss": 7.6295, + "step": 920500 + }, + { + "epoch": 3.7503220832497357, + "grad_norm": 4.070127487182617, + "learning_rate": 0.0019607326127153266, + "loss": 7.59, + "step": 920600 + }, + { + "epoch": 3.7507294612731172, + "grad_norm": 6.613310813903809, + "learning_rate": 0.0019602484777394382, + "loss": 7.6204, + "step": 920700 + }, + { + "epoch": 3.7511368392964988, + "grad_norm": 4.784940242767334, + "learning_rate": 0.001959764364300491, + "loss": 7.6265, + "step": 920800 + }, + { + "epoch": 3.7515442173198803, + "grad_norm": 4.124524116516113, + "learning_rate": 0.001959280272417624, + "loss": 7.5986, + "step": 920900 + }, + { + "epoch": 3.751951595343262, + "grad_norm": 6.608432292938232, + "learning_rate": 0.0019587962021099764, + "loss": 7.6198, + "step": 921000 + }, + { + "epoch": 3.751951595343262, + "eval_MaskedAccuracy": 0.5098709177786279, + "eval_loss": 1.603236436843872, + "eval_runtime": 207.7534, + "eval_samples_per_second": 305.535, + "eval_steps_per_second": 1.194, + "step": 921000 + }, + { + "epoch": 3.7523589733666434, + "grad_norm": 15.3954439163208, + "learning_rate": 0.001958312153396688, + "loss": 7.6055, + "step": 921100 + }, + { + "epoch": 3.752766351390025, + "grad_norm": 8.494688987731934, + "learning_rate": 0.001957828126296891, + "loss": 7.6394, + "step": 921200 + }, + { + "epoch": 3.7531737294134064, + "grad_norm": 6.292644023895264, + "learning_rate": 0.001957344120829729, + "loss": 7.6064, + "step": 921300 + }, + { + "epoch": 3.7535811074367875, + "grad_norm": 7.271989345550537, + "learning_rate": 0.001956860137014331, + "loss": 7.5715, + "step": 921400 + }, + { + "epoch": 3.753988485460169, + "grad_norm": 5.526138782501221, + "learning_rate": 0.0019563761748698323, + "loss": 7.6033, + "step": 921500 + }, + { + "epoch": 3.7543958634835506, + "grad_norm": 4.571513652801514, + "learning_rate": 0.001955892234415367, + "loss": 7.6333, + "step": 921600 + }, + { + "epoch": 3.754803241506932, + "grad_norm": 4.379602432250977, + "learning_rate": 0.00195540831567007, + "loss": 7.6146, + "step": 921700 + }, + { + "epoch": 3.7552106195303137, + "grad_norm": 5.573449611663818, + "learning_rate": 0.0019549244186530712, + "loss": 7.6089, + "step": 921800 + }, + { + "epoch": 3.7556179975536947, + "grad_norm": 4.742440223693848, + "learning_rate": 0.0019544405433835014, + "loss": 7.6135, + "step": 921900 + }, + { + "epoch": 3.7560253755770763, + "grad_norm": 7.930488109588623, + "learning_rate": 0.001953956689880494, + "loss": 7.6125, + "step": 922000 + }, + { + "epoch": 3.7560253755770763, + "eval_MaskedAccuracy": 0.508604559251422, + "eval_loss": 1.6128079891204834, + "eval_runtime": 178.4683, + "eval_samples_per_second": 355.671, + "eval_steps_per_second": 1.39, + "step": 922000 + }, + { + "epoch": 3.756432753600458, + "grad_norm": 5.972048759460449, + "learning_rate": 0.001953472858163176, + "loss": 7.6117, + "step": 922100 + }, + { + "epoch": 3.7568401316238393, + "grad_norm": 12.296833038330078, + "learning_rate": 0.0019529890482506767, + "loss": 7.614, + "step": 922200 + }, + { + "epoch": 3.757247509647221, + "grad_norm": 14.029672622680664, + "learning_rate": 0.001952505260162131, + "loss": 7.5928, + "step": 922300 + }, + { + "epoch": 3.7576548876706024, + "grad_norm": 5.605931758880615, + "learning_rate": 0.0019520214939166565, + "loss": 7.6218, + "step": 922400 + }, + { + "epoch": 3.758062265693984, + "grad_norm": 3.330333709716797, + "learning_rate": 0.0019515377495333808, + "loss": 7.5931, + "step": 922500 + }, + { + "epoch": 3.7584696437173655, + "grad_norm": 13.505358695983887, + "learning_rate": 0.0019510540270314277, + "loss": 7.6312, + "step": 922600 + }, + { + "epoch": 3.7588770217407466, + "grad_norm": 6.706369876861572, + "learning_rate": 0.0019505703264299232, + "loss": 7.6074, + "step": 922700 + }, + { + "epoch": 3.759284399764128, + "grad_norm": 5.232789039611816, + "learning_rate": 0.0019500866477479882, + "loss": 7.5991, + "step": 922800 + }, + { + "epoch": 3.7596917777875096, + "grad_norm": 7.2715044021606445, + "learning_rate": 0.0019496029910047465, + "loss": 7.6205, + "step": 922900 + }, + { + "epoch": 3.760099155810891, + "grad_norm": 12.410377502441406, + "learning_rate": 0.001949119356219318, + "loss": 7.6358, + "step": 923000 + }, + { + "epoch": 3.760099155810891, + "eval_MaskedAccuracy": 0.5088255505899758, + "eval_loss": 1.6051766872406006, + "eval_runtime": 167.2217, + "eval_samples_per_second": 379.592, + "eval_steps_per_second": 1.483, + "step": 923000 + }, + { + "epoch": 3.7605065338342722, + "grad_norm": 3.711641788482666, + "learning_rate": 0.0019486357434108253, + "loss": 7.6025, + "step": 923100 + }, + { + "epoch": 3.760913911857654, + "grad_norm": 8.183929443359375, + "learning_rate": 0.001948152152598386, + "loss": 7.6324, + "step": 923200 + }, + { + "epoch": 3.7613212898810353, + "grad_norm": 7.197895526885986, + "learning_rate": 0.0019476685838011201, + "loss": 7.6022, + "step": 923300 + }, + { + "epoch": 3.761728667904417, + "grad_norm": 10.465385437011719, + "learning_rate": 0.001947185037038145, + "loss": 7.6294, + "step": 923400 + }, + { + "epoch": 3.7621360459277984, + "grad_norm": 3.7687559127807617, + "learning_rate": 0.0019467015123285816, + "loss": 7.6236, + "step": 923500 + }, + { + "epoch": 3.76254342395118, + "grad_norm": 3.5964677333831787, + "learning_rate": 0.0019462180096915425, + "loss": 7.6261, + "step": 923600 + }, + { + "epoch": 3.7629508019745614, + "grad_norm": 7.2088117599487305, + "learning_rate": 0.0019457345291461424, + "loss": 7.6201, + "step": 923700 + }, + { + "epoch": 3.763358179997943, + "grad_norm": 2.728327512741089, + "learning_rate": 0.0019452510707114983, + "loss": 7.6378, + "step": 923800 + }, + { + "epoch": 3.763765558021324, + "grad_norm": 4.735380172729492, + "learning_rate": 0.0019447676344067238, + "loss": 7.6563, + "step": 923900 + }, + { + "epoch": 3.7641729360447056, + "grad_norm": 8.870068550109863, + "learning_rate": 0.0019442842202509302, + "loss": 7.6321, + "step": 924000 + }, + { + "epoch": 3.7641729360447056, + "eval_MaskedAccuracy": 0.5082377429610162, + "eval_loss": 1.6108441352844238, + "eval_runtime": 169.2683, + "eval_samples_per_second": 375.002, + "eval_steps_per_second": 1.465, + "step": 924000 + }, + { + "epoch": 3.764580314068087, + "grad_norm": 6.801137924194336, + "learning_rate": 0.0019438008282632287, + "loss": 7.6365, + "step": 924100 + }, + { + "epoch": 3.7649876920914687, + "grad_norm": 11.852954864501953, + "learning_rate": 0.0019433174584627332, + "loss": 7.6249, + "step": 924200 + }, + { + "epoch": 3.76539507011485, + "grad_norm": 4.879085063934326, + "learning_rate": 0.0019428341108685529, + "loss": 7.6219, + "step": 924300 + }, + { + "epoch": 3.7658024481382313, + "grad_norm": 8.953462600708008, + "learning_rate": 0.0019423507854997965, + "loss": 7.6171, + "step": 924400 + }, + { + "epoch": 3.766209826161613, + "grad_norm": 6.114043712615967, + "learning_rate": 0.0019418674823755747, + "loss": 7.6225, + "step": 924500 + }, + { + "epoch": 3.7666172041849943, + "grad_norm": 4.322595119476318, + "learning_rate": 0.001941384201514992, + "loss": 7.6156, + "step": 924600 + }, + { + "epoch": 3.767024582208376, + "grad_norm": 9.810853958129883, + "learning_rate": 0.0019409009429371554, + "loss": 7.6142, + "step": 924700 + }, + { + "epoch": 3.7674319602317574, + "grad_norm": 4.716188907623291, + "learning_rate": 0.0019404177066611708, + "loss": 7.6106, + "step": 924800 + }, + { + "epoch": 3.767839338255139, + "grad_norm": 5.210825443267822, + "learning_rate": 0.001939934492706145, + "loss": 7.6208, + "step": 924900 + }, + { + "epoch": 3.7682467162785205, + "grad_norm": 6.435670375823975, + "learning_rate": 0.0019394513010911813, + "loss": 7.6332, + "step": 925000 + }, + { + "epoch": 3.7682467162785205, + "eval_MaskedAccuracy": 0.5089074662516333, + "eval_loss": 1.6012661457061768, + "eval_runtime": 172.1616, + "eval_samples_per_second": 368.7, + "eval_steps_per_second": 1.441, + "step": 925000 + }, + { + "epoch": 3.768654094301902, + "grad_norm": 3.518427848815918, + "learning_rate": 0.0019389681318353832, + "loss": 7.6217, + "step": 925100 + }, + { + "epoch": 3.769061472325283, + "grad_norm": 4.321313858032227, + "learning_rate": 0.0019384849849578495, + "loss": 7.6026, + "step": 925200 + }, + { + "epoch": 3.7694688503486646, + "grad_norm": 5.672753810882568, + "learning_rate": 0.0019380018604776858, + "loss": 7.6097, + "step": 925300 + }, + { + "epoch": 3.769876228372046, + "grad_norm": 7.020403861999512, + "learning_rate": 0.001937518758413992, + "loss": 7.6288, + "step": 925400 + }, + { + "epoch": 3.7702836063954277, + "grad_norm": 14.594400405883789, + "learning_rate": 0.0019370356787858662, + "loss": 7.6228, + "step": 925500 + }, + { + "epoch": 3.770690984418809, + "grad_norm": 5.069800853729248, + "learning_rate": 0.0019365526216124085, + "loss": 7.6339, + "step": 925600 + }, + { + "epoch": 3.7710983624421903, + "grad_norm": 3.8607585430145264, + "learning_rate": 0.0019360695869127139, + "loss": 7.5989, + "step": 925700 + }, + { + "epoch": 3.771505740465572, + "grad_norm": 4.882491588592529, + "learning_rate": 0.0019355865747058842, + "loss": 7.6535, + "step": 925800 + }, + { + "epoch": 3.7719131184889534, + "grad_norm": 5.538855075836182, + "learning_rate": 0.0019351035850110126, + "loss": 7.6216, + "step": 925900 + }, + { + "epoch": 3.772320496512335, + "grad_norm": 18.590662002563477, + "learning_rate": 0.001934620617847193, + "loss": 7.6136, + "step": 926000 + }, + { + "epoch": 3.772320496512335, + "eval_MaskedAccuracy": 0.5085267091249369, + "eval_loss": 1.6080429553985596, + "eval_runtime": 173.6734, + "eval_samples_per_second": 365.491, + "eval_steps_per_second": 1.428, + "step": 926000 + }, + { + "epoch": 3.7727278745357165, + "grad_norm": 4.088473796844482, + "learning_rate": 0.0019341376732335217, + "loss": 7.614, + "step": 926100 + }, + { + "epoch": 3.773135252559098, + "grad_norm": 8.415093421936035, + "learning_rate": 0.0019336547511890896, + "loss": 7.6151, + "step": 926200 + }, + { + "epoch": 3.7735426305824795, + "grad_norm": 5.7778120040893555, + "learning_rate": 0.001933171851732991, + "loss": 7.5784, + "step": 926300 + }, + { + "epoch": 3.7739500086058606, + "grad_norm": 10.873912811279297, + "learning_rate": 0.0019326889748843164, + "loss": 7.6529, + "step": 926400 + }, + { + "epoch": 3.774357386629242, + "grad_norm": 10.82746696472168, + "learning_rate": 0.00193220612066216, + "loss": 7.6368, + "step": 926500 + }, + { + "epoch": 3.7747647646526237, + "grad_norm": 6.487828731536865, + "learning_rate": 0.0019317232890856083, + "loss": 7.6463, + "step": 926600 + }, + { + "epoch": 3.775172142676005, + "grad_norm": 3.0424578189849854, + "learning_rate": 0.0019312404801737518, + "loss": 7.6184, + "step": 926700 + }, + { + "epoch": 3.7755795206993867, + "grad_norm": 3.9002697467803955, + "learning_rate": 0.001930757693945678, + "loss": 7.6292, + "step": 926800 + }, + { + "epoch": 3.775986898722768, + "grad_norm": 7.217895984649658, + "learning_rate": 0.0019302749304204731, + "loss": 7.6211, + "step": 926900 + }, + { + "epoch": 3.7763942767461494, + "grad_norm": 9.165586471557617, + "learning_rate": 0.0019297921896172245, + "loss": 7.6304, + "step": 927000 + }, + { + "epoch": 3.7763942767461494, + "eval_MaskedAccuracy": 0.5088129069204577, + "eval_loss": 1.6102675199508667, + "eval_runtime": 184.8197, + "eval_samples_per_second": 343.448, + "eval_steps_per_second": 1.342, + "step": 927000 + }, + { + "epoch": 3.776801654769531, + "grad_norm": 7.244925022125244, + "learning_rate": 0.0019293094715550159, + "loss": 7.6093, + "step": 927100 + }, + { + "epoch": 3.7772090327929124, + "grad_norm": 10.581029891967773, + "learning_rate": 0.0019288267762529328, + "loss": 7.612, + "step": 927200 + }, + { + "epoch": 3.777616410816294, + "grad_norm": 3.60493540763855, + "learning_rate": 0.0019283441037300602, + "loss": 7.6197, + "step": 927300 + }, + { + "epoch": 3.7780237888396755, + "grad_norm": 10.525854110717773, + "learning_rate": 0.001927861454005478, + "loss": 7.6008, + "step": 927400 + }, + { + "epoch": 3.778431166863057, + "grad_norm": 11.118393898010254, + "learning_rate": 0.0019273788270982703, + "loss": 7.631, + "step": 927500 + }, + { + "epoch": 3.7788385448864386, + "grad_norm": 4.733307361602783, + "learning_rate": 0.001926896223027518, + "loss": 7.6265, + "step": 927600 + }, + { + "epoch": 3.7792459229098196, + "grad_norm": 3.1409990787506104, + "learning_rate": 0.0019264136418122996, + "loss": 7.6193, + "step": 927700 + }, + { + "epoch": 3.779653300933201, + "grad_norm": 7.09230899810791, + "learning_rate": 0.001925931083471695, + "loss": 7.6131, + "step": 927800 + }, + { + "epoch": 3.7800606789565827, + "grad_norm": 3.0488314628601074, + "learning_rate": 0.0019254485480247794, + "loss": 7.6366, + "step": 927900 + }, + { + "epoch": 3.7804680569799642, + "grad_norm": 5.295105457305908, + "learning_rate": 0.0019249660354906334, + "loss": 7.6328, + "step": 928000 + }, + { + "epoch": 3.7804680569799642, + "eval_MaskedAccuracy": 0.5088230856977412, + "eval_loss": 1.6120420694351196, + "eval_runtime": 177.4072, + "eval_samples_per_second": 357.798, + "eval_steps_per_second": 1.398, + "step": 928000 + }, + { + "epoch": 3.7808754350033453, + "grad_norm": 4.067548751831055, + "learning_rate": 0.0019244835458883335, + "loss": 7.5919, + "step": 928100 + }, + { + "epoch": 3.781282813026727, + "grad_norm": 6.178030014038086, + "learning_rate": 0.001924001079236952, + "loss": 7.6033, + "step": 928200 + }, + { + "epoch": 3.7816901910501084, + "grad_norm": 4.132225036621094, + "learning_rate": 0.0019235186355555647, + "loss": 7.6217, + "step": 928300 + }, + { + "epoch": 3.78209756907349, + "grad_norm": 3.7145018577575684, + "learning_rate": 0.0019230362148632463, + "loss": 7.6341, + "step": 928400 + }, + { + "epoch": 3.7825049470968715, + "grad_norm": 6.242832660675049, + "learning_rate": 0.001922553817179067, + "loss": 7.6216, + "step": 928500 + }, + { + "epoch": 3.782912325120253, + "grad_norm": 10.651412963867188, + "learning_rate": 0.0019220714425221036, + "loss": 7.6105, + "step": 928600 + }, + { + "epoch": 3.7833197031436345, + "grad_norm": 3.809922695159912, + "learning_rate": 0.0019215890909114223, + "loss": 7.6333, + "step": 928700 + }, + { + "epoch": 3.783727081167016, + "grad_norm": 4.2344255447387695, + "learning_rate": 0.001921106762366095, + "loss": 7.6358, + "step": 928800 + }, + { + "epoch": 3.784134459190397, + "grad_norm": 5.805576324462891, + "learning_rate": 0.00192062445690519, + "loss": 7.6337, + "step": 928900 + }, + { + "epoch": 3.7845418372137787, + "grad_norm": 3.8189687728881836, + "learning_rate": 0.0019201421745477757, + "loss": 7.63, + "step": 929000 + }, + { + "epoch": 3.7845418372137787, + "eval_MaskedAccuracy": 0.5085324869720714, + "eval_loss": 1.6022095680236816, + "eval_runtime": 218.4841, + "eval_samples_per_second": 290.529, + "eval_steps_per_second": 1.135, + "step": 929000 + }, + { + "epoch": 3.78494921523716, + "grad_norm": 4.850886344909668, + "learning_rate": 0.0019196599153129196, + "loss": 7.6288, + "step": 929100 + }, + { + "epoch": 3.7853565932605417, + "grad_norm": 4.773021697998047, + "learning_rate": 0.0019191776792196885, + "loss": 7.6495, + "step": 929200 + }, + { + "epoch": 3.7857639712839233, + "grad_norm": 9.408202171325684, + "learning_rate": 0.0019186954662871458, + "loss": 7.6224, + "step": 929300 + }, + { + "epoch": 3.7861713493073044, + "grad_norm": 9.539839744567871, + "learning_rate": 0.0019182132765343577, + "loss": 7.6402, + "step": 929400 + }, + { + "epoch": 3.786578727330686, + "grad_norm": 7.151216983795166, + "learning_rate": 0.0019177311099803872, + "loss": 7.601, + "step": 929500 + }, + { + "epoch": 3.7869861053540674, + "grad_norm": 5.781540870666504, + "learning_rate": 0.0019172489666442968, + "loss": 7.6295, + "step": 929600 + }, + { + "epoch": 3.787393483377449, + "grad_norm": 8.596144676208496, + "learning_rate": 0.0019167668465451477, + "loss": 7.6083, + "step": 929700 + }, + { + "epoch": 3.7878008614008305, + "grad_norm": 3.7301230430603027, + "learning_rate": 0.0019162847497020026, + "loss": 7.6516, + "step": 929800 + }, + { + "epoch": 3.788208239424212, + "grad_norm": 10.695462226867676, + "learning_rate": 0.0019158026761339204, + "loss": 7.5908, + "step": 929900 + }, + { + "epoch": 3.7886156174475936, + "grad_norm": 5.438191890716553, + "learning_rate": 0.001915320625859957, + "loss": 7.6035, + "step": 930000 + }, + { + "epoch": 3.7886156174475936, + "eval_MaskedAccuracy": 0.5088793106898304, + "eval_loss": 1.6012581586837769, + "eval_runtime": 202.1123, + "eval_samples_per_second": 314.063, + "eval_steps_per_second": 1.227, + "step": 930000 + }, + { + "epoch": 3.789022995470975, + "grad_norm": 8.394279479980469, + "learning_rate": 0.0019148385988991758, + "loss": 7.6457, + "step": 930100 + }, + { + "epoch": 3.789430373494356, + "grad_norm": 3.487502336502075, + "learning_rate": 0.001914356595270628, + "loss": 7.6192, + "step": 930200 + }, + { + "epoch": 3.7898377515177377, + "grad_norm": 8.857083320617676, + "learning_rate": 0.0019138746149933753, + "loss": 7.601, + "step": 930300 + }, + { + "epoch": 3.7902451295411193, + "grad_norm": 6.209765911102295, + "learning_rate": 0.0019133926580864706, + "loss": 7.6184, + "step": 930400 + }, + { + "epoch": 3.790652507564501, + "grad_norm": 12.463996887207031, + "learning_rate": 0.0019129107245689683, + "loss": 7.6209, + "step": 930500 + }, + { + "epoch": 3.791059885587882, + "grad_norm": 9.588804244995117, + "learning_rate": 0.001912428814459921, + "loss": 7.5902, + "step": 930600 + }, + { + "epoch": 3.7914672636112634, + "grad_norm": 9.467856407165527, + "learning_rate": 0.001911946927778382, + "loss": 7.6354, + "step": 930700 + }, + { + "epoch": 3.791874641634645, + "grad_norm": 9.193414688110352, + "learning_rate": 0.0019114650645434032, + "loss": 7.6607, + "step": 930800 + }, + { + "epoch": 3.7922820196580265, + "grad_norm": 10.822127342224121, + "learning_rate": 0.001910983224774035, + "loss": 7.6242, + "step": 930900 + }, + { + "epoch": 3.792689397681408, + "grad_norm": 5.29072380065918, + "learning_rate": 0.001910501408489326, + "loss": 7.6329, + "step": 931000 + }, + { + "epoch": 3.792689397681408, + "eval_MaskedAccuracy": 0.5088497738630215, + "eval_loss": 1.6049249172210693, + "eval_runtime": 173.2161, + "eval_samples_per_second": 366.456, + "eval_steps_per_second": 1.432, + "step": 931000 + }, + { + "epoch": 3.7930967757047895, + "grad_norm": 6.270878314971924, + "learning_rate": 0.0019100196157083252, + "loss": 7.6315, + "step": 931100 + }, + { + "epoch": 3.793504153728171, + "grad_norm": 8.242561340332031, + "learning_rate": 0.0019095378464500808, + "loss": 7.6053, + "step": 931200 + }, + { + "epoch": 3.7939115317515526, + "grad_norm": 3.960211992263794, + "learning_rate": 0.001909056100733643, + "loss": 7.6133, + "step": 931300 + }, + { + "epoch": 3.7943189097749337, + "grad_norm": 6.9959235191345215, + "learning_rate": 0.0019085743785780516, + "loss": 7.6454, + "step": 931400 + }, + { + "epoch": 3.7947262877983152, + "grad_norm": 4.7930192947387695, + "learning_rate": 0.0019080926800023575, + "loss": 7.6258, + "step": 931500 + }, + { + "epoch": 3.7951336658216968, + "grad_norm": 5.788893699645996, + "learning_rate": 0.0019076110050256006, + "loss": 7.6056, + "step": 931600 + }, + { + "epoch": 3.7955410438450783, + "grad_norm": 10.426204681396484, + "learning_rate": 0.0019071293536668263, + "loss": 7.6422, + "step": 931700 + }, + { + "epoch": 3.79594842186846, + "grad_norm": 7.6468682289123535, + "learning_rate": 0.001906647725945077, + "loss": 7.6086, + "step": 931800 + }, + { + "epoch": 3.796355799891841, + "grad_norm": 4.9427266120910645, + "learning_rate": 0.0019061661218793935, + "loss": 7.6002, + "step": 931900 + }, + { + "epoch": 3.7967631779152224, + "grad_norm": 9.223947525024414, + "learning_rate": 0.0019056845414888162, + "loss": 7.63, + "step": 932000 + }, + { + "epoch": 3.7967631779152224, + "eval_MaskedAccuracy": 0.5096634473074941, + "eval_loss": 1.6023601293563843, + "eval_runtime": 183.5559, + "eval_samples_per_second": 345.813, + "eval_steps_per_second": 1.351, + "step": 932000 + }, + { + "epoch": 3.797170555938604, + "grad_norm": 6.308687210083008, + "learning_rate": 0.0019052029847923845, + "loss": 7.5935, + "step": 932100 + }, + { + "epoch": 3.7975779339619855, + "grad_norm": 8.173504829406738, + "learning_rate": 0.001904721451809135, + "loss": 7.6145, + "step": 932200 + }, + { + "epoch": 3.797985311985367, + "grad_norm": 13.783915519714355, + "learning_rate": 0.0019042399425581089, + "loss": 7.5895, + "step": 932300 + }, + { + "epoch": 3.7983926900087486, + "grad_norm": 6.4456987380981445, + "learning_rate": 0.0019037584570583406, + "loss": 7.6284, + "step": 932400 + }, + { + "epoch": 3.79880006803213, + "grad_norm": 3.5840206146240234, + "learning_rate": 0.0019032769953288655, + "loss": 7.6604, + "step": 932500 + }, + { + "epoch": 3.7992074460555116, + "grad_norm": 11.0537109375, + "learning_rate": 0.00190279555738872, + "loss": 7.6327, + "step": 932600 + }, + { + "epoch": 3.7996148240788927, + "grad_norm": 4.1343913078308105, + "learning_rate": 0.0019023141432569365, + "loss": 7.6355, + "step": 932700 + }, + { + "epoch": 3.8000222021022743, + "grad_norm": 8.910087585449219, + "learning_rate": 0.0019018327529525486, + "loss": 7.5976, + "step": 932800 + }, + { + "epoch": 3.800429580125656, + "grad_norm": 4.534204959869385, + "learning_rate": 0.0019013513864945871, + "loss": 7.6299, + "step": 932900 + }, + { + "epoch": 3.8008369581490373, + "grad_norm": 3.7702343463897705, + "learning_rate": 0.001900870043902081, + "loss": 7.6218, + "step": 933000 + }, + { + "epoch": 3.8008369581490373, + "eval_MaskedAccuracy": 0.5087608229897715, + "eval_loss": 1.6011755466461182, + "eval_runtime": 168.4595, + "eval_samples_per_second": 376.803, + "eval_steps_per_second": 1.472, + "step": 933000 + }, + { + "epoch": 3.8012443361724184, + "grad_norm": 20.06410789489746, + "learning_rate": 0.0019003887251940656, + "loss": 7.6295, + "step": 933100 + }, + { + "epoch": 3.8016517141958, + "grad_norm": 7.299319267272949, + "learning_rate": 0.001899907430389567, + "loss": 7.6214, + "step": 933200 + }, + { + "epoch": 3.8020590922191815, + "grad_norm": 9.940072059631348, + "learning_rate": 0.0018994261595076155, + "loss": 7.6014, + "step": 933300 + }, + { + "epoch": 3.802466470242563, + "grad_norm": 9.4749755859375, + "learning_rate": 0.001898944912567237, + "loss": 7.6262, + "step": 933400 + }, + { + "epoch": 3.8028738482659445, + "grad_norm": 8.810791969299316, + "learning_rate": 0.0018984636895874555, + "loss": 7.6003, + "step": 933500 + }, + { + "epoch": 3.803281226289326, + "grad_norm": 5.451295852661133, + "learning_rate": 0.0018979824905872994, + "loss": 7.6203, + "step": 933600 + }, + { + "epoch": 3.8036886043127076, + "grad_norm": 9.41309642791748, + "learning_rate": 0.0018975013155857938, + "loss": 7.628, + "step": 933700 + }, + { + "epoch": 3.804095982336089, + "grad_norm": 3.6500532627105713, + "learning_rate": 0.0018970201646019583, + "loss": 7.6062, + "step": 933800 + }, + { + "epoch": 3.8045033603594702, + "grad_norm": 3.9726932048797607, + "learning_rate": 0.0018965390376548175, + "loss": 7.6384, + "step": 933900 + }, + { + "epoch": 3.8049107383828518, + "grad_norm": 18.39219856262207, + "learning_rate": 0.0018960579347633922, + "loss": 7.6166, + "step": 934000 + }, + { + "epoch": 3.8049107383828518, + "eval_MaskedAccuracy": 0.5078392971486618, + "eval_loss": 1.615269660949707, + "eval_runtime": 165.284, + "eval_samples_per_second": 384.042, + "eval_steps_per_second": 1.5, + "step": 934000 + }, + { + "epoch": 3.8053181164062333, + "grad_norm": 7.437594890594482, + "learning_rate": 0.001895576855946705, + "loss": 7.6228, + "step": 934100 + }, + { + "epoch": 3.805725494429615, + "grad_norm": 5.421675205230713, + "learning_rate": 0.001895095801223774, + "loss": 7.6216, + "step": 934200 + }, + { + "epoch": 3.8061328724529964, + "grad_norm": 4.602786540985107, + "learning_rate": 0.001894614770613619, + "loss": 7.6374, + "step": 934300 + }, + { + "epoch": 3.8065402504763775, + "grad_norm": 3.846876621246338, + "learning_rate": 0.001894133764135255, + "loss": 7.5964, + "step": 934400 + }, + { + "epoch": 3.806947628499759, + "grad_norm": 3.219874143600464, + "learning_rate": 0.0018936527818077004, + "loss": 7.5776, + "step": 934500 + }, + { + "epoch": 3.8073550065231405, + "grad_norm": 6.049060344696045, + "learning_rate": 0.0018931718236499724, + "loss": 7.6029, + "step": 934600 + }, + { + "epoch": 3.807762384546522, + "grad_norm": 8.037505149841309, + "learning_rate": 0.0018926908896810847, + "loss": 7.5972, + "step": 934700 + }, + { + "epoch": 3.8081697625699036, + "grad_norm": 6.002288341522217, + "learning_rate": 0.0018922099799200507, + "loss": 7.6172, + "step": 934800 + }, + { + "epoch": 3.808577140593285, + "grad_norm": 8.529500961303711, + "learning_rate": 0.0018917290943858815, + "loss": 7.648, + "step": 934900 + }, + { + "epoch": 3.8089845186166666, + "grad_norm": 4.888249397277832, + "learning_rate": 0.0018912482330975921, + "loss": 7.6155, + "step": 935000 + }, + { + "epoch": 3.8089845186166666, + "eval_MaskedAccuracy": 0.5092839951477184, + "eval_loss": 1.6106300354003906, + "eval_runtime": 175.5796, + "eval_samples_per_second": 361.523, + "eval_steps_per_second": 1.412, + "step": 935000 + }, + { + "epoch": 3.809391896640048, + "grad_norm": 12.686517715454102, + "learning_rate": 0.0018907673960741923, + "loss": 7.6283, + "step": 935100 + }, + { + "epoch": 3.8097992746634293, + "grad_norm": 5.5246734619140625, + "learning_rate": 0.001890286583334692, + "loss": 7.6263, + "step": 935200 + }, + { + "epoch": 3.810206652686811, + "grad_norm": 11.302197456359863, + "learning_rate": 0.0018898057948981016, + "loss": 7.626, + "step": 935300 + }, + { + "epoch": 3.8106140307101923, + "grad_norm": 10.458709716796875, + "learning_rate": 0.001889325030783428, + "loss": 7.6231, + "step": 935400 + }, + { + "epoch": 3.811021408733574, + "grad_norm": 5.100559711456299, + "learning_rate": 0.0018888442910096797, + "loss": 7.5966, + "step": 935500 + }, + { + "epoch": 3.811428786756955, + "grad_norm": 5.678092956542969, + "learning_rate": 0.0018883635755958617, + "loss": 7.5932, + "step": 935600 + }, + { + "epoch": 3.8118361647803365, + "grad_norm": 10.808277130126953, + "learning_rate": 0.0018878828845609794, + "loss": 7.5963, + "step": 935700 + }, + { + "epoch": 3.812243542803718, + "grad_norm": 6.407227993011475, + "learning_rate": 0.0018874022179240372, + "loss": 7.5891, + "step": 935800 + }, + { + "epoch": 3.8126509208270996, + "grad_norm": 5.734281063079834, + "learning_rate": 0.0018869215757040378, + "loss": 7.6034, + "step": 935900 + }, + { + "epoch": 3.813058298850481, + "grad_norm": 4.35515022277832, + "learning_rate": 0.001886440957919985, + "loss": 7.613, + "step": 936000 + }, + { + "epoch": 3.813058298850481, + "eval_MaskedAccuracy": 0.5086288339335939, + "eval_loss": 1.6063237190246582, + "eval_runtime": 168.5456, + "eval_samples_per_second": 376.61, + "eval_steps_per_second": 1.471, + "step": 936000 + }, + { + "epoch": 3.8134656768738626, + "grad_norm": 3.6834418773651123, + "learning_rate": 0.0018859603645908793, + "loss": 7.6224, + "step": 936100 + }, + { + "epoch": 3.813873054897244, + "grad_norm": 12.3207368850708, + "learning_rate": 0.001885479795735722, + "loss": 7.634, + "step": 936200 + }, + { + "epoch": 3.8142804329206257, + "grad_norm": 10.722724914550781, + "learning_rate": 0.0018849992513735108, + "loss": 7.6177, + "step": 936300 + }, + { + "epoch": 3.8146878109440068, + "grad_norm": 3.2028284072875977, + "learning_rate": 0.001884518731523246, + "loss": 7.6264, + "step": 936400 + }, + { + "epoch": 3.8150951889673883, + "grad_norm": 7.487059116363525, + "learning_rate": 0.0018840382362039253, + "loss": 7.6002, + "step": 936500 + }, + { + "epoch": 3.81550256699077, + "grad_norm": 6.927460193634033, + "learning_rate": 0.0018835577654345444, + "loss": 7.6204, + "step": 936600 + }, + { + "epoch": 3.8159099450141514, + "grad_norm": 5.701291561126709, + "learning_rate": 0.0018830773192340979, + "loss": 7.613, + "step": 936700 + }, + { + "epoch": 3.816317323037533, + "grad_norm": 4.743677139282227, + "learning_rate": 0.00188259689762158, + "loss": 7.6305, + "step": 936800 + }, + { + "epoch": 3.816724701060914, + "grad_norm": 6.083077430725098, + "learning_rate": 0.0018821165006159862, + "loss": 7.6235, + "step": 936900 + }, + { + "epoch": 3.8171320790842955, + "grad_norm": 7.612229347229004, + "learning_rate": 0.0018816361282363078, + "loss": 7.6209, + "step": 937000 + }, + { + "epoch": 3.8171320790842955, + "eval_MaskedAccuracy": 0.5091577492180929, + "eval_loss": 1.593906283378601, + "eval_runtime": 185.9105, + "eval_samples_per_second": 341.433, + "eval_steps_per_second": 1.334, + "step": 937000 + }, + { + "epoch": 3.817539457107677, + "grad_norm": 6.0063796043396, + "learning_rate": 0.001881155780501538, + "loss": 7.5972, + "step": 937100 + }, + { + "epoch": 3.8179468351310586, + "grad_norm": 3.575380802154541, + "learning_rate": 0.0018806754574306674, + "loss": 7.6217, + "step": 937200 + }, + { + "epoch": 3.81835421315444, + "grad_norm": 3.4223945140838623, + "learning_rate": 0.0018801951590426843, + "loss": 7.6167, + "step": 937300 + }, + { + "epoch": 3.8187615911778217, + "grad_norm": 8.924440383911133, + "learning_rate": 0.0018797148853565793, + "loss": 7.6246, + "step": 937400 + }, + { + "epoch": 3.819168969201203, + "grad_norm": 6.881430625915527, + "learning_rate": 0.0018792346363913372, + "loss": 7.6216, + "step": 937500 + }, + { + "epoch": 3.8195763472245847, + "grad_norm": 5.718722343444824, + "learning_rate": 0.001878754412165949, + "loss": 7.5987, + "step": 937600 + }, + { + "epoch": 3.819983725247966, + "grad_norm": 14.568428993225098, + "learning_rate": 0.0018782742126993966, + "loss": 7.6669, + "step": 937700 + }, + { + "epoch": 3.8203911032713473, + "grad_norm": 8.684883117675781, + "learning_rate": 0.0018777940380106677, + "loss": 7.6132, + "step": 937800 + }, + { + "epoch": 3.820798481294729, + "grad_norm": 6.8858561515808105, + "learning_rate": 0.0018773138881187448, + "loss": 7.6092, + "step": 937900 + }, + { + "epoch": 3.8212058593181104, + "grad_norm": 8.121291160583496, + "learning_rate": 0.0018768337630426127, + "loss": 7.5881, + "step": 938000 + }, + { + "epoch": 3.8212058593181104, + "eval_MaskedAccuracy": 0.5095008639566333, + "eval_loss": 1.6043816804885864, + "eval_runtime": 176.2299, + "eval_samples_per_second": 360.189, + "eval_steps_per_second": 1.407, + "step": 938000 + }, + { + "epoch": 3.8216132373414915, + "grad_norm": 5.040372371673584, + "learning_rate": 0.0018763536628012478, + "loss": 7.5973, + "step": 938100 + }, + { + "epoch": 3.822020615364873, + "grad_norm": 4.880376815795898, + "learning_rate": 0.0018758735874136376, + "loss": 7.5983, + "step": 938200 + }, + { + "epoch": 3.8224279933882546, + "grad_norm": 7.10554838180542, + "learning_rate": 0.0018753935368987585, + "loss": 7.617, + "step": 938300 + }, + { + "epoch": 3.822835371411636, + "grad_norm": 4.163276672363281, + "learning_rate": 0.001874913511275591, + "loss": 7.6003, + "step": 938400 + }, + { + "epoch": 3.8232427494350176, + "grad_norm": 10.246548652648926, + "learning_rate": 0.0018744335105631128, + "loss": 7.5941, + "step": 938500 + }, + { + "epoch": 3.823650127458399, + "grad_norm": 5.069756984710693, + "learning_rate": 0.0018739535347802989, + "loss": 7.6244, + "step": 938600 + }, + { + "epoch": 3.8240575054817807, + "grad_norm": 5.792740345001221, + "learning_rate": 0.001873473583946127, + "loss": 7.624, + "step": 938700 + }, + { + "epoch": 3.8244648835051622, + "grad_norm": 3.303527355194092, + "learning_rate": 0.0018729936580795734, + "loss": 7.5887, + "step": 938800 + }, + { + "epoch": 3.8248722615285433, + "grad_norm": 4.251362323760986, + "learning_rate": 0.0018725137571996093, + "loss": 7.6391, + "step": 938900 + }, + { + "epoch": 3.825279639551925, + "grad_norm": 8.193767547607422, + "learning_rate": 0.0018720338813252085, + "loss": 7.6252, + "step": 939000 + }, + { + "epoch": 3.825279639551925, + "eval_MaskedAccuracy": 0.509100322426978, + "eval_loss": 1.6003364324569702, + "eval_runtime": 176.6721, + "eval_samples_per_second": 359.287, + "eval_steps_per_second": 1.404, + "step": 939000 + }, + { + "epoch": 3.8256870175753064, + "grad_norm": 4.346183776855469, + "learning_rate": 0.0018715540304753435, + "loss": 7.6372, + "step": 939100 + }, + { + "epoch": 3.826094395598688, + "grad_norm": 5.751767635345459, + "learning_rate": 0.0018710742046689859, + "loss": 7.6037, + "step": 939200 + }, + { + "epoch": 3.8265017736220694, + "grad_norm": 6.1118669509887695, + "learning_rate": 0.0018705944039251064, + "loss": 7.5879, + "step": 939300 + }, + { + "epoch": 3.8269091516454505, + "grad_norm": 4.788643836975098, + "learning_rate": 0.0018701146282626734, + "loss": 7.6114, + "step": 939400 + }, + { + "epoch": 3.827316529668832, + "grad_norm": 8.829459190368652, + "learning_rate": 0.0018696348777006541, + "loss": 7.6234, + "step": 939500 + }, + { + "epoch": 3.8277239076922136, + "grad_norm": 4.282959938049316, + "learning_rate": 0.0018691551522580156, + "loss": 7.6345, + "step": 939600 + }, + { + "epoch": 3.828131285715595, + "grad_norm": 5.3322367668151855, + "learning_rate": 0.001868675451953727, + "loss": 7.5969, + "step": 939700 + }, + { + "epoch": 3.8285386637389767, + "grad_norm": 10.194397926330566, + "learning_rate": 0.0018681957768067491, + "loss": 7.6247, + "step": 939800 + }, + { + "epoch": 3.828946041762358, + "grad_norm": 7.6213698387146, + "learning_rate": 0.0018677161268360488, + "loss": 7.6347, + "step": 939900 + }, + { + "epoch": 3.8293534197857397, + "grad_norm": 3.7974536418914795, + "learning_rate": 0.0018672365020605875, + "loss": 7.6137, + "step": 940000 + }, + { + "epoch": 3.8293534197857397, + "eval_MaskedAccuracy": 0.5102094498043321, + "eval_loss": 1.6037489175796509, + "eval_runtime": 232.3575, + "eval_samples_per_second": 273.183, + "eval_steps_per_second": 1.067, + "step": 940000 + }, + { + "epoch": 3.8297607978091213, + "grad_norm": 9.21261978149414, + "learning_rate": 0.0018667569024993279, + "loss": 7.6084, + "step": 940100 + }, + { + "epoch": 3.8301681758325024, + "grad_norm": 3.3972275257110596, + "learning_rate": 0.0018662773281712317, + "loss": 7.6157, + "step": 940200 + }, + { + "epoch": 3.830575553855884, + "grad_norm": 9.21441650390625, + "learning_rate": 0.0018657977790952607, + "loss": 7.6034, + "step": 940300 + }, + { + "epoch": 3.8309829318792654, + "grad_norm": 4.649506092071533, + "learning_rate": 0.0018653182552903706, + "loss": 7.6163, + "step": 940400 + }, + { + "epoch": 3.831390309902647, + "grad_norm": 6.620223045349121, + "learning_rate": 0.001864838756775523, + "loss": 7.6052, + "step": 940500 + }, + { + "epoch": 3.831797687926028, + "grad_norm": 11.1571683883667, + "learning_rate": 0.0018643592835696726, + "loss": 7.6362, + "step": 940600 + }, + { + "epoch": 3.8322050659494096, + "grad_norm": 4.182370185852051, + "learning_rate": 0.0018638798356917763, + "loss": 7.6197, + "step": 940700 + }, + { + "epoch": 3.832612443972791, + "grad_norm": 3.2624902725219727, + "learning_rate": 0.0018634004131607887, + "loss": 7.6173, + "step": 940800 + }, + { + "epoch": 3.8330198219961726, + "grad_norm": 6.115269660949707, + "learning_rate": 0.0018629210159956644, + "loss": 7.609, + "step": 940900 + }, + { + "epoch": 3.833427200019554, + "grad_norm": 6.403346061706543, + "learning_rate": 0.0018624416442153534, + "loss": 7.6072, + "step": 941000 + }, + { + "epoch": 3.833427200019554, + "eval_MaskedAccuracy": 0.5094567683345336, + "eval_loss": 1.5979567766189575, + "eval_runtime": 185.0722, + "eval_samples_per_second": 342.98, + "eval_steps_per_second": 1.34, + "step": 941000 + }, + { + "epoch": 3.8338345780429357, + "grad_norm": 7.553493022918701, + "learning_rate": 0.0018619622978388137, + "loss": 7.6242, + "step": 941100 + }, + { + "epoch": 3.8342419560663172, + "grad_norm": 6.307436943054199, + "learning_rate": 0.0018614829768849937, + "loss": 7.6189, + "step": 941200 + }, + { + "epoch": 3.8346493340896988, + "grad_norm": 7.843674659729004, + "learning_rate": 0.001861003681372844, + "loss": 7.5906, + "step": 941300 + }, + { + "epoch": 3.83505671211308, + "grad_norm": 5.269289016723633, + "learning_rate": 0.0018605244113213118, + "loss": 7.6151, + "step": 941400 + }, + { + "epoch": 3.8354640901364614, + "grad_norm": 8.50742244720459, + "learning_rate": 0.0018600451667493468, + "loss": 7.5925, + "step": 941500 + }, + { + "epoch": 3.835871468159843, + "grad_norm": 2.3496034145355225, + "learning_rate": 0.0018595659476758944, + "loss": 7.605, + "step": 941600 + }, + { + "epoch": 3.8362788461832245, + "grad_norm": 7.684658050537109, + "learning_rate": 0.0018590867541199022, + "loss": 7.6516, + "step": 941700 + }, + { + "epoch": 3.836686224206606, + "grad_norm": 9.886786460876465, + "learning_rate": 0.001858607586100315, + "loss": 7.6137, + "step": 941800 + }, + { + "epoch": 3.837093602229987, + "grad_norm": 3.5811352729797363, + "learning_rate": 0.0018581284436360777, + "loss": 7.5893, + "step": 941900 + }, + { + "epoch": 3.8375009802533686, + "grad_norm": 5.257310390472412, + "learning_rate": 0.0018576493267461313, + "loss": 7.5956, + "step": 942000 + }, + { + "epoch": 3.8375009802533686, + "eval_MaskedAccuracy": 0.5094653795671995, + "eval_loss": 1.5970203876495361, + "eval_runtime": 192.4947, + "eval_samples_per_second": 329.755, + "eval_steps_per_second": 1.288, + "step": 942000 + }, + { + "epoch": 3.83790835827675, + "grad_norm": 3.2904205322265625, + "learning_rate": 0.0018571702354494217, + "loss": 7.6574, + "step": 942100 + }, + { + "epoch": 3.8383157363001317, + "grad_norm": 4.732083797454834, + "learning_rate": 0.0018566911697648872, + "loss": 7.6345, + "step": 942200 + }, + { + "epoch": 3.838723114323513, + "grad_norm": 3.3520989418029785, + "learning_rate": 0.0018562121297114693, + "loss": 7.6397, + "step": 942300 + }, + { + "epoch": 3.8391304923468947, + "grad_norm": 8.889657020568848, + "learning_rate": 0.001855733115308104, + "loss": 7.615, + "step": 942400 + }, + { + "epoch": 3.8395378703702763, + "grad_norm": 13.190738677978516, + "learning_rate": 0.0018552541265737315, + "loss": 7.616, + "step": 942500 + }, + { + "epoch": 3.839945248393658, + "grad_norm": 13.407934188842773, + "learning_rate": 0.0018547751635272886, + "loss": 7.6206, + "step": 942600 + }, + { + "epoch": 3.840352626417039, + "grad_norm": 11.850515365600586, + "learning_rate": 0.0018542962261877106, + "loss": 7.6271, + "step": 942700 + }, + { + "epoch": 3.8407600044404204, + "grad_norm": 10.754595756530762, + "learning_rate": 0.0018538173145739353, + "loss": 7.605, + "step": 942800 + }, + { + "epoch": 3.841167382463802, + "grad_norm": 6.437925338745117, + "learning_rate": 0.001853338428704893, + "loss": 7.6474, + "step": 942900 + }, + { + "epoch": 3.8415747604871835, + "grad_norm": 14.888747215270996, + "learning_rate": 0.0018528595685995176, + "loss": 7.6063, + "step": 943000 + }, + { + "epoch": 3.8415747604871835, + "eval_MaskedAccuracy": 0.5082340440738872, + "eval_loss": 1.611876368522644, + "eval_runtime": 187.8381, + "eval_samples_per_second": 337.929, + "eval_steps_per_second": 1.32, + "step": 943000 + }, + { + "epoch": 3.8419821385105646, + "grad_norm": 4.881290435791016, + "learning_rate": 0.0018523807342767406, + "loss": 7.5982, + "step": 943100 + }, + { + "epoch": 3.842389516533946, + "grad_norm": 5.884538173675537, + "learning_rate": 0.0018519019257554944, + "loss": 7.6097, + "step": 943200 + }, + { + "epoch": 3.8427968945573276, + "grad_norm": 9.731324195861816, + "learning_rate": 0.0018514231430547066, + "loss": 7.6141, + "step": 943300 + }, + { + "epoch": 3.843204272580709, + "grad_norm": 6.106199264526367, + "learning_rate": 0.0018509443861933084, + "loss": 7.6104, + "step": 943400 + }, + { + "epoch": 3.8436116506040907, + "grad_norm": 5.420655250549316, + "learning_rate": 0.0018504656551902273, + "loss": 7.6209, + "step": 943500 + }, + { + "epoch": 3.8440190286274722, + "grad_norm": 12.647811889648438, + "learning_rate": 0.0018499869500643897, + "loss": 7.6155, + "step": 943600 + }, + { + "epoch": 3.844426406650854, + "grad_norm": 8.811073303222656, + "learning_rate": 0.001849508270834721, + "loss": 7.6172, + "step": 943700 + }, + { + "epoch": 3.8448337846742353, + "grad_norm": 7.697712421417236, + "learning_rate": 0.0018490296175201456, + "loss": 7.615, + "step": 943800 + }, + { + "epoch": 3.8452411626976164, + "grad_norm": 4.827374458312988, + "learning_rate": 0.0018485509901395893, + "loss": 7.5803, + "step": 943900 + }, + { + "epoch": 3.845648540720998, + "grad_norm": 5.738444805145264, + "learning_rate": 0.0018480723887119724, + "loss": 7.6146, + "step": 944000 + }, + { + "epoch": 3.845648540720998, + "eval_MaskedAccuracy": 0.5092260782136705, + "eval_loss": 1.6010421514511108, + "eval_runtime": 243.1795, + "eval_samples_per_second": 261.025, + "eval_steps_per_second": 1.02, + "step": 944000 + }, + { + "epoch": 3.8460559187443795, + "grad_norm": 4.160953998565674, + "learning_rate": 0.0018475938132562178, + "loss": 7.5895, + "step": 944100 + }, + { + "epoch": 3.846463296767761, + "grad_norm": 3.4514048099517822, + "learning_rate": 0.001847115263791246, + "loss": 7.6234, + "step": 944200 + }, + { + "epoch": 3.8468706747911425, + "grad_norm": 3.569908618927002, + "learning_rate": 0.001846636740335977, + "loss": 7.602, + "step": 944300 + }, + { + "epoch": 3.8472780528145236, + "grad_norm": 6.771123886108398, + "learning_rate": 0.0018461582429093262, + "loss": 7.6438, + "step": 944400 + }, + { + "epoch": 3.847685430837905, + "grad_norm": 9.543065071105957, + "learning_rate": 0.0018456797715302152, + "loss": 7.5958, + "step": 944500 + }, + { + "epoch": 3.8480928088612867, + "grad_norm": 5.088683605194092, + "learning_rate": 0.001845201326217559, + "loss": 7.6053, + "step": 944600 + }, + { + "epoch": 3.848500186884668, + "grad_norm": 10.062211036682129, + "learning_rate": 0.0018447229069902728, + "loss": 7.5971, + "step": 944700 + }, + { + "epoch": 3.8489075649080497, + "grad_norm": 7.096020221710205, + "learning_rate": 0.0018442445138672728, + "loss": 7.5848, + "step": 944800 + }, + { + "epoch": 3.8493149429314313, + "grad_norm": 7.388985633850098, + "learning_rate": 0.0018437661468674706, + "loss": 7.6223, + "step": 944900 + }, + { + "epoch": 3.849722320954813, + "grad_norm": 4.046175956726074, + "learning_rate": 0.001843287806009778, + "loss": 7.6378, + "step": 945000 + }, + { + "epoch": 3.849722320954813, + "eval_MaskedAccuracy": 0.5084699091181181, + "eval_loss": 1.6078386306762695, + "eval_runtime": 164.7419, + "eval_samples_per_second": 385.306, + "eval_steps_per_second": 1.505, + "step": 945000 + }, + { + "epoch": 3.8501296989781943, + "grad_norm": 9.305213928222656, + "learning_rate": 0.0018428094913131078, + "loss": 7.6307, + "step": 945100 + }, + { + "epoch": 3.8505370770015754, + "grad_norm": 2.908653497695923, + "learning_rate": 0.0018423312027963708, + "loss": 7.5821, + "step": 945200 + }, + { + "epoch": 3.850944455024957, + "grad_norm": 5.208691596984863, + "learning_rate": 0.0018418529404784748, + "loss": 7.6321, + "step": 945300 + }, + { + "epoch": 3.8513518330483385, + "grad_norm": 4.687861919403076, + "learning_rate": 0.0018413747043783295, + "loss": 7.6145, + "step": 945400 + }, + { + "epoch": 3.85175921107172, + "grad_norm": 14.521341323852539, + "learning_rate": 0.001840896494514841, + "loss": 7.6239, + "step": 945500 + }, + { + "epoch": 3.852166589095101, + "grad_norm": 6.697803497314453, + "learning_rate": 0.001840418310906916, + "loss": 7.6066, + "step": 945600 + }, + { + "epoch": 3.8525739671184827, + "grad_norm": 6.27771520614624, + "learning_rate": 0.0018399401535734604, + "loss": 7.5982, + "step": 945700 + }, + { + "epoch": 3.852981345141864, + "grad_norm": 6.302993297576904, + "learning_rate": 0.0018394620225333782, + "loss": 7.6163, + "step": 945800 + }, + { + "epoch": 3.8533887231652457, + "grad_norm": 3.5316030979156494, + "learning_rate": 0.0018389839178055702, + "loss": 7.6322, + "step": 945900 + }, + { + "epoch": 3.8537961011886273, + "grad_norm": 6.291867733001709, + "learning_rate": 0.0018385058394089402, + "loss": 7.6303, + "step": 946000 + }, + { + "epoch": 3.8537961011886273, + "eval_MaskedAccuracy": 0.5088446917423474, + "eval_loss": 1.6043281555175781, + "eval_runtime": 246.4298, + "eval_samples_per_second": 257.582, + "eval_steps_per_second": 1.006, + "step": 946000 + }, + { + "epoch": 3.854203479212009, + "grad_norm": 13.41540813446045, + "learning_rate": 0.001838027787362389, + "loss": 7.6073, + "step": 946100 + }, + { + "epoch": 3.8546108572353903, + "grad_norm": 3.8155667781829834, + "learning_rate": 0.001837549761684816, + "loss": 7.6011, + "step": 946200 + }, + { + "epoch": 3.855018235258772, + "grad_norm": 5.04111385345459, + "learning_rate": 0.0018370717623951212, + "loss": 7.5932, + "step": 946300 + }, + { + "epoch": 3.855425613282153, + "grad_norm": 9.936125755310059, + "learning_rate": 0.001836593789512203, + "loss": 7.6087, + "step": 946400 + }, + { + "epoch": 3.8558329913055345, + "grad_norm": 9.701949119567871, + "learning_rate": 0.0018361158430549548, + "loss": 7.6082, + "step": 946500 + }, + { + "epoch": 3.856240369328916, + "grad_norm": 6.736760139465332, + "learning_rate": 0.0018356379230422763, + "loss": 7.6148, + "step": 946600 + }, + { + "epoch": 3.8566477473522975, + "grad_norm": 4.913790225982666, + "learning_rate": 0.0018351600294930627, + "loss": 7.5924, + "step": 946700 + }, + { + "epoch": 3.857055125375679, + "grad_norm": 4.850794792175293, + "learning_rate": 0.001834682162426204, + "loss": 7.5985, + "step": 946800 + }, + { + "epoch": 3.85746250339906, + "grad_norm": 3.34824800491333, + "learning_rate": 0.0018342043218605947, + "loss": 7.6191, + "step": 946900 + }, + { + "epoch": 3.8578698814224417, + "grad_norm": 6.440149307250977, + "learning_rate": 0.0018337265078151254, + "loss": 7.6166, + "step": 947000 + }, + { + "epoch": 3.8578698814224417, + "eval_MaskedAccuracy": 0.5092632863592347, + "eval_loss": 1.6087101697921753, + "eval_runtime": 231.7468, + "eval_samples_per_second": 273.902, + "eval_steps_per_second": 1.07, + "step": 947000 + }, + { + "epoch": 3.8582772594458232, + "grad_norm": 4.203551769256592, + "learning_rate": 0.0018332487203086883, + "loss": 7.6093, + "step": 947100 + }, + { + "epoch": 3.8586846374692048, + "grad_norm": 10.298797607421875, + "learning_rate": 0.0018327709593601706, + "loss": 7.5878, + "step": 947200 + }, + { + "epoch": 3.8590920154925863, + "grad_norm": 9.265373229980469, + "learning_rate": 0.0018322932249884632, + "loss": 7.6051, + "step": 947300 + }, + { + "epoch": 3.859499393515968, + "grad_norm": 5.258175849914551, + "learning_rate": 0.0018318155172124529, + "loss": 7.5989, + "step": 947400 + }, + { + "epoch": 3.8599067715393494, + "grad_norm": 4.714539527893066, + "learning_rate": 0.0018313378360510244, + "loss": 7.6084, + "step": 947500 + }, + { + "epoch": 3.860314149562731, + "grad_norm": 5.320790767669678, + "learning_rate": 0.0018308601815230647, + "loss": 7.5995, + "step": 947600 + }, + { + "epoch": 3.860721527586112, + "grad_norm": 2.9611928462982178, + "learning_rate": 0.001830382553647459, + "loss": 7.6135, + "step": 947700 + }, + { + "epoch": 3.8611289056094935, + "grad_norm": 6.9255828857421875, + "learning_rate": 0.0018299049524430893, + "loss": 7.6082, + "step": 947800 + }, + { + "epoch": 3.861536283632875, + "grad_norm": 3.260333776473999, + "learning_rate": 0.001829427377928837, + "loss": 7.5901, + "step": 947900 + }, + { + "epoch": 3.8619436616562566, + "grad_norm": 3.394360065460205, + "learning_rate": 0.0018289498301235828, + "loss": 7.6032, + "step": 948000 + }, + { + "epoch": 3.8619436616562566, + "eval_MaskedAccuracy": 0.5090682639138114, + "eval_loss": 1.602862000465393, + "eval_runtime": 208.2255, + "eval_samples_per_second": 304.843, + "eval_steps_per_second": 1.191, + "step": 948000 + }, + { + "epoch": 3.8623510396796377, + "grad_norm": 8.246703147888184, + "learning_rate": 0.001828472309046208, + "loss": 7.6268, + "step": 948100 + }, + { + "epoch": 3.862758417703019, + "grad_norm": 6.750222206115723, + "learning_rate": 0.0018279948147155913, + "loss": 7.5911, + "step": 948200 + }, + { + "epoch": 3.8631657957264007, + "grad_norm": 11.483348846435547, + "learning_rate": 0.0018275173471506106, + "loss": 7.6327, + "step": 948300 + }, + { + "epoch": 3.8635731737497823, + "grad_norm": 8.000398635864258, + "learning_rate": 0.0018270399063701423, + "loss": 7.6494, + "step": 948400 + }, + { + "epoch": 3.863980551773164, + "grad_norm": 4.537847995758057, + "learning_rate": 0.0018265624923930628, + "loss": 7.6297, + "step": 948500 + }, + { + "epoch": 3.8643879297965453, + "grad_norm": 3.8453502655029297, + "learning_rate": 0.0018260851052382477, + "loss": 7.5948, + "step": 948600 + }, + { + "epoch": 3.864795307819927, + "grad_norm": 4.933693885803223, + "learning_rate": 0.0018256077449245681, + "loss": 7.5849, + "step": 948700 + }, + { + "epoch": 3.8652026858433084, + "grad_norm": 7.7320780754089355, + "learning_rate": 0.0018251304114708976, + "loss": 7.6149, + "step": 948800 + }, + { + "epoch": 3.8656100638666895, + "grad_norm": 11.603869438171387, + "learning_rate": 0.0018246531048961095, + "loss": 7.5858, + "step": 948900 + }, + { + "epoch": 3.866017441890071, + "grad_norm": 4.933516025543213, + "learning_rate": 0.0018241758252190703, + "loss": 7.6174, + "step": 949000 + }, + { + "epoch": 3.866017441890071, + "eval_MaskedAccuracy": 0.5089156285212543, + "eval_loss": 1.6060400009155273, + "eval_runtime": 167.3093, + "eval_samples_per_second": 379.393, + "eval_steps_per_second": 1.482, + "step": 949000 + }, + { + "epoch": 3.8664248199134525, + "grad_norm": 11.33521842956543, + "learning_rate": 0.0018236985724586544, + "loss": 7.6226, + "step": 949100 + }, + { + "epoch": 3.866832197936834, + "grad_norm": 4.204806804656982, + "learning_rate": 0.001823221346633727, + "loss": 7.627, + "step": 949200 + }, + { + "epoch": 3.8672395759602156, + "grad_norm": 4.625539302825928, + "learning_rate": 0.0018227441477631562, + "loss": 7.6229, + "step": 949300 + }, + { + "epoch": 3.8676469539835967, + "grad_norm": 4.905984878540039, + "learning_rate": 0.0018222669758658076, + "loss": 7.6129, + "step": 949400 + }, + { + "epoch": 3.8680543320069782, + "grad_norm": 13.30921459197998, + "learning_rate": 0.0018217898309605485, + "loss": 7.6132, + "step": 949500 + }, + { + "epoch": 3.8684617100303598, + "grad_norm": 4.7313666343688965, + "learning_rate": 0.0018213127130662413, + "loss": 7.6394, + "step": 949600 + }, + { + "epoch": 3.8688690880537413, + "grad_norm": 5.9433088302612305, + "learning_rate": 0.0018208356222017474, + "loss": 7.6041, + "step": 949700 + }, + { + "epoch": 3.869276466077123, + "grad_norm": 8.721516609191895, + "learning_rate": 0.0018203585583859312, + "loss": 7.6002, + "step": 949800 + }, + { + "epoch": 3.8696838441005044, + "grad_norm": 14.288046836853027, + "learning_rate": 0.0018198815216376514, + "loss": 7.6247, + "step": 949900 + }, + { + "epoch": 3.870091222123886, + "grad_norm": 4.396093368530273, + "learning_rate": 0.0018194045119757716, + "loss": 7.6175, + "step": 950000 + }, + { + "epoch": 3.870091222123886, + "eval_MaskedAccuracy": 0.5089442366642627, + "eval_loss": 1.6116803884506226, + "eval_runtime": 178.9291, + "eval_samples_per_second": 354.755, + "eval_steps_per_second": 1.386, + "step": 950000 + }, + { + "epoch": 3.8704986001472674, + "grad_norm": 5.026941776275635, + "learning_rate": 0.001818927529419148, + "loss": 7.6204, + "step": 950100 + }, + { + "epoch": 3.8709059781706485, + "grad_norm": 10.700847625732422, + "learning_rate": 0.0018184505739866378, + "loss": 7.6121, + "step": 950200 + }, + { + "epoch": 3.87131335619403, + "grad_norm": 3.735995292663574, + "learning_rate": 0.001817973645697099, + "loss": 7.6079, + "step": 950300 + }, + { + "epoch": 3.8717207342174116, + "grad_norm": 5.800289154052734, + "learning_rate": 0.001817496744569386, + "loss": 7.6376, + "step": 950400 + }, + { + "epoch": 3.872128112240793, + "grad_norm": 5.358453750610352, + "learning_rate": 0.0018170198706223535, + "loss": 7.6084, + "step": 950500 + }, + { + "epoch": 3.872535490264174, + "grad_norm": 3.3876280784606934, + "learning_rate": 0.0018165430238748564, + "loss": 7.5943, + "step": 950600 + }, + { + "epoch": 3.8729428682875557, + "grad_norm": 5.961701393127441, + "learning_rate": 0.0018160662043457461, + "loss": 7.5772, + "step": 950700 + }, + { + "epoch": 3.8733502463109373, + "grad_norm": 4.473485469818115, + "learning_rate": 0.001815589412053871, + "loss": 7.6057, + "step": 950800 + }, + { + "epoch": 3.873757624334319, + "grad_norm": 4.792442798614502, + "learning_rate": 0.0018151126470180847, + "loss": 7.6274, + "step": 950900 + }, + { + "epoch": 3.8741650023577003, + "grad_norm": 3.3934457302093506, + "learning_rate": 0.0018146359092572346, + "loss": 7.6034, + "step": 951000 + }, + { + "epoch": 3.8741650023577003, + "eval_MaskedAccuracy": 0.5088081165592627, + "eval_loss": 1.6002564430236816, + "eval_runtime": 177.1429, + "eval_samples_per_second": 358.332, + "eval_steps_per_second": 1.4, + "step": 951000 + }, + { + "epoch": 3.874572380381082, + "grad_norm": 7.944741249084473, + "learning_rate": 0.0018141591987901704, + "loss": 7.5975, + "step": 951100 + }, + { + "epoch": 3.8749797584044634, + "grad_norm": 4.314601421356201, + "learning_rate": 0.001813682515635739, + "loss": 7.5904, + "step": 951200 + }, + { + "epoch": 3.875387136427845, + "grad_norm": 5.166213035583496, + "learning_rate": 0.0018132058598127844, + "loss": 7.6441, + "step": 951300 + }, + { + "epoch": 3.875794514451226, + "grad_norm": 13.966766357421875, + "learning_rate": 0.0018127292313401516, + "loss": 7.6071, + "step": 951400 + }, + { + "epoch": 3.8762018924746076, + "grad_norm": 6.3037285804748535, + "learning_rate": 0.0018122526302366841, + "loss": 7.6163, + "step": 951500 + }, + { + "epoch": 3.876609270497989, + "grad_norm": 8.044075012207031, + "learning_rate": 0.0018117760565212254, + "loss": 7.6163, + "step": 951600 + }, + { + "epoch": 3.8770166485213706, + "grad_norm": 5.0798773765563965, + "learning_rate": 0.0018112995102126184, + "loss": 7.6001, + "step": 951700 + }, + { + "epoch": 3.877424026544752, + "grad_norm": 4.548254489898682, + "learning_rate": 0.0018108229913297, + "loss": 7.6032, + "step": 951800 + }, + { + "epoch": 3.8778314045681332, + "grad_norm": 6.899433612823486, + "learning_rate": 0.0018103464998913126, + "loss": 7.6263, + "step": 951900 + }, + { + "epoch": 3.8782387825915148, + "grad_norm": 5.320175647735596, + "learning_rate": 0.0018098700359162933, + "loss": 7.6256, + "step": 952000 + }, + { + "epoch": 3.8782387825915148, + "eval_MaskedAccuracy": 0.5090562569753042, + "eval_loss": 1.6056095361709595, + "eval_runtime": 163.5442, + "eval_samples_per_second": 388.127, + "eval_steps_per_second": 1.516, + "step": 952000 + }, + { + "epoch": 3.8786461606148963, + "grad_norm": 4.4534592628479, + "learning_rate": 0.0018093935994234792, + "loss": 7.6215, + "step": 952100 + }, + { + "epoch": 3.879053538638278, + "grad_norm": 3.465465545654297, + "learning_rate": 0.0018089171904317083, + "loss": 7.6213, + "step": 952200 + }, + { + "epoch": 3.8794609166616594, + "grad_norm": 5.833391189575195, + "learning_rate": 0.0018084408089598103, + "loss": 7.6305, + "step": 952300 + }, + { + "epoch": 3.879868294685041, + "grad_norm": 11.478015899658203, + "learning_rate": 0.0018079644550266227, + "loss": 7.6171, + "step": 952400 + }, + { + "epoch": 3.8802756727084224, + "grad_norm": 8.241008758544922, + "learning_rate": 0.00180748812865098, + "loss": 7.6236, + "step": 952500 + }, + { + "epoch": 3.880683050731804, + "grad_norm": 4.015230655670166, + "learning_rate": 0.0018070118298517106, + "loss": 7.6139, + "step": 952600 + }, + { + "epoch": 3.881090428755185, + "grad_norm": 9.021292686462402, + "learning_rate": 0.0018065355586476476, + "loss": 7.6198, + "step": 952700 + }, + { + "epoch": 3.8814978067785666, + "grad_norm": 4.2154107093811035, + "learning_rate": 0.0018060593150576196, + "loss": 7.6215, + "step": 952800 + }, + { + "epoch": 3.881905184801948, + "grad_norm": 9.103936195373535, + "learning_rate": 0.0018055830991004554, + "loss": 7.5944, + "step": 952900 + }, + { + "epoch": 3.8823125628253297, + "grad_norm": 3.8361706733703613, + "learning_rate": 0.0018051069107949814, + "loss": 7.6266, + "step": 953000 + }, + { + "epoch": 3.8823125628253297, + "eval_MaskedAccuracy": 0.5090985913748143, + "eval_loss": 1.6088173389434814, + "eval_runtime": 171.0992, + "eval_samples_per_second": 370.989, + "eval_steps_per_second": 1.449, + "step": 953000 + }, + { + "epoch": 3.8827199408487107, + "grad_norm": 5.715306758880615, + "learning_rate": 0.0018046307501600268, + "loss": 7.6275, + "step": 953100 + }, + { + "epoch": 3.8831273188720923, + "grad_norm": 9.456602096557617, + "learning_rate": 0.0018041546172144127, + "loss": 7.6222, + "step": 953200 + }, + { + "epoch": 3.883534696895474, + "grad_norm": 5.366934776306152, + "learning_rate": 0.0018036785119769636, + "loss": 7.6055, + "step": 953300 + }, + { + "epoch": 3.8839420749188553, + "grad_norm": 8.507433891296387, + "learning_rate": 0.0018032024344665062, + "loss": 7.619, + "step": 953400 + }, + { + "epoch": 3.884349452942237, + "grad_norm": 4.057890892028809, + "learning_rate": 0.0018027263847018601, + "loss": 7.6024, + "step": 953500 + }, + { + "epoch": 3.8847568309656184, + "grad_norm": 4.30263614654541, + "learning_rate": 0.0018022503627018458, + "loss": 7.596, + "step": 953600 + }, + { + "epoch": 3.885164208989, + "grad_norm": 6.657095909118652, + "learning_rate": 0.001801774368485282, + "loss": 7.6062, + "step": 953700 + }, + { + "epoch": 3.8855715870123815, + "grad_norm": 7.9841718673706055, + "learning_rate": 0.0018012984020709894, + "loss": 7.6091, + "step": 953800 + }, + { + "epoch": 3.8859789650357626, + "grad_norm": 6.192052841186523, + "learning_rate": 0.0018008224634777819, + "loss": 7.5954, + "step": 953900 + }, + { + "epoch": 3.886386343059144, + "grad_norm": 4.977832317352295, + "learning_rate": 0.0018003465527244812, + "loss": 7.6245, + "step": 954000 + }, + { + "epoch": 3.886386343059144, + "eval_MaskedAccuracy": 0.5091509910645146, + "eval_loss": 1.6058160066604614, + "eval_runtime": 242.3496, + "eval_samples_per_second": 261.919, + "eval_steps_per_second": 1.023, + "step": 954000 + }, + { + "epoch": 3.8867937210825256, + "grad_norm": 4.110084056854248, + "learning_rate": 0.0017998706698299, + "loss": 7.631, + "step": 954100 + }, + { + "epoch": 3.887201099105907, + "grad_norm": 8.557034492492676, + "learning_rate": 0.0017993948148128517, + "loss": 7.6177, + "step": 954200 + }, + { + "epoch": 3.8876084771292887, + "grad_norm": 4.660292625427246, + "learning_rate": 0.0017989189876921508, + "loss": 7.6417, + "step": 954300 + }, + { + "epoch": 3.88801585515267, + "grad_norm": 3.670198917388916, + "learning_rate": 0.0017984431884866085, + "loss": 7.6058, + "step": 954400 + }, + { + "epoch": 3.8884232331760513, + "grad_norm": 6.274688243865967, + "learning_rate": 0.0017979674172150357, + "loss": 7.5926, + "step": 954500 + }, + { + "epoch": 3.888830611199433, + "grad_norm": 4.719417572021484, + "learning_rate": 0.0017974916738962435, + "loss": 7.6303, + "step": 954600 + }, + { + "epoch": 3.8892379892228144, + "grad_norm": 6.074313640594482, + "learning_rate": 0.0017970159585490387, + "loss": 7.5992, + "step": 954700 + }, + { + "epoch": 3.889645367246196, + "grad_norm": 8.574957847595215, + "learning_rate": 0.0017965402711922296, + "loss": 7.6452, + "step": 954800 + }, + { + "epoch": 3.8900527452695774, + "grad_norm": 6.722024440765381, + "learning_rate": 0.0017960646118446234, + "loss": 7.5882, + "step": 954900 + }, + { + "epoch": 3.890460123292959, + "grad_norm": 6.899868488311768, + "learning_rate": 0.0017955889805250235, + "loss": 7.6136, + "step": 955000 + }, + { + "epoch": 3.890460123292959, + "eval_MaskedAccuracy": 0.509182417232623, + "eval_loss": 1.6039791107177734, + "eval_runtime": 172.1183, + "eval_samples_per_second": 368.793, + "eval_steps_per_second": 1.441, + "step": 955000 + }, + { + "epoch": 3.8908675013163405, + "grad_norm": 15.284417152404785, + "learning_rate": 0.0017951133772522363, + "loss": 7.6104, + "step": 955100 + }, + { + "epoch": 3.8912748793397216, + "grad_norm": 6.407052993774414, + "learning_rate": 0.0017946378020450644, + "loss": 7.6112, + "step": 955200 + }, + { + "epoch": 3.891682257363103, + "grad_norm": 9.482291221618652, + "learning_rate": 0.0017941622549223083, + "loss": 7.6071, + "step": 955300 + }, + { + "epoch": 3.8920896353864847, + "grad_norm": 8.136209487915039, + "learning_rate": 0.00179368673590277, + "loss": 7.5947, + "step": 955400 + }, + { + "epoch": 3.892497013409866, + "grad_norm": 4.7726593017578125, + "learning_rate": 0.0017932112450052492, + "loss": 7.5978, + "step": 955500 + }, + { + "epoch": 3.8929043914332473, + "grad_norm": 8.94983196258545, + "learning_rate": 0.0017927357822485454, + "loss": 7.595, + "step": 955600 + }, + { + "epoch": 3.893311769456629, + "grad_norm": 3.8978352546691895, + "learning_rate": 0.0017922603476514564, + "loss": 7.6174, + "step": 955700 + }, + { + "epoch": 3.8937191474800104, + "grad_norm": 3.3412771224975586, + "learning_rate": 0.001791784941232777, + "loss": 7.5995, + "step": 955800 + }, + { + "epoch": 3.894126525503392, + "grad_norm": 7.2563276290893555, + "learning_rate": 0.001791309563011302, + "loss": 7.6281, + "step": 955900 + }, + { + "epoch": 3.8945339035267734, + "grad_norm": 5.581161975860596, + "learning_rate": 0.0017908342130058292, + "loss": 7.6131, + "step": 956000 + }, + { + "epoch": 3.8945339035267734, + "eval_MaskedAccuracy": 0.5084608431798913, + "eval_loss": 1.6025763750076294, + "eval_runtime": 175.0895, + "eval_samples_per_second": 362.535, + "eval_steps_per_second": 1.416, + "step": 956000 + }, + { + "epoch": 3.894941281550155, + "grad_norm": 13.122600555419922, + "learning_rate": 0.0017903588912351505, + "loss": 7.6163, + "step": 956100 + }, + { + "epoch": 3.8953486595735365, + "grad_norm": 6.420519828796387, + "learning_rate": 0.0017898835977180553, + "loss": 7.6012, + "step": 956200 + }, + { + "epoch": 3.895756037596918, + "grad_norm": 4.720472812652588, + "learning_rate": 0.0017894083324733357, + "loss": 7.5975, + "step": 956300 + }, + { + "epoch": 3.896163415620299, + "grad_norm": 3.968144655227661, + "learning_rate": 0.001788933095519783, + "loss": 7.6479, + "step": 956400 + }, + { + "epoch": 3.8965707936436806, + "grad_norm": 7.333112716674805, + "learning_rate": 0.0017884578868761845, + "loss": 7.617, + "step": 956500 + }, + { + "epoch": 3.896978171667062, + "grad_norm": 6.139729976654053, + "learning_rate": 0.0017879827065613268, + "loss": 7.616, + "step": 956600 + }, + { + "epoch": 3.8973855496904437, + "grad_norm": 5.550684452056885, + "learning_rate": 0.0017875075545939997, + "loss": 7.614, + "step": 956700 + }, + { + "epoch": 3.8977929277138252, + "grad_norm": 5.037663459777832, + "learning_rate": 0.0017870324309929861, + "loss": 7.6058, + "step": 956800 + }, + { + "epoch": 3.8982003057372063, + "grad_norm": 5.413069725036621, + "learning_rate": 0.0017865573357770685, + "loss": 7.6169, + "step": 956900 + }, + { + "epoch": 3.898607683760588, + "grad_norm": 7.729787826538086, + "learning_rate": 0.0017860822689650326, + "loss": 7.6048, + "step": 957000 + }, + { + "epoch": 3.898607683760588, + "eval_MaskedAccuracy": 0.5091071710516075, + "eval_loss": 1.5998338460922241, + "eval_runtime": 188.7024, + "eval_samples_per_second": 336.382, + "eval_steps_per_second": 1.314, + "step": 957000 + }, + { + "epoch": 3.8990150617839694, + "grad_norm": 6.152901649475098, + "learning_rate": 0.0017856072305756593, + "loss": 7.582, + "step": 957100 + }, + { + "epoch": 3.899422439807351, + "grad_norm": 3.584260940551758, + "learning_rate": 0.001785132220627732, + "loss": 7.6105, + "step": 957200 + }, + { + "epoch": 3.8998298178307325, + "grad_norm": 9.127362251281738, + "learning_rate": 0.001784657239140025, + "loss": 7.6376, + "step": 957300 + }, + { + "epoch": 3.900237195854114, + "grad_norm": 6.2226176261901855, + "learning_rate": 0.0017841822861313214, + "loss": 7.633, + "step": 957400 + }, + { + "epoch": 3.9006445738774955, + "grad_norm": 10.394842147827148, + "learning_rate": 0.0017837073616203969, + "loss": 7.6098, + "step": 957500 + }, + { + "epoch": 3.9010519519008766, + "grad_norm": 4.542253494262695, + "learning_rate": 0.001783232465626029, + "loss": 7.6327, + "step": 957600 + }, + { + "epoch": 3.901459329924258, + "grad_norm": 3.889946937561035, + "learning_rate": 0.001782757598166993, + "loss": 7.6256, + "step": 957700 + }, + { + "epoch": 3.9018667079476397, + "grad_norm": 4.671126365661621, + "learning_rate": 0.0017822827592620604, + "loss": 7.5918, + "step": 957800 + }, + { + "epoch": 3.902274085971021, + "grad_norm": 10.369836807250977, + "learning_rate": 0.001781807948930006, + "loss": 7.6108, + "step": 957900 + }, + { + "epoch": 3.9026814639944027, + "grad_norm": 5.292967319488525, + "learning_rate": 0.0017813331671896024, + "loss": 7.632, + "step": 958000 + }, + { + "epoch": 3.9026814639944027, + "eval_MaskedAccuracy": 0.5089657313638909, + "eval_loss": 1.600001573562622, + "eval_runtime": 191.4429, + "eval_samples_per_second": 331.566, + "eval_steps_per_second": 1.295, + "step": 958000 + }, + { + "epoch": 3.903088842017784, + "grad_norm": 12.011616706848145, + "learning_rate": 0.0017808584140596186, + "loss": 7.6033, + "step": 958100 + }, + { + "epoch": 3.9034962200411654, + "grad_norm": 6.7674560546875, + "learning_rate": 0.0017803836895588245, + "loss": 7.608, + "step": 958200 + }, + { + "epoch": 3.903903598064547, + "grad_norm": 3.2566893100738525, + "learning_rate": 0.0017799089937059888, + "loss": 7.5775, + "step": 958300 + }, + { + "epoch": 3.9043109760879284, + "grad_norm": 8.980989456176758, + "learning_rate": 0.0017794343265198785, + "loss": 7.628, + "step": 958400 + }, + { + "epoch": 3.90471835411131, + "grad_norm": 5.547982692718506, + "learning_rate": 0.001778959688019261, + "loss": 7.6237, + "step": 958500 + }, + { + "epoch": 3.9051257321346915, + "grad_norm": 5.356801509857178, + "learning_rate": 0.0017784850782228997, + "loss": 7.6141, + "step": 958600 + }, + { + "epoch": 3.905533110158073, + "grad_norm": 5.821535110473633, + "learning_rate": 0.0017780104971495583, + "loss": 7.6203, + "step": 958700 + }, + { + "epoch": 3.9059404881814546, + "grad_norm": 3.865333080291748, + "learning_rate": 0.001777535944818001, + "loss": 7.5888, + "step": 958800 + }, + { + "epoch": 3.9063478662048357, + "grad_norm": 4.598865985870361, + "learning_rate": 0.001777061421246987, + "loss": 7.6083, + "step": 958900 + }, + { + "epoch": 3.906755244228217, + "grad_norm": 2.5107874870300293, + "learning_rate": 0.0017765869264552787, + "loss": 7.6133, + "step": 959000 + }, + { + "epoch": 3.906755244228217, + "eval_MaskedAccuracy": 0.5089750421296063, + "eval_loss": 1.6102250814437866, + "eval_runtime": 180.9229, + "eval_samples_per_second": 350.846, + "eval_steps_per_second": 1.371, + "step": 959000 + }, + { + "epoch": 3.9071626222515987, + "grad_norm": 6.061223030090332, + "learning_rate": 0.0017761124604616337, + "loss": 7.6094, + "step": 959100 + }, + { + "epoch": 3.9075700002749802, + "grad_norm": 6.3825507164001465, + "learning_rate": 0.001775638023284814, + "loss": 7.617, + "step": 959200 + }, + { + "epoch": 3.907977378298362, + "grad_norm": 4.168699741363525, + "learning_rate": 0.0017751636149435756, + "loss": 7.5893, + "step": 959300 + }, + { + "epoch": 3.908384756321743, + "grad_norm": 6.144078731536865, + "learning_rate": 0.001774689235456671, + "loss": 7.614, + "step": 959400 + }, + { + "epoch": 3.9087921343451244, + "grad_norm": 4.921387195587158, + "learning_rate": 0.0017742148848428558, + "loss": 7.5867, + "step": 959500 + }, + { + "epoch": 3.909199512368506, + "grad_norm": 4.409626007080078, + "learning_rate": 0.0017737405631208852, + "loss": 7.6084, + "step": 959600 + }, + { + "epoch": 3.9096068903918875, + "grad_norm": 11.31052303314209, + "learning_rate": 0.0017732662703095119, + "loss": 7.6115, + "step": 959700 + }, + { + "epoch": 3.910014268415269, + "grad_norm": 7.96267032623291, + "learning_rate": 0.0017727920064274866, + "loss": 7.5897, + "step": 959800 + }, + { + "epoch": 3.9104216464386505, + "grad_norm": 7.232409477233887, + "learning_rate": 0.0017723177714935585, + "loss": 7.615, + "step": 959900 + }, + { + "epoch": 3.910829024462032, + "grad_norm": 8.711453437805176, + "learning_rate": 0.0017718435655264782, + "loss": 7.6072, + "step": 960000 + }, + { + "epoch": 3.910829024462032, + "eval_MaskedAccuracy": 0.5096968113028026, + "eval_loss": 1.5956754684448242, + "eval_runtime": 186.8973, + "eval_samples_per_second": 339.63, + "eval_steps_per_second": 1.327, + "step": 960000 + }, + { + "epoch": 3.911236402485413, + "grad_norm": 5.794650554656982, + "learning_rate": 0.0017713693885449923, + "loss": 7.6014, + "step": 960100 + }, + { + "epoch": 3.9116437805087947, + "grad_norm": 6.808177947998047, + "learning_rate": 0.0017708952405678495, + "loss": 7.6244, + "step": 960200 + }, + { + "epoch": 3.912051158532176, + "grad_norm": 6.023766994476318, + "learning_rate": 0.0017704211216137936, + "loss": 7.6023, + "step": 960300 + }, + { + "epoch": 3.9124585365555578, + "grad_norm": 15.710735321044922, + "learning_rate": 0.001769947031701568, + "loss": 7.6043, + "step": 960400 + }, + { + "epoch": 3.9128659145789393, + "grad_norm": 9.586337089538574, + "learning_rate": 0.0017694729708499183, + "loss": 7.6036, + "step": 960500 + }, + { + "epoch": 3.9132732926023204, + "grad_norm": 6.30381441116333, + "learning_rate": 0.0017689989390775856, + "loss": 7.5745, + "step": 960600 + }, + { + "epoch": 3.913680670625702, + "grad_norm": 6.227634906768799, + "learning_rate": 0.001768524936403311, + "loss": 7.6371, + "step": 960700 + }, + { + "epoch": 3.9140880486490834, + "grad_norm": 8.263335227966309, + "learning_rate": 0.001768050962845835, + "loss": 7.5995, + "step": 960800 + }, + { + "epoch": 3.914495426672465, + "grad_norm": 9.07831859588623, + "learning_rate": 0.0017675770184238972, + "loss": 7.6242, + "step": 960900 + }, + { + "epoch": 3.9149028046958465, + "grad_norm": 7.0234293937683105, + "learning_rate": 0.0017671031031562317, + "loss": 7.6544, + "step": 961000 + }, + { + "epoch": 3.9149028046958465, + "eval_MaskedAccuracy": 0.5089093694398151, + "eval_loss": 1.6114834547042847, + "eval_runtime": 184.1641, + "eval_samples_per_second": 344.671, + "eval_steps_per_second": 1.347, + "step": 961000 + }, + { + "epoch": 3.915310182719228, + "grad_norm": 9.08312702178955, + "learning_rate": 0.0017666292170615795, + "loss": 7.6247, + "step": 961100 + }, + { + "epoch": 3.9157175607426096, + "grad_norm": 4.557876110076904, + "learning_rate": 0.0017661553601586717, + "loss": 7.5911, + "step": 961200 + }, + { + "epoch": 3.916124938765991, + "grad_norm": 7.523808479309082, + "learning_rate": 0.0017656815324662454, + "loss": 7.6428, + "step": 961300 + }, + { + "epoch": 3.916532316789372, + "grad_norm": 6.667905330657959, + "learning_rate": 0.001765207734003032, + "loss": 7.6174, + "step": 961400 + }, + { + "epoch": 3.9169396948127537, + "grad_norm": 8.559423446655273, + "learning_rate": 0.0017647339647877628, + "loss": 7.6077, + "step": 961500 + }, + { + "epoch": 3.9173470728361353, + "grad_norm": 6.447027683258057, + "learning_rate": 0.0017642602248391698, + "loss": 7.6136, + "step": 961600 + }, + { + "epoch": 3.917754450859517, + "grad_norm": 7.463019847869873, + "learning_rate": 0.001763786514175982, + "loss": 7.6081, + "step": 961700 + }, + { + "epoch": 3.9181618288828983, + "grad_norm": 3.941450595855713, + "learning_rate": 0.0017633128328169271, + "loss": 7.5928, + "step": 961800 + }, + { + "epoch": 3.9185692069062794, + "grad_norm": 10.279614448547363, + "learning_rate": 0.0017628391807807317, + "loss": 7.6013, + "step": 961900 + }, + { + "epoch": 3.918976584929661, + "grad_norm": 5.762928009033203, + "learning_rate": 0.0017623655580861245, + "loss": 7.5891, + "step": 962000 + }, + { + "epoch": 3.918976584929661, + "eval_MaskedAccuracy": 0.509734933997005, + "eval_loss": 1.6062231063842773, + "eval_runtime": 253.4031, + "eval_samples_per_second": 250.494, + "eval_steps_per_second": 0.979, + "step": 962000 + }, + { + "epoch": 3.9193839629530425, + "grad_norm": 7.73829984664917, + "learning_rate": 0.0017618919647518288, + "loss": 7.6409, + "step": 962100 + }, + { + "epoch": 3.919791340976424, + "grad_norm": 3.7185044288635254, + "learning_rate": 0.001761418400796568, + "loss": 7.6207, + "step": 962200 + }, + { + "epoch": 3.9201987189998055, + "grad_norm": 9.785548210144043, + "learning_rate": 0.0017609448662390655, + "loss": 7.5845, + "step": 962300 + }, + { + "epoch": 3.920606097023187, + "grad_norm": 6.399051189422607, + "learning_rate": 0.0017604713610980382, + "loss": 7.6083, + "step": 962400 + }, + { + "epoch": 3.9210134750465686, + "grad_norm": 8.724244117736816, + "learning_rate": 0.0017599978853922115, + "loss": 7.6192, + "step": 962500 + }, + { + "epoch": 3.9214208530699497, + "grad_norm": 7.021622180938721, + "learning_rate": 0.0017595244391403057, + "loss": 7.5972, + "step": 962600 + }, + { + "epoch": 3.9218282310933312, + "grad_norm": 11.517878532409668, + "learning_rate": 0.0017590510223610327, + "loss": 7.5885, + "step": 962700 + }, + { + "epoch": 3.9222356091167128, + "grad_norm": 8.714223861694336, + "learning_rate": 0.0017585776350731155, + "loss": 7.6014, + "step": 962800 + }, + { + "epoch": 3.9226429871400943, + "grad_norm": 6.0792741775512695, + "learning_rate": 0.0017581042772952648, + "loss": 7.585, + "step": 962900 + }, + { + "epoch": 3.923050365163476, + "grad_norm": 13.479080200195312, + "learning_rate": 0.0017576309490461974, + "loss": 7.606, + "step": 963000 + }, + { + "epoch": 3.923050365163476, + "eval_MaskedAccuracy": 0.50965225417915, + "eval_loss": 1.6007351875305176, + "eval_runtime": 174.2466, + "eval_samples_per_second": 364.288, + "eval_steps_per_second": 1.423, + "step": 963000 + }, + { + "epoch": 3.923457743186857, + "grad_norm": 10.573036193847656, + "learning_rate": 0.0017571576503446235, + "loss": 7.6, + "step": 963100 + }, + { + "epoch": 3.9238651212102384, + "grad_norm": 3.684870719909668, + "learning_rate": 0.001756684381209259, + "loss": 7.6255, + "step": 963200 + }, + { + "epoch": 3.92427249923362, + "grad_norm": 4.756135940551758, + "learning_rate": 0.001756211141658813, + "loss": 7.6018, + "step": 963300 + }, + { + "epoch": 3.9246798772570015, + "grad_norm": 3.6143898963928223, + "learning_rate": 0.0017557379317119962, + "loss": 7.6029, + "step": 963400 + }, + { + "epoch": 3.925087255280383, + "grad_norm": 19.82906723022461, + "learning_rate": 0.0017552647513875167, + "loss": 7.6159, + "step": 963500 + }, + { + "epoch": 3.9254946333037646, + "grad_norm": 3.3934342861175537, + "learning_rate": 0.0017547916007040806, + "loss": 7.6193, + "step": 963600 + }, + { + "epoch": 3.925902011327146, + "grad_norm": 6.459085941314697, + "learning_rate": 0.001754318479680394, + "loss": 7.604, + "step": 963700 + }, + { + "epoch": 3.9263093893505276, + "grad_norm": 2.7584681510925293, + "learning_rate": 0.0017538453883351635, + "loss": 7.6087, + "step": 963800 + }, + { + "epoch": 3.9267167673739087, + "grad_norm": 5.214748859405518, + "learning_rate": 0.0017533723266870934, + "loss": 7.6072, + "step": 963900 + }, + { + "epoch": 3.9271241453972903, + "grad_norm": 9.482467651367188, + "learning_rate": 0.0017528992947548842, + "loss": 7.5804, + "step": 964000 + }, + { + "epoch": 3.9271241453972903, + "eval_MaskedAccuracy": 0.5089183610110668, + "eval_loss": 1.5995738506317139, + "eval_runtime": 179.4685, + "eval_samples_per_second": 353.689, + "eval_steps_per_second": 1.382, + "step": 964000 + }, + { + "epoch": 3.927531523420672, + "grad_norm": 8.670701026916504, + "learning_rate": 0.001752426292557239, + "loss": 7.6204, + "step": 964100 + }, + { + "epoch": 3.9279389014440533, + "grad_norm": 4.509506702423096, + "learning_rate": 0.001751953320112857, + "loss": 7.5954, + "step": 964200 + }, + { + "epoch": 3.928346279467435, + "grad_norm": 5.005879878997803, + "learning_rate": 0.0017514803774404375, + "loss": 7.6298, + "step": 964300 + }, + { + "epoch": 3.928753657490816, + "grad_norm": 3.8375208377838135, + "learning_rate": 0.001751007464558679, + "loss": 7.6267, + "step": 964400 + }, + { + "epoch": 3.9291610355141975, + "grad_norm": 5.750095844268799, + "learning_rate": 0.0017505345814862777, + "loss": 7.6051, + "step": 964500 + }, + { + "epoch": 3.929568413537579, + "grad_norm": 14.052850723266602, + "learning_rate": 0.00175006172824193, + "loss": 7.5784, + "step": 964600 + }, + { + "epoch": 3.9299757915609606, + "grad_norm": 4.827563762664795, + "learning_rate": 0.0017495889048443297, + "loss": 7.6119, + "step": 964700 + }, + { + "epoch": 3.930383169584342, + "grad_norm": 4.114831447601318, + "learning_rate": 0.0017491161113121691, + "loss": 7.6101, + "step": 964800 + }, + { + "epoch": 3.9307905476077236, + "grad_norm": 7.301882266998291, + "learning_rate": 0.0017486433476641423, + "loss": 7.6026, + "step": 964900 + }, + { + "epoch": 3.931197925631105, + "grad_norm": 6.055305004119873, + "learning_rate": 0.00174817061391894, + "loss": 7.605, + "step": 965000 + }, + { + "epoch": 3.931197925631105, + "eval_MaskedAccuracy": 0.5094075744219674, + "eval_loss": 1.6030793190002441, + "eval_runtime": 170.5433, + "eval_samples_per_second": 372.199, + "eval_steps_per_second": 1.454, + "step": 965000 + }, + { + "epoch": 3.9316053036544862, + "grad_norm": 7.509965896606445, + "learning_rate": 0.0017476979100952516, + "loss": 7.5782, + "step": 965100 + }, + { + "epoch": 3.9320126816778678, + "grad_norm": 11.511940956115723, + "learning_rate": 0.0017472252362117642, + "loss": 7.5963, + "step": 965200 + }, + { + "epoch": 3.9324200597012493, + "grad_norm": 8.02153205871582, + "learning_rate": 0.0017467525922871662, + "loss": 7.5949, + "step": 965300 + }, + { + "epoch": 3.932827437724631, + "grad_norm": 10.607674598693848, + "learning_rate": 0.0017462799783401447, + "loss": 7.6329, + "step": 965400 + }, + { + "epoch": 3.9332348157480124, + "grad_norm": 7.6859331130981445, + "learning_rate": 0.0017458073943893825, + "loss": 7.6219, + "step": 965500 + }, + { + "epoch": 3.9336421937713935, + "grad_norm": 6.8652520179748535, + "learning_rate": 0.001745334840453566, + "loss": 7.6236, + "step": 965600 + }, + { + "epoch": 3.934049571794775, + "grad_norm": 6.446781158447266, + "learning_rate": 0.0017448623165513748, + "loss": 7.6278, + "step": 965700 + }, + { + "epoch": 3.9344569498181565, + "grad_norm": 9.547246932983398, + "learning_rate": 0.0017443898227014916, + "loss": 7.6108, + "step": 965800 + }, + { + "epoch": 3.934864327841538, + "grad_norm": 12.40895938873291, + "learning_rate": 0.0017439173589225963, + "loss": 7.5992, + "step": 965900 + }, + { + "epoch": 3.9352717058649196, + "grad_norm": 13.574623107910156, + "learning_rate": 0.001743444925233369, + "loss": 7.6124, + "step": 966000 + }, + { + "epoch": 3.9352717058649196, + "eval_MaskedAccuracy": 0.5092441180934023, + "eval_loss": 1.6030031442642212, + "eval_runtime": 218.0381, + "eval_samples_per_second": 291.123, + "eval_steps_per_second": 1.137, + "step": 966000 + }, + { + "epoch": 3.935679083888301, + "grad_norm": 10.19545841217041, + "learning_rate": 0.0017429725216524861, + "loss": 7.6029, + "step": 966100 + }, + { + "epoch": 3.9360864619116827, + "grad_norm": 8.474671363830566, + "learning_rate": 0.0017425001481986258, + "loss": 7.6071, + "step": 966200 + }, + { + "epoch": 3.936493839935064, + "grad_norm": 5.1744513511657715, + "learning_rate": 0.0017420278048904632, + "loss": 7.636, + "step": 966300 + }, + { + "epoch": 3.9369012179584453, + "grad_norm": 4.764209747314453, + "learning_rate": 0.0017415554917466717, + "loss": 7.6004, + "step": 966400 + }, + { + "epoch": 3.937308595981827, + "grad_norm": 3.4906716346740723, + "learning_rate": 0.0017410832087859248, + "loss": 7.6217, + "step": 966500 + }, + { + "epoch": 3.9377159740052083, + "grad_norm": 7.8018622398376465, + "learning_rate": 0.0017406109560268955, + "loss": 7.6183, + "step": 966600 + }, + { + "epoch": 3.93812335202859, + "grad_norm": 8.805583000183105, + "learning_rate": 0.0017401387334882506, + "loss": 7.6126, + "step": 966700 + }, + { + "epoch": 3.9385307300519714, + "grad_norm": 4.326712131500244, + "learning_rate": 0.001739666541188665, + "loss": 7.5709, + "step": 966800 + }, + { + "epoch": 3.9389381080753525, + "grad_norm": 10.123628616333008, + "learning_rate": 0.0017391943791468026, + "loss": 7.6283, + "step": 966900 + }, + { + "epoch": 3.939345486098734, + "grad_norm": 7.375999450683594, + "learning_rate": 0.0017387222473813327, + "loss": 7.6254, + "step": 967000 + }, + { + "epoch": 3.939345486098734, + "eval_MaskedAccuracy": 0.509036903561673, + "eval_loss": 1.612851858139038, + "eval_runtime": 172.39, + "eval_samples_per_second": 368.212, + "eval_steps_per_second": 1.439, + "step": 967000 + }, + { + "epoch": 3.9397528641221156, + "grad_norm": 9.184760093688965, + "learning_rate": 0.00173825014591092, + "loss": 7.6214, + "step": 967100 + }, + { + "epoch": 3.940160242145497, + "grad_norm": 3.240121603012085, + "learning_rate": 0.0017377780747542307, + "loss": 7.6016, + "step": 967200 + }, + { + "epoch": 3.9405676201688786, + "grad_norm": 12.256264686584473, + "learning_rate": 0.0017373060339299265, + "loss": 7.6188, + "step": 967300 + }, + { + "epoch": 3.94097499819226, + "grad_norm": 3.1830883026123047, + "learning_rate": 0.001736834023456671, + "loss": 7.5812, + "step": 967400 + }, + { + "epoch": 3.9413823762156417, + "grad_norm": 5.6326775550842285, + "learning_rate": 0.001736362043353126, + "loss": 7.6057, + "step": 967500 + }, + { + "epoch": 3.941789754239023, + "grad_norm": 7.11724328994751, + "learning_rate": 0.0017358900936379502, + "loss": 7.6092, + "step": 967600 + }, + { + "epoch": 3.9421971322624043, + "grad_norm": 9.236283302307129, + "learning_rate": 0.0017354181743298033, + "loss": 7.6022, + "step": 967700 + }, + { + "epoch": 3.942604510285786, + "grad_norm": 10.890929222106934, + "learning_rate": 0.001734946285447342, + "loss": 7.6339, + "step": 967800 + }, + { + "epoch": 3.9430118883091674, + "grad_norm": 4.577576160430908, + "learning_rate": 0.0017344744270092205, + "loss": 7.5962, + "step": 967900 + }, + { + "epoch": 3.943419266332549, + "grad_norm": 4.31857967376709, + "learning_rate": 0.0017340025990340954, + "loss": 7.5695, + "step": 968000 + }, + { + "epoch": 3.943419266332549, + "eval_MaskedAccuracy": 0.5087030499207391, + "eval_loss": 1.6019302606582642, + "eval_runtime": 194.1768, + "eval_samples_per_second": 326.898, + "eval_steps_per_second": 1.277, + "step": 968000 + }, + { + "epoch": 3.94382664435593, + "grad_norm": 7.419013977050781, + "learning_rate": 0.0017335308015406228, + "loss": 7.5877, + "step": 968100 + }, + { + "epoch": 3.9442340223793115, + "grad_norm": 7.132618427276611, + "learning_rate": 0.0017330590345474527, + "loss": 7.6275, + "step": 968200 + }, + { + "epoch": 3.944641400402693, + "grad_norm": 5.5635294914245605, + "learning_rate": 0.0017325872980732374, + "loss": 7.609, + "step": 968300 + }, + { + "epoch": 3.9450487784260746, + "grad_norm": 5.992742538452148, + "learning_rate": 0.0017321155921366273, + "loss": 7.5958, + "step": 968400 + }, + { + "epoch": 3.945456156449456, + "grad_norm": 3.753831386566162, + "learning_rate": 0.0017316439167562722, + "loss": 7.6208, + "step": 968500 + }, + { + "epoch": 3.9458635344728377, + "grad_norm": 4.255530834197998, + "learning_rate": 0.0017311722719508204, + "loss": 7.599, + "step": 968600 + }, + { + "epoch": 3.946270912496219, + "grad_norm": 5.118152618408203, + "learning_rate": 0.0017307006577389158, + "loss": 7.6313, + "step": 968700 + }, + { + "epoch": 3.9466782905196007, + "grad_norm": 6.278252601623535, + "learning_rate": 0.001730229074139205, + "loss": 7.5869, + "step": 968800 + }, + { + "epoch": 3.947085668542982, + "grad_norm": 3.765784740447998, + "learning_rate": 0.001729757521170333, + "loss": 7.6278, + "step": 968900 + }, + { + "epoch": 3.9474930465663634, + "grad_norm": 6.677095413208008, + "learning_rate": 0.0017292859988509428, + "loss": 7.6083, + "step": 969000 + }, + { + "epoch": 3.9474930465663634, + "eval_MaskedAccuracy": 0.5089457616204685, + "eval_loss": 1.6062957048416138, + "eval_runtime": 244.5024, + "eval_samples_per_second": 259.613, + "eval_steps_per_second": 1.014, + "step": 969000 + }, + { + "epoch": 3.947900424589745, + "grad_norm": 5.654714584350586, + "learning_rate": 0.0017288145071996778, + "loss": 7.5821, + "step": 969100 + }, + { + "epoch": 3.9483078026131264, + "grad_norm": 6.486881732940674, + "learning_rate": 0.0017283430462351763, + "loss": 7.609, + "step": 969200 + }, + { + "epoch": 3.948715180636508, + "grad_norm": 6.111940860748291, + "learning_rate": 0.0017278716159760778, + "loss": 7.6086, + "step": 969300 + }, + { + "epoch": 3.949122558659889, + "grad_norm": 10.307491302490234, + "learning_rate": 0.0017274002164410205, + "loss": 7.6249, + "step": 969400 + }, + { + "epoch": 3.9495299366832706, + "grad_norm": 6.614736557006836, + "learning_rate": 0.0017269288476486414, + "loss": 7.5924, + "step": 969500 + }, + { + "epoch": 3.949937314706652, + "grad_norm": 9.90733814239502, + "learning_rate": 0.0017264575096175763, + "loss": 7.6071, + "step": 969600 + }, + { + "epoch": 3.9503446927300336, + "grad_norm": 7.4483561515808105, + "learning_rate": 0.001725986202366459, + "loss": 7.5843, + "step": 969700 + }, + { + "epoch": 3.950752070753415, + "grad_norm": 8.133569717407227, + "learning_rate": 0.0017255149259139243, + "loss": 7.6001, + "step": 969800 + }, + { + "epoch": 3.9511594487767967, + "grad_norm": 3.9263856410980225, + "learning_rate": 0.001725043680278603, + "loss": 7.5881, + "step": 969900 + }, + { + "epoch": 3.9515668268001782, + "grad_norm": 8.540437698364258, + "learning_rate": 0.001724572465479127, + "loss": 7.6145, + "step": 970000 + }, + { + "epoch": 3.9515668268001782, + "eval_MaskedAccuracy": 0.5089144960412382, + "eval_loss": 1.612705111503601, + "eval_runtime": 171.0321, + "eval_samples_per_second": 371.135, + "eval_steps_per_second": 1.45, + "step": 970000 + }, + { + "epoch": 3.9519742048235593, + "grad_norm": 5.326159477233887, + "learning_rate": 0.001724101281534126, + "loss": 7.5963, + "step": 970100 + }, + { + "epoch": 3.952381582846941, + "grad_norm": 10.774977684020996, + "learning_rate": 0.0017236301284622268, + "loss": 7.5999, + "step": 970200 + }, + { + "epoch": 3.9527889608703224, + "grad_norm": 11.163222312927246, + "learning_rate": 0.0017231590062820577, + "loss": 7.5947, + "step": 970300 + }, + { + "epoch": 3.953196338893704, + "grad_norm": 8.657665252685547, + "learning_rate": 0.001722687915012246, + "loss": 7.6081, + "step": 970400 + }, + { + "epoch": 3.9536037169170855, + "grad_norm": 8.744972229003906, + "learning_rate": 0.0017222168546714152, + "loss": 7.6163, + "step": 970500 + }, + { + "epoch": 3.9540110949404665, + "grad_norm": 7.80633544921875, + "learning_rate": 0.001721745825278188, + "loss": 7.5912, + "step": 970600 + }, + { + "epoch": 3.954418472963848, + "grad_norm": 9.006880760192871, + "learning_rate": 0.0017212748268511874, + "loss": 7.6229, + "step": 970700 + }, + { + "epoch": 3.9548258509872296, + "grad_norm": 6.297878265380859, + "learning_rate": 0.0017208038594090347, + "loss": 7.6206, + "step": 970800 + }, + { + "epoch": 3.955233229010611, + "grad_norm": 4.641481876373291, + "learning_rate": 0.0017203329229703505, + "loss": 7.5948, + "step": 970900 + }, + { + "epoch": 3.9556406070339927, + "grad_norm": 2.9865880012512207, + "learning_rate": 0.0017198620175537522, + "loss": 7.6016, + "step": 971000 + }, + { + "epoch": 3.9556406070339927, + "eval_MaskedAccuracy": 0.5096631115317253, + "eval_loss": 1.6074703931808472, + "eval_runtime": 167.0376, + "eval_samples_per_second": 380.01, + "eval_steps_per_second": 1.485, + "step": 971000 + }, + { + "epoch": 3.956047985057374, + "grad_norm": 3.9123666286468506, + "learning_rate": 0.001719391143177859, + "loss": 7.5958, + "step": 971100 + }, + { + "epoch": 3.9564553630807557, + "grad_norm": 4.388565540313721, + "learning_rate": 0.0017189202998612813, + "loss": 7.6196, + "step": 971200 + }, + { + "epoch": 3.9568627411041373, + "grad_norm": 3.3493175506591797, + "learning_rate": 0.0017184494876226403, + "loss": 7.5778, + "step": 971300 + }, + { + "epoch": 3.9572701191275184, + "grad_norm": 10.546339988708496, + "learning_rate": 0.001717978706480547, + "loss": 7.6032, + "step": 971400 + }, + { + "epoch": 3.9576774971509, + "grad_norm": 5.917593479156494, + "learning_rate": 0.0017175079564536165, + "loss": 7.6366, + "step": 971500 + }, + { + "epoch": 3.9580848751742814, + "grad_norm": 4.218117713928223, + "learning_rate": 0.0017170372375604577, + "loss": 7.598, + "step": 971600 + }, + { + "epoch": 3.958492253197663, + "grad_norm": 4.911594390869141, + "learning_rate": 0.001716566549819682, + "loss": 7.6187, + "step": 971700 + }, + { + "epoch": 3.9588996312210445, + "grad_norm": 13.799729347229004, + "learning_rate": 0.001716095893249896, + "loss": 7.5556, + "step": 971800 + }, + { + "epoch": 3.9593070092444256, + "grad_norm": 7.282037734985352, + "learning_rate": 0.0017156252678697103, + "loss": 7.581, + "step": 971900 + }, + { + "epoch": 3.959714387267807, + "grad_norm": 10.808502197265625, + "learning_rate": 0.0017151546736977295, + "loss": 7.6168, + "step": 972000 + }, + { + "epoch": 3.959714387267807, + "eval_MaskedAccuracy": 0.5091707189279271, + "eval_loss": 1.6053344011306763, + "eval_runtime": 169.707, + "eval_samples_per_second": 374.033, + "eval_steps_per_second": 1.461, + "step": 972000 + }, + { + "epoch": 3.9601217652911886, + "grad_norm": 4.173104763031006, + "learning_rate": 0.0017146841107525577, + "loss": 7.6016, + "step": 972100 + }, + { + "epoch": 3.96052914331457, + "grad_norm": 7.0381622314453125, + "learning_rate": 0.0017142135790527997, + "loss": 7.6365, + "step": 972200 + }, + { + "epoch": 3.9609365213379517, + "grad_norm": 7.141263961791992, + "learning_rate": 0.0017137430786170601, + "loss": 7.5984, + "step": 972300 + }, + { + "epoch": 3.9613438993613332, + "grad_norm": 8.423711776733398, + "learning_rate": 0.0017132726094639373, + "loss": 7.6243, + "step": 972400 + }, + { + "epoch": 3.9617512773847148, + "grad_norm": 6.234470367431641, + "learning_rate": 0.0017128021716120326, + "loss": 7.5877, + "step": 972500 + }, + { + "epoch": 3.962158655408096, + "grad_norm": 7.703559875488281, + "learning_rate": 0.001712331765079945, + "loss": 7.6258, + "step": 972600 + }, + { + "epoch": 3.9625660334314774, + "grad_norm": 8.55420207977295, + "learning_rate": 0.0017118613898862722, + "loss": 7.6121, + "step": 972700 + }, + { + "epoch": 3.962973411454859, + "grad_norm": 3.7489073276519775, + "learning_rate": 0.0017113910460496123, + "loss": 7.6163, + "step": 972800 + }, + { + "epoch": 3.9633807894782405, + "grad_norm": 4.63359260559082, + "learning_rate": 0.001710920733588557, + "loss": 7.6142, + "step": 972900 + }, + { + "epoch": 3.963788167501622, + "grad_norm": 4.0801591873168945, + "learning_rate": 0.0017104504525217038, + "loss": 7.6015, + "step": 973000 + }, + { + "epoch": 3.963788167501622, + "eval_MaskedAccuracy": 0.5098385944356569, + "eval_loss": 1.6042712926864624, + "eval_runtime": 179.3414, + "eval_samples_per_second": 353.939, + "eval_steps_per_second": 1.383, + "step": 973000 + }, + { + "epoch": 3.964195545525003, + "grad_norm": 4.366600036621094, + "learning_rate": 0.001709980202867641, + "loss": 7.6032, + "step": 973100 + }, + { + "epoch": 3.9646029235483846, + "grad_norm": 5.428745269775391, + "learning_rate": 0.001709509984644964, + "loss": 7.6191, + "step": 973200 + }, + { + "epoch": 3.965010301571766, + "grad_norm": 5.178256511688232, + "learning_rate": 0.0017090397978722625, + "loss": 7.6357, + "step": 973300 + }, + { + "epoch": 3.9654176795951477, + "grad_norm": 7.077785491943359, + "learning_rate": 0.0017085696425681242, + "loss": 7.6124, + "step": 973400 + }, + { + "epoch": 3.965825057618529, + "grad_norm": 13.619942665100098, + "learning_rate": 0.0017080995187511376, + "loss": 7.6049, + "step": 973500 + }, + { + "epoch": 3.9662324356419107, + "grad_norm": 8.637674331665039, + "learning_rate": 0.001707629426439889, + "loss": 7.6215, + "step": 973600 + }, + { + "epoch": 3.9666398136652923, + "grad_norm": 10.719552040100098, + "learning_rate": 0.0017071593656529647, + "loss": 7.6161, + "step": 973700 + }, + { + "epoch": 3.967047191688674, + "grad_norm": 5.537758827209473, + "learning_rate": 0.0017066893364089464, + "loss": 7.6187, + "step": 973800 + }, + { + "epoch": 3.967454569712055, + "grad_norm": 5.392650604248047, + "learning_rate": 0.0017062193387264188, + "loss": 7.6237, + "step": 973900 + }, + { + "epoch": 3.9678619477354364, + "grad_norm": 3.8435933589935303, + "learning_rate": 0.0017057493726239635, + "loss": 7.6246, + "step": 974000 + }, + { + "epoch": 3.9678619477354364, + "eval_MaskedAccuracy": 0.5092166668473508, + "eval_loss": 1.6011650562286377, + "eval_runtime": 175.5725, + "eval_samples_per_second": 361.537, + "eval_steps_per_second": 1.413, + "step": 974000 + }, + { + "epoch": 3.968269325758818, + "grad_norm": 9.764803886413574, + "learning_rate": 0.001705279438120159, + "loss": 7.6261, + "step": 974100 + }, + { + "epoch": 3.9686767037821995, + "grad_norm": 4.899872303009033, + "learning_rate": 0.0017048095352335858, + "loss": 7.612, + "step": 974200 + }, + { + "epoch": 3.969084081805581, + "grad_norm": 10.642428398132324, + "learning_rate": 0.0017043396639828234, + "loss": 7.6152, + "step": 974300 + }, + { + "epoch": 3.969491459828962, + "grad_norm": 4.387692451477051, + "learning_rate": 0.0017038698243864439, + "loss": 7.5949, + "step": 974400 + }, + { + "epoch": 3.9698988378523437, + "grad_norm": 9.260013580322266, + "learning_rate": 0.0017034000164630257, + "loss": 7.5844, + "step": 974500 + }, + { + "epoch": 3.970306215875725, + "grad_norm": 6.511177062988281, + "learning_rate": 0.0017029302402311421, + "loss": 7.6172, + "step": 974600 + }, + { + "epoch": 3.9707135938991067, + "grad_norm": 10.507186889648438, + "learning_rate": 0.0017024604957093668, + "loss": 7.6099, + "step": 974700 + }, + { + "epoch": 3.9711209719224883, + "grad_norm": 4.583454608917236, + "learning_rate": 0.0017019907829162696, + "loss": 7.6258, + "step": 974800 + }, + { + "epoch": 3.97152834994587, + "grad_norm": 5.074526786804199, + "learning_rate": 0.0017015211018704221, + "loss": 7.586, + "step": 974900 + }, + { + "epoch": 3.9719357279692513, + "grad_norm": 6.329677581787109, + "learning_rate": 0.001701051452590394, + "loss": 7.5945, + "step": 975000 + }, + { + "epoch": 3.9719357279692513, + "eval_MaskedAccuracy": 0.5089593009373351, + "eval_loss": 1.614335060119629, + "eval_runtime": 181.824, + "eval_samples_per_second": 349.107, + "eval_steps_per_second": 1.364, + "step": 975000 + }, + { + "epoch": 3.9723431059926324, + "grad_norm": 3.5868403911590576, + "learning_rate": 0.0017005818350947512, + "loss": 7.623, + "step": 975100 + }, + { + "epoch": 3.972750484016014, + "grad_norm": 5.082174301147461, + "learning_rate": 0.0017001122494020608, + "loss": 7.5911, + "step": 975200 + }, + { + "epoch": 3.9731578620393955, + "grad_norm": 9.512438774108887, + "learning_rate": 0.0016996426955308878, + "loss": 7.6167, + "step": 975300 + }, + { + "epoch": 3.973565240062777, + "grad_norm": 7.000560283660889, + "learning_rate": 0.0016991731734997963, + "loss": 7.6084, + "step": 975400 + }, + { + "epoch": 3.9739726180861585, + "grad_norm": 6.193122863769531, + "learning_rate": 0.0016987036833273482, + "loss": 7.6148, + "step": 975500 + }, + { + "epoch": 3.9743799961095396, + "grad_norm": 8.048707008361816, + "learning_rate": 0.001698234225032108, + "loss": 7.6017, + "step": 975600 + }, + { + "epoch": 3.974787374132921, + "grad_norm": 8.468701362609863, + "learning_rate": 0.0016977647986326306, + "loss": 7.6221, + "step": 975700 + }, + { + "epoch": 3.9751947521563027, + "grad_norm": 4.622277736663818, + "learning_rate": 0.0016972954041474797, + "loss": 7.5971, + "step": 975800 + }, + { + "epoch": 3.9756021301796842, + "grad_norm": 9.617279052734375, + "learning_rate": 0.0016968260415952117, + "loss": 7.6124, + "step": 975900 + }, + { + "epoch": 3.9760095082030658, + "grad_norm": 4.895694732666016, + "learning_rate": 0.001696356710994383, + "loss": 7.6079, + "step": 976000 + }, + { + "epoch": 3.9760095082030658, + "eval_MaskedAccuracy": 0.50983445511661, + "eval_loss": 1.5986188650131226, + "eval_runtime": 159.5097, + "eval_samples_per_second": 397.944, + "eval_steps_per_second": 1.555, + "step": 976000 + }, + { + "epoch": 3.9764168862264473, + "grad_norm": 6.036518096923828, + "learning_rate": 0.0016958874123635483, + "loss": 7.585, + "step": 976100 + }, + { + "epoch": 3.976824264249829, + "grad_norm": 5.62017297744751, + "learning_rate": 0.0016954181457212634, + "loss": 7.5829, + "step": 976200 + }, + { + "epoch": 3.9772316422732104, + "grad_norm": 4.487478256225586, + "learning_rate": 0.001694948911086078, + "loss": 7.5964, + "step": 976300 + }, + { + "epoch": 3.9776390202965914, + "grad_norm": 4.295134544372559, + "learning_rate": 0.0016944797084765442, + "loss": 7.6068, + "step": 976400 + }, + { + "epoch": 3.978046398319973, + "grad_norm": 13.148701667785645, + "learning_rate": 0.0016940105379112125, + "loss": 7.6207, + "step": 976500 + }, + { + "epoch": 3.9784537763433545, + "grad_norm": 5.530330657958984, + "learning_rate": 0.0016935413994086294, + "loss": 7.6016, + "step": 976600 + }, + { + "epoch": 3.978861154366736, + "grad_norm": 3.2301185131073, + "learning_rate": 0.0016930722929873465, + "loss": 7.5981, + "step": 976700 + }, + { + "epoch": 3.9792685323901176, + "grad_norm": 4.73488187789917, + "learning_rate": 0.001692603218665906, + "loss": 7.5991, + "step": 976800 + }, + { + "epoch": 3.9796759104134987, + "grad_norm": 7.956972122192383, + "learning_rate": 0.0016921341764628577, + "loss": 7.5862, + "step": 976900 + }, + { + "epoch": 3.98008328843688, + "grad_norm": 3.1814346313476562, + "learning_rate": 0.0016916651663967416, + "loss": 7.6019, + "step": 977000 + }, + { + "epoch": 3.98008328843688, + "eval_MaskedAccuracy": 0.509535113683961, + "eval_loss": 1.6061532497406006, + "eval_runtime": 200.0001, + "eval_samples_per_second": 317.38, + "eval_steps_per_second": 1.24, + "step": 977000 + }, + { + "epoch": 3.9804906664602617, + "grad_norm": 9.073088645935059, + "learning_rate": 0.0016911961884861035, + "loss": 7.5911, + "step": 977100 + }, + { + "epoch": 3.9808980444836433, + "grad_norm": 11.336462020874023, + "learning_rate": 0.0016907272427494826, + "loss": 7.5966, + "step": 977200 + }, + { + "epoch": 3.981305422507025, + "grad_norm": 5.6354146003723145, + "learning_rate": 0.001690258329205419, + "loss": 7.5939, + "step": 977300 + }, + { + "epoch": 3.9817128005304063, + "grad_norm": 4.563399791717529, + "learning_rate": 0.0016897894478724495, + "loss": 7.5959, + "step": 977400 + }, + { + "epoch": 3.982120178553788, + "grad_norm": 9.256034851074219, + "learning_rate": 0.0016893205987691133, + "loss": 7.598, + "step": 977500 + }, + { + "epoch": 3.982527556577169, + "grad_norm": 4.321191310882568, + "learning_rate": 0.0016888517819139454, + "loss": 7.5936, + "step": 977600 + }, + { + "epoch": 3.9829349346005505, + "grad_norm": 15.723010063171387, + "learning_rate": 0.001688382997325483, + "loss": 7.5967, + "step": 977700 + }, + { + "epoch": 3.983342312623932, + "grad_norm": 3.105146646499634, + "learning_rate": 0.0016879142450222577, + "loss": 7.6134, + "step": 977800 + }, + { + "epoch": 3.9837496906473135, + "grad_norm": 10.083918571472168, + "learning_rate": 0.0016874455250228044, + "loss": 7.5799, + "step": 977900 + }, + { + "epoch": 3.984157068670695, + "grad_norm": 3.9866275787353516, + "learning_rate": 0.00168697683734565, + "loss": 7.5977, + "step": 978000 + }, + { + "epoch": 3.984157068670695, + "eval_MaskedAccuracy": 0.5096758100810419, + "eval_loss": 1.6063538789749146, + "eval_runtime": 232.7136, + "eval_samples_per_second": 272.765, + "eval_steps_per_second": 1.066, + "step": 978000 + }, + { + "epoch": 3.984564446694076, + "grad_norm": 4.538233757019043, + "learning_rate": 0.0016865081820093273, + "loss": 7.6205, + "step": 978100 + }, + { + "epoch": 3.9849718247174577, + "grad_norm": 15.1337251663208, + "learning_rate": 0.0016860395590323645, + "loss": 7.5709, + "step": 978200 + }, + { + "epoch": 3.9853792027408392, + "grad_norm": 3.3948187828063965, + "learning_rate": 0.0016855709684332866, + "loss": 7.5849, + "step": 978300 + }, + { + "epoch": 3.9857865807642208, + "grad_norm": 5.072014331817627, + "learning_rate": 0.0016851024102306207, + "loss": 7.6037, + "step": 978400 + }, + { + "epoch": 3.9861939587876023, + "grad_norm": 3.318345785140991, + "learning_rate": 0.0016846338844428927, + "loss": 7.6003, + "step": 978500 + }, + { + "epoch": 3.986601336810984, + "grad_norm": 4.902764320373535, + "learning_rate": 0.0016841653910886253, + "loss": 7.6191, + "step": 978600 + }, + { + "epoch": 3.9870087148343654, + "grad_norm": 17.06964111328125, + "learning_rate": 0.0016836969301863385, + "loss": 7.6105, + "step": 978700 + }, + { + "epoch": 3.987416092857747, + "grad_norm": 6.311084270477295, + "learning_rate": 0.0016832285017545558, + "loss": 7.6117, + "step": 978800 + }, + { + "epoch": 3.987823470881128, + "grad_norm": 7.435804843902588, + "learning_rate": 0.0016827601058117963, + "loss": 7.5868, + "step": 978900 + }, + { + "epoch": 3.9882308489045095, + "grad_norm": 5.136353969573975, + "learning_rate": 0.0016822917423765765, + "loss": 7.5817, + "step": 979000 + }, + { + "epoch": 3.9882308489045095, + "eval_MaskedAccuracy": 0.5093313363367059, + "eval_loss": 1.6005762815475464, + "eval_runtime": 185.8371, + "eval_samples_per_second": 341.568, + "eval_steps_per_second": 1.335, + "step": 979000 + }, + { + "epoch": 3.988638226927891, + "grad_norm": 5.9919843673706055, + "learning_rate": 0.0016818234114674131, + "loss": 7.6002, + "step": 979100 + }, + { + "epoch": 3.9890456049512726, + "grad_norm": 7.936841011047363, + "learning_rate": 0.0016813551131028238, + "loss": 7.5694, + "step": 979200 + }, + { + "epoch": 3.989452982974654, + "grad_norm": 7.31936502456665, + "learning_rate": 0.0016808868473013221, + "loss": 7.6011, + "step": 979300 + }, + { + "epoch": 3.989860360998035, + "grad_norm": 3.525270938873291, + "learning_rate": 0.0016804186140814228, + "loss": 7.5873, + "step": 979400 + }, + { + "epoch": 3.9902677390214167, + "grad_norm": 6.6307573318481445, + "learning_rate": 0.001679950413461635, + "loss": 7.5841, + "step": 979500 + }, + { + "epoch": 3.9906751170447983, + "grad_norm": 5.909872531890869, + "learning_rate": 0.0016794822454604698, + "loss": 7.6352, + "step": 979600 + }, + { + "epoch": 3.99108249506818, + "grad_norm": 4.36875581741333, + "learning_rate": 0.001679014110096436, + "loss": 7.5939, + "step": 979700 + }, + { + "epoch": 3.9914898730915613, + "grad_norm": 7.524434566497803, + "learning_rate": 0.0016785460073880415, + "loss": 7.5861, + "step": 979800 + }, + { + "epoch": 3.991897251114943, + "grad_norm": 14.55718994140625, + "learning_rate": 0.0016780779373537955, + "loss": 7.5725, + "step": 979900 + }, + { + "epoch": 3.9923046291383244, + "grad_norm": 4.0364603996276855, + "learning_rate": 0.0016776099000121991, + "loss": 7.5801, + "step": 980000 + }, + { + "epoch": 3.9923046291383244, + "eval_MaskedAccuracy": 0.5095230067761933, + "eval_loss": 1.6059062480926514, + "eval_runtime": 183.7045, + "eval_samples_per_second": 345.533, + "eval_steps_per_second": 1.35, + "step": 980000 + }, + { + "epoch": 3.9927120071617055, + "grad_norm": 7.353693008422852, + "learning_rate": 0.0016771418953817595, + "loss": 7.6074, + "step": 980100 + }, + { + "epoch": 3.993119385185087, + "grad_norm": 9.953048706054688, + "learning_rate": 0.0016766739234809792, + "loss": 7.6262, + "step": 980200 + }, + { + "epoch": 3.9935267632084686, + "grad_norm": 9.833102226257324, + "learning_rate": 0.0016762059843283582, + "loss": 7.613, + "step": 980300 + }, + { + "epoch": 3.99393414123185, + "grad_norm": 3.764328956604004, + "learning_rate": 0.0016757380779423984, + "loss": 7.6269, + "step": 980400 + }, + { + "epoch": 3.9943415192552316, + "grad_norm": 11.513262748718262, + "learning_rate": 0.0016752702043415966, + "loss": 7.5752, + "step": 980500 + }, + { + "epoch": 3.9947488972786127, + "grad_norm": 6.5440850257873535, + "learning_rate": 0.0016748023635444504, + "loss": 7.6159, + "step": 980600 + }, + { + "epoch": 3.9951562753019942, + "grad_norm": 8.95826530456543, + "learning_rate": 0.0016743345555694574, + "loss": 7.6012, + "step": 980700 + }, + { + "epoch": 3.9955636533253758, + "grad_norm": 5.079588890075684, + "learning_rate": 0.0016738667804351115, + "loss": 7.5855, + "step": 980800 + }, + { + "epoch": 3.9959710313487573, + "grad_norm": 2.878133535385132, + "learning_rate": 0.0016733990381599072, + "loss": 7.5802, + "step": 980900 + }, + { + "epoch": 3.996378409372139, + "grad_norm": 3.4497833251953125, + "learning_rate": 0.0016729313287623354, + "loss": 7.6233, + "step": 981000 + }, + { + "epoch": 3.996378409372139, + "eval_MaskedAccuracy": 0.5100188147294573, + "eval_loss": 1.6072598695755005, + "eval_runtime": 202.7131, + "eval_samples_per_second": 313.132, + "eval_steps_per_second": 1.223, + "step": 981000 + }, + { + "epoch": 3.9967857873955204, + "grad_norm": 5.88665771484375, + "learning_rate": 0.0016724636522608906, + "loss": 7.621, + "step": 981100 + }, + { + "epoch": 3.997193165418902, + "grad_norm": 5.950596332550049, + "learning_rate": 0.0016719960086740602, + "loss": 7.6084, + "step": 981200 + }, + { + "epoch": 3.9976005434422834, + "grad_norm": 4.331485271453857, + "learning_rate": 0.0016715283980203329, + "loss": 7.6046, + "step": 981300 + }, + { + "epoch": 3.9980079214656645, + "grad_norm": 2.948194742202759, + "learning_rate": 0.0016710608203181947, + "loss": 7.5938, + "step": 981400 + }, + { + "epoch": 3.998415299489046, + "grad_norm": 14.660123825073242, + "learning_rate": 0.0016705932755861326, + "loss": 7.5907, + "step": 981500 + }, + { + "epoch": 3.9988226775124276, + "grad_norm": 7.557115077972412, + "learning_rate": 0.001670125763842631, + "loss": 7.6077, + "step": 981600 + }, + { + "epoch": 3.999230055535809, + "grad_norm": 6.886069297790527, + "learning_rate": 0.0016696582851061742, + "loss": 7.6244, + "step": 981700 + }, + { + "epoch": 3.9996374335591907, + "grad_norm": 10.294854164123535, + "learning_rate": 0.001669190839395242, + "loss": 7.6077, + "step": 981800 + }, + { + "epoch": 4.000044811582572, + "grad_norm": 3.667914390563965, + "learning_rate": 0.0016687234267283172, + "loss": 7.603, + "step": 981900 + }, + { + "epoch": 4.000452189605953, + "grad_norm": 3.2173373699188232, + "learning_rate": 0.001668256047123878, + "loss": 7.6231, + "step": 982000 + }, + { + "epoch": 4.000452189605953, + "eval_MaskedAccuracy": 0.5094959772927058, + "eval_loss": 1.6051380634307861, + "eval_runtime": 148.8675, + "eval_samples_per_second": 426.393, + "eval_steps_per_second": 1.666, + "step": 982000 + }, + { + "epoch": 4.000859567629335, + "grad_norm": 5.491061687469482, + "learning_rate": 0.0016677887006004016, + "loss": 7.6002, + "step": 982100 + }, + { + "epoch": 4.001266945652716, + "grad_norm": 12.87983226776123, + "learning_rate": 0.0016673213871763659, + "loss": 7.6504, + "step": 982200 + }, + { + "epoch": 4.001674323676098, + "grad_norm": 7.052155017852783, + "learning_rate": 0.0016668541068702472, + "loss": 7.6467, + "step": 982300 + }, + { + "epoch": 4.002081701699479, + "grad_norm": 2.9802677631378174, + "learning_rate": 0.0016663868597005167, + "loss": 7.6518, + "step": 982400 + }, + { + "epoch": 4.002489079722861, + "grad_norm": 8.544371604919434, + "learning_rate": 0.0016659196456856483, + "loss": 7.6359, + "step": 982500 + }, + { + "epoch": 4.0028964577462425, + "grad_norm": 9.191143035888672, + "learning_rate": 0.0016654524648441146, + "loss": 7.6313, + "step": 982600 + }, + { + "epoch": 4.003303835769624, + "grad_norm": 3.2784583568573, + "learning_rate": 0.0016649853171943865, + "loss": 7.6256, + "step": 982700 + }, + { + "epoch": 4.0037112137930055, + "grad_norm": 3.854095935821533, + "learning_rate": 0.0016645182027549292, + "loss": 7.5983, + "step": 982800 + }, + { + "epoch": 4.004118591816386, + "grad_norm": 7.503828048706055, + "learning_rate": 0.0016640511215442137, + "loss": 7.6112, + "step": 982900 + }, + { + "epoch": 4.004525969839768, + "grad_norm": 4.859063625335693, + "learning_rate": 0.0016635840735807047, + "loss": 7.5607, + "step": 983000 + }, + { + "epoch": 4.004525969839768, + "eval_MaskedAccuracy": 0.509650864568538, + "eval_loss": 1.601935863494873, + "eval_runtime": 149.9558, + "eval_samples_per_second": 423.298, + "eval_steps_per_second": 1.654, + "step": 983000 + }, + { + "epoch": 4.004933347863149, + "grad_norm": 7.183450698852539, + "learning_rate": 0.0016631170588828686, + "loss": 7.5983, + "step": 983100 + }, + { + "epoch": 4.005340725886531, + "grad_norm": 4.626616477966309, + "learning_rate": 0.0016626500774691677, + "loss": 7.6083, + "step": 983200 + }, + { + "epoch": 4.005748103909912, + "grad_norm": 7.315069198608398, + "learning_rate": 0.001662183129358065, + "loss": 7.6087, + "step": 983300 + }, + { + "epoch": 4.006155481933294, + "grad_norm": 4.400745391845703, + "learning_rate": 0.0016617162145680213, + "loss": 7.6365, + "step": 983400 + }, + { + "epoch": 4.006562859956675, + "grad_norm": 6.4837775230407715, + "learning_rate": 0.001661249333117496, + "loss": 7.6326, + "step": 983500 + }, + { + "epoch": 4.006970237980057, + "grad_norm": 5.988894462585449, + "learning_rate": 0.0016607824850249481, + "loss": 7.5876, + "step": 983600 + }, + { + "epoch": 4.0073776160034384, + "grad_norm": 4.420219421386719, + "learning_rate": 0.0016603156703088318, + "loss": 7.6013, + "step": 983700 + }, + { + "epoch": 4.00778499402682, + "grad_norm": 8.457923889160156, + "learning_rate": 0.0016598488889876047, + "loss": 7.6053, + "step": 983800 + }, + { + "epoch": 4.0081923720502015, + "grad_norm": 9.585550308227539, + "learning_rate": 0.0016593821410797213, + "loss": 7.6126, + "step": 983900 + }, + { + "epoch": 4.008599750073583, + "grad_norm": 5.95546293258667, + "learning_rate": 0.001658915426603634, + "loss": 7.6169, + "step": 984000 + }, + { + "epoch": 4.008599750073583, + "eval_MaskedAccuracy": 0.5094694198042954, + "eval_loss": 1.61019766330719, + "eval_runtime": 149.01, + "eval_samples_per_second": 425.985, + "eval_steps_per_second": 1.664, + "step": 984000 + }, + { + "epoch": 4.009007128096964, + "grad_norm": 5.147625923156738, + "learning_rate": 0.001658448745577795, + "loss": 7.6507, + "step": 984100 + }, + { + "epoch": 4.009414506120345, + "grad_norm": 3.7334346771240234, + "learning_rate": 0.001657982098020656, + "loss": 7.6011, + "step": 984200 + }, + { + "epoch": 4.009821884143727, + "grad_norm": 2.3425774574279785, + "learning_rate": 0.001657515483950663, + "loss": 7.6154, + "step": 984300 + }, + { + "epoch": 4.010229262167108, + "grad_norm": 7.669173717498779, + "learning_rate": 0.0016570489033862663, + "loss": 7.5999, + "step": 984400 + }, + { + "epoch": 4.01063664019049, + "grad_norm": 5.005259037017822, + "learning_rate": 0.001656582356345912, + "loss": 7.6213, + "step": 984500 + }, + { + "epoch": 4.011044018213871, + "grad_norm": 3.5709750652313232, + "learning_rate": 0.0016561158428480445, + "loss": 7.6224, + "step": 984600 + }, + { + "epoch": 4.011451396237253, + "grad_norm": 3.1985106468200684, + "learning_rate": 0.0016556493629111056, + "loss": 7.6002, + "step": 984700 + }, + { + "epoch": 4.011858774260634, + "grad_norm": 4.026301383972168, + "learning_rate": 0.001655182916553543, + "loss": 7.6022, + "step": 984800 + }, + { + "epoch": 4.012266152284016, + "grad_norm": 7.075306415557861, + "learning_rate": 0.0016547165037937945, + "loss": 7.6124, + "step": 984900 + }, + { + "epoch": 4.0126735303073975, + "grad_norm": 4.863493919372559, + "learning_rate": 0.0016542501246503004, + "loss": 7.5892, + "step": 985000 + }, + { + "epoch": 4.0126735303073975, + "eval_MaskedAccuracy": 0.5098572916061271, + "eval_loss": 1.6003751754760742, + "eval_runtime": 149.1404, + "eval_samples_per_second": 425.612, + "eval_steps_per_second": 1.663, + "step": 985000 + }, + { + "epoch": 4.013080908330779, + "grad_norm": 5.730146408081055, + "learning_rate": 0.0016537837791414979, + "loss": 7.5818, + "step": 985100 + }, + { + "epoch": 4.0134882863541606, + "grad_norm": 8.001017570495605, + "learning_rate": 0.0016533174672858262, + "loss": 7.5501, + "step": 985200 + }, + { + "epoch": 4.013895664377542, + "grad_norm": 5.380482196807861, + "learning_rate": 0.0016528511891017196, + "loss": 7.6155, + "step": 985300 + }, + { + "epoch": 4.014303042400923, + "grad_norm": 7.937867641448975, + "learning_rate": 0.001652384944607614, + "loss": 7.597, + "step": 985400 + }, + { + "epoch": 4.014710420424304, + "grad_norm": 6.902302265167236, + "learning_rate": 0.0016519187338219412, + "loss": 7.6014, + "step": 985500 + }, + { + "epoch": 4.015117798447686, + "grad_norm": 4.475093364715576, + "learning_rate": 0.0016514525567631324, + "loss": 7.6247, + "step": 985600 + }, + { + "epoch": 4.015525176471067, + "grad_norm": 7.708363056182861, + "learning_rate": 0.0016509864134496201, + "loss": 7.5872, + "step": 985700 + }, + { + "epoch": 4.015932554494449, + "grad_norm": 9.547569274902344, + "learning_rate": 0.0016505203038998325, + "loss": 7.6135, + "step": 985800 + }, + { + "epoch": 4.01633993251783, + "grad_norm": 6.526007175445557, + "learning_rate": 0.0016500542281321974, + "loss": 7.6144, + "step": 985900 + }, + { + "epoch": 4.016747310541212, + "grad_norm": 3.0797715187072754, + "learning_rate": 0.0016495881861651417, + "loss": 7.618, + "step": 986000 + }, + { + "epoch": 4.016747310541212, + "eval_MaskedAccuracy": 0.5102666310395091, + "eval_loss": 1.6040263175964355, + "eval_runtime": 150.5843, + "eval_samples_per_second": 421.531, + "eval_steps_per_second": 1.647, + "step": 986000 + }, + { + "epoch": 4.0171546885645935, + "grad_norm": 4.383269786834717, + "learning_rate": 0.0016491221780170894, + "loss": 7.6177, + "step": 986100 + }, + { + "epoch": 4.017562066587975, + "grad_norm": 4.634573459625244, + "learning_rate": 0.0016486562037064652, + "loss": 7.608, + "step": 986200 + }, + { + "epoch": 4.0179694446113565, + "grad_norm": 6.890470027923584, + "learning_rate": 0.0016481902632516927, + "loss": 7.6174, + "step": 986300 + }, + { + "epoch": 4.018376822634738, + "grad_norm": 4.450423240661621, + "learning_rate": 0.001647724356671189, + "loss": 7.6121, + "step": 986400 + }, + { + "epoch": 4.01878420065812, + "grad_norm": 9.86596393585205, + "learning_rate": 0.0016472584839833787, + "loss": 7.5766, + "step": 986500 + }, + { + "epoch": 4.0191915786815, + "grad_norm": 5.9871134757995605, + "learning_rate": 0.0016467926452066763, + "loss": 7.5972, + "step": 986600 + }, + { + "epoch": 4.019598956704882, + "grad_norm": 5.728337287902832, + "learning_rate": 0.0016463268403595012, + "loss": 7.6381, + "step": 986700 + }, + { + "epoch": 4.020006334728263, + "grad_norm": 5.415317058563232, + "learning_rate": 0.0016458610694602702, + "loss": 7.6249, + "step": 986800 + }, + { + "epoch": 4.020413712751645, + "grad_norm": 9.332226753234863, + "learning_rate": 0.001645395332527394, + "loss": 7.6199, + "step": 986900 + }, + { + "epoch": 4.020821090775026, + "grad_norm": 7.070971965789795, + "learning_rate": 0.0016449296295792878, + "loss": 7.6067, + "step": 987000 + }, + { + "epoch": 4.020821090775026, + "eval_MaskedAccuracy": 0.5097597074861391, + "eval_loss": 1.6057909727096558, + "eval_runtime": 151.2127, + "eval_samples_per_second": 419.78, + "eval_steps_per_second": 1.64, + "step": 987000 + }, + { + "epoch": 4.021228468798408, + "grad_norm": 9.921390533447266, + "learning_rate": 0.0016444639606343634, + "loss": 7.5979, + "step": 987100 + }, + { + "epoch": 4.021635846821789, + "grad_norm": 6.192162990570068, + "learning_rate": 0.0016439983257110322, + "loss": 7.5848, + "step": 987200 + }, + { + "epoch": 4.022043224845171, + "grad_norm": 7.434414386749268, + "learning_rate": 0.001643532724827702, + "loss": 7.6073, + "step": 987300 + }, + { + "epoch": 4.0224506028685525, + "grad_norm": 4.407102584838867, + "learning_rate": 0.0016430671580027806, + "loss": 7.6069, + "step": 987400 + }, + { + "epoch": 4.022857980891934, + "grad_norm": 5.222054958343506, + "learning_rate": 0.0016426016252546745, + "loss": 7.6095, + "step": 987500 + }, + { + "epoch": 4.023265358915316, + "grad_norm": 10.153409004211426, + "learning_rate": 0.0016421361266017879, + "loss": 7.6373, + "step": 987600 + }, + { + "epoch": 4.023672736938697, + "grad_norm": 3.4026260375976562, + "learning_rate": 0.001641670662062526, + "loss": 7.6217, + "step": 987700 + }, + { + "epoch": 4.024080114962079, + "grad_norm": 4.457971572875977, + "learning_rate": 0.0016412052316552898, + "loss": 7.6221, + "step": 987800 + }, + { + "epoch": 4.024487492985459, + "grad_norm": 3.766571521759033, + "learning_rate": 0.0016407398353984804, + "loss": 7.6067, + "step": 987900 + }, + { + "epoch": 4.024894871008841, + "grad_norm": 5.549269676208496, + "learning_rate": 0.0016402744733104982, + "loss": 7.6168, + "step": 988000 + }, + { + "epoch": 4.024894871008841, + "eval_MaskedAccuracy": 0.5099150393420648, + "eval_loss": 1.5979849100112915, + "eval_runtime": 149.1503, + "eval_samples_per_second": 425.584, + "eval_steps_per_second": 1.663, + "step": 988000 + }, + { + "epoch": 4.025302249032222, + "grad_norm": 13.185425758361816, + "learning_rate": 0.001639809145409741, + "loss": 7.6263, + "step": 988100 + }, + { + "epoch": 4.025709627055604, + "grad_norm": 6.519994258880615, + "learning_rate": 0.0016393438517146067, + "loss": 7.5787, + "step": 988200 + }, + { + "epoch": 4.026117005078985, + "grad_norm": 3.994704008102417, + "learning_rate": 0.0016388785922434874, + "loss": 7.6051, + "step": 988300 + }, + { + "epoch": 4.026524383102367, + "grad_norm": 9.871971130371094, + "learning_rate": 0.0016384133670147814, + "loss": 7.5802, + "step": 988400 + }, + { + "epoch": 4.0269317611257485, + "grad_norm": 6.690221309661865, + "learning_rate": 0.0016379481760468797, + "loss": 7.6185, + "step": 988500 + }, + { + "epoch": 4.02733913914913, + "grad_norm": 9.233526229858398, + "learning_rate": 0.0016374830193581717, + "loss": 7.5756, + "step": 988600 + }, + { + "epoch": 4.0277465171725115, + "grad_norm": 4.646430492401123, + "learning_rate": 0.0016370178969670529, + "loss": 7.5945, + "step": 988700 + }, + { + "epoch": 4.028153895195893, + "grad_norm": 9.425500869750977, + "learning_rate": 0.0016365528088919086, + "loss": 7.6269, + "step": 988800 + }, + { + "epoch": 4.028561273219275, + "grad_norm": 6.34346866607666, + "learning_rate": 0.0016360877551511253, + "loss": 7.6043, + "step": 988900 + }, + { + "epoch": 4.028968651242656, + "grad_norm": 3.5296967029571533, + "learning_rate": 0.0016356227357630902, + "loss": 7.6116, + "step": 989000 + }, + { + "epoch": 4.028968651242656, + "eval_MaskedAccuracy": 0.5102494184687643, + "eval_loss": 1.5941894054412842, + "eval_runtime": 152.0137, + "eval_samples_per_second": 417.568, + "eval_steps_per_second": 1.631, + "step": 989000 + }, + { + "epoch": 4.029376029266037, + "grad_norm": 8.836751937866211, + "learning_rate": 0.0016351577507461887, + "loss": 7.6095, + "step": 989100 + }, + { + "epoch": 4.029783407289418, + "grad_norm": 10.11208438873291, + "learning_rate": 0.0016346928001188044, + "loss": 7.6392, + "step": 989200 + }, + { + "epoch": 4.0301907853128, + "grad_norm": 6.892326354980469, + "learning_rate": 0.001634227883899316, + "loss": 7.6189, + "step": 989300 + }, + { + "epoch": 4.030598163336181, + "grad_norm": 5.5347676277160645, + "learning_rate": 0.0016337630021061084, + "loss": 7.5896, + "step": 989400 + }, + { + "epoch": 4.031005541359563, + "grad_norm": 3.7020421028137207, + "learning_rate": 0.0016332981547575577, + "loss": 7.6069, + "step": 989500 + }, + { + "epoch": 4.031412919382944, + "grad_norm": 11.612074851989746, + "learning_rate": 0.0016328333418720417, + "loss": 7.61, + "step": 989600 + }, + { + "epoch": 4.031820297406326, + "grad_norm": 10.339410781860352, + "learning_rate": 0.0016323685634679404, + "loss": 7.5975, + "step": 989700 + }, + { + "epoch": 4.0322276754297075, + "grad_norm": 3.8707027435302734, + "learning_rate": 0.0016319038195636267, + "loss": 7.6044, + "step": 989800 + }, + { + "epoch": 4.032635053453089, + "grad_norm": 9.976296424865723, + "learning_rate": 0.0016314391101774752, + "loss": 7.6275, + "step": 989900 + }, + { + "epoch": 4.033042431476471, + "grad_norm": 4.477710723876953, + "learning_rate": 0.0016309744353278568, + "loss": 7.6148, + "step": 990000 + }, + { + "epoch": 4.033042431476471, + "eval_MaskedAccuracy": 0.5096885277811427, + "eval_loss": 1.6079641580581665, + "eval_runtime": 152.6634, + "eval_samples_per_second": 415.791, + "eval_steps_per_second": 1.624, + "step": 990000 + }, + { + "epoch": 4.033449809499852, + "grad_norm": 5.303618907928467, + "learning_rate": 0.0016305097950331442, + "loss": 7.6572, + "step": 990100 + }, + { + "epoch": 4.033857187523234, + "grad_norm": 5.709973335266113, + "learning_rate": 0.0016300451893117066, + "loss": 7.5981, + "step": 990200 + }, + { + "epoch": 4.034264565546615, + "grad_norm": 7.839023590087891, + "learning_rate": 0.0016295806181819128, + "loss": 7.6312, + "step": 990300 + }, + { + "epoch": 4.034671943569996, + "grad_norm": 9.670504570007324, + "learning_rate": 0.0016291160816621312, + "loss": 7.6089, + "step": 990400 + }, + { + "epoch": 4.035079321593377, + "grad_norm": 4.9290242195129395, + "learning_rate": 0.0016286515797707215, + "loss": 7.5979, + "step": 990500 + }, + { + "epoch": 4.035486699616759, + "grad_norm": 6.394056797027588, + "learning_rate": 0.0016281871125260526, + "loss": 7.6154, + "step": 990600 + }, + { + "epoch": 4.03589407764014, + "grad_norm": 3.729290723800659, + "learning_rate": 0.0016277226799464872, + "loss": 7.5927, + "step": 990700 + }, + { + "epoch": 4.036301455663522, + "grad_norm": 3.4127912521362305, + "learning_rate": 0.0016272582820503864, + "loss": 7.6005, + "step": 990800 + }, + { + "epoch": 4.0367088336869035, + "grad_norm": 5.75628137588501, + "learning_rate": 0.00162679391885611, + "loss": 7.6042, + "step": 990900 + }, + { + "epoch": 4.037116211710285, + "grad_norm": 3.784318447113037, + "learning_rate": 0.0016263295903820177, + "loss": 7.6334, + "step": 991000 + }, + { + "epoch": 4.037116211710285, + "eval_MaskedAccuracy": 0.5097704659043, + "eval_loss": 1.6059882640838623, + "eval_runtime": 151.5063, + "eval_samples_per_second": 418.966, + "eval_steps_per_second": 1.637, + "step": 991000 + }, + { + "epoch": 4.0375235897336665, + "grad_norm": 5.234433650970459, + "learning_rate": 0.001625865296646467, + "loss": 7.5861, + "step": 991100 + }, + { + "epoch": 4.037930967757048, + "grad_norm": 8.855758666992188, + "learning_rate": 0.0016254010376678116, + "loss": 7.6207, + "step": 991200 + }, + { + "epoch": 4.03833834578043, + "grad_norm": 4.37705135345459, + "learning_rate": 0.0016249368134644093, + "loss": 7.6377, + "step": 991300 + }, + { + "epoch": 4.038745723803811, + "grad_norm": 5.495611190795898, + "learning_rate": 0.0016244726240546114, + "loss": 7.6066, + "step": 991400 + }, + { + "epoch": 4.039153101827193, + "grad_norm": 6.446033477783203, + "learning_rate": 0.0016240084694567692, + "loss": 7.6175, + "step": 991500 + }, + { + "epoch": 4.039560479850573, + "grad_norm": 3.4947476387023926, + "learning_rate": 0.0016235443496892326, + "loss": 7.6115, + "step": 991600 + }, + { + "epoch": 4.039967857873955, + "grad_norm": 6.529889106750488, + "learning_rate": 0.001623080264770353, + "loss": 7.618, + "step": 991700 + }, + { + "epoch": 4.040375235897336, + "grad_norm": 4.808955192565918, + "learning_rate": 0.0016226162147184753, + "loss": 7.62, + "step": 991800 + }, + { + "epoch": 4.040782613920718, + "grad_norm": 5.324098587036133, + "learning_rate": 0.00162215219955195, + "loss": 7.6277, + "step": 991900 + }, + { + "epoch": 4.0411899919440994, + "grad_norm": 5.577237129211426, + "learning_rate": 0.0016216882192891199, + "loss": 7.6204, + "step": 992000 + }, + { + "epoch": 4.0411899919440994, + "eval_MaskedAccuracy": 0.5092054552655934, + "eval_loss": 1.6071096658706665, + "eval_runtime": 154.4256, + "eval_samples_per_second": 411.046, + "eval_steps_per_second": 1.606, + "step": 992000 + }, + { + "epoch": 4.041597369967481, + "grad_norm": 3.5991404056549072, + "learning_rate": 0.0016212242739483292, + "loss": 7.5895, + "step": 992100 + }, + { + "epoch": 4.0420047479908625, + "grad_norm": 7.769160270690918, + "learning_rate": 0.0016207603635479207, + "loss": 7.6234, + "step": 992200 + }, + { + "epoch": 4.042412126014244, + "grad_norm": 4.4813361167907715, + "learning_rate": 0.0016202964881062338, + "loss": 7.6024, + "step": 992300 + }, + { + "epoch": 4.042819504037626, + "grad_norm": 4.528625011444092, + "learning_rate": 0.001619832647641608, + "loss": 7.6168, + "step": 992400 + }, + { + "epoch": 4.043226882061007, + "grad_norm": 5.218510150909424, + "learning_rate": 0.0016193688421723832, + "loss": 7.5872, + "step": 992500 + }, + { + "epoch": 4.043634260084389, + "grad_norm": 6.234872341156006, + "learning_rate": 0.0016189050717168917, + "loss": 7.6172, + "step": 992600 + }, + { + "epoch": 4.04404163810777, + "grad_norm": 5.389354228973389, + "learning_rate": 0.0016184413362934737, + "loss": 7.6093, + "step": 992700 + }, + { + "epoch": 4.044449016131152, + "grad_norm": 8.0420560836792, + "learning_rate": 0.0016179776359204606, + "loss": 7.626, + "step": 992800 + }, + { + "epoch": 4.044856394154532, + "grad_norm": 4.45302152633667, + "learning_rate": 0.0016175139706161878, + "loss": 7.6349, + "step": 992900 + }, + { + "epoch": 4.045263772177914, + "grad_norm": 5.611714839935303, + "learning_rate": 0.0016170503403989834, + "loss": 7.6196, + "step": 993000 + }, + { + "epoch": 4.045263772177914, + "eval_MaskedAccuracy": 0.510505442108171, + "eval_loss": 1.6032553911209106, + "eval_runtime": 152.13, + "eval_samples_per_second": 417.248, + "eval_steps_per_second": 1.63, + "step": 993000 + }, + { + "epoch": 4.045671150201295, + "grad_norm": 6.103917598724365, + "learning_rate": 0.0016165867452871774, + "loss": 7.6103, + "step": 993100 + }, + { + "epoch": 4.046078528224677, + "grad_norm": 6.502655506134033, + "learning_rate": 0.0016161231852990993, + "loss": 7.5727, + "step": 993200 + }, + { + "epoch": 4.0464859062480585, + "grad_norm": 3.6164493560791016, + "learning_rate": 0.0016156596604530759, + "loss": 7.6015, + "step": 993300 + }, + { + "epoch": 4.04689328427144, + "grad_norm": 12.996712684631348, + "learning_rate": 0.0016151961707674323, + "loss": 7.6171, + "step": 993400 + }, + { + "epoch": 4.0473006622948215, + "grad_norm": 9.091630935668945, + "learning_rate": 0.0016147327162604945, + "loss": 7.5743, + "step": 993500 + }, + { + "epoch": 4.047708040318203, + "grad_norm": 5.92149543762207, + "learning_rate": 0.0016142692969505815, + "loss": 7.6215, + "step": 993600 + }, + { + "epoch": 4.048115418341585, + "grad_norm": 2.944608211517334, + "learning_rate": 0.001613805912856018, + "loss": 7.6067, + "step": 993700 + }, + { + "epoch": 4.048522796364966, + "grad_norm": 5.606834411621094, + "learning_rate": 0.0016133425639951239, + "loss": 7.6413, + "step": 993800 + }, + { + "epoch": 4.048930174388348, + "grad_norm": 4.560391902923584, + "learning_rate": 0.0016128792503862168, + "loss": 7.619, + "step": 993900 + }, + { + "epoch": 4.049337552411729, + "grad_norm": 8.38029670715332, + "learning_rate": 0.0016124159720476146, + "loss": 7.6077, + "step": 994000 + }, + { + "epoch": 4.049337552411729, + "eval_MaskedAccuracy": 0.5097825120515572, + "eval_loss": 1.6010316610336304, + "eval_runtime": 154.5815, + "eval_samples_per_second": 410.631, + "eval_steps_per_second": 1.604, + "step": 994000 + }, + { + "epoch": 4.04974493043511, + "grad_norm": 6.94528865814209, + "learning_rate": 0.0016119527289976318, + "loss": 7.6216, + "step": 994100 + }, + { + "epoch": 4.050152308458491, + "grad_norm": 7.764667510986328, + "learning_rate": 0.0016114895212545849, + "loss": 7.6427, + "step": 994200 + }, + { + "epoch": 4.050559686481873, + "grad_norm": 5.185911655426025, + "learning_rate": 0.0016110263488367858, + "loss": 7.6261, + "step": 994300 + }, + { + "epoch": 4.0509670645052545, + "grad_norm": 10.981317520141602, + "learning_rate": 0.001610563211762547, + "loss": 7.6634, + "step": 994400 + }, + { + "epoch": 4.051374442528636, + "grad_norm": 6.662348747253418, + "learning_rate": 0.0016101001100501785, + "loss": 7.6104, + "step": 994500 + }, + { + "epoch": 4.0517818205520175, + "grad_norm": 8.90304183959961, + "learning_rate": 0.0016096370437179881, + "loss": 7.6094, + "step": 994600 + }, + { + "epoch": 4.052189198575399, + "grad_norm": 3.5736660957336426, + "learning_rate": 0.0016091740127842846, + "loss": 7.62, + "step": 994700 + }, + { + "epoch": 4.052596576598781, + "grad_norm": 6.343061447143555, + "learning_rate": 0.001608711017267372, + "loss": 7.62, + "step": 994800 + }, + { + "epoch": 4.053003954622162, + "grad_norm": 3.335782051086426, + "learning_rate": 0.0016082480571855582, + "loss": 7.6294, + "step": 994900 + }, + { + "epoch": 4.053411332645544, + "grad_norm": 8.102592468261719, + "learning_rate": 0.001607785132557147, + "loss": 7.5968, + "step": 995000 + }, + { + "epoch": 4.053411332645544, + "eval_MaskedAccuracy": 0.5096118635349681, + "eval_loss": 1.6051462888717651, + "eval_runtime": 184.6131, + "eval_samples_per_second": 343.833, + "eval_steps_per_second": 1.343, + "step": 995000 + }, + { + "epoch": 4.053818710668925, + "grad_norm": 4.340451240539551, + "learning_rate": 0.0016073222434004341, + "loss": 7.6314, + "step": 995100 + }, + { + "epoch": 4.054226088692307, + "grad_norm": 6.552633285522461, + "learning_rate": 0.001606859389733726, + "loss": 7.6269, + "step": 995200 + }, + { + "epoch": 4.054633466715688, + "grad_norm": 6.394277572631836, + "learning_rate": 0.0016063965715753207, + "loss": 7.629, + "step": 995300 + }, + { + "epoch": 4.055040844739069, + "grad_norm": 3.224449634552002, + "learning_rate": 0.0016059337889435129, + "loss": 7.6144, + "step": 995400 + }, + { + "epoch": 4.05544822276245, + "grad_norm": 6.690535545349121, + "learning_rate": 0.0016054710418566002, + "loss": 7.5875, + "step": 995500 + }, + { + "epoch": 4.055855600785832, + "grad_norm": 10.263288497924805, + "learning_rate": 0.0016050083303328787, + "loss": 7.6265, + "step": 995600 + }, + { + "epoch": 4.0562629788092135, + "grad_norm": 7.773268699645996, + "learning_rate": 0.0016045456543906432, + "loss": 7.6158, + "step": 995700 + }, + { + "epoch": 4.056670356832595, + "grad_norm": 4.887666702270508, + "learning_rate": 0.0016040830140481835, + "loss": 7.594, + "step": 995800 + }, + { + "epoch": 4.057077734855977, + "grad_norm": 4.525401592254639, + "learning_rate": 0.0016036204093237904, + "loss": 7.6172, + "step": 995900 + }, + { + "epoch": 4.057485112879358, + "grad_norm": 4.199537754058838, + "learning_rate": 0.0016031578402357536, + "loss": 7.6219, + "step": 996000 + }, + { + "epoch": 4.057485112879358, + "eval_MaskedAccuracy": 0.5098717340069447, + "eval_loss": 1.6112149953842163, + "eval_runtime": 152.5527, + "eval_samples_per_second": 416.092, + "eval_steps_per_second": 1.626, + "step": 996000 + }, + { + "epoch": 4.05789249090274, + "grad_norm": 4.9359002113342285, + "learning_rate": 0.001602695306802361, + "loss": 7.6123, + "step": 996100 + }, + { + "epoch": 4.058299868926121, + "grad_norm": 2.5359299182891846, + "learning_rate": 0.0016022328090418978, + "loss": 7.6179, + "step": 996200 + }, + { + "epoch": 4.058707246949503, + "grad_norm": 4.1552581787109375, + "learning_rate": 0.0016017703469726487, + "loss": 7.6299, + "step": 996300 + }, + { + "epoch": 4.059114624972884, + "grad_norm": 4.035418510437012, + "learning_rate": 0.001601307920612902, + "loss": 7.6203, + "step": 996400 + }, + { + "epoch": 4.059522002996266, + "grad_norm": 6.833289623260498, + "learning_rate": 0.0016008455299809361, + "loss": 7.5886, + "step": 996500 + }, + { + "epoch": 4.059929381019646, + "grad_norm": 4.115273952484131, + "learning_rate": 0.0016003831750950293, + "loss": 7.6369, + "step": 996600 + }, + { + "epoch": 4.060336759043028, + "grad_norm": 5.999115943908691, + "learning_rate": 0.0015999208559734658, + "loss": 7.6249, + "step": 996700 + }, + { + "epoch": 4.0607441370664095, + "grad_norm": 7.40685510635376, + "learning_rate": 0.0015994585726345207, + "loss": 7.6339, + "step": 996800 + }, + { + "epoch": 4.061151515089791, + "grad_norm": 6.59105110168457, + "learning_rate": 0.0015989963250964715, + "loss": 7.6084, + "step": 996900 + }, + { + "epoch": 4.0615588931131725, + "grad_norm": 4.839681148529053, + "learning_rate": 0.001598534113377593, + "loss": 7.6405, + "step": 997000 + }, + { + "epoch": 4.0615588931131725, + "eval_MaskedAccuracy": 0.5093082663863335, + "eval_loss": 1.6117303371429443, + "eval_runtime": 156.9476, + "eval_samples_per_second": 404.441, + "eval_steps_per_second": 1.58, + "step": 997000 + }, + { + "epoch": 4.061966271136554, + "grad_norm": 3.898164987564087, + "learning_rate": 0.0015980719374961607, + "loss": 7.6197, + "step": 997100 + }, + { + "epoch": 4.062373649159936, + "grad_norm": 11.058553695678711, + "learning_rate": 0.001597609797470446, + "loss": 7.6005, + "step": 997200 + }, + { + "epoch": 4.062781027183317, + "grad_norm": 3.065469741821289, + "learning_rate": 0.0015971476933187194, + "loss": 7.6122, + "step": 997300 + }, + { + "epoch": 4.063188405206699, + "grad_norm": 9.281084060668945, + "learning_rate": 0.0015966856250592492, + "loss": 7.6556, + "step": 997400 + }, + { + "epoch": 4.06359578323008, + "grad_norm": 7.05238676071167, + "learning_rate": 0.0015962235927103024, + "loss": 7.6558, + "step": 997500 + }, + { + "epoch": 4.064003161253462, + "grad_norm": 4.852511882781982, + "learning_rate": 0.0015957615962901494, + "loss": 7.6202, + "step": 997600 + }, + { + "epoch": 4.064410539276843, + "grad_norm": 6.081695556640625, + "learning_rate": 0.0015952996358170536, + "loss": 7.6344, + "step": 997700 + }, + { + "epoch": 4.064817917300225, + "grad_norm": 12.288058280944824, + "learning_rate": 0.001594837711309277, + "loss": 7.6243, + "step": 997800 + }, + { + "epoch": 4.065225295323605, + "grad_norm": 3.905149221420288, + "learning_rate": 0.0015943758227850845, + "loss": 7.5941, + "step": 997900 + }, + { + "epoch": 4.065632673346987, + "grad_norm": 7.204836845397949, + "learning_rate": 0.0015939139702627356, + "loss": 7.6221, + "step": 998000 + }, + { + "epoch": 4.065632673346987, + "eval_MaskedAccuracy": 0.5095109931374482, + "eval_loss": 1.6079601049423218, + "eval_runtime": 211.6343, + "eval_samples_per_second": 299.932, + "eval_steps_per_second": 1.172, + "step": 998000 + }, + { + "epoch": 4.0660400513703685, + "grad_norm": 7.875951766967773, + "learning_rate": 0.0015934521537604896, + "loss": 7.6319, + "step": 998100 + }, + { + "epoch": 4.06644742939375, + "grad_norm": 9.63471794128418, + "learning_rate": 0.0015929903732966062, + "loss": 7.6374, + "step": 998200 + }, + { + "epoch": 4.066854807417132, + "grad_norm": 9.316731452941895, + "learning_rate": 0.0015925286288893405, + "loss": 7.6151, + "step": 998300 + }, + { + "epoch": 4.067262185440513, + "grad_norm": 6.268803119659424, + "learning_rate": 0.001592066920556949, + "loss": 7.646, + "step": 998400 + }, + { + "epoch": 4.067669563463895, + "grad_norm": 5.5083770751953125, + "learning_rate": 0.0015916052483176837, + "loss": 7.5851, + "step": 998500 + }, + { + "epoch": 4.068076941487276, + "grad_norm": 4.724705219268799, + "learning_rate": 0.001591143612189798, + "loss": 7.6039, + "step": 998600 + }, + { + "epoch": 4.068484319510658, + "grad_norm": 7.567147254943848, + "learning_rate": 0.0015906820121915446, + "loss": 7.6138, + "step": 998700 + }, + { + "epoch": 4.068891697534039, + "grad_norm": 6.970294952392578, + "learning_rate": 0.0015902204483411696, + "loss": 7.5879, + "step": 998800 + }, + { + "epoch": 4.069299075557421, + "grad_norm": 4.607194900512695, + "learning_rate": 0.001589758920656924, + "loss": 7.5753, + "step": 998900 + }, + { + "epoch": 4.069706453580802, + "grad_norm": 16.934329986572266, + "learning_rate": 0.0015892974291570534, + "loss": 7.617, + "step": 999000 + }, + { + "epoch": 4.069706453580802, + "eval_MaskedAccuracy": 0.5097562622975296, + "eval_loss": 1.5989048480987549, + "eval_runtime": 160.4169, + "eval_samples_per_second": 395.694, + "eval_steps_per_second": 1.546, + "step": 999000 + }, + { + "epoch": 4.070113831604183, + "grad_norm": 5.921639442443848, + "learning_rate": 0.0015888359738598025, + "loss": 7.6129, + "step": 999100 + }, + { + "epoch": 4.0705212096275645, + "grad_norm": 10.488616943359375, + "learning_rate": 0.0015883745547834159, + "loss": 7.5983, + "step": 999200 + }, + { + "epoch": 4.070928587650946, + "grad_norm": 8.565415382385254, + "learning_rate": 0.0015879131719461353, + "loss": 7.6103, + "step": 999300 + }, + { + "epoch": 4.0713359656743275, + "grad_norm": 5.272050857543945, + "learning_rate": 0.0015874518253662024, + "loss": 7.6472, + "step": 999400 + }, + { + "epoch": 4.071743343697709, + "grad_norm": 6.496955871582031, + "learning_rate": 0.0015869905150618555, + "loss": 7.5782, + "step": 999500 + }, + { + "epoch": 4.072150721721091, + "grad_norm": 10.269801139831543, + "learning_rate": 0.0015865292410513344, + "loss": 7.5903, + "step": 999600 + }, + { + "epoch": 4.072558099744472, + "grad_norm": 7.985873222351074, + "learning_rate": 0.001586068003352875, + "loss": 7.6196, + "step": 999700 + }, + { + "epoch": 4.072965477767854, + "grad_norm": 3.554377794265747, + "learning_rate": 0.001585606801984711, + "loss": 7.6271, + "step": 999800 + }, + { + "epoch": 4.073372855791235, + "grad_norm": 6.6276140213012695, + "learning_rate": 0.0015851456369650782, + "loss": 7.6388, + "step": 999900 + }, + { + "epoch": 4.073780233814617, + "grad_norm": 4.672713756561279, + "learning_rate": 0.0015846845083122068, + "loss": 7.6125, + "step": 1000000 + }, + { + "epoch": 4.073780233814617, + "eval_MaskedAccuracy": 0.5093898608426418, + "eval_loss": 1.595578908920288, + "eval_runtime": 163.6962, + "eval_samples_per_second": 387.767, + "eval_steps_per_second": 1.515, + "step": 1000000 + }, + { + "epoch": 4.074187611837998, + "grad_norm": 5.099730968475342, + "learning_rate": 0.0015842234160443301, + "loss": 7.635, + "step": 1000100 + }, + { + "epoch": 4.07459498986138, + "grad_norm": 3.789768695831299, + "learning_rate": 0.0015837623601796787, + "loss": 7.6247, + "step": 1000200 + }, + { + "epoch": 4.075002367884761, + "grad_norm": 4.410281181335449, + "learning_rate": 0.001583301340736476, + "loss": 7.6248, + "step": 1000300 + }, + { + "epoch": 4.075409745908142, + "grad_norm": 11.91579532623291, + "learning_rate": 0.001582840357732951, + "loss": 7.6268, + "step": 1000400 + }, + { + "epoch": 4.0758171239315235, + "grad_norm": 5.041029930114746, + "learning_rate": 0.0015823794111873269, + "loss": 7.6216, + "step": 1000500 + }, + { + "epoch": 4.076224501954905, + "grad_norm": 3.9867160320281982, + "learning_rate": 0.0015819185011178296, + "loss": 7.6004, + "step": 1000600 + }, + { + "epoch": 4.076631879978287, + "grad_norm": 6.8187150955200195, + "learning_rate": 0.001581457627542683, + "loss": 7.639, + "step": 1000700 + }, + { + "epoch": 4.077039258001668, + "grad_norm": 13.169952392578125, + "learning_rate": 0.0015809967904801055, + "loss": 7.5955, + "step": 1000800 + }, + { + "epoch": 4.07744663602505, + "grad_norm": 15.388030052185059, + "learning_rate": 0.0015805359899483179, + "loss": 7.6198, + "step": 1000900 + }, + { + "epoch": 4.077854014048431, + "grad_norm": 9.966484069824219, + "learning_rate": 0.0015800752259655358, + "loss": 7.6413, + "step": 1001000 + }, + { + "epoch": 4.077854014048431, + "eval_MaskedAccuracy": 0.5095671997960911, + "eval_loss": 1.600243091583252, + "eval_runtime": 159.2321, + "eval_samples_per_second": 398.638, + "eval_steps_per_second": 1.557, + "step": 1001000 + }, + { + "epoch": 4.078261392071813, + "grad_norm": 3.6168150901794434, + "learning_rate": 0.0015796144985499767, + "loss": 7.6015, + "step": 1001100 + }, + { + "epoch": 4.078668770095194, + "grad_norm": 3.234494209289551, + "learning_rate": 0.0015791538077198546, + "loss": 7.5954, + "step": 1001200 + }, + { + "epoch": 4.079076148118576, + "grad_norm": 8.642640113830566, + "learning_rate": 0.0015786931534933872, + "loss": 7.6037, + "step": 1001300 + }, + { + "epoch": 4.079483526141957, + "grad_norm": 4.173941135406494, + "learning_rate": 0.001578232535888782, + "loss": 7.6303, + "step": 1001400 + }, + { + "epoch": 4.079890904165339, + "grad_norm": 4.0859785079956055, + "learning_rate": 0.00157777195492425, + "loss": 7.627, + "step": 1001500 + }, + { + "epoch": 4.0802982821887195, + "grad_norm": 7.465732574462891, + "learning_rate": 0.001577311410618002, + "loss": 7.6308, + "step": 1001600 + }, + { + "epoch": 4.080705660212101, + "grad_norm": 9.598640441894531, + "learning_rate": 0.0015768509029882465, + "loss": 7.5924, + "step": 1001700 + }, + { + "epoch": 4.0811130382354825, + "grad_norm": 8.930276870727539, + "learning_rate": 0.0015763904320531894, + "loss": 7.6109, + "step": 1001800 + }, + { + "epoch": 4.081520416258864, + "grad_norm": 5.911867618560791, + "learning_rate": 0.0015759299978310352, + "loss": 7.6128, + "step": 1001900 + }, + { + "epoch": 4.081927794282246, + "grad_norm": 3.2866337299346924, + "learning_rate": 0.001575469600339987, + "loss": 7.6519, + "step": 1002000 + }, + { + "epoch": 4.081927794282246, + "eval_MaskedAccuracy": 0.5103328025454996, + "eval_loss": 1.590938687324524, + "eval_runtime": 160.5887, + "eval_samples_per_second": 395.271, + "eval_steps_per_second": 1.544, + "step": 1002000 + }, + { + "epoch": 4.082335172305627, + "grad_norm": 6.869104862213135, + "learning_rate": 0.0015750092395982473, + "loss": 7.6288, + "step": 1002100 + }, + { + "epoch": 4.082742550329009, + "grad_norm": 7.4071831703186035, + "learning_rate": 0.0015745489156240152, + "loss": 7.5954, + "step": 1002200 + }, + { + "epoch": 4.08314992835239, + "grad_norm": 5.9728899002075195, + "learning_rate": 0.0015740886284354913, + "loss": 7.6114, + "step": 1002300 + }, + { + "epoch": 4.083557306375772, + "grad_norm": 9.210744857788086, + "learning_rate": 0.001573628378050873, + "loss": 7.643, + "step": 1002400 + }, + { + "epoch": 4.083964684399153, + "grad_norm": 4.123696327209473, + "learning_rate": 0.0015731681644883544, + "loss": 7.6331, + "step": 1002500 + }, + { + "epoch": 4.084372062422535, + "grad_norm": 4.110208034515381, + "learning_rate": 0.0015727079877661328, + "loss": 7.5809, + "step": 1002600 + }, + { + "epoch": 4.084779440445916, + "grad_norm": 6.685558795928955, + "learning_rate": 0.001572247847902402, + "loss": 7.6414, + "step": 1002700 + }, + { + "epoch": 4.085186818469298, + "grad_norm": 9.374321937561035, + "learning_rate": 0.0015717877449153528, + "loss": 7.5948, + "step": 1002800 + }, + { + "epoch": 4.0855941964926785, + "grad_norm": 7.754372596740723, + "learning_rate": 0.0015713276788231736, + "loss": 7.5892, + "step": 1002900 + }, + { + "epoch": 4.08600157451606, + "grad_norm": 3.866623878479004, + "learning_rate": 0.0015708676496440563, + "loss": 7.6056, + "step": 1003000 + }, + { + "epoch": 4.08600157451606, + "eval_MaskedAccuracy": 0.510134365929565, + "eval_loss": 1.6037026643753052, + "eval_runtime": 168.95, + "eval_samples_per_second": 375.709, + "eval_steps_per_second": 1.468, + "step": 1003000 + }, + { + "epoch": 4.086408952539442, + "grad_norm": 3.749837875366211, + "learning_rate": 0.0015704076573961844, + "loss": 7.5931, + "step": 1003100 + }, + { + "epoch": 4.086816330562823, + "grad_norm": 4.7531256675720215, + "learning_rate": 0.001569947702097749, + "loss": 7.5845, + "step": 1003200 + }, + { + "epoch": 4.087223708586205, + "grad_norm": 7.454687118530273, + "learning_rate": 0.0015694877837669316, + "loss": 7.6232, + "step": 1003300 + }, + { + "epoch": 4.087631086609586, + "grad_norm": 4.9076433181762695, + "learning_rate": 0.001569027902421916, + "loss": 7.5976, + "step": 1003400 + }, + { + "epoch": 4.088038464632968, + "grad_norm": 7.2205891609191895, + "learning_rate": 0.0015685680580808828, + "loss": 7.5995, + "step": 1003500 + }, + { + "epoch": 4.088445842656349, + "grad_norm": 10.62397575378418, + "learning_rate": 0.0015681082507620124, + "loss": 7.62, + "step": 1003600 + }, + { + "epoch": 4.088853220679731, + "grad_norm": 5.233545780181885, + "learning_rate": 0.0015676484804834838, + "loss": 7.6229, + "step": 1003700 + }, + { + "epoch": 4.089260598703112, + "grad_norm": 3.8612184524536133, + "learning_rate": 0.0015671887472634735, + "loss": 7.613, + "step": 1003800 + }, + { + "epoch": 4.089667976726494, + "grad_norm": 6.494972229003906, + "learning_rate": 0.0015667290511201565, + "loss": 7.6151, + "step": 1003900 + }, + { + "epoch": 4.090075354749875, + "grad_norm": 7.1356892585754395, + "learning_rate": 0.0015662693920717105, + "loss": 7.5971, + "step": 1004000 + }, + { + "epoch": 4.090075354749875, + "eval_MaskedAccuracy": 0.5094093150471627, + "eval_loss": 1.6064566373825073, + "eval_runtime": 154.5638, + "eval_samples_per_second": 410.678, + "eval_steps_per_second": 1.605, + "step": 1004000 + }, + { + "epoch": 4.090482732773256, + "grad_norm": 6.189616680145264, + "learning_rate": 0.0015658097701363061, + "loss": 7.6011, + "step": 1004100 + }, + { + "epoch": 4.090890110796638, + "grad_norm": 6.372153282165527, + "learning_rate": 0.0015653501853321145, + "loss": 7.577, + "step": 1004200 + }, + { + "epoch": 4.091297488820019, + "grad_norm": 9.208247184753418, + "learning_rate": 0.001564890637677305, + "loss": 7.6186, + "step": 1004300 + }, + { + "epoch": 4.091704866843401, + "grad_norm": 6.365452766418457, + "learning_rate": 0.001564431127190048, + "loss": 7.5769, + "step": 1004400 + }, + { + "epoch": 4.092112244866782, + "grad_norm": 5.940836429595947, + "learning_rate": 0.0015639716538885069, + "loss": 7.5911, + "step": 1004500 + }, + { + "epoch": 4.092519622890164, + "grad_norm": 10.46280288696289, + "learning_rate": 0.001563512217790851, + "loss": 7.6008, + "step": 1004600 + }, + { + "epoch": 4.092927000913545, + "grad_norm": 3.738527297973633, + "learning_rate": 0.0015630528189152428, + "loss": 7.5919, + "step": 1004700 + }, + { + "epoch": 4.093334378936927, + "grad_norm": 11.295626640319824, + "learning_rate": 0.001562593457279842, + "loss": 7.622, + "step": 1004800 + }, + { + "epoch": 4.093741756960308, + "grad_norm": 7.543964862823486, + "learning_rate": 0.0015621341329028135, + "loss": 7.6281, + "step": 1004900 + }, + { + "epoch": 4.09414913498369, + "grad_norm": 5.9705047607421875, + "learning_rate": 0.001561674845802315, + "loss": 7.629, + "step": 1005000 + }, + { + "epoch": 4.09414913498369, + "eval_MaskedAccuracy": 0.5098191979939412, + "eval_loss": 1.5994396209716797, + "eval_runtime": 150.1648, + "eval_samples_per_second": 422.709, + "eval_steps_per_second": 1.652, + "step": 1005000 + }, + { + "epoch": 4.094556513007071, + "grad_norm": 4.041198253631592, + "learning_rate": 0.0015612155959965055, + "loss": 7.5617, + "step": 1005100 + }, + { + "epoch": 4.094963891030453, + "grad_norm": 12.734151840209961, + "learning_rate": 0.001560756383503542, + "loss": 7.5798, + "step": 1005200 + }, + { + "epoch": 4.095371269053834, + "grad_norm": 10.603124618530273, + "learning_rate": 0.0015602972083415788, + "loss": 7.5945, + "step": 1005300 + }, + { + "epoch": 4.095778647077215, + "grad_norm": 8.641983985900879, + "learning_rate": 0.0015598380705287693, + "loss": 7.5958, + "step": 1005400 + }, + { + "epoch": 4.096186025100597, + "grad_norm": 3.963597536087036, + "learning_rate": 0.0015593789700832634, + "loss": 7.6038, + "step": 1005500 + }, + { + "epoch": 4.096593403123978, + "grad_norm": 10.408990859985352, + "learning_rate": 0.0015589199070232171, + "loss": 7.5924, + "step": 1005600 + }, + { + "epoch": 4.09700078114736, + "grad_norm": 9.836311340332031, + "learning_rate": 0.0015584608813667773, + "loss": 7.6157, + "step": 1005700 + }, + { + "epoch": 4.097408159170741, + "grad_norm": 6.25810432434082, + "learning_rate": 0.00155800189313209, + "loss": 7.5978, + "step": 1005800 + }, + { + "epoch": 4.097815537194123, + "grad_norm": 12.14841365814209, + "learning_rate": 0.0015575429423373042, + "loss": 7.5971, + "step": 1005900 + }, + { + "epoch": 4.098222915217504, + "grad_norm": 9.926202774047852, + "learning_rate": 0.0015570840290005628, + "loss": 7.6078, + "step": 1006000 + }, + { + "epoch": 4.098222915217504, + "eval_MaskedAccuracy": 0.5101178245561072, + "eval_loss": 1.602278232574463, + "eval_runtime": 151.9913, + "eval_samples_per_second": 417.629, + "eval_steps_per_second": 1.632, + "step": 1006000 + }, + { + "epoch": 4.098630293240886, + "grad_norm": 5.3686442375183105, + "learning_rate": 0.001556625153140009, + "loss": 7.5883, + "step": 1006100 + }, + { + "epoch": 4.099037671264267, + "grad_norm": 9.6718111038208, + "learning_rate": 0.0015561663147737857, + "loss": 7.5826, + "step": 1006200 + }, + { + "epoch": 4.099445049287649, + "grad_norm": 4.49713134765625, + "learning_rate": 0.001555707513920033, + "loss": 7.619, + "step": 1006300 + }, + { + "epoch": 4.09985242731103, + "grad_norm": 6.915175437927246, + "learning_rate": 0.0015552487505968878, + "loss": 7.6089, + "step": 1006400 + }, + { + "epoch": 4.100259805334412, + "grad_norm": 12.97745132446289, + "learning_rate": 0.0015547900248224895, + "loss": 7.6195, + "step": 1006500 + }, + { + "epoch": 4.100667183357793, + "grad_norm": 6.967992782592773, + "learning_rate": 0.0015543313366149737, + "loss": 7.5922, + "step": 1006600 + }, + { + "epoch": 4.101074561381174, + "grad_norm": 3.911487340927124, + "learning_rate": 0.001553872685992476, + "loss": 7.5906, + "step": 1006700 + }, + { + "epoch": 4.101481939404556, + "grad_norm": 8.793313026428223, + "learning_rate": 0.0015534140729731266, + "loss": 7.6096, + "step": 1006800 + }, + { + "epoch": 4.101889317427937, + "grad_norm": 7.74241828918457, + "learning_rate": 0.0015529554975750592, + "loss": 7.5587, + "step": 1006900 + }, + { + "epoch": 4.102296695451319, + "grad_norm": 11.245430946350098, + "learning_rate": 0.0015524969598164034, + "loss": 7.6231, + "step": 1007000 + }, + { + "epoch": 4.102296695451319, + "eval_MaskedAccuracy": 0.509991830061726, + "eval_loss": 1.5961247682571411, + "eval_runtime": 157.9598, + "eval_samples_per_second": 401.849, + "eval_steps_per_second": 1.57, + "step": 1007000 + }, + { + "epoch": 4.1027040734747, + "grad_norm": 5.222518444061279, + "learning_rate": 0.0015520384597152866, + "loss": 7.5836, + "step": 1007100 + }, + { + "epoch": 4.103111451498082, + "grad_norm": 7.076982498168945, + "learning_rate": 0.0015515799972898351, + "loss": 7.5813, + "step": 1007200 + }, + { + "epoch": 4.103518829521463, + "grad_norm": 4.8812947273254395, + "learning_rate": 0.0015511215725581766, + "loss": 7.5946, + "step": 1007300 + }, + { + "epoch": 4.103926207544845, + "grad_norm": 5.624736309051514, + "learning_rate": 0.0015506631855384337, + "loss": 7.6165, + "step": 1007400 + }, + { + "epoch": 4.104333585568226, + "grad_norm": 6.506237983703613, + "learning_rate": 0.001550204836248731, + "loss": 7.6067, + "step": 1007500 + }, + { + "epoch": 4.104740963591608, + "grad_norm": 4.0211896896362305, + "learning_rate": 0.0015497465247071868, + "loss": 7.5996, + "step": 1007600 + }, + { + "epoch": 4.105148341614989, + "grad_norm": 5.129552364349365, + "learning_rate": 0.00154928825093192, + "loss": 7.5939, + "step": 1007700 + }, + { + "epoch": 4.105555719638371, + "grad_norm": 5.615102291107178, + "learning_rate": 0.00154883001494105, + "loss": 7.5863, + "step": 1007800 + }, + { + "epoch": 4.105963097661752, + "grad_norm": 3.3849809169769287, + "learning_rate": 0.0015483718167526948, + "loss": 7.623, + "step": 1007900 + }, + { + "epoch": 4.106370475685133, + "grad_norm": 4.826959609985352, + "learning_rate": 0.001547913656384966, + "loss": 7.6, + "step": 1008000 + }, + { + "epoch": 4.106370475685133, + "eval_MaskedAccuracy": 0.5105324922947476, + "eval_loss": 1.59247887134552, + "eval_runtime": 169.9465, + "eval_samples_per_second": 373.506, + "eval_steps_per_second": 1.459, + "step": 1008000 + }, + { + "epoch": 4.106777853708515, + "grad_norm": 7.931018352508545, + "learning_rate": 0.0015474555338559819, + "loss": 7.6056, + "step": 1008100 + }, + { + "epoch": 4.107185231731896, + "grad_norm": 3.6159827709198, + "learning_rate": 0.0015469974491838507, + "loss": 7.6173, + "step": 1008200 + }, + { + "epoch": 4.107592609755278, + "grad_norm": 4.925380229949951, + "learning_rate": 0.001546539402386685, + "loss": 7.6099, + "step": 1008300 + }, + { + "epoch": 4.107999987778659, + "grad_norm": 6.228870868682861, + "learning_rate": 0.001546081393482591, + "loss": 7.6284, + "step": 1008400 + }, + { + "epoch": 4.108407365802041, + "grad_norm": 4.068419933319092, + "learning_rate": 0.0015456234224896779, + "loss": 7.586, + "step": 1008500 + }, + { + "epoch": 4.108814743825422, + "grad_norm": 5.585260391235352, + "learning_rate": 0.0015451654894260538, + "loss": 7.5905, + "step": 1008600 + }, + { + "epoch": 4.109222121848804, + "grad_norm": 9.680155754089355, + "learning_rate": 0.001544707594309821, + "loss": 7.6205, + "step": 1008700 + }, + { + "epoch": 4.109629499872185, + "grad_norm": 6.640585422515869, + "learning_rate": 0.0015442497371590835, + "loss": 7.597, + "step": 1008800 + }, + { + "epoch": 4.110036877895567, + "grad_norm": 6.251063823699951, + "learning_rate": 0.0015437919179919425, + "loss": 7.625, + "step": 1008900 + }, + { + "epoch": 4.1104442559189485, + "grad_norm": 6.639671802520752, + "learning_rate": 0.0015433341368264976, + "loss": 7.6054, + "step": 1009000 + }, + { + "epoch": 4.1104442559189485, + "eval_MaskedAccuracy": 0.5102449508291053, + "eval_loss": 1.5968902111053467, + "eval_runtime": 151.4579, + "eval_samples_per_second": 419.1, + "eval_steps_per_second": 1.637, + "step": 1009000 + }, + { + "epoch": 4.110851633942329, + "grad_norm": 5.833513259887695, + "learning_rate": 0.0015428763936808465, + "loss": 7.5765, + "step": 1009100 + }, + { + "epoch": 4.111259011965711, + "grad_norm": 13.104619026184082, + "learning_rate": 0.0015424186885730888, + "loss": 7.6393, + "step": 1009200 + }, + { + "epoch": 4.111666389989092, + "grad_norm": 8.669432640075684, + "learning_rate": 0.0015419610215213183, + "loss": 7.5906, + "step": 1009300 + }, + { + "epoch": 4.112073768012474, + "grad_norm": 6.3840227127075195, + "learning_rate": 0.001541503392543628, + "loss": 7.6109, + "step": 1009400 + }, + { + "epoch": 4.112481146035855, + "grad_norm": 9.165207862854004, + "learning_rate": 0.001541045801658114, + "loss": 7.6238, + "step": 1009500 + }, + { + "epoch": 4.112888524059237, + "grad_norm": 10.769502639770508, + "learning_rate": 0.001540588248882864, + "loss": 7.5711, + "step": 1009600 + }, + { + "epoch": 4.113295902082618, + "grad_norm": 4.284952640533447, + "learning_rate": 0.0015401307342359696, + "loss": 7.616, + "step": 1009700 + }, + { + "epoch": 4.113703280106, + "grad_norm": 8.703279495239258, + "learning_rate": 0.0015396732577355173, + "loss": 7.616, + "step": 1009800 + }, + { + "epoch": 4.114110658129381, + "grad_norm": 6.439189434051514, + "learning_rate": 0.0015392158193995938, + "loss": 7.6108, + "step": 1009900 + }, + { + "epoch": 4.114518036152763, + "grad_norm": 3.0160834789276123, + "learning_rate": 0.0015387584192462836, + "loss": 7.5934, + "step": 1010000 + }, + { + "epoch": 4.114518036152763, + "eval_MaskedAccuracy": 0.5099782953950945, + "eval_loss": 1.5968457460403442, + "eval_runtime": 152.8504, + "eval_samples_per_second": 415.282, + "eval_steps_per_second": 1.623, + "step": 1010000 + }, + { + "epoch": 4.114925414176144, + "grad_norm": 4.228946208953857, + "learning_rate": 0.001538301057293671, + "loss": 7.6321, + "step": 1010100 + }, + { + "epoch": 4.115332792199526, + "grad_norm": 4.3874616622924805, + "learning_rate": 0.0015378437335598373, + "loss": 7.616, + "step": 1010200 + }, + { + "epoch": 4.1157401702229075, + "grad_norm": 4.825639724731445, + "learning_rate": 0.0015373864480628648, + "loss": 7.6088, + "step": 1010300 + }, + { + "epoch": 4.116147548246288, + "grad_norm": 4.5141448974609375, + "learning_rate": 0.00153692920082083, + "loss": 7.6245, + "step": 1010400 + }, + { + "epoch": 4.11655492626967, + "grad_norm": 8.228910446166992, + "learning_rate": 0.0015364719918518114, + "loss": 7.5834, + "step": 1010500 + }, + { + "epoch": 4.116962304293051, + "grad_norm": 7.053731918334961, + "learning_rate": 0.001536014821173885, + "loss": 7.6027, + "step": 1010600 + }, + { + "epoch": 4.117369682316433, + "grad_norm": 6.305589199066162, + "learning_rate": 0.0015355576888051263, + "loss": 7.5999, + "step": 1010700 + }, + { + "epoch": 4.117777060339814, + "grad_norm": 6.593777656555176, + "learning_rate": 0.0015351005947636073, + "loss": 7.584, + "step": 1010800 + }, + { + "epoch": 4.118184438363196, + "grad_norm": 7.992525100708008, + "learning_rate": 0.0015346435390673988, + "loss": 7.5915, + "step": 1010900 + }, + { + "epoch": 4.118591816386577, + "grad_norm": 4.64131498336792, + "learning_rate": 0.0015341865217345708, + "loss": 7.596, + "step": 1011000 + }, + { + "epoch": 4.118591816386577, + "eval_MaskedAccuracy": 0.5103032010326884, + "eval_loss": 1.5907819271087646, + "eval_runtime": 152.8091, + "eval_samples_per_second": 415.394, + "eval_steps_per_second": 1.623, + "step": 1011000 + }, + { + "epoch": 4.118999194409959, + "grad_norm": 4.767360687255859, + "learning_rate": 0.0015337295427831914, + "loss": 7.6241, + "step": 1011100 + }, + { + "epoch": 4.11940657243334, + "grad_norm": 12.26150894165039, + "learning_rate": 0.0015332726022313271, + "loss": 7.6232, + "step": 1011200 + }, + { + "epoch": 4.119813950456722, + "grad_norm": 3.7885890007019043, + "learning_rate": 0.0015328157000970447, + "loss": 7.5975, + "step": 1011300 + }, + { + "epoch": 4.1202213284801035, + "grad_norm": 9.345982551574707, + "learning_rate": 0.0015323588363984075, + "loss": 7.6507, + "step": 1011400 + }, + { + "epoch": 4.120628706503485, + "grad_norm": 3.2871835231781006, + "learning_rate": 0.0015319020111534778, + "loss": 7.5993, + "step": 1011500 + }, + { + "epoch": 4.121036084526866, + "grad_norm": 12.509317398071289, + "learning_rate": 0.0015314452243803155, + "loss": 7.5736, + "step": 1011600 + }, + { + "epoch": 4.121443462550247, + "grad_norm": 10.545855522155762, + "learning_rate": 0.001530988476096982, + "loss": 7.6432, + "step": 1011700 + }, + { + "epoch": 4.121850840573629, + "grad_norm": 12.254087448120117, + "learning_rate": 0.001530531766321532, + "loss": 7.6374, + "step": 1011800 + }, + { + "epoch": 4.12225821859701, + "grad_norm": 7.642628192901611, + "learning_rate": 0.0015300750950720237, + "loss": 7.6145, + "step": 1011900 + }, + { + "epoch": 4.122665596620392, + "grad_norm": 8.72175121307373, + "learning_rate": 0.0015296184623665113, + "loss": 7.5868, + "step": 1012000 + }, + { + "epoch": 4.122665596620392, + "eval_MaskedAccuracy": 0.509399024333583, + "eval_loss": 1.596692681312561, + "eval_runtime": 160.3397, + "eval_samples_per_second": 395.884, + "eval_steps_per_second": 1.547, + "step": 1012000 + }, + { + "epoch": 4.123072974643773, + "grad_norm": 3.646857500076294, + "learning_rate": 0.0015291618682230473, + "loss": 7.609, + "step": 1012100 + }, + { + "epoch": 4.123480352667155, + "grad_norm": 8.230676651000977, + "learning_rate": 0.0015287053126596844, + "loss": 7.6032, + "step": 1012200 + }, + { + "epoch": 4.123887730690536, + "grad_norm": 8.112112045288086, + "learning_rate": 0.0015282487956944717, + "loss": 7.6091, + "step": 1012300 + }, + { + "epoch": 4.124295108713918, + "grad_norm": 8.373854637145996, + "learning_rate": 0.001527792317345457, + "loss": 7.5804, + "step": 1012400 + }, + { + "epoch": 4.1247024867372994, + "grad_norm": 4.3560943603515625, + "learning_rate": 0.0015273358776306905, + "loss": 7.6364, + "step": 1012500 + }, + { + "epoch": 4.125109864760681, + "grad_norm": 4.420809745788574, + "learning_rate": 0.001526879476568214, + "loss": 7.5761, + "step": 1012600 + }, + { + "epoch": 4.1255172427840625, + "grad_norm": 9.671296119689941, + "learning_rate": 0.0015264231141760753, + "loss": 7.6387, + "step": 1012700 + }, + { + "epoch": 4.125924620807444, + "grad_norm": 6.722683906555176, + "learning_rate": 0.0015259667904723136, + "loss": 7.6265, + "step": 1012800 + }, + { + "epoch": 4.126331998830825, + "grad_norm": 7.6011834144592285, + "learning_rate": 0.001525510505474971, + "loss": 7.6063, + "step": 1012900 + }, + { + "epoch": 4.126739376854206, + "grad_norm": 8.455955505371094, + "learning_rate": 0.0015250542592020857, + "loss": 7.6501, + "step": 1013000 + }, + { + "epoch": 4.126739376854206, + "eval_MaskedAccuracy": 0.5094283299674538, + "eval_loss": 1.5934301614761353, + "eval_runtime": 155.974, + "eval_samples_per_second": 406.965, + "eval_steps_per_second": 1.59, + "step": 1013000 + }, + { + "epoch": 4.127146754877588, + "grad_norm": 4.501232624053955, + "learning_rate": 0.0015245980516716974, + "loss": 7.6197, + "step": 1013100 + }, + { + "epoch": 4.127554132900969, + "grad_norm": 4.817880630493164, + "learning_rate": 0.0015241418829018414, + "loss": 7.6162, + "step": 1013200 + }, + { + "epoch": 4.127961510924351, + "grad_norm": 11.781376838684082, + "learning_rate": 0.0015236857529105532, + "loss": 7.6102, + "step": 1013300 + }, + { + "epoch": 4.128368888947732, + "grad_norm": 11.634780883789062, + "learning_rate": 0.001523229661715866, + "loss": 7.6006, + "step": 1013400 + }, + { + "epoch": 4.128776266971114, + "grad_norm": 4.63670539855957, + "learning_rate": 0.001522773609335811, + "loss": 7.5974, + "step": 1013500 + }, + { + "epoch": 4.129183644994495, + "grad_norm": 6.439024925231934, + "learning_rate": 0.0015223175957884171, + "loss": 7.5839, + "step": 1013600 + }, + { + "epoch": 4.129591023017877, + "grad_norm": 6.234420299530029, + "learning_rate": 0.0015218616210917163, + "loss": 7.6136, + "step": 1013700 + }, + { + "epoch": 4.1299984010412585, + "grad_norm": 4.24252462387085, + "learning_rate": 0.0015214056852637325, + "loss": 7.6078, + "step": 1013800 + }, + { + "epoch": 4.13040577906464, + "grad_norm": 12.226213455200195, + "learning_rate": 0.0015209497883224919, + "loss": 7.6127, + "step": 1013900 + }, + { + "epoch": 4.1308131570880215, + "grad_norm": 12.963217735290527, + "learning_rate": 0.0015204939302860202, + "loss": 7.5984, + "step": 1014000 + }, + { + "epoch": 4.1308131570880215, + "eval_MaskedAccuracy": 0.5100630276593748, + "eval_loss": 1.5979079008102417, + "eval_runtime": 151.547, + "eval_samples_per_second": 418.854, + "eval_steps_per_second": 1.636, + "step": 1014000 + }, + { + "epoch": 4.131220535111402, + "grad_norm": 3.741776704788208, + "learning_rate": 0.0015200381111723402, + "loss": 7.6189, + "step": 1014100 + }, + { + "epoch": 4.131627913134784, + "grad_norm": 5.306728363037109, + "learning_rate": 0.0015195823309994698, + "loss": 7.6562, + "step": 1014200 + }, + { + "epoch": 4.132035291158165, + "grad_norm": 3.6316115856170654, + "learning_rate": 0.0015191265897854314, + "loss": 7.6176, + "step": 1014300 + }, + { + "epoch": 4.132442669181547, + "grad_norm": 10.061392784118652, + "learning_rate": 0.001518670887548241, + "loss": 7.6344, + "step": 1014400 + }, + { + "epoch": 4.132850047204928, + "grad_norm": 4.140845775604248, + "learning_rate": 0.0015182152243059172, + "loss": 7.6132, + "step": 1014500 + }, + { + "epoch": 4.13325742522831, + "grad_norm": 7.131834983825684, + "learning_rate": 0.001517759600076473, + "loss": 7.6036, + "step": 1014600 + }, + { + "epoch": 4.133664803251691, + "grad_norm": 3.7401998043060303, + "learning_rate": 0.0015173040148779215, + "loss": 7.6173, + "step": 1014700 + }, + { + "epoch": 4.134072181275073, + "grad_norm": 6.527963638305664, + "learning_rate": 0.0015168484687282742, + "loss": 7.6034, + "step": 1014800 + }, + { + "epoch": 4.1344795592984545, + "grad_norm": 12.643240928649902, + "learning_rate": 0.0015163929616455426, + "loss": 7.6096, + "step": 1014900 + }, + { + "epoch": 4.134886937321836, + "grad_norm": 6.257790565490723, + "learning_rate": 0.0015159374936477344, + "loss": 7.6189, + "step": 1015000 + }, + { + "epoch": 4.134886937321836, + "eval_MaskedAccuracy": 0.5102536230156649, + "eval_loss": 1.6067798137664795, + "eval_runtime": 156.6268, + "eval_samples_per_second": 405.269, + "eval_steps_per_second": 1.583, + "step": 1015000 + }, + { + "epoch": 4.1352943153452175, + "grad_norm": 9.284547805786133, + "learning_rate": 0.001515482064752858, + "loss": 7.602, + "step": 1015100 + }, + { + "epoch": 4.135701693368599, + "grad_norm": 6.334458827972412, + "learning_rate": 0.0015150266749789161, + "loss": 7.5702, + "step": 1015200 + }, + { + "epoch": 4.13610907139198, + "grad_norm": 11.795574188232422, + "learning_rate": 0.0015145713243439152, + "loss": 7.5911, + "step": 1015300 + }, + { + "epoch": 4.136516449415361, + "grad_norm": 6.572207450866699, + "learning_rate": 0.0015141160128658575, + "loss": 7.5995, + "step": 1015400 + }, + { + "epoch": 4.136923827438743, + "grad_norm": 9.94412899017334, + "learning_rate": 0.001513660740562742, + "loss": 7.5867, + "step": 1015500 + }, + { + "epoch": 4.137331205462124, + "grad_norm": 15.848379135131836, + "learning_rate": 0.0015132055074525707, + "loss": 7.5937, + "step": 1015600 + }, + { + "epoch": 4.137738583485506, + "grad_norm": 8.111241340637207, + "learning_rate": 0.0015127503135533393, + "loss": 7.6214, + "step": 1015700 + }, + { + "epoch": 4.138145961508887, + "grad_norm": 8.875723838806152, + "learning_rate": 0.0015122951588830453, + "loss": 7.612, + "step": 1015800 + }, + { + "epoch": 4.138553339532269, + "grad_norm": 7.9057817459106445, + "learning_rate": 0.001511840043459683, + "loss": 7.6189, + "step": 1015900 + }, + { + "epoch": 4.13896071755565, + "grad_norm": 3.75134015083313, + "learning_rate": 0.001511384967301245, + "loss": 7.6028, + "step": 1016000 + }, + { + "epoch": 4.13896071755565, + "eval_MaskedAccuracy": 0.5100770322511757, + "eval_loss": 1.59774649143219, + "eval_runtime": 159.2981, + "eval_samples_per_second": 398.473, + "eval_steps_per_second": 1.557, + "step": 1016000 + }, + { + "epoch": 4.139368095579032, + "grad_norm": 4.936377048492432, + "learning_rate": 0.001510929930425725, + "loss": 7.5967, + "step": 1016100 + }, + { + "epoch": 4.1397754736024135, + "grad_norm": 6.553722858428955, + "learning_rate": 0.0015104749328511118, + "loss": 7.5972, + "step": 1016200 + }, + { + "epoch": 4.140182851625795, + "grad_norm": 8.784997940063477, + "learning_rate": 0.0015100199745953918, + "loss": 7.6006, + "step": 1016300 + }, + { + "epoch": 4.140590229649177, + "grad_norm": 4.709084987640381, + "learning_rate": 0.001509565055676554, + "loss": 7.5659, + "step": 1016400 + }, + { + "epoch": 4.140997607672558, + "grad_norm": 9.501605987548828, + "learning_rate": 0.0015091101761125843, + "loss": 7.6151, + "step": 1016500 + }, + { + "epoch": 4.141404985695939, + "grad_norm": 4.299465656280518, + "learning_rate": 0.0015086553359214647, + "loss": 7.6033, + "step": 1016600 + }, + { + "epoch": 4.14181236371932, + "grad_norm": 19.92583465576172, + "learning_rate": 0.0015082005351211793, + "loss": 7.6114, + "step": 1016700 + }, + { + "epoch": 4.142219741742702, + "grad_norm": 7.212285995483398, + "learning_rate": 0.001507745773729707, + "loss": 7.6489, + "step": 1016800 + }, + { + "epoch": 4.142627119766083, + "grad_norm": 5.368351936340332, + "learning_rate": 0.0015072910517650296, + "loss": 7.607, + "step": 1016900 + }, + { + "epoch": 4.143034497789465, + "grad_norm": 5.146826267242432, + "learning_rate": 0.0015068363692451218, + "loss": 7.6289, + "step": 1017000 + }, + { + "epoch": 4.143034497789465, + "eval_MaskedAccuracy": 0.5098016886272805, + "eval_loss": 1.6056424379348755, + "eval_runtime": 166.4361, + "eval_samples_per_second": 381.384, + "eval_steps_per_second": 1.49, + "step": 1017000 + }, + { + "epoch": 4.143441875812846, + "grad_norm": 4.130514144897461, + "learning_rate": 0.0015063817261879615, + "loss": 7.6085, + "step": 1017100 + }, + { + "epoch": 4.143849253836228, + "grad_norm": 13.537511825561523, + "learning_rate": 0.0015059271226115219, + "loss": 7.6193, + "step": 1017200 + }, + { + "epoch": 4.1442566318596095, + "grad_norm": 5.954915523529053, + "learning_rate": 0.001505472558533777, + "loss": 7.5905, + "step": 1017300 + }, + { + "epoch": 4.144664009882991, + "grad_norm": 6.077690601348877, + "learning_rate": 0.0015050180339726965, + "loss": 7.6382, + "step": 1017400 + }, + { + "epoch": 4.1450713879063725, + "grad_norm": 4.083377838134766, + "learning_rate": 0.0015045635489462526, + "loss": 7.626, + "step": 1017500 + }, + { + "epoch": 4.145478765929754, + "grad_norm": 6.283422470092773, + "learning_rate": 0.0015041091034724118, + "loss": 7.6278, + "step": 1017600 + }, + { + "epoch": 4.145886143953136, + "grad_norm": 3.2693064212799072, + "learning_rate": 0.0015036546975691425, + "loss": 7.6108, + "step": 1017700 + }, + { + "epoch": 4.146293521976517, + "grad_norm": 5.898975372314453, + "learning_rate": 0.0015032003312544081, + "loss": 7.6138, + "step": 1017800 + }, + { + "epoch": 4.146700899999898, + "grad_norm": 5.674659252166748, + "learning_rate": 0.0015027460045461706, + "loss": 7.6308, + "step": 1017900 + }, + { + "epoch": 4.147108278023279, + "grad_norm": 9.477431297302246, + "learning_rate": 0.0015022917174623966, + "loss": 7.6202, + "step": 1018000 + }, + { + "epoch": 4.147108278023279, + "eval_MaskedAccuracy": 0.5103109231301669, + "eval_loss": 1.5969358682632446, + "eval_runtime": 167.1107, + "eval_samples_per_second": 379.844, + "eval_steps_per_second": 1.484, + "step": 1018000 + }, + { + "epoch": 4.147515656046661, + "grad_norm": 4.549951076507568, + "learning_rate": 0.0015018374700210449, + "loss": 7.6322, + "step": 1018100 + }, + { + "epoch": 4.147923034070042, + "grad_norm": 2.744879961013794, + "learning_rate": 0.00150138326224007, + "loss": 7.6188, + "step": 1018200 + }, + { + "epoch": 4.148330412093424, + "grad_norm": 10.048369407653809, + "learning_rate": 0.001500929094137433, + "loss": 7.6332, + "step": 1018300 + }, + { + "epoch": 4.148737790116805, + "grad_norm": 5.693751811981201, + "learning_rate": 0.0015004749657310898, + "loss": 7.5786, + "step": 1018400 + }, + { + "epoch": 4.149145168140187, + "grad_norm": 6.913130760192871, + "learning_rate": 0.0015000208770389943, + "loss": 7.5854, + "step": 1018500 + }, + { + "epoch": 4.1495525461635685, + "grad_norm": 5.684288501739502, + "learning_rate": 0.0014995668280790985, + "loss": 7.5938, + "step": 1018600 + }, + { + "epoch": 4.14995992418695, + "grad_norm": 4.903898239135742, + "learning_rate": 0.0014991128188693535, + "loss": 7.6359, + "step": 1018700 + }, + { + "epoch": 4.150367302210332, + "grad_norm": 6.3219313621521, + "learning_rate": 0.0014986588494277082, + "loss": 7.5987, + "step": 1018800 + }, + { + "epoch": 4.150774680233713, + "grad_norm": 5.960380554199219, + "learning_rate": 0.001498204919772111, + "loss": 7.5887, + "step": 1018900 + }, + { + "epoch": 4.151182058257095, + "grad_norm": 4.532342433929443, + "learning_rate": 0.0014977510299205093, + "loss": 7.6135, + "step": 1019000 + }, + { + "epoch": 4.151182058257095, + "eval_MaskedAccuracy": 0.5101085505391345, + "eval_loss": 1.608246088027954, + "eval_runtime": 156.2264, + "eval_samples_per_second": 406.308, + "eval_steps_per_second": 1.587, + "step": 1019000 + }, + { + "epoch": 4.151589436280475, + "grad_norm": 6.990480422973633, + "learning_rate": 0.0014972971798908447, + "loss": 7.5806, + "step": 1019100 + }, + { + "epoch": 4.151996814303857, + "grad_norm": 5.496598243713379, + "learning_rate": 0.0014968433697010622, + "loss": 7.6284, + "step": 1019200 + }, + { + "epoch": 4.152404192327238, + "grad_norm": 8.429397583007812, + "learning_rate": 0.0014963895993691026, + "loss": 7.6237, + "step": 1019300 + }, + { + "epoch": 4.15281157035062, + "grad_norm": 2.5694942474365234, + "learning_rate": 0.0014959358689129074, + "loss": 7.6191, + "step": 1019400 + }, + { + "epoch": 4.153218948374001, + "grad_norm": 7.12736701965332, + "learning_rate": 0.0014954821783504152, + "loss": 7.6026, + "step": 1019500 + }, + { + "epoch": 4.153626326397383, + "grad_norm": 10.624343872070312, + "learning_rate": 0.0014950285276995616, + "loss": 7.6264, + "step": 1019600 + }, + { + "epoch": 4.1540337044207645, + "grad_norm": 3.3696537017822266, + "learning_rate": 0.0014945749169782824, + "loss": 7.613, + "step": 1019700 + }, + { + "epoch": 4.154441082444146, + "grad_norm": 4.400647163391113, + "learning_rate": 0.0014941213462045097, + "loss": 7.618, + "step": 1019800 + }, + { + "epoch": 4.1548484604675275, + "grad_norm": 4.743301868438721, + "learning_rate": 0.0014936678153961778, + "loss": 7.5831, + "step": 1019900 + }, + { + "epoch": 4.155255838490909, + "grad_norm": 5.98439884185791, + "learning_rate": 0.001493214324571215, + "loss": 7.589, + "step": 1020000 + }, + { + "epoch": 4.155255838490909, + "eval_MaskedAccuracy": 0.5094930484841899, + "eval_loss": 1.588067889213562, + "eval_runtime": 182.8127, + "eval_samples_per_second": 347.219, + "eval_steps_per_second": 1.357, + "step": 1020000 + }, + { + "epoch": 4.155663216514291, + "grad_norm": 9.075858116149902, + "learning_rate": 0.0014927608737475527, + "loss": 7.6092, + "step": 1020100 + }, + { + "epoch": 4.156070594537672, + "grad_norm": 4.691498756408691, + "learning_rate": 0.0014923074629431177, + "loss": 7.6096, + "step": 1020200 + }, + { + "epoch": 4.156477972561053, + "grad_norm": 4.332222938537598, + "learning_rate": 0.0014918540921758342, + "loss": 7.6053, + "step": 1020300 + }, + { + "epoch": 4.156885350584434, + "grad_norm": 6.397316932678223, + "learning_rate": 0.001491400761463626, + "loss": 7.6311, + "step": 1020400 + }, + { + "epoch": 4.157292728607816, + "grad_norm": 11.58827018737793, + "learning_rate": 0.0014909474708244147, + "loss": 7.6123, + "step": 1020500 + }, + { + "epoch": 4.157700106631197, + "grad_norm": 4.856008052825928, + "learning_rate": 0.0014904942202761241, + "loss": 7.6194, + "step": 1020600 + }, + { + "epoch": 4.158107484654579, + "grad_norm": 3.3364298343658447, + "learning_rate": 0.001490041009836675, + "loss": 7.5989, + "step": 1020700 + }, + { + "epoch": 4.1585148626779604, + "grad_norm": 4.057903289794922, + "learning_rate": 0.001489587839523983, + "loss": 7.5818, + "step": 1020800 + }, + { + "epoch": 4.158922240701342, + "grad_norm": 5.017449855804443, + "learning_rate": 0.0014891347093559642, + "loss": 7.5748, + "step": 1020900 + }, + { + "epoch": 4.1593296187247235, + "grad_norm": 2.404221296310425, + "learning_rate": 0.0014886816193505337, + "loss": 7.5745, + "step": 1021000 + }, + { + "epoch": 4.1593296187247235, + "eval_MaskedAccuracy": 0.5101413743344813, + "eval_loss": 1.6007146835327148, + "eval_runtime": 155.7336, + "eval_samples_per_second": 407.593, + "eval_steps_per_second": 1.592, + "step": 1021000 + }, + { + "epoch": 4.159736996748105, + "grad_norm": 4.9971795082092285, + "learning_rate": 0.0014882285695256045, + "loss": 7.6058, + "step": 1021100 + }, + { + "epoch": 4.160144374771487, + "grad_norm": 4.318365573883057, + "learning_rate": 0.0014877755598990865, + "loss": 7.6422, + "step": 1021200 + }, + { + "epoch": 4.160551752794868, + "grad_norm": 5.754800319671631, + "learning_rate": 0.0014873225904888903, + "loss": 7.5933, + "step": 1021300 + }, + { + "epoch": 4.16095913081825, + "grad_norm": 3.9653968811035156, + "learning_rate": 0.0014868696613129265, + "loss": 7.6267, + "step": 1021400 + }, + { + "epoch": 4.161366508841631, + "grad_norm": 5.344866752624512, + "learning_rate": 0.0014864167723891012, + "loss": 7.6064, + "step": 1021500 + }, + { + "epoch": 4.161773886865012, + "grad_norm": 4.454051494598389, + "learning_rate": 0.0014859639237353205, + "loss": 7.5842, + "step": 1021600 + }, + { + "epoch": 4.162181264888393, + "grad_norm": 14.928621292114258, + "learning_rate": 0.001485511115369483, + "loss": 7.6124, + "step": 1021700 + }, + { + "epoch": 4.162588642911775, + "grad_norm": 3.4864470958709717, + "learning_rate": 0.0014850583473094946, + "loss": 7.6281, + "step": 1021800 + }, + { + "epoch": 4.162996020935156, + "grad_norm": 5.536746501922607, + "learning_rate": 0.0014846056195732538, + "loss": 7.6176, + "step": 1021900 + }, + { + "epoch": 4.163403398958538, + "grad_norm": 7.407474994659424, + "learning_rate": 0.001484152932178661, + "loss": 7.6222, + "step": 1022000 + }, + { + "epoch": 4.163403398958538, + "eval_MaskedAccuracy": 0.5099852719236984, + "eval_loss": 1.6082144975662231, + "eval_runtime": 157.8049, + "eval_samples_per_second": 402.244, + "eval_steps_per_second": 1.572, + "step": 1022000 + }, + { + "epoch": 4.1638107769819195, + "grad_norm": 6.846193790435791, + "learning_rate": 0.0014837002851436137, + "loss": 7.6007, + "step": 1022100 + }, + { + "epoch": 4.164218155005301, + "grad_norm": 4.985725402832031, + "learning_rate": 0.0014832476784860044, + "loss": 7.5966, + "step": 1022200 + }, + { + "epoch": 4.1646255330286825, + "grad_norm": 9.667717933654785, + "learning_rate": 0.0014827951122237273, + "loss": 7.6015, + "step": 1022300 + }, + { + "epoch": 4.165032911052064, + "grad_norm": 5.9029130935668945, + "learning_rate": 0.0014823425863746812, + "loss": 7.6246, + "step": 1022400 + }, + { + "epoch": 4.165440289075446, + "grad_norm": 7.814671039581299, + "learning_rate": 0.0014818901009567508, + "loss": 7.5965, + "step": 1022500 + }, + { + "epoch": 4.165847667098827, + "grad_norm": 7.018571853637695, + "learning_rate": 0.0014814376559878259, + "loss": 7.605, + "step": 1022600 + }, + { + "epoch": 4.166255045122209, + "grad_norm": 5.882373332977295, + "learning_rate": 0.0014809852514857955, + "loss": 7.5994, + "step": 1022700 + }, + { + "epoch": 4.16666242314559, + "grad_norm": 7.16847038269043, + "learning_rate": 0.0014805328874685439, + "loss": 7.6254, + "step": 1022800 + }, + { + "epoch": 4.167069801168971, + "grad_norm": 8.592397689819336, + "learning_rate": 0.0014800805639539583, + "loss": 7.5882, + "step": 1022900 + }, + { + "epoch": 4.167477179192352, + "grad_norm": 3.2145416736602783, + "learning_rate": 0.0014796282809599192, + "loss": 7.5909, + "step": 1023000 + }, + { + "epoch": 4.167477179192352, + "eval_MaskedAccuracy": 0.5102767782158548, + "eval_loss": 1.5986086130142212, + "eval_runtime": 159.2053, + "eval_samples_per_second": 398.705, + "eval_steps_per_second": 1.558, + "step": 1023000 + }, + { + "epoch": 4.167884557215734, + "grad_norm": 4.349625110626221, + "learning_rate": 0.0014791760385043076, + "loss": 7.5913, + "step": 1023100 + }, + { + "epoch": 4.1682919352391155, + "grad_norm": 8.549214363098145, + "learning_rate": 0.0014787238366050037, + "loss": 7.5978, + "step": 1023200 + }, + { + "epoch": 4.168699313262497, + "grad_norm": 4.202352523803711, + "learning_rate": 0.0014782716752798862, + "loss": 7.614, + "step": 1023300 + }, + { + "epoch": 4.1691066912858785, + "grad_norm": 7.24500036239624, + "learning_rate": 0.0014778195545468314, + "loss": 7.5903, + "step": 1023400 + }, + { + "epoch": 4.16951406930926, + "grad_norm": 10.378110885620117, + "learning_rate": 0.0014773674744237115, + "loss": 7.5901, + "step": 1023500 + }, + { + "epoch": 4.169921447332642, + "grad_norm": 3.6616709232330322, + "learning_rate": 0.0014769154349284037, + "loss": 7.6289, + "step": 1023600 + }, + { + "epoch": 4.170328825356023, + "grad_norm": 7.426562309265137, + "learning_rate": 0.0014764634360787782, + "loss": 7.5777, + "step": 1023700 + }, + { + "epoch": 4.170736203379405, + "grad_norm": 9.659289360046387, + "learning_rate": 0.0014760114778927013, + "loss": 7.5904, + "step": 1023800 + }, + { + "epoch": 4.171143581402786, + "grad_norm": 9.703145027160645, + "learning_rate": 0.001475559560388045, + "loss": 7.6234, + "step": 1023900 + }, + { + "epoch": 4.171550959426168, + "grad_norm": 4.7512006759643555, + "learning_rate": 0.0014751076835826742, + "loss": 7.5886, + "step": 1024000 + }, + { + "epoch": 4.171550959426168, + "eval_MaskedAccuracy": 0.5098047120751074, + "eval_loss": 1.5959850549697876, + "eval_runtime": 159.3796, + "eval_samples_per_second": 398.269, + "eval_steps_per_second": 1.556, + "step": 1024000 + }, + { + "epoch": 4.171958337449548, + "grad_norm": 11.228114128112793, + "learning_rate": 0.0014746558474944566, + "loss": 7.5962, + "step": 1024100 + }, + { + "epoch": 4.17236571547293, + "grad_norm": 9.726095199584961, + "learning_rate": 0.001474204052141253, + "loss": 7.5837, + "step": 1024200 + }, + { + "epoch": 4.172773093496311, + "grad_norm": 3.0137314796447754, + "learning_rate": 0.001473752297540926, + "loss": 7.5867, + "step": 1024300 + }, + { + "epoch": 4.173180471519693, + "grad_norm": 9.817083358764648, + "learning_rate": 0.0014733005837113374, + "loss": 7.6123, + "step": 1024400 + }, + { + "epoch": 4.1735878495430745, + "grad_norm": 4.301794528961182, + "learning_rate": 0.0014728489106703444, + "loss": 7.5733, + "step": 1024500 + }, + { + "epoch": 4.173995227566456, + "grad_norm": 7.952913284301758, + "learning_rate": 0.0014723972784358033, + "loss": 7.5825, + "step": 1024600 + }, + { + "epoch": 4.174402605589838, + "grad_norm": 3.9031291007995605, + "learning_rate": 0.0014719456870255704, + "loss": 7.6271, + "step": 1024700 + }, + { + "epoch": 4.174809983613219, + "grad_norm": 5.106107711791992, + "learning_rate": 0.0014714941364575005, + "loss": 7.5446, + "step": 1024800 + }, + { + "epoch": 4.175217361636601, + "grad_norm": 13.670040130615234, + "learning_rate": 0.0014710426267494458, + "loss": 7.6107, + "step": 1024900 + }, + { + "epoch": 4.175624739659982, + "grad_norm": 10.635793685913086, + "learning_rate": 0.0014705911579192554, + "loss": 7.5953, + "step": 1025000 + }, + { + "epoch": 4.175624739659982, + "eval_MaskedAccuracy": 0.5106687895399447, + "eval_loss": 1.593947410583496, + "eval_runtime": 160.2436, + "eval_samples_per_second": 396.122, + "eval_steps_per_second": 1.548, + "step": 1025000 + }, + { + "epoch": 4.176032117683364, + "grad_norm": 8.559614181518555, + "learning_rate": 0.0014701397299847787, + "loss": 7.5839, + "step": 1025100 + }, + { + "epoch": 4.176439495706745, + "grad_norm": 9.795751571655273, + "learning_rate": 0.001469688342963864, + "loss": 7.615, + "step": 1025200 + }, + { + "epoch": 4.176846873730126, + "grad_norm": 4.664596080780029, + "learning_rate": 0.001469236996874355, + "loss": 7.6213, + "step": 1025300 + }, + { + "epoch": 4.177254251753507, + "grad_norm": 3.550440788269043, + "learning_rate": 0.0014687856917340986, + "loss": 7.6127, + "step": 1025400 + }, + { + "epoch": 4.177661629776889, + "grad_norm": 8.955238342285156, + "learning_rate": 0.0014683344275609337, + "loss": 7.5867, + "step": 1025500 + }, + { + "epoch": 4.1780690078002705, + "grad_norm": 3.6599338054656982, + "learning_rate": 0.0014678832043727042, + "loss": 7.592, + "step": 1025600 + }, + { + "epoch": 4.178476385823652, + "grad_norm": 5.006877422332764, + "learning_rate": 0.0014674320221872489, + "loss": 7.5848, + "step": 1025700 + }, + { + "epoch": 4.1788837638470335, + "grad_norm": 8.671599388122559, + "learning_rate": 0.0014669808810224034, + "loss": 7.589, + "step": 1025800 + }, + { + "epoch": 4.179291141870415, + "grad_norm": 5.992976665496826, + "learning_rate": 0.0014665297808960065, + "loss": 7.603, + "step": 1025900 + }, + { + "epoch": 4.179698519893797, + "grad_norm": 10.025277137756348, + "learning_rate": 0.0014660787218258914, + "loss": 7.5922, + "step": 1026000 + }, + { + "epoch": 4.179698519893797, + "eval_MaskedAccuracy": 0.5096075246703081, + "eval_loss": 1.6091586351394653, + "eval_runtime": 157.9571, + "eval_samples_per_second": 401.856, + "eval_steps_per_second": 1.57, + "step": 1026000 + }, + { + "epoch": 4.180105897917178, + "grad_norm": 8.122149467468262, + "learning_rate": 0.0014656277038298919, + "loss": 7.5758, + "step": 1026100 + }, + { + "epoch": 4.18051327594056, + "grad_norm": 3.658780813217163, + "learning_rate": 0.0014651767269258384, + "loss": 7.6023, + "step": 1026200 + }, + { + "epoch": 4.180920653963941, + "grad_norm": 7.74526834487915, + "learning_rate": 0.001464725791131558, + "loss": 7.6064, + "step": 1026300 + }, + { + "epoch": 4.181328031987323, + "grad_norm": 10.288622856140137, + "learning_rate": 0.0014642748964648821, + "loss": 7.6268, + "step": 1026400 + }, + { + "epoch": 4.181735410010704, + "grad_norm": 11.330092430114746, + "learning_rate": 0.001463824042943636, + "loss": 7.5888, + "step": 1026500 + }, + { + "epoch": 4.182142788034085, + "grad_norm": 6.9427080154418945, + "learning_rate": 0.0014633732305856443, + "loss": 7.6169, + "step": 1026600 + }, + { + "epoch": 4.182550166057466, + "grad_norm": 10.429247856140137, + "learning_rate": 0.00146292245940873, + "loss": 7.5902, + "step": 1026700 + }, + { + "epoch": 4.182957544080848, + "grad_norm": 14.039982795715332, + "learning_rate": 0.001462471729430715, + "loss": 7.6083, + "step": 1026800 + }, + { + "epoch": 4.1833649221042295, + "grad_norm": 2.7318310737609863, + "learning_rate": 0.001462021040669417, + "loss": 7.6211, + "step": 1026900 + }, + { + "epoch": 4.183772300127611, + "grad_norm": 4.351866245269775, + "learning_rate": 0.0014615703931426555, + "loss": 7.5958, + "step": 1027000 + }, + { + "epoch": 4.183772300127611, + "eval_MaskedAccuracy": 0.5106928532701904, + "eval_loss": 1.6009950637817383, + "eval_runtime": 158.6672, + "eval_samples_per_second": 400.057, + "eval_steps_per_second": 1.563, + "step": 1027000 + }, + { + "epoch": 4.184179678150993, + "grad_norm": 5.183883190155029, + "learning_rate": 0.001461119786868247, + "loss": 7.5966, + "step": 1027100 + }, + { + "epoch": 4.184587056174374, + "grad_norm": 12.216350555419922, + "learning_rate": 0.0014606692218640058, + "loss": 7.6025, + "step": 1027200 + }, + { + "epoch": 4.184994434197756, + "grad_norm": 3.1864845752716064, + "learning_rate": 0.0014602186981477446, + "loss": 7.6272, + "step": 1027300 + }, + { + "epoch": 4.185401812221137, + "grad_norm": 8.0393705368042, + "learning_rate": 0.0014597682157372787, + "loss": 7.6049, + "step": 1027400 + }, + { + "epoch": 4.185809190244519, + "grad_norm": 5.613422870635986, + "learning_rate": 0.001459317774650415, + "loss": 7.5624, + "step": 1027500 + }, + { + "epoch": 4.1862165682679, + "grad_norm": 11.337660789489746, + "learning_rate": 0.001458867374904963, + "loss": 7.5756, + "step": 1027600 + }, + { + "epoch": 4.186623946291282, + "grad_norm": 6.008993148803711, + "learning_rate": 0.0014584170165187287, + "loss": 7.5922, + "step": 1027700 + }, + { + "epoch": 4.187031324314663, + "grad_norm": 14.749534606933594, + "learning_rate": 0.0014579666995095167, + "loss": 7.5996, + "step": 1027800 + }, + { + "epoch": 4.187438702338044, + "grad_norm": 14.441166877746582, + "learning_rate": 0.0014575164238951315, + "loss": 7.6402, + "step": 1027900 + }, + { + "epoch": 4.1878460803614255, + "grad_norm": 3.8267993927001953, + "learning_rate": 0.0014570661896933725, + "loss": 7.591, + "step": 1028000 + }, + { + "epoch": 4.1878460803614255, + "eval_MaskedAccuracy": 0.5104208789487067, + "eval_loss": 1.5986571311950684, + "eval_runtime": 164.851, + "eval_samples_per_second": 385.051, + "eval_steps_per_second": 1.504, + "step": 1028000 + }, + { + "epoch": 4.188253458384807, + "grad_norm": 12.592936515808105, + "learning_rate": 0.001456615996922042, + "loss": 7.5936, + "step": 1028100 + }, + { + "epoch": 4.1886608364081885, + "grad_norm": 16.150226593017578, + "learning_rate": 0.0014561658455989404, + "loss": 7.57, + "step": 1028200 + }, + { + "epoch": 4.18906821443157, + "grad_norm": 16.680103302001953, + "learning_rate": 0.0014557157357418605, + "loss": 7.5961, + "step": 1028300 + }, + { + "epoch": 4.189475592454952, + "grad_norm": 4.220909118652344, + "learning_rate": 0.0014552656673686006, + "loss": 7.6108, + "step": 1028400 + }, + { + "epoch": 4.189882970478333, + "grad_norm": 9.812032699584961, + "learning_rate": 0.001454815640496952, + "loss": 7.5964, + "step": 1028500 + }, + { + "epoch": 4.190290348501715, + "grad_norm": 12.421441078186035, + "learning_rate": 0.00145436565514471, + "loss": 7.5877, + "step": 1028600 + }, + { + "epoch": 4.190697726525096, + "grad_norm": 5.953688144683838, + "learning_rate": 0.001453915711329661, + "loss": 7.5787, + "step": 1028700 + }, + { + "epoch": 4.191105104548478, + "grad_norm": 4.339315414428711, + "learning_rate": 0.001453465809069596, + "loss": 7.6027, + "step": 1028800 + }, + { + "epoch": 4.191512482571859, + "grad_norm": 5.007308006286621, + "learning_rate": 0.0014530159483823024, + "loss": 7.5843, + "step": 1028900 + }, + { + "epoch": 4.191919860595241, + "grad_norm": 2.6288020610809326, + "learning_rate": 0.0014525661292855642, + "loss": 7.6077, + "step": 1029000 + }, + { + "epoch": 4.191919860595241, + "eval_MaskedAccuracy": 0.5101682141368029, + "eval_loss": 1.6052380800247192, + "eval_runtime": 155.8335, + "eval_samples_per_second": 407.332, + "eval_steps_per_second": 1.591, + "step": 1029000 + }, + { + "epoch": 4.192327238618621, + "grad_norm": 7.667224407196045, + "learning_rate": 0.0014521163517971663, + "loss": 7.5938, + "step": 1029100 + }, + { + "epoch": 4.192734616642003, + "grad_norm": 12.914789199829102, + "learning_rate": 0.0014516666159348891, + "loss": 7.5844, + "step": 1029200 + }, + { + "epoch": 4.1931419946653845, + "grad_norm": 4.420145511627197, + "learning_rate": 0.0014512169217165151, + "loss": 7.5856, + "step": 1029300 + }, + { + "epoch": 4.193549372688766, + "grad_norm": 6.624450206756592, + "learning_rate": 0.0014507672691598215, + "loss": 7.5676, + "step": 1029400 + }, + { + "epoch": 4.193956750712148, + "grad_norm": 16.761884689331055, + "learning_rate": 0.0014503176582825875, + "loss": 7.5813, + "step": 1029500 + }, + { + "epoch": 4.194364128735529, + "grad_norm": 3.6186683177948, + "learning_rate": 0.0014498680891025866, + "loss": 7.5833, + "step": 1029600 + }, + { + "epoch": 4.194771506758911, + "grad_norm": 10.661664009094238, + "learning_rate": 0.0014494185616375924, + "loss": 7.599, + "step": 1029700 + }, + { + "epoch": 4.195178884782292, + "grad_norm": 3.963146448135376, + "learning_rate": 0.0014489690759053792, + "loss": 7.5886, + "step": 1029800 + }, + { + "epoch": 4.195586262805674, + "grad_norm": 13.324056625366211, + "learning_rate": 0.0014485196319237175, + "loss": 7.5916, + "step": 1029900 + }, + { + "epoch": 4.195993640829055, + "grad_norm": 11.302262306213379, + "learning_rate": 0.0014480702297103752, + "loss": 7.5888, + "step": 1030000 + }, + { + "epoch": 4.195993640829055, + "eval_MaskedAccuracy": 0.5096965536116576, + "eval_loss": 1.5952415466308594, + "eval_runtime": 168.002, + "eval_samples_per_second": 377.829, + "eval_steps_per_second": 1.476, + "step": 1030000 + }, + { + "epoch": 4.196401018852437, + "grad_norm": 7.486935138702393, + "learning_rate": 0.0014476208692831183, + "loss": 7.6255, + "step": 1030100 + }, + { + "epoch": 4.196808396875818, + "grad_norm": 8.094013214111328, + "learning_rate": 0.0014471715506597115, + "loss": 7.5757, + "step": 1030200 + }, + { + "epoch": 4.197215774899199, + "grad_norm": 6.376736640930176, + "learning_rate": 0.0014467222738579234, + "loss": 7.5918, + "step": 1030300 + }, + { + "epoch": 4.1976231529225805, + "grad_norm": 9.68858814239502, + "learning_rate": 0.0014462730388955147, + "loss": 7.6198, + "step": 1030400 + }, + { + "epoch": 4.198030530945962, + "grad_norm": 5.73710823059082, + "learning_rate": 0.001445823845790246, + "loss": 7.6224, + "step": 1030500 + }, + { + "epoch": 4.1984379089693435, + "grad_norm": 3.1832115650177, + "learning_rate": 0.0014453746945598735, + "loss": 7.6208, + "step": 1030600 + }, + { + "epoch": 4.198845286992725, + "grad_norm": 9.9255952835083, + "learning_rate": 0.0014449255852221573, + "loss": 7.5886, + "step": 1030700 + }, + { + "epoch": 4.199252665016107, + "grad_norm": 5.664330005645752, + "learning_rate": 0.0014444765177948518, + "loss": 7.6045, + "step": 1030800 + }, + { + "epoch": 4.199660043039488, + "grad_norm": 6.452301979064941, + "learning_rate": 0.0014440274922957123, + "loss": 7.6006, + "step": 1030900 + }, + { + "epoch": 4.20006742106287, + "grad_norm": 3.30295729637146, + "learning_rate": 0.0014435785087424905, + "loss": 7.6189, + "step": 1031000 + }, + { + "epoch": 4.20006742106287, + "eval_MaskedAccuracy": 0.5100643277420648, + "eval_loss": 1.6013303995132446, + "eval_runtime": 172.5314, + "eval_samples_per_second": 367.91, + "eval_steps_per_second": 1.437, + "step": 1031000 + }, + { + "epoch": 4.200474799086251, + "grad_norm": 5.853790760040283, + "learning_rate": 0.0014431295671529364, + "loss": 7.6163, + "step": 1031100 + }, + { + "epoch": 4.200882177109633, + "grad_norm": 9.750107765197754, + "learning_rate": 0.0014426806675447992, + "loss": 7.5881, + "step": 1031200 + }, + { + "epoch": 4.201289555133014, + "grad_norm": 11.970582008361816, + "learning_rate": 0.0014422318099358263, + "loss": 7.6326, + "step": 1031300 + }, + { + "epoch": 4.201696933156396, + "grad_norm": 9.420632362365723, + "learning_rate": 0.0014417829943437666, + "loss": 7.6037, + "step": 1031400 + }, + { + "epoch": 4.202104311179777, + "grad_norm": 13.569077491760254, + "learning_rate": 0.001441334220786359, + "loss": 7.6071, + "step": 1031500 + }, + { + "epoch": 4.202511689203158, + "grad_norm": 4.433078289031982, + "learning_rate": 0.0014408854892813492, + "loss": 7.5812, + "step": 1031600 + }, + { + "epoch": 4.2029190672265395, + "grad_norm": 5.293403148651123, + "learning_rate": 0.0014404367998464768, + "loss": 7.6147, + "step": 1031700 + }, + { + "epoch": 4.203326445249921, + "grad_norm": 5.65088415145874, + "learning_rate": 0.0014399881524994828, + "loss": 7.6042, + "step": 1031800 + }, + { + "epoch": 4.203733823273303, + "grad_norm": 12.346303939819336, + "learning_rate": 0.001439539547258102, + "loss": 7.6056, + "step": 1031900 + }, + { + "epoch": 4.204141201296684, + "grad_norm": 4.291621208190918, + "learning_rate": 0.0014390909841400735, + "loss": 7.5951, + "step": 1032000 + }, + { + "epoch": 4.204141201296684, + "eval_MaskedAccuracy": 0.5096514852280516, + "eval_loss": 1.6052285432815552, + "eval_runtime": 162.7154, + "eval_samples_per_second": 390.105, + "eval_steps_per_second": 1.524, + "step": 1032000 + }, + { + "epoch": 4.204548579320066, + "grad_norm": 14.775259017944336, + "learning_rate": 0.0014386424631631278, + "loss": 7.5904, + "step": 1032100 + }, + { + "epoch": 4.204955957343447, + "grad_norm": 9.05004596710205, + "learning_rate": 0.0014381939843449991, + "loss": 7.5775, + "step": 1032200 + }, + { + "epoch": 4.205363335366829, + "grad_norm": 3.9531989097595215, + "learning_rate": 0.0014377455477034187, + "loss": 7.5917, + "step": 1032300 + }, + { + "epoch": 4.20577071339021, + "grad_norm": 5.5266523361206055, + "learning_rate": 0.001437297153256115, + "loss": 7.5868, + "step": 1032400 + }, + { + "epoch": 4.206178091413592, + "grad_norm": 10.092397689819336, + "learning_rate": 0.0014368488010208142, + "loss": 7.5904, + "step": 1032500 + }, + { + "epoch": 4.206585469436973, + "grad_norm": 5.211591720581055, + "learning_rate": 0.0014364004910152439, + "loss": 7.607, + "step": 1032600 + }, + { + "epoch": 4.206992847460355, + "grad_norm": 10.154476165771484, + "learning_rate": 0.0014359522232571264, + "loss": 7.5883, + "step": 1032700 + }, + { + "epoch": 4.207400225483736, + "grad_norm": 8.700800895690918, + "learning_rate": 0.0014355039977641858, + "loss": 7.6247, + "step": 1032800 + }, + { + "epoch": 4.207807603507117, + "grad_norm": 7.243443965911865, + "learning_rate": 0.0014350558145541437, + "loss": 7.6093, + "step": 1032900 + }, + { + "epoch": 4.2082149815304986, + "grad_norm": 4.729804039001465, + "learning_rate": 0.001434607673644716, + "loss": 7.6221, + "step": 1033000 + }, + { + "epoch": 4.2082149815304986, + "eval_MaskedAccuracy": 0.5105174837283005, + "eval_loss": 1.6016790866851807, + "eval_runtime": 163.8821, + "eval_samples_per_second": 387.327, + "eval_steps_per_second": 1.513, + "step": 1033000 + }, + { + "epoch": 4.20862235955388, + "grad_norm": 13.310611724853516, + "learning_rate": 0.001434159575053622, + "loss": 7.613, + "step": 1033100 + }, + { + "epoch": 4.209029737577262, + "grad_norm": 11.235798835754395, + "learning_rate": 0.0014337115187985775, + "loss": 7.5982, + "step": 1033200 + }, + { + "epoch": 4.209437115600643, + "grad_norm": 7.849621295928955, + "learning_rate": 0.0014332635048972964, + "loss": 7.6235, + "step": 1033300 + }, + { + "epoch": 4.209844493624025, + "grad_norm": 6.901366710662842, + "learning_rate": 0.0014328155333674913, + "loss": 7.6068, + "step": 1033400 + }, + { + "epoch": 4.210251871647406, + "grad_norm": 4.541335105895996, + "learning_rate": 0.0014323676042268712, + "loss": 7.6033, + "step": 1033500 + }, + { + "epoch": 4.210659249670788, + "grad_norm": 5.252453804016113, + "learning_rate": 0.0014319197174931465, + "loss": 7.5956, + "step": 1033600 + }, + { + "epoch": 4.211066627694169, + "grad_norm": 6.729776859283447, + "learning_rate": 0.0014314718731840248, + "loss": 7.6165, + "step": 1033700 + }, + { + "epoch": 4.211474005717551, + "grad_norm": 5.497008800506592, + "learning_rate": 0.0014310240713172123, + "loss": 7.597, + "step": 1033800 + }, + { + "epoch": 4.211881383740932, + "grad_norm": 5.694904804229736, + "learning_rate": 0.0014305763119104125, + "loss": 7.6054, + "step": 1033900 + }, + { + "epoch": 4.212288761764314, + "grad_norm": 7.6464385986328125, + "learning_rate": 0.001430128594981329, + "loss": 7.6043, + "step": 1034000 + }, + { + "epoch": 4.212288761764314, + "eval_MaskedAccuracy": 0.509850729345662, + "eval_loss": 1.5982760190963745, + "eval_runtime": 152.0538, + "eval_samples_per_second": 417.458, + "eval_steps_per_second": 1.631, + "step": 1034000 + }, + { + "epoch": 4.2126961397876945, + "grad_norm": 3.917483329772949, + "learning_rate": 0.0014296809205476592, + "loss": 7.5896, + "step": 1034100 + }, + { + "epoch": 4.213103517811076, + "grad_norm": 5.192936897277832, + "learning_rate": 0.0014292332886271031, + "loss": 7.6009, + "step": 1034200 + }, + { + "epoch": 4.213510895834458, + "grad_norm": 6.1599602699279785, + "learning_rate": 0.0014287856992373602, + "loss": 7.6249, + "step": 1034300 + }, + { + "epoch": 4.213918273857839, + "grad_norm": 4.17832612991333, + "learning_rate": 0.0014283381523961223, + "loss": 7.6259, + "step": 1034400 + }, + { + "epoch": 4.214325651881221, + "grad_norm": 14.357792854309082, + "learning_rate": 0.001427890648121088, + "loss": 7.5974, + "step": 1034500 + }, + { + "epoch": 4.214733029904602, + "grad_norm": 7.662722110748291, + "learning_rate": 0.0014274431864299462, + "loss": 7.5946, + "step": 1034600 + }, + { + "epoch": 4.215140407927984, + "grad_norm": 4.965076923370361, + "learning_rate": 0.0014269957673403895, + "loss": 7.6232, + "step": 1034700 + }, + { + "epoch": 4.215547785951365, + "grad_norm": 4.649047374725342, + "learning_rate": 0.0014265483908701044, + "loss": 7.5993, + "step": 1034800 + }, + { + "epoch": 4.215955163974747, + "grad_norm": 9.61099910736084, + "learning_rate": 0.0014261010570367795, + "loss": 7.5981, + "step": 1034900 + }, + { + "epoch": 4.216362541998128, + "grad_norm": 5.705331802368164, + "learning_rate": 0.001425653765858099, + "loss": 7.5778, + "step": 1035000 + }, + { + "epoch": 4.216362541998128, + "eval_MaskedAccuracy": 0.5102395071453286, + "eval_loss": 1.589215874671936, + "eval_runtime": 179.1055, + "eval_samples_per_second": 354.406, + "eval_steps_per_second": 1.385, + "step": 1035000 + }, + { + "epoch": 4.21676992002151, + "grad_norm": 2.996056318283081, + "learning_rate": 0.0014252065173517492, + "loss": 7.5826, + "step": 1035100 + }, + { + "epoch": 4.217177298044891, + "grad_norm": 4.911251068115234, + "learning_rate": 0.0014247593115354097, + "loss": 7.6332, + "step": 1035200 + }, + { + "epoch": 4.217584676068272, + "grad_norm": 4.924323558807373, + "learning_rate": 0.0014243121484267633, + "loss": 7.5839, + "step": 1035300 + }, + { + "epoch": 4.217992054091654, + "grad_norm": 7.712258815765381, + "learning_rate": 0.0014238650280434877, + "loss": 7.5621, + "step": 1035400 + }, + { + "epoch": 4.218399432115035, + "grad_norm": 9.961227416992188, + "learning_rate": 0.00142341795040326, + "loss": 7.6159, + "step": 1035500 + }, + { + "epoch": 4.218806810138417, + "grad_norm": 8.253767013549805, + "learning_rate": 0.0014229709155237581, + "loss": 7.622, + "step": 1035600 + }, + { + "epoch": 4.219214188161798, + "grad_norm": 4.563155174255371, + "learning_rate": 0.0014225239234226506, + "loss": 7.5653, + "step": 1035700 + }, + { + "epoch": 4.21962156618518, + "grad_norm": 10.279685974121094, + "learning_rate": 0.0014220769741176116, + "loss": 7.59, + "step": 1035800 + }, + { + "epoch": 4.220028944208561, + "grad_norm": 8.487176895141602, + "learning_rate": 0.0014216300676263118, + "loss": 7.5725, + "step": 1035900 + }, + { + "epoch": 4.220436322231943, + "grad_norm": 11.135157585144043, + "learning_rate": 0.0014211832039664192, + "loss": 7.5991, + "step": 1036000 + }, + { + "epoch": 4.220436322231943, + "eval_MaskedAccuracy": 0.510516270683006, + "eval_loss": 1.6004313230514526, + "eval_runtime": 184.468, + "eval_samples_per_second": 344.103, + "eval_steps_per_second": 1.344, + "step": 1036000 + }, + { + "epoch": 4.220843700255324, + "grad_norm": 14.776406288146973, + "learning_rate": 0.0014207363831556011, + "loss": 7.5955, + "step": 1036100 + }, + { + "epoch": 4.221251078278706, + "grad_norm": 6.738020896911621, + "learning_rate": 0.0014202896052115208, + "loss": 7.5831, + "step": 1036200 + }, + { + "epoch": 4.221658456302087, + "grad_norm": 10.824565887451172, + "learning_rate": 0.001419842870151847, + "loss": 7.5895, + "step": 1036300 + }, + { + "epoch": 4.222065834325469, + "grad_norm": 3.366100549697876, + "learning_rate": 0.001419396177994238, + "loss": 7.6135, + "step": 1036400 + }, + { + "epoch": 4.22247321234885, + "grad_norm": 3.5348172187805176, + "learning_rate": 0.0014189495287563537, + "loss": 7.5733, + "step": 1036500 + }, + { + "epoch": 4.222880590372231, + "grad_norm": 5.367615699768066, + "learning_rate": 0.0014185029224558546, + "loss": 7.6033, + "step": 1036600 + }, + { + "epoch": 4.223287968395613, + "grad_norm": 8.914054870605469, + "learning_rate": 0.001418056359110395, + "loss": 7.6153, + "step": 1036700 + }, + { + "epoch": 4.223695346418994, + "grad_norm": 8.189297676086426, + "learning_rate": 0.0014176098387376306, + "loss": 7.5986, + "step": 1036800 + }, + { + "epoch": 4.224102724442376, + "grad_norm": 8.711004257202148, + "learning_rate": 0.001417163361355215, + "loss": 7.5879, + "step": 1036900 + }, + { + "epoch": 4.224510102465757, + "grad_norm": 7.876670837402344, + "learning_rate": 0.001416716926980801, + "loss": 7.5858, + "step": 1037000 + }, + { + "epoch": 4.224510102465757, + "eval_MaskedAccuracy": 0.5104020098319255, + "eval_loss": 1.5996097326278687, + "eval_runtime": 181.2853, + "eval_samples_per_second": 350.144, + "eval_steps_per_second": 1.368, + "step": 1037000 + }, + { + "epoch": 4.224917480489139, + "grad_norm": 4.606138706207275, + "learning_rate": 0.0014162705356320368, + "loss": 7.5649, + "step": 1037100 + }, + { + "epoch": 4.22532485851252, + "grad_norm": 8.091697692871094, + "learning_rate": 0.0014158241873265707, + "loss": 7.6043, + "step": 1037200 + }, + { + "epoch": 4.225732236535902, + "grad_norm": 3.9498138427734375, + "learning_rate": 0.0014153778820820506, + "loss": 7.6016, + "step": 1037300 + }, + { + "epoch": 4.226139614559283, + "grad_norm": 12.753782272338867, + "learning_rate": 0.0014149316199161205, + "loss": 7.606, + "step": 1037400 + }, + { + "epoch": 4.226546992582665, + "grad_norm": 4.535918235778809, + "learning_rate": 0.0014144854008464216, + "loss": 7.6137, + "step": 1037500 + }, + { + "epoch": 4.226954370606046, + "grad_norm": 3.7183914184570312, + "learning_rate": 0.0014140392248905985, + "loss": 7.6012, + "step": 1037600 + }, + { + "epoch": 4.227361748629428, + "grad_norm": 4.210314750671387, + "learning_rate": 0.0014135930920662909, + "loss": 7.5942, + "step": 1037700 + }, + { + "epoch": 4.2277691266528095, + "grad_norm": 6.041977882385254, + "learning_rate": 0.0014131470023911344, + "loss": 7.5879, + "step": 1037800 + }, + { + "epoch": 4.22817650467619, + "grad_norm": 8.699797630310059, + "learning_rate": 0.0014127009558827677, + "loss": 7.5819, + "step": 1037900 + }, + { + "epoch": 4.228583882699572, + "grad_norm": 7.279827117919922, + "learning_rate": 0.0014122549525588236, + "loss": 7.5784, + "step": 1038000 + }, + { + "epoch": 4.228583882699572, + "eval_MaskedAccuracy": 0.5103667723872006, + "eval_loss": 1.5985842943191528, + "eval_runtime": 160.8712, + "eval_samples_per_second": 394.577, + "eval_steps_per_second": 1.542, + "step": 1038000 + }, + { + "epoch": 4.228991260722953, + "grad_norm": 4.683993339538574, + "learning_rate": 0.0014118089924369357, + "loss": 7.5954, + "step": 1038100 + }, + { + "epoch": 4.229398638746335, + "grad_norm": 8.449949264526367, + "learning_rate": 0.001411363075534735, + "loss": 7.5867, + "step": 1038200 + }, + { + "epoch": 4.229806016769716, + "grad_norm": 7.157567977905273, + "learning_rate": 0.0014109172018698503, + "loss": 7.6008, + "step": 1038300 + }, + { + "epoch": 4.230213394793098, + "grad_norm": 8.54863166809082, + "learning_rate": 0.0014104713714599103, + "loss": 7.583, + "step": 1038400 + }, + { + "epoch": 4.230620772816479, + "grad_norm": 9.954252243041992, + "learning_rate": 0.0014100255843225405, + "loss": 7.5946, + "step": 1038500 + }, + { + "epoch": 4.231028150839861, + "grad_norm": 13.772327423095703, + "learning_rate": 0.0014095798404753664, + "loss": 7.6321, + "step": 1038600 + }, + { + "epoch": 4.231435528863242, + "grad_norm": 7.2670135498046875, + "learning_rate": 0.001409134139936011, + "loss": 7.5994, + "step": 1038700 + }, + { + "epoch": 4.231842906886624, + "grad_norm": 3.6352944374084473, + "learning_rate": 0.001408688482722093, + "loss": 7.5996, + "step": 1038800 + }, + { + "epoch": 4.232250284910005, + "grad_norm": 10.875031471252441, + "learning_rate": 0.0014082428688512328, + "loss": 7.6008, + "step": 1038900 + }, + { + "epoch": 4.232657662933387, + "grad_norm": 7.153680801391602, + "learning_rate": 0.0014077972983410488, + "loss": 7.6032, + "step": 1039000 + }, + { + "epoch": 4.232657662933387, + "eval_MaskedAccuracy": 0.51030481513918, + "eval_loss": 1.59966242313385, + "eval_runtime": 202.6644, + "eval_samples_per_second": 313.207, + "eval_steps_per_second": 1.224, + "step": 1039000 + }, + { + "epoch": 4.233065040956768, + "grad_norm": 4.558526039123535, + "learning_rate": 0.0014073517712091558, + "loss": 7.586, + "step": 1039100 + }, + { + "epoch": 4.233472418980149, + "grad_norm": 6.087801456451416, + "learning_rate": 0.001406906287473165, + "loss": 7.5847, + "step": 1039200 + }, + { + "epoch": 4.233879797003531, + "grad_norm": 6.982261657714844, + "learning_rate": 0.0014064608471506957, + "loss": 7.5899, + "step": 1039300 + }, + { + "epoch": 4.234287175026912, + "grad_norm": 14.22995376586914, + "learning_rate": 0.001406015450259353, + "loss": 7.6126, + "step": 1039400 + }, + { + "epoch": 4.234694553050294, + "grad_norm": 11.781092643737793, + "learning_rate": 0.001405570096816747, + "loss": 7.5737, + "step": 1039500 + }, + { + "epoch": 4.235101931073675, + "grad_norm": 11.608367919921875, + "learning_rate": 0.0014051247868404847, + "loss": 7.584, + "step": 1039600 + }, + { + "epoch": 4.235509309097057, + "grad_norm": 5.197314262390137, + "learning_rate": 0.0014046795203481724, + "loss": 7.5912, + "step": 1039700 + }, + { + "epoch": 4.235916687120438, + "grad_norm": 13.955909729003906, + "learning_rate": 0.0014042342973574148, + "loss": 7.5847, + "step": 1039800 + }, + { + "epoch": 4.23632406514382, + "grad_norm": 4.1139349937438965, + "learning_rate": 0.0014037891178858126, + "loss": 7.5895, + "step": 1039900 + }, + { + "epoch": 4.236731443167201, + "grad_norm": 3.392300605773926, + "learning_rate": 0.001403343981950965, + "loss": 7.581, + "step": 1040000 + }, + { + "epoch": 4.236731443167201, + "eval_MaskedAccuracy": 0.5107230557023007, + "eval_loss": 1.5984443426132202, + "eval_runtime": 160.6273, + "eval_samples_per_second": 395.176, + "eval_steps_per_second": 1.544, + "step": 1040000 + }, + { + "epoch": 4.237138821190583, + "grad_norm": 4.306652069091797, + "learning_rate": 0.0014028988895704718, + "loss": 7.6129, + "step": 1040100 + }, + { + "epoch": 4.2375461992139645, + "grad_norm": 6.447711944580078, + "learning_rate": 0.00140245384076193, + "loss": 7.5864, + "step": 1040200 + }, + { + "epoch": 4.237953577237345, + "grad_norm": 7.224948406219482, + "learning_rate": 0.0014020088355429353, + "loss": 7.5909, + "step": 1040300 + }, + { + "epoch": 4.238360955260727, + "grad_norm": 7.60656213760376, + "learning_rate": 0.0014015638739310813, + "loss": 7.5982, + "step": 1040400 + }, + { + "epoch": 4.238768333284108, + "grad_norm": 5.7304205894470215, + "learning_rate": 0.0014011189559439577, + "loss": 7.5931, + "step": 1040500 + }, + { + "epoch": 4.23917571130749, + "grad_norm": 3.2884790897369385, + "learning_rate": 0.001400674081599157, + "loss": 7.5858, + "step": 1040600 + }, + { + "epoch": 4.239583089330871, + "grad_norm": 5.337141513824463, + "learning_rate": 0.0014002292509142655, + "loss": 7.5829, + "step": 1040700 + }, + { + "epoch": 4.239990467354253, + "grad_norm": 13.813345909118652, + "learning_rate": 0.0013997844639068696, + "loss": 7.6143, + "step": 1040800 + }, + { + "epoch": 4.240397845377634, + "grad_norm": 10.081512451171875, + "learning_rate": 0.0013993397205945566, + "loss": 7.5724, + "step": 1040900 + }, + { + "epoch": 4.240805223401016, + "grad_norm": 5.806445121765137, + "learning_rate": 0.0013988950209949072, + "loss": 7.5967, + "step": 1041000 + }, + { + "epoch": 4.240805223401016, + "eval_MaskedAccuracy": 0.5106415551820171, + "eval_loss": 1.5973789691925049, + "eval_runtime": 151.5209, + "eval_samples_per_second": 418.926, + "eval_steps_per_second": 1.637, + "step": 1041000 + }, + { + "epoch": 4.241212601424397, + "grad_norm": 6.213423252105713, + "learning_rate": 0.001398450365125503, + "loss": 7.5996, + "step": 1041100 + }, + { + "epoch": 4.241619979447779, + "grad_norm": 3.1893868446350098, + "learning_rate": 0.0013980057530039243, + "loss": 7.5974, + "step": 1041200 + }, + { + "epoch": 4.24202735747116, + "grad_norm": 4.202600479125977, + "learning_rate": 0.0013975611846477483, + "loss": 7.5891, + "step": 1041300 + }, + { + "epoch": 4.242434735494542, + "grad_norm": 3.976889133453369, + "learning_rate": 0.0013971166600745522, + "loss": 7.6055, + "step": 1041400 + }, + { + "epoch": 4.2428421135179235, + "grad_norm": 4.6882429122924805, + "learning_rate": 0.001396672179301909, + "loss": 7.5942, + "step": 1041500 + }, + { + "epoch": 4.243249491541304, + "grad_norm": 8.26848030090332, + "learning_rate": 0.0013962277423473952, + "loss": 7.6036, + "step": 1041600 + }, + { + "epoch": 4.243656869564686, + "grad_norm": 7.022709846496582, + "learning_rate": 0.0013957833492285775, + "loss": 7.6177, + "step": 1041700 + }, + { + "epoch": 4.244064247588067, + "grad_norm": 6.45651388168335, + "learning_rate": 0.0013953389999630248, + "loss": 7.6096, + "step": 1041800 + }, + { + "epoch": 4.244471625611449, + "grad_norm": 10.376928329467773, + "learning_rate": 0.001394894694568308, + "loss": 7.6, + "step": 1041900 + }, + { + "epoch": 4.24487900363483, + "grad_norm": 4.617563247680664, + "learning_rate": 0.0013944504330619897, + "loss": 7.5957, + "step": 1042000 + }, + { + "epoch": 4.24487900363483, + "eval_MaskedAccuracy": 0.5098486364057767, + "eval_loss": 1.6006395816802979, + "eval_runtime": 166.303, + "eval_samples_per_second": 381.689, + "eval_steps_per_second": 1.491, + "step": 1042000 + }, + { + "epoch": 4.245286381658212, + "grad_norm": 12.784701347351074, + "learning_rate": 0.0013940062154616363, + "loss": 7.5798, + "step": 1042100 + }, + { + "epoch": 4.245693759681593, + "grad_norm": 5.599425792694092, + "learning_rate": 0.00139356204178481, + "loss": 7.5941, + "step": 1042200 + }, + { + "epoch": 4.246101137704975, + "grad_norm": 4.726510524749756, + "learning_rate": 0.00139311791204907, + "loss": 7.6388, + "step": 1042300 + }, + { + "epoch": 4.246508515728356, + "grad_norm": 2.8609824180603027, + "learning_rate": 0.001392673826271974, + "loss": 7.6139, + "step": 1042400 + }, + { + "epoch": 4.246915893751738, + "grad_norm": 3.4965760707855225, + "learning_rate": 0.0013922297844710836, + "loss": 7.5804, + "step": 1042500 + }, + { + "epoch": 4.2473232717751195, + "grad_norm": 6.381969928741455, + "learning_rate": 0.0013917857866639495, + "loss": 7.6251, + "step": 1042600 + }, + { + "epoch": 4.247730649798501, + "grad_norm": 17.653200149536133, + "learning_rate": 0.0013913418328681286, + "loss": 7.5774, + "step": 1042700 + }, + { + "epoch": 4.2481380278218825, + "grad_norm": 6.616718292236328, + "learning_rate": 0.0013908979231011703, + "loss": 7.6292, + "step": 1042800 + }, + { + "epoch": 4.248545405845263, + "grad_norm": 8.231491088867188, + "learning_rate": 0.0013904540573806265, + "loss": 7.5937, + "step": 1042900 + }, + { + "epoch": 4.248952783868645, + "grad_norm": 6.566020965576172, + "learning_rate": 0.0013900102357240441, + "loss": 7.5842, + "step": 1043000 + }, + { + "epoch": 4.248952783868645, + "eval_MaskedAccuracy": 0.5103305871340506, + "eval_loss": 1.597257375717163, + "eval_runtime": 163.7009, + "eval_samples_per_second": 387.756, + "eval_steps_per_second": 1.515, + "step": 1043000 + }, + { + "epoch": 4.249360161892026, + "grad_norm": 9.477441787719727, + "learning_rate": 0.0013895664581489715, + "loss": 7.6107, + "step": 1043100 + }, + { + "epoch": 4.249767539915408, + "grad_norm": 7.334949970245361, + "learning_rate": 0.001389122724672952, + "loss": 7.6041, + "step": 1043200 + }, + { + "epoch": 4.250174917938789, + "grad_norm": 6.031586647033691, + "learning_rate": 0.0013886790353135295, + "loss": 7.591, + "step": 1043300 + }, + { + "epoch": 4.250582295962171, + "grad_norm": 9.586661338806152, + "learning_rate": 0.0013882353900882443, + "loss": 7.5904, + "step": 1043400 + }, + { + "epoch": 4.250989673985552, + "grad_norm": 5.399155616760254, + "learning_rate": 0.0013877917890146392, + "loss": 7.6182, + "step": 1043500 + }, + { + "epoch": 4.251397052008934, + "grad_norm": 9.838690757751465, + "learning_rate": 0.0013873482321102477, + "loss": 7.5695, + "step": 1043600 + }, + { + "epoch": 4.2518044300323155, + "grad_norm": 4.266875267028809, + "learning_rate": 0.0013869047193926082, + "loss": 7.6018, + "step": 1043700 + }, + { + "epoch": 4.252211808055697, + "grad_norm": 3.9155490398406982, + "learning_rate": 0.001386461250879256, + "loss": 7.6015, + "step": 1043800 + }, + { + "epoch": 4.2526191860790785, + "grad_norm": 12.214604377746582, + "learning_rate": 0.0013860178265877234, + "loss": 7.5713, + "step": 1043900 + }, + { + "epoch": 4.25302656410246, + "grad_norm": 7.104584693908691, + "learning_rate": 0.0013855744465355415, + "loss": 7.5933, + "step": 1044000 + }, + { + "epoch": 4.25302656410246, + "eval_MaskedAccuracy": 0.5102716451615913, + "eval_loss": 1.5962787866592407, + "eval_runtime": 161.6995, + "eval_samples_per_second": 392.555, + "eval_steps_per_second": 1.534, + "step": 1044000 + }, + { + "epoch": 4.253433942125841, + "grad_norm": 12.101835250854492, + "learning_rate": 0.0013851311107402372, + "loss": 7.6064, + "step": 1044100 + }, + { + "epoch": 4.253841320149222, + "grad_norm": 2.8748786449432373, + "learning_rate": 0.0013846878192193396, + "loss": 7.5928, + "step": 1044200 + }, + { + "epoch": 4.254248698172604, + "grad_norm": 3.9991018772125244, + "learning_rate": 0.0013842445719903747, + "loss": 7.611, + "step": 1044300 + }, + { + "epoch": 4.254656076195985, + "grad_norm": 7.150573253631592, + "learning_rate": 0.0013838013690708664, + "loss": 7.6046, + "step": 1044400 + }, + { + "epoch": 4.255063454219367, + "grad_norm": 18.1768798828125, + "learning_rate": 0.001383358210478336, + "loss": 7.5745, + "step": 1044500 + }, + { + "epoch": 4.255470832242748, + "grad_norm": 6.408421039581299, + "learning_rate": 0.001382915096230304, + "loss": 7.6166, + "step": 1044600 + }, + { + "epoch": 4.25587821026613, + "grad_norm": 4.033831596374512, + "learning_rate": 0.0013824720263442894, + "loss": 7.595, + "step": 1044700 + }, + { + "epoch": 4.256285588289511, + "grad_norm": 18.95318031311035, + "learning_rate": 0.0013820290008378102, + "loss": 7.6036, + "step": 1044800 + }, + { + "epoch": 4.256692966312893, + "grad_norm": 15.923903465270996, + "learning_rate": 0.0013815860197283812, + "loss": 7.6255, + "step": 1044900 + }, + { + "epoch": 4.2571003443362745, + "grad_norm": 5.607954502105713, + "learning_rate": 0.0013811430830335137, + "loss": 7.6046, + "step": 1045000 + }, + { + "epoch": 4.2571003443362745, + "eval_MaskedAccuracy": 0.5101724898143549, + "eval_loss": 1.6080166101455688, + "eval_runtime": 158.8932, + "eval_samples_per_second": 399.489, + "eval_steps_per_second": 1.561, + "step": 1045000 + }, + { + "epoch": 4.257507722359656, + "grad_norm": 7.38274621963501, + "learning_rate": 0.0013807001907707212, + "loss": 7.5877, + "step": 1045100 + }, + { + "epoch": 4.2579151003830376, + "grad_norm": 4.9756903648376465, + "learning_rate": 0.0013802573429575144, + "loss": 7.5956, + "step": 1045200 + }, + { + "epoch": 4.258322478406418, + "grad_norm": 4.759303092956543, + "learning_rate": 0.0013798145396114007, + "loss": 7.5925, + "step": 1045300 + }, + { + "epoch": 4.2587298564298, + "grad_norm": 6.458241939544678, + "learning_rate": 0.001379371780749885, + "loss": 7.5946, + "step": 1045400 + }, + { + "epoch": 4.259137234453181, + "grad_norm": 5.849318504333496, + "learning_rate": 0.0013789290663904726, + "loss": 7.5974, + "step": 1045500 + }, + { + "epoch": 4.259544612476563, + "grad_norm": 3.0111122131347656, + "learning_rate": 0.0013784863965506677, + "loss": 7.5756, + "step": 1045600 + }, + { + "epoch": 4.259951990499944, + "grad_norm": 3.285658121109009, + "learning_rate": 0.0013780437712479705, + "loss": 7.6009, + "step": 1045700 + }, + { + "epoch": 4.260359368523326, + "grad_norm": 3.3373794555664062, + "learning_rate": 0.0013776011904998813, + "loss": 7.5979, + "step": 1045800 + }, + { + "epoch": 4.260766746546707, + "grad_norm": 6.0788679122924805, + "learning_rate": 0.0013771586543238974, + "loss": 7.608, + "step": 1045900 + }, + { + "epoch": 4.261174124570089, + "grad_norm": 6.220348834991455, + "learning_rate": 0.0013767161627375128, + "loss": 7.6254, + "step": 1046000 + }, + { + "epoch": 4.261174124570089, + "eval_MaskedAccuracy": 0.5105576711595019, + "eval_loss": 1.605486512184143, + "eval_runtime": 155.2058, + "eval_samples_per_second": 408.98, + "eval_steps_per_second": 1.598, + "step": 1046000 + }, + { + "epoch": 4.2615815025934705, + "grad_norm": 8.3060302734375, + "learning_rate": 0.0013762737157582215, + "loss": 7.5906, + "step": 1046100 + }, + { + "epoch": 4.261988880616852, + "grad_norm": 6.216963291168213, + "learning_rate": 0.0013758313134035185, + "loss": 7.5889, + "step": 1046200 + }, + { + "epoch": 4.2623962586402335, + "grad_norm": 10.733205795288086, + "learning_rate": 0.001375388955690892, + "loss": 7.6103, + "step": 1046300 + }, + { + "epoch": 4.262803636663615, + "grad_norm": 4.733798503875732, + "learning_rate": 0.0013749466426378333, + "loss": 7.5805, + "step": 1046400 + }, + { + "epoch": 4.263211014686997, + "grad_norm": 9.358062744140625, + "learning_rate": 0.0013745043742618272, + "loss": 7.6144, + "step": 1046500 + }, + { + "epoch": 4.263618392710377, + "grad_norm": 5.283135414123535, + "learning_rate": 0.0013740621505803597, + "loss": 7.5833, + "step": 1046600 + }, + { + "epoch": 4.264025770733759, + "grad_norm": 13.439477920532227, + "learning_rate": 0.001373619971610914, + "loss": 7.5921, + "step": 1046700 + }, + { + "epoch": 4.26443314875714, + "grad_norm": 10.138379096984863, + "learning_rate": 0.001373177837370972, + "loss": 7.5857, + "step": 1046800 + }, + { + "epoch": 4.264840526780522, + "grad_norm": 10.753410339355469, + "learning_rate": 0.001372735747878013, + "loss": 7.6175, + "step": 1046900 + }, + { + "epoch": 4.265247904803903, + "grad_norm": 8.986906051635742, + "learning_rate": 0.0013722937031495144, + "loss": 7.5749, + "step": 1047000 + }, + { + "epoch": 4.265247904803903, + "eval_MaskedAccuracy": 0.510165483161183, + "eval_loss": 1.5947030782699585, + "eval_runtime": 175.7149, + "eval_samples_per_second": 361.244, + "eval_steps_per_second": 1.411, + "step": 1047000 + }, + { + "epoch": 4.265655282827285, + "grad_norm": 5.9366912841796875, + "learning_rate": 0.0013718517032029558, + "loss": 7.611, + "step": 1047100 + }, + { + "epoch": 4.266062660850666, + "grad_norm": 4.973095893859863, + "learning_rate": 0.00137140974805581, + "loss": 7.587, + "step": 1047200 + }, + { + "epoch": 4.266470038874048, + "grad_norm": 9.203110694885254, + "learning_rate": 0.001370967837725549, + "loss": 7.5996, + "step": 1047300 + }, + { + "epoch": 4.2668774168974295, + "grad_norm": 17.141218185424805, + "learning_rate": 0.0013705259722296443, + "loss": 7.6149, + "step": 1047400 + }, + { + "epoch": 4.267284794920811, + "grad_norm": 3.210751533508301, + "learning_rate": 0.001370084151585565, + "loss": 7.5693, + "step": 1047500 + }, + { + "epoch": 4.267692172944193, + "grad_norm": 11.829336166381836, + "learning_rate": 0.0013696423758107795, + "loss": 7.6087, + "step": 1047600 + }, + { + "epoch": 4.268099550967574, + "grad_norm": 5.585712909698486, + "learning_rate": 0.0013692006449227518, + "loss": 7.5905, + "step": 1047700 + }, + { + "epoch": 4.268506928990956, + "grad_norm": 3.361020088195801, + "learning_rate": 0.0013687589589389479, + "loss": 7.5976, + "step": 1047800 + }, + { + "epoch": 4.268914307014336, + "grad_norm": 6.276754856109619, + "learning_rate": 0.001368317317876831, + "loss": 7.5851, + "step": 1047900 + }, + { + "epoch": 4.269321685037718, + "grad_norm": 7.628347873687744, + "learning_rate": 0.0013678757217538587, + "loss": 7.6, + "step": 1048000 + }, + { + "epoch": 4.269321685037718, + "eval_MaskedAccuracy": 0.510544796005558, + "eval_loss": 1.601022720336914, + "eval_runtime": 155.935, + "eval_samples_per_second": 407.067, + "eval_steps_per_second": 1.59, + "step": 1048000 + }, + { + "epoch": 4.269729063061099, + "grad_norm": 4.361343860626221, + "learning_rate": 0.0013674341705874912, + "loss": 7.5912, + "step": 1048100 + }, + { + "epoch": 4.270136441084481, + "grad_norm": 10.683218002319336, + "learning_rate": 0.0013669926643951852, + "loss": 7.5705, + "step": 1048200 + }, + { + "epoch": 4.270543819107862, + "grad_norm": 5.449821472167969, + "learning_rate": 0.0013665512031943957, + "loss": 7.5813, + "step": 1048300 + }, + { + "epoch": 4.270951197131244, + "grad_norm": 6.593095779418945, + "learning_rate": 0.0013661097870025756, + "loss": 7.6175, + "step": 1048400 + }, + { + "epoch": 4.2713585751546255, + "grad_norm": 5.898374557495117, + "learning_rate": 0.001365668415837177, + "loss": 7.6092, + "step": 1048500 + }, + { + "epoch": 4.271765953178007, + "grad_norm": 21.504728317260742, + "learning_rate": 0.0013652270897156489, + "loss": 7.5872, + "step": 1048600 + }, + { + "epoch": 4.2721733312013885, + "grad_norm": 3.979430913925171, + "learning_rate": 0.001364785808655441, + "loss": 7.5815, + "step": 1048700 + }, + { + "epoch": 4.27258070922477, + "grad_norm": 15.458041191101074, + "learning_rate": 0.0013643445726739976, + "loss": 7.5795, + "step": 1048800 + }, + { + "epoch": 4.272988087248152, + "grad_norm": 6.766093730926514, + "learning_rate": 0.001363903381788764, + "loss": 7.5902, + "step": 1048900 + }, + { + "epoch": 4.273395465271533, + "grad_norm": 11.906927108764648, + "learning_rate": 0.0013634622360171836, + "loss": 7.5898, + "step": 1049000 + }, + { + "epoch": 4.273395465271533, + "eval_MaskedAccuracy": 0.5105699382233186, + "eval_loss": 1.6041675806045532, + "eval_runtime": 157.58, + "eval_samples_per_second": 402.818, + "eval_steps_per_second": 1.574, + "step": 1049000 + }, + { + "epoch": 4.273802843294914, + "grad_norm": 5.8816375732421875, + "learning_rate": 0.0013630211353766943, + "loss": 7.5897, + "step": 1049100 + }, + { + "epoch": 4.274210221318295, + "grad_norm": 7.39239501953125, + "learning_rate": 0.0013625800798847394, + "loss": 7.6126, + "step": 1049200 + }, + { + "epoch": 4.274617599341677, + "grad_norm": 8.080638885498047, + "learning_rate": 0.001362139069558753, + "loss": 7.6085, + "step": 1049300 + }, + { + "epoch": 4.275024977365058, + "grad_norm": 4.267389297485352, + "learning_rate": 0.0013616981044161716, + "loss": 7.6104, + "step": 1049400 + }, + { + "epoch": 4.27543235538844, + "grad_norm": 10.538228988647461, + "learning_rate": 0.0013612571844744306, + "loss": 7.6089, + "step": 1049500 + }, + { + "epoch": 4.275839733411821, + "grad_norm": 5.671195030212402, + "learning_rate": 0.0013608163097509603, + "loss": 7.5492, + "step": 1049600 + }, + { + "epoch": 4.276247111435203, + "grad_norm": 8.128995895385742, + "learning_rate": 0.0013603754802631908, + "loss": 7.6204, + "step": 1049700 + }, + { + "epoch": 4.2766544894585845, + "grad_norm": 15.104260444641113, + "learning_rate": 0.0013599346960285505, + "loss": 7.6049, + "step": 1049800 + }, + { + "epoch": 4.277061867481966, + "grad_norm": 6.141758918762207, + "learning_rate": 0.0013594939570644633, + "loss": 7.5944, + "step": 1049900 + }, + { + "epoch": 4.277469245505348, + "grad_norm": 6.137265682220459, + "learning_rate": 0.001359053263388357, + "loss": 7.5827, + "step": 1050000 + }, + { + "epoch": 4.277469245505348, + "eval_MaskedAccuracy": 0.5104539762579546, + "eval_loss": 1.604261040687561, + "eval_runtime": 160.1805, + "eval_samples_per_second": 396.278, + "eval_steps_per_second": 1.548, + "step": 1050000 + }, + { + "epoch": 4.277876623528729, + "grad_norm": 6.1768035888671875, + "learning_rate": 0.001358612615017653, + "loss": 7.5911, + "step": 1050100 + }, + { + "epoch": 4.278284001552111, + "grad_norm": 2.9127068519592285, + "learning_rate": 0.0013581720119697752, + "loss": 7.6041, + "step": 1050200 + }, + { + "epoch": 4.278691379575491, + "grad_norm": 5.4770426750183105, + "learning_rate": 0.0013577314542621418, + "loss": 7.573, + "step": 1050300 + }, + { + "epoch": 4.279098757598873, + "grad_norm": 2.914484977722168, + "learning_rate": 0.0013572909419121686, + "loss": 7.5765, + "step": 1050400 + }, + { + "epoch": 4.279506135622254, + "grad_norm": 4.840549945831299, + "learning_rate": 0.001356850474937274, + "loss": 7.6156, + "step": 1050500 + }, + { + "epoch": 4.279913513645636, + "grad_norm": 10.579102516174316, + "learning_rate": 0.0013564100533548715, + "loss": 7.5796, + "step": 1050600 + }, + { + "epoch": 4.280320891669017, + "grad_norm": 8.241044998168945, + "learning_rate": 0.0013559696771823734, + "loss": 7.5974, + "step": 1050700 + }, + { + "epoch": 4.280728269692399, + "grad_norm": 17.360485076904297, + "learning_rate": 0.001355529346437188, + "loss": 7.6033, + "step": 1050800 + }, + { + "epoch": 4.2811356477157805, + "grad_norm": 12.6903657913208, + "learning_rate": 0.001355089061136727, + "loss": 7.6016, + "step": 1050900 + }, + { + "epoch": 4.281543025739162, + "grad_norm": 11.648893356323242, + "learning_rate": 0.001354648821298394, + "loss": 7.6138, + "step": 1051000 + }, + { + "epoch": 4.281543025739162, + "eval_MaskedAccuracy": 0.5107127187312243, + "eval_loss": 1.595258355140686, + "eval_runtime": 158.3991, + "eval_samples_per_second": 400.735, + "eval_steps_per_second": 1.566, + "step": 1051000 + }, + { + "epoch": 4.2819504037625435, + "grad_norm": 10.427696228027344, + "learning_rate": 0.0013542086269395966, + "loss": 7.6047, + "step": 1051100 + }, + { + "epoch": 4.282357781785925, + "grad_norm": 4.498067378997803, + "learning_rate": 0.0013537684780777367, + "loss": 7.6153, + "step": 1051200 + }, + { + "epoch": 4.282765159809307, + "grad_norm": 6.52211332321167, + "learning_rate": 0.0013533283747302147, + "loss": 7.6057, + "step": 1051300 + }, + { + "epoch": 4.283172537832688, + "grad_norm": 3.8469252586364746, + "learning_rate": 0.0013528883169144329, + "loss": 7.6022, + "step": 1051400 + }, + { + "epoch": 4.28357991585607, + "grad_norm": 15.737393379211426, + "learning_rate": 0.0013524483046477884, + "loss": 7.5877, + "step": 1051500 + }, + { + "epoch": 4.28398729387945, + "grad_norm": 9.453332901000977, + "learning_rate": 0.0013520083379476757, + "loss": 7.6081, + "step": 1051600 + }, + { + "epoch": 4.284394671902832, + "grad_norm": 4.673506736755371, + "learning_rate": 0.0013515684168314904, + "loss": 7.6194, + "step": 1051700 + }, + { + "epoch": 4.284802049926213, + "grad_norm": 5.137613296508789, + "learning_rate": 0.0013511285413166255, + "loss": 7.6118, + "step": 1051800 + }, + { + "epoch": 4.285209427949595, + "grad_norm": 4.768191814422607, + "learning_rate": 0.001350688711420469, + "loss": 7.5906, + "step": 1051900 + }, + { + "epoch": 4.2856168059729765, + "grad_norm": 10.33692455291748, + "learning_rate": 0.0013502489271604132, + "loss": 7.573, + "step": 1052000 + }, + { + "epoch": 4.2856168059729765, + "eval_MaskedAccuracy": 0.5105846314590503, + "eval_loss": 1.6071923971176147, + "eval_runtime": 175.5904, + "eval_samples_per_second": 361.5, + "eval_steps_per_second": 1.412, + "step": 1052000 + }, + { + "epoch": 4.286024183996358, + "grad_norm": 4.2482171058654785, + "learning_rate": 0.0013498091885538435, + "loss": 7.6372, + "step": 1052100 + }, + { + "epoch": 4.2864315620197395, + "grad_norm": 8.275416374206543, + "learning_rate": 0.0013493694956181429, + "loss": 7.5969, + "step": 1052200 + }, + { + "epoch": 4.286838940043121, + "grad_norm": 8.151731491088867, + "learning_rate": 0.0013489298483706984, + "loss": 7.586, + "step": 1052300 + }, + { + "epoch": 4.287246318066503, + "grad_norm": 7.892337799072266, + "learning_rate": 0.0013484902468288904, + "loss": 7.5684, + "step": 1052400 + }, + { + "epoch": 4.287653696089884, + "grad_norm": 8.394124984741211, + "learning_rate": 0.001348050691010097, + "loss": 7.6165, + "step": 1052500 + }, + { + "epoch": 4.288061074113266, + "grad_norm": 6.7744879722595215, + "learning_rate": 0.0013476111809316964, + "loss": 7.5881, + "step": 1052600 + }, + { + "epoch": 4.288468452136647, + "grad_norm": 4.3457770347595215, + "learning_rate": 0.0013471717166110664, + "loss": 7.5829, + "step": 1052700 + }, + { + "epoch": 4.288875830160029, + "grad_norm": 6.017189979553223, + "learning_rate": 0.0013467322980655826, + "loss": 7.5883, + "step": 1052800 + }, + { + "epoch": 4.289283208183409, + "grad_norm": 8.758934020996094, + "learning_rate": 0.0013462929253126165, + "loss": 7.5988, + "step": 1052900 + }, + { + "epoch": 4.289690586206791, + "grad_norm": 4.954868793487549, + "learning_rate": 0.001345853598369537, + "loss": 7.5871, + "step": 1053000 + }, + { + "epoch": 4.289690586206791, + "eval_MaskedAccuracy": 0.5101855293138061, + "eval_loss": 1.5933858156204224, + "eval_runtime": 158.1265, + "eval_samples_per_second": 401.425, + "eval_steps_per_second": 1.568, + "step": 1053000 + }, + { + "epoch": 4.290097964230172, + "grad_norm": 3.5732669830322266, + "learning_rate": 0.0013454143172537143, + "loss": 7.6088, + "step": 1053100 + }, + { + "epoch": 4.290505342253554, + "grad_norm": 5.973149299621582, + "learning_rate": 0.0013449750819825156, + "loss": 7.5845, + "step": 1053200 + }, + { + "epoch": 4.2909127202769355, + "grad_norm": 7.148528575897217, + "learning_rate": 0.0013445358925733074, + "loss": 7.5924, + "step": 1053300 + }, + { + "epoch": 4.291320098300317, + "grad_norm": 4.60052490234375, + "learning_rate": 0.0013440967490434504, + "loss": 7.5947, + "step": 1053400 + }, + { + "epoch": 4.2917274763236986, + "grad_norm": 4.36650276184082, + "learning_rate": 0.0013436576514103096, + "loss": 7.5904, + "step": 1053500 + }, + { + "epoch": 4.29213485434708, + "grad_norm": 5.558254718780518, + "learning_rate": 0.001343218599691242, + "loss": 7.6059, + "step": 1053600 + }, + { + "epoch": 4.292542232370462, + "grad_norm": 8.331778526306152, + "learning_rate": 0.0013427795939036072, + "loss": 7.6071, + "step": 1053700 + }, + { + "epoch": 4.292949610393843, + "grad_norm": 4.540841102600098, + "learning_rate": 0.0013423406340647614, + "loss": 7.6168, + "step": 1053800 + }, + { + "epoch": 4.293356988417225, + "grad_norm": 13.898836135864258, + "learning_rate": 0.0013419017201920572, + "loss": 7.5812, + "step": 1053900 + }, + { + "epoch": 4.293764366440606, + "grad_norm": 4.2923736572265625, + "learning_rate": 0.001341462852302848, + "loss": 7.6087, + "step": 1054000 + }, + { + "epoch": 4.293764366440606, + "eval_MaskedAccuracy": 0.5105729284503422, + "eval_loss": 1.599921464920044, + "eval_runtime": 158.362, + "eval_samples_per_second": 400.828, + "eval_steps_per_second": 1.566, + "step": 1054000 + }, + { + "epoch": 4.294171744463987, + "grad_norm": 9.693986892700195, + "learning_rate": 0.0013410240304144857, + "loss": 7.6009, + "step": 1054100 + }, + { + "epoch": 4.294579122487368, + "grad_norm": 5.510178565979004, + "learning_rate": 0.0013405852545443201, + "loss": 7.6049, + "step": 1054200 + }, + { + "epoch": 4.29498650051075, + "grad_norm": 6.429800987243652, + "learning_rate": 0.0013401465247096964, + "loss": 7.598, + "step": 1054300 + }, + { + "epoch": 4.2953938785341315, + "grad_norm": 9.711774826049805, + "learning_rate": 0.0013397078409279613, + "loss": 7.5863, + "step": 1054400 + }, + { + "epoch": 4.295801256557513, + "grad_norm": 6.588146686553955, + "learning_rate": 0.0013392692032164566, + "loss": 7.5982, + "step": 1054500 + }, + { + "epoch": 4.2962086345808945, + "grad_norm": 4.1650261878967285, + "learning_rate": 0.0013388306115925262, + "loss": 7.5887, + "step": 1054600 + }, + { + "epoch": 4.296616012604276, + "grad_norm": 4.303426742553711, + "learning_rate": 0.0013383920660735079, + "loss": 7.598, + "step": 1054700 + }, + { + "epoch": 4.297023390627658, + "grad_norm": 6.5245771408081055, + "learning_rate": 0.0013379535666767429, + "loss": 7.5787, + "step": 1054800 + }, + { + "epoch": 4.297430768651039, + "grad_norm": 5.145477294921875, + "learning_rate": 0.0013375151134195633, + "loss": 7.5935, + "step": 1054900 + }, + { + "epoch": 4.297838146674421, + "grad_norm": 3.6686227321624756, + "learning_rate": 0.0013370767063193049, + "loss": 7.5919, + "step": 1055000 + }, + { + "epoch": 4.297838146674421, + "eval_MaskedAccuracy": 0.5110081258676045, + "eval_loss": 1.5933496952056885, + "eval_runtime": 155.7441, + "eval_samples_per_second": 407.566, + "eval_steps_per_second": 1.592, + "step": 1055000 + }, + { + "epoch": 4.298245524697802, + "grad_norm": 10.672622680664062, + "learning_rate": 0.0013366383453933017, + "loss": 7.607, + "step": 1055100 + }, + { + "epoch": 4.298652902721184, + "grad_norm": 4.603048801422119, + "learning_rate": 0.0013362000306588828, + "loss": 7.6094, + "step": 1055200 + }, + { + "epoch": 4.299060280744564, + "grad_norm": 4.876296043395996, + "learning_rate": 0.001335761762133378, + "loss": 7.5943, + "step": 1055300 + }, + { + "epoch": 4.299467658767946, + "grad_norm": 3.4054372310638428, + "learning_rate": 0.0013353235398341147, + "loss": 7.5754, + "step": 1055400 + }, + { + "epoch": 4.299875036791327, + "grad_norm": 6.237635135650635, + "learning_rate": 0.0013348853637784183, + "loss": 7.6041, + "step": 1055500 + }, + { + "epoch": 4.300282414814709, + "grad_norm": 6.513390064239502, + "learning_rate": 0.0013344472339836128, + "loss": 7.5821, + "step": 1055600 + }, + { + "epoch": 4.3006897928380905, + "grad_norm": 4.691460609436035, + "learning_rate": 0.0013340091504670183, + "loss": 7.5944, + "step": 1055700 + }, + { + "epoch": 4.301097170861472, + "grad_norm": 10.522651672363281, + "learning_rate": 0.0013335711132459538, + "loss": 7.6126, + "step": 1055800 + }, + { + "epoch": 4.301504548884854, + "grad_norm": 4.642693042755127, + "learning_rate": 0.001333133122337738, + "loss": 7.6016, + "step": 1055900 + }, + { + "epoch": 4.301911926908235, + "grad_norm": 6.95733118057251, + "learning_rate": 0.0013326951777596892, + "loss": 7.6056, + "step": 1056000 + }, + { + "epoch": 4.301911926908235, + "eval_MaskedAccuracy": 0.5115172713143512, + "eval_loss": 1.5941381454467773, + "eval_runtime": 166.1302, + "eval_samples_per_second": 382.086, + "eval_steps_per_second": 1.493, + "step": 1056000 + }, + { + "epoch": 4.302319304931617, + "grad_norm": 5.869209289550781, + "learning_rate": 0.0013322572795291196, + "loss": 7.5703, + "step": 1056100 + }, + { + "epoch": 4.302726682954998, + "grad_norm": 5.532618999481201, + "learning_rate": 0.0013318194276633426, + "loss": 7.611, + "step": 1056200 + }, + { + "epoch": 4.30313406097838, + "grad_norm": 11.812193870544434, + "learning_rate": 0.0013313816221796673, + "loss": 7.5935, + "step": 1056300 + }, + { + "epoch": 4.303541439001761, + "grad_norm": 6.252383708953857, + "learning_rate": 0.001330943863095403, + "loss": 7.6089, + "step": 1056400 + }, + { + "epoch": 4.303948817025143, + "grad_norm": 5.153746128082275, + "learning_rate": 0.001330506150427856, + "loss": 7.6089, + "step": 1056500 + }, + { + "epoch": 4.304356195048523, + "grad_norm": 9.104736328125, + "learning_rate": 0.0013300684841943335, + "loss": 7.5613, + "step": 1056600 + }, + { + "epoch": 4.304763573071905, + "grad_norm": 12.152100563049316, + "learning_rate": 0.0013296308644121396, + "loss": 7.616, + "step": 1056700 + }, + { + "epoch": 4.3051709510952865, + "grad_norm": 4.863597393035889, + "learning_rate": 0.0013291932910985728, + "loss": 7.6433, + "step": 1056800 + }, + { + "epoch": 4.305578329118668, + "grad_norm": 5.482358455657959, + "learning_rate": 0.001328755764270934, + "loss": 7.5724, + "step": 1056900 + }, + { + "epoch": 4.3059857071420495, + "grad_norm": 15.2509183883667, + "learning_rate": 0.0013283182839465178, + "loss": 7.6107, + "step": 1057000 + }, + { + "epoch": 4.3059857071420495, + "eval_MaskedAccuracy": 0.5105492409469372, + "eval_loss": 1.58795166015625, + "eval_runtime": 163.1464, + "eval_samples_per_second": 389.074, + "eval_steps_per_second": 1.52, + "step": 1057000 + }, + { + "epoch": 4.306393085165431, + "grad_norm": 4.565687656402588, + "learning_rate": 0.0013278808501426232, + "loss": 7.5928, + "step": 1057100 + }, + { + "epoch": 4.306800463188813, + "grad_norm": 7.921701431274414, + "learning_rate": 0.0013274434628765457, + "loss": 7.6182, + "step": 1057200 + }, + { + "epoch": 4.307207841212194, + "grad_norm": 8.388190269470215, + "learning_rate": 0.0013270061221655762, + "loss": 7.6275, + "step": 1057300 + }, + { + "epoch": 4.307615219235576, + "grad_norm": 6.327116012573242, + "learning_rate": 0.0013265688280270057, + "loss": 7.6206, + "step": 1057400 + }, + { + "epoch": 4.308022597258957, + "grad_norm": 3.048905611038208, + "learning_rate": 0.0013261315804781223, + "loss": 7.6101, + "step": 1057500 + }, + { + "epoch": 4.308429975282339, + "grad_norm": 2.9544801712036133, + "learning_rate": 0.0013256943795362124, + "loss": 7.5938, + "step": 1057600 + }, + { + "epoch": 4.30883735330572, + "grad_norm": 4.046011447906494, + "learning_rate": 0.0013252572252185609, + "loss": 7.6113, + "step": 1057700 + }, + { + "epoch": 4.309244731329102, + "grad_norm": 4.219917297363281, + "learning_rate": 0.0013248201175424508, + "loss": 7.5877, + "step": 1057800 + }, + { + "epoch": 4.309652109352482, + "grad_norm": 6.405755996704102, + "learning_rate": 0.0013243830565251619, + "loss": 7.5735, + "step": 1057900 + }, + { + "epoch": 4.310059487375864, + "grad_norm": 7.971467018127441, + "learning_rate": 0.0013239460421839771, + "loss": 7.6056, + "step": 1058000 + }, + { + "epoch": 4.310059487375864, + "eval_MaskedAccuracy": 0.5106496008305519, + "eval_loss": 1.5981202125549316, + "eval_runtime": 168.8455, + "eval_samples_per_second": 375.941, + "eval_steps_per_second": 1.469, + "step": 1058000 + }, + { + "epoch": 4.3104668653992455, + "grad_norm": 3.9189813137054443, + "learning_rate": 0.001323509074536172, + "loss": 7.6137, + "step": 1058100 + }, + { + "epoch": 4.310874243422627, + "grad_norm": 4.11265754699707, + "learning_rate": 0.001323072153599021, + "loss": 7.5849, + "step": 1058200 + }, + { + "epoch": 4.311281621446009, + "grad_norm": 5.129466533660889, + "learning_rate": 0.0013226352793898007, + "loss": 7.6074, + "step": 1058300 + }, + { + "epoch": 4.31168899946939, + "grad_norm": 9.669137001037598, + "learning_rate": 0.001322198451925782, + "loss": 7.6006, + "step": 1058400 + }, + { + "epoch": 4.312096377492772, + "grad_norm": 5.684308052062988, + "learning_rate": 0.0013217616712242343, + "loss": 7.5882, + "step": 1058500 + }, + { + "epoch": 4.312503755516153, + "grad_norm": 10.73115348815918, + "learning_rate": 0.0013213249373024265, + "loss": 7.5805, + "step": 1058600 + }, + { + "epoch": 4.312911133539535, + "grad_norm": 3.2344930171966553, + "learning_rate": 0.0013208882501776244, + "loss": 7.6101, + "step": 1058700 + }, + { + "epoch": 4.313318511562916, + "grad_norm": 16.917919158935547, + "learning_rate": 0.001320451609867093, + "loss": 7.5866, + "step": 1058800 + }, + { + "epoch": 4.313725889586298, + "grad_norm": 5.793231010437012, + "learning_rate": 0.0013200150163880935, + "loss": 7.6191, + "step": 1058900 + }, + { + "epoch": 4.314133267609679, + "grad_norm": 10.23379898071289, + "learning_rate": 0.0013195784697578887, + "loss": 7.5969, + "step": 1059000 + }, + { + "epoch": 4.314133267609679, + "eval_MaskedAccuracy": 0.5108911332159818, + "eval_loss": 1.5956271886825562, + "eval_runtime": 165.7102, + "eval_samples_per_second": 383.054, + "eval_steps_per_second": 1.497, + "step": 1059000 + }, + { + "epoch": 4.31454064563306, + "grad_norm": 11.354375839233398, + "learning_rate": 0.0013191419699937383, + "loss": 7.5962, + "step": 1059100 + }, + { + "epoch": 4.3149480236564415, + "grad_norm": 6.699918746948242, + "learning_rate": 0.0013187055171128969, + "loss": 7.5858, + "step": 1059200 + }, + { + "epoch": 4.315355401679823, + "grad_norm": 8.553930282592773, + "learning_rate": 0.0013182691111326198, + "loss": 7.5681, + "step": 1059300 + }, + { + "epoch": 4.3157627797032045, + "grad_norm": 4.089259147644043, + "learning_rate": 0.001317832752070163, + "loss": 7.6107, + "step": 1059400 + }, + { + "epoch": 4.316170157726586, + "grad_norm": 7.99910831451416, + "learning_rate": 0.0013173964399427762, + "loss": 7.5744, + "step": 1059500 + }, + { + "epoch": 4.316577535749968, + "grad_norm": 11.323901176452637, + "learning_rate": 0.0013169601747677108, + "loss": 7.5732, + "step": 1059600 + }, + { + "epoch": 4.316984913773349, + "grad_norm": 5.703823566436768, + "learning_rate": 0.001316523956562213, + "loss": 7.5945, + "step": 1059700 + }, + { + "epoch": 4.317392291796731, + "grad_norm": 14.853983879089355, + "learning_rate": 0.0013160877853435307, + "loss": 7.6008, + "step": 1059800 + }, + { + "epoch": 4.317799669820112, + "grad_norm": 8.825891494750977, + "learning_rate": 0.0013156516611289068, + "loss": 7.605, + "step": 1059900 + }, + { + "epoch": 4.318207047843494, + "grad_norm": 4.946917533874512, + "learning_rate": 0.0013152155839355844, + "loss": 7.5915, + "step": 1060000 + }, + { + "epoch": 4.318207047843494, + "eval_MaskedAccuracy": 0.5103674138807739, + "eval_loss": 1.5986543893814087, + "eval_runtime": 161.7593, + "eval_samples_per_second": 392.41, + "eval_steps_per_second": 1.533, + "step": 1060000 + }, + { + "epoch": 4.318614425866875, + "grad_norm": 4.960884094238281, + "learning_rate": 0.0013147795537808021, + "loss": 7.608, + "step": 1060100 + }, + { + "epoch": 4.319021803890257, + "grad_norm": 15.583209991455078, + "learning_rate": 0.0013143435706818, + "loss": 7.5951, + "step": 1060200 + }, + { + "epoch": 4.3194291819136374, + "grad_norm": 10.425910949707031, + "learning_rate": 0.001313907634655815, + "loss": 7.5948, + "step": 1060300 + }, + { + "epoch": 4.319836559937019, + "grad_norm": 8.672053337097168, + "learning_rate": 0.0013134717457200823, + "loss": 7.5649, + "step": 1060400 + }, + { + "epoch": 4.3202439379604005, + "grad_norm": 9.029831886291504, + "learning_rate": 0.001313035903891833, + "loss": 7.5745, + "step": 1060500 + }, + { + "epoch": 4.320651315983782, + "grad_norm": 5.3896355628967285, + "learning_rate": 0.0013126001091883005, + "loss": 7.5787, + "step": 1060600 + }, + { + "epoch": 4.321058694007164, + "grad_norm": 8.003243446350098, + "learning_rate": 0.0013121643616267134, + "loss": 7.5771, + "step": 1060700 + }, + { + "epoch": 4.321466072030545, + "grad_norm": 3.931807279586792, + "learning_rate": 0.001311728661224299, + "loss": 7.6167, + "step": 1060800 + }, + { + "epoch": 4.321873450053927, + "grad_norm": 2.4016737937927246, + "learning_rate": 0.0013112930079982807, + "loss": 7.5967, + "step": 1060900 + }, + { + "epoch": 4.322280828077308, + "grad_norm": 7.975673675537109, + "learning_rate": 0.001310857401965886, + "loss": 7.6061, + "step": 1061000 + }, + { + "epoch": 4.322280828077308, + "eval_MaskedAccuracy": 0.510862917167637, + "eval_loss": 1.5919030904769897, + "eval_runtime": 158.5265, + "eval_samples_per_second": 400.413, + "eval_steps_per_second": 1.564, + "step": 1061000 + }, + { + "epoch": 4.32268820610069, + "grad_norm": 7.915943145751953, + "learning_rate": 0.001310421843144334, + "loss": 7.6006, + "step": 1061100 + }, + { + "epoch": 4.323095584124071, + "grad_norm": 10.837762832641602, + "learning_rate": 0.0013099863315508451, + "loss": 7.5836, + "step": 1061200 + }, + { + "epoch": 4.323502962147453, + "grad_norm": 10.817831993103027, + "learning_rate": 0.0013095508672026379, + "loss": 7.567, + "step": 1061300 + }, + { + "epoch": 4.323910340170834, + "grad_norm": 5.905236721038818, + "learning_rate": 0.001309115450116929, + "loss": 7.5539, + "step": 1061400 + }, + { + "epoch": 4.324317718194216, + "grad_norm": 10.229114532470703, + "learning_rate": 0.0013086800803109317, + "loss": 7.6162, + "step": 1061500 + }, + { + "epoch": 4.3247250962175965, + "grad_norm": 8.440672874450684, + "learning_rate": 0.0013082447578018604, + "loss": 7.6045, + "step": 1061600 + }, + { + "epoch": 4.325132474240978, + "grad_norm": 5.581210136413574, + "learning_rate": 0.0013078094826069252, + "loss": 7.6123, + "step": 1061700 + }, + { + "epoch": 4.3255398522643596, + "grad_norm": 16.999338150024414, + "learning_rate": 0.001307374254743332, + "loss": 7.5482, + "step": 1061800 + }, + { + "epoch": 4.325947230287741, + "grad_norm": 4.336997032165527, + "learning_rate": 0.0013069390742282895, + "loss": 7.5791, + "step": 1061900 + }, + { + "epoch": 4.326354608311123, + "grad_norm": 9.085204124450684, + "learning_rate": 0.0013065039410790047, + "loss": 7.5664, + "step": 1062000 + }, + { + "epoch": 4.326354608311123, + "eval_MaskedAccuracy": 0.5104186976277901, + "eval_loss": 1.5993964672088623, + "eval_runtime": 157.3186, + "eval_samples_per_second": 403.487, + "eval_steps_per_second": 1.576, + "step": 1062000 + }, + { + "epoch": 4.326761986334504, + "grad_norm": 4.992656230926514, + "learning_rate": 0.001306068855312679, + "loss": 7.6227, + "step": 1062100 + }, + { + "epoch": 4.327169364357886, + "grad_norm": 8.597797393798828, + "learning_rate": 0.0013056338169465133, + "loss": 7.5815, + "step": 1062200 + }, + { + "epoch": 4.327576742381267, + "grad_norm": 9.000292778015137, + "learning_rate": 0.0013051988259977084, + "loss": 7.6169, + "step": 1062300 + }, + { + "epoch": 4.327984120404649, + "grad_norm": 15.233333587646484, + "learning_rate": 0.0013047638824834605, + "loss": 7.5652, + "step": 1062400 + }, + { + "epoch": 4.32839149842803, + "grad_norm": 9.209403991699219, + "learning_rate": 0.001304328986420965, + "loss": 7.5839, + "step": 1062500 + }, + { + "epoch": 4.328798876451412, + "grad_norm": 11.246329307556152, + "learning_rate": 0.0013038941378274167, + "loss": 7.6156, + "step": 1062600 + }, + { + "epoch": 4.329206254474793, + "grad_norm": 8.702144622802734, + "learning_rate": 0.0013034593367200076, + "loss": 7.5804, + "step": 1062700 + }, + { + "epoch": 4.329613632498175, + "grad_norm": 6.944125652313232, + "learning_rate": 0.0013030245831159271, + "loss": 7.5906, + "step": 1062800 + }, + { + "epoch": 4.3300210105215555, + "grad_norm": 10.930323600769043, + "learning_rate": 0.0013025898770323624, + "loss": 7.5963, + "step": 1062900 + }, + { + "epoch": 4.330428388544937, + "grad_norm": 5.019044876098633, + "learning_rate": 0.0013021552184865027, + "loss": 7.5525, + "step": 1063000 + }, + { + "epoch": 4.330428388544937, + "eval_MaskedAccuracy": 0.5101939467960676, + "eval_loss": 1.5975756645202637, + "eval_runtime": 158.1521, + "eval_samples_per_second": 401.36, + "eval_steps_per_second": 1.568, + "step": 1063000 + }, + { + "epoch": 4.330835766568319, + "grad_norm": 8.213278770446777, + "learning_rate": 0.001301720607495529, + "loss": 7.6018, + "step": 1063100 + }, + { + "epoch": 4.3312431445917, + "grad_norm": 6.062457084655762, + "learning_rate": 0.0013012860440766243, + "loss": 7.5964, + "step": 1063200 + }, + { + "epoch": 4.331650522615082, + "grad_norm": 3.820427894592285, + "learning_rate": 0.0013008515282469705, + "loss": 7.5949, + "step": 1063300 + }, + { + "epoch": 4.332057900638463, + "grad_norm": 6.628716945648193, + "learning_rate": 0.0013004170600237471, + "loss": 7.607, + "step": 1063400 + }, + { + "epoch": 4.332465278661845, + "grad_norm": 3.9690921306610107, + "learning_rate": 0.0012999826394241298, + "loss": 7.596, + "step": 1063500 + }, + { + "epoch": 4.332872656685226, + "grad_norm": 5.554694175720215, + "learning_rate": 0.001299548266465293, + "loss": 7.5683, + "step": 1063600 + }, + { + "epoch": 4.333280034708608, + "grad_norm": 10.183398246765137, + "learning_rate": 0.0012991139411644094, + "loss": 7.6026, + "step": 1063700 + }, + { + "epoch": 4.333687412731989, + "grad_norm": 5.579558372497559, + "learning_rate": 0.001298679663538653, + "loss": 7.5957, + "step": 1063800 + }, + { + "epoch": 4.334094790755371, + "grad_norm": 5.486879348754883, + "learning_rate": 0.0012982454336051906, + "loss": 7.6042, + "step": 1063900 + }, + { + "epoch": 4.334502168778752, + "grad_norm": 10.786160469055176, + "learning_rate": 0.001297811251381193, + "loss": 7.5803, + "step": 1064000 + }, + { + "epoch": 4.334502168778752, + "eval_MaskedAccuracy": 0.51076194041149, + "eval_loss": 1.5966955423355103, + "eval_runtime": 173.4607, + "eval_samples_per_second": 365.939, + "eval_steps_per_second": 1.43, + "step": 1064000 + }, + { + "epoch": 4.334909546802133, + "grad_norm": 8.863340377807617, + "learning_rate": 0.0012973771168838214, + "loss": 7.5892, + "step": 1064100 + }, + { + "epoch": 4.335316924825515, + "grad_norm": 4.055553436279297, + "learning_rate": 0.0012969430301302412, + "loss": 7.6095, + "step": 1064200 + }, + { + "epoch": 4.335724302848896, + "grad_norm": 9.943106651306152, + "learning_rate": 0.0012965089911376128, + "loss": 7.5997, + "step": 1064300 + }, + { + "epoch": 4.336131680872278, + "grad_norm": 8.628669738769531, + "learning_rate": 0.001296074999923099, + "loss": 7.5851, + "step": 1064400 + }, + { + "epoch": 4.336539058895659, + "grad_norm": 7.279753684997559, + "learning_rate": 0.0012956410565038562, + "loss": 7.5732, + "step": 1064500 + }, + { + "epoch": 4.336946436919041, + "grad_norm": 5.713074684143066, + "learning_rate": 0.001295207160897041, + "loss": 7.6035, + "step": 1064600 + }, + { + "epoch": 4.337353814942422, + "grad_norm": 6.424903392791748, + "learning_rate": 0.001294773313119807, + "loss": 7.5754, + "step": 1064700 + }, + { + "epoch": 4.337761192965804, + "grad_norm": 7.36214017868042, + "learning_rate": 0.0012943395131893049, + "loss": 7.5841, + "step": 1064800 + }, + { + "epoch": 4.338168570989185, + "grad_norm": 10.240035057067871, + "learning_rate": 0.0012939057611226885, + "loss": 7.5897, + "step": 1064900 + }, + { + "epoch": 4.338575949012567, + "grad_norm": 5.936344623565674, + "learning_rate": 0.001293472056937105, + "loss": 7.5685, + "step": 1065000 + }, + { + "epoch": 4.338575949012567, + "eval_MaskedAccuracy": 0.509844156468321, + "eval_loss": 1.6008100509643555, + "eval_runtime": 158.0993, + "eval_samples_per_second": 401.495, + "eval_steps_per_second": 1.569, + "step": 1065000 + }, + { + "epoch": 4.338983327035948, + "grad_norm": 5.6455864906311035, + "learning_rate": 0.0012930384006496994, + "loss": 7.5927, + "step": 1065100 + }, + { + "epoch": 4.33939070505933, + "grad_norm": 7.265775203704834, + "learning_rate": 0.0012926047922776187, + "loss": 7.6001, + "step": 1065200 + }, + { + "epoch": 4.3397980830827105, + "grad_norm": 9.657461166381836, + "learning_rate": 0.0012921712318380051, + "loss": 7.5824, + "step": 1065300 + }, + { + "epoch": 4.340205461106092, + "grad_norm": 10.5460786819458, + "learning_rate": 0.0012917377193479994, + "loss": 7.5894, + "step": 1065400 + }, + { + "epoch": 4.340612839129474, + "grad_norm": 4.517817974090576, + "learning_rate": 0.0012913042548247412, + "loss": 7.593, + "step": 1065500 + }, + { + "epoch": 4.341020217152855, + "grad_norm": 12.844802856445312, + "learning_rate": 0.0012908708382853673, + "loss": 7.6035, + "step": 1065600 + }, + { + "epoch": 4.341427595176237, + "grad_norm": 4.840984344482422, + "learning_rate": 0.0012904374697470123, + "loss": 7.5903, + "step": 1065700 + }, + { + "epoch": 4.341834973199618, + "grad_norm": 12.027416229248047, + "learning_rate": 0.00129000414922681, + "loss": 7.5852, + "step": 1065800 + }, + { + "epoch": 4.342242351223, + "grad_norm": 5.543475151062012, + "learning_rate": 0.0012895708767418918, + "loss": 7.5684, + "step": 1065900 + }, + { + "epoch": 4.342649729246381, + "grad_norm": 8.735795021057129, + "learning_rate": 0.0012891376523093865, + "loss": 7.5857, + "step": 1066000 + }, + { + "epoch": 4.342649729246381, + "eval_MaskedAccuracy": 0.5101855763272692, + "eval_loss": 1.5994043350219727, + "eval_runtime": 154.8085, + "eval_samples_per_second": 410.029, + "eval_steps_per_second": 1.602, + "step": 1066000 + }, + { + "epoch": 4.343057107269763, + "grad_norm": 4.258831977844238, + "learning_rate": 0.0012887044759464234, + "loss": 7.6141, + "step": 1066100 + }, + { + "epoch": 4.343464485293144, + "grad_norm": 10.738426208496094, + "learning_rate": 0.0012882713476701272, + "loss": 7.589, + "step": 1066200 + }, + { + "epoch": 4.343871863316526, + "grad_norm": 4.778125762939453, + "learning_rate": 0.0012878382674976223, + "loss": 7.6103, + "step": 1066300 + }, + { + "epoch": 4.344279241339907, + "grad_norm": 6.218105792999268, + "learning_rate": 0.0012874052354460307, + "loss": 7.5739, + "step": 1066400 + }, + { + "epoch": 4.344686619363289, + "grad_norm": 4.286153793334961, + "learning_rate": 0.0012869722515324725, + "loss": 7.5593, + "step": 1066500 + }, + { + "epoch": 4.34509399738667, + "grad_norm": 3.8365478515625, + "learning_rate": 0.0012865393157740653, + "loss": 7.5927, + "step": 1066600 + }, + { + "epoch": 4.345501375410051, + "grad_norm": 11.182649612426758, + "learning_rate": 0.0012861064281879256, + "loss": 7.5998, + "step": 1066700 + }, + { + "epoch": 4.345908753433433, + "grad_norm": 5.147824287414551, + "learning_rate": 0.0012856735887911684, + "loss": 7.6066, + "step": 1066800 + }, + { + "epoch": 4.346316131456814, + "grad_norm": 3.963681936264038, + "learning_rate": 0.0012852407976009048, + "loss": 7.5603, + "step": 1066900 + }, + { + "epoch": 4.346723509480196, + "grad_norm": 12.69094181060791, + "learning_rate": 0.0012848080546342458, + "loss": 7.5934, + "step": 1067000 + }, + { + "epoch": 4.346723509480196, + "eval_MaskedAccuracy": 0.5104228786458219, + "eval_loss": 1.605615258216858, + "eval_runtime": 162.6117, + "eval_samples_per_second": 390.353, + "eval_steps_per_second": 1.525, + "step": 1067000 + }, + { + "epoch": 4.347130887503577, + "grad_norm": 4.976997375488281, + "learning_rate": 0.0012843753599083004, + "loss": 7.6242, + "step": 1067100 + }, + { + "epoch": 4.347538265526959, + "grad_norm": 3.4787328243255615, + "learning_rate": 0.0012839427134401769, + "loss": 7.5837, + "step": 1067200 + }, + { + "epoch": 4.34794564355034, + "grad_norm": 8.680774688720703, + "learning_rate": 0.0012835101152469778, + "loss": 7.5942, + "step": 1067300 + }, + { + "epoch": 4.348353021573722, + "grad_norm": 8.437795639038086, + "learning_rate": 0.0012830775653458076, + "loss": 7.5711, + "step": 1067400 + }, + { + "epoch": 4.348760399597103, + "grad_norm": 7.789732933044434, + "learning_rate": 0.0012826450637537662, + "loss": 7.5909, + "step": 1067500 + }, + { + "epoch": 4.349167777620485, + "grad_norm": 7.138700008392334, + "learning_rate": 0.0012822126104879519, + "loss": 7.6016, + "step": 1067600 + }, + { + "epoch": 4.349575155643866, + "grad_norm": 3.996811628341675, + "learning_rate": 0.0012817802055654635, + "loss": 7.5711, + "step": 1067700 + }, + { + "epoch": 4.349982533667248, + "grad_norm": 8.46236515045166, + "learning_rate": 0.001281347849003394, + "loss": 7.5659, + "step": 1067800 + }, + { + "epoch": 4.350389911690629, + "grad_norm": 15.083024024963379, + "learning_rate": 0.0012809155408188399, + "loss": 7.6039, + "step": 1067900 + }, + { + "epoch": 4.35079728971401, + "grad_norm": 3.8241968154907227, + "learning_rate": 0.00128048328102889, + "loss": 7.5949, + "step": 1068000 + }, + { + "epoch": 4.35079728971401, + "eval_MaskedAccuracy": 0.5108676400674896, + "eval_loss": 1.596187710762024, + "eval_runtime": 153.5642, + "eval_samples_per_second": 413.351, + "eval_steps_per_second": 1.615, + "step": 1068000 + }, + { + "epoch": 4.351204667737392, + "grad_norm": 12.495206832885742, + "learning_rate": 0.001280051069650638, + "loss": 7.589, + "step": 1068100 + }, + { + "epoch": 4.351612045760773, + "grad_norm": 9.804719924926758, + "learning_rate": 0.0012796189067011679, + "loss": 7.586, + "step": 1068200 + }, + { + "epoch": 4.352019423784155, + "grad_norm": 4.911618709564209, + "learning_rate": 0.0012791867921975667, + "loss": 7.5939, + "step": 1068300 + }, + { + "epoch": 4.352426801807536, + "grad_norm": 10.578044891357422, + "learning_rate": 0.0012787547261569163, + "loss": 7.5736, + "step": 1068400 + }, + { + "epoch": 4.352834179830918, + "grad_norm": 4.151943683624268, + "learning_rate": 0.001278322708596301, + "loss": 7.6047, + "step": 1068500 + }, + { + "epoch": 4.353241557854299, + "grad_norm": 11.867549896240234, + "learning_rate": 0.0012778907395327996, + "loss": 7.5943, + "step": 1068600 + }, + { + "epoch": 4.353648935877681, + "grad_norm": 16.011520385742188, + "learning_rate": 0.001277458818983491, + "loss": 7.5673, + "step": 1068700 + }, + { + "epoch": 4.354056313901062, + "grad_norm": 3.1570136547088623, + "learning_rate": 0.0012770269469654513, + "loss": 7.5858, + "step": 1068800 + }, + { + "epoch": 4.354463691924444, + "grad_norm": 11.839057922363281, + "learning_rate": 0.0012765951234957537, + "loss": 7.6403, + "step": 1068900 + }, + { + "epoch": 4.3548710699478255, + "grad_norm": 5.1624650955200195, + "learning_rate": 0.00127616334859147, + "loss": 7.5585, + "step": 1069000 + }, + { + "epoch": 4.3548710699478255, + "eval_MaskedAccuracy": 0.5108027954903482, + "eval_loss": 1.5931923389434814, + "eval_runtime": 152.3651, + "eval_samples_per_second": 416.605, + "eval_steps_per_second": 1.628, + "step": 1069000 + }, + { + "epoch": 4.355278447971206, + "grad_norm": 11.090703964233398, + "learning_rate": 0.0012757316222696718, + "loss": 7.5973, + "step": 1069100 + }, + { + "epoch": 4.355685825994588, + "grad_norm": 16.196626663208008, + "learning_rate": 0.0012752999445474263, + "loss": 7.5732, + "step": 1069200 + }, + { + "epoch": 4.356093204017969, + "grad_norm": 14.063507080078125, + "learning_rate": 0.0012748683154418018, + "loss": 7.6002, + "step": 1069300 + }, + { + "epoch": 4.356500582041351, + "grad_norm": 4.258674621582031, + "learning_rate": 0.0012744367349698624, + "loss": 7.565, + "step": 1069400 + }, + { + "epoch": 4.356907960064732, + "grad_norm": 7.568656921386719, + "learning_rate": 0.0012740052031486708, + "loss": 7.5596, + "step": 1069500 + }, + { + "epoch": 4.357315338088114, + "grad_norm": 8.4083890914917, + "learning_rate": 0.0012735737199952857, + "loss": 7.5934, + "step": 1069600 + }, + { + "epoch": 4.357722716111495, + "grad_norm": 9.756007194519043, + "learning_rate": 0.0012731422855267699, + "loss": 7.5897, + "step": 1069700 + }, + { + "epoch": 4.358130094134877, + "grad_norm": 10.273256301879883, + "learning_rate": 0.0012727108997601757, + "loss": 7.5617, + "step": 1069800 + }, + { + "epoch": 4.358537472158258, + "grad_norm": 12.686280250549316, + "learning_rate": 0.0012722795627125609, + "loss": 7.5649, + "step": 1069900 + }, + { + "epoch": 4.35894485018164, + "grad_norm": 4.117603778839111, + "learning_rate": 0.0012718482744009778, + "loss": 7.5786, + "step": 1070000 + }, + { + "epoch": 4.35894485018164, + "eval_MaskedAccuracy": 0.5102318150094306, + "eval_loss": 1.5989423990249634, + "eval_runtime": 168.4935, + "eval_samples_per_second": 376.727, + "eval_steps_per_second": 1.472, + "step": 1070000 + }, + { + "epoch": 4.359352228205021, + "grad_norm": 6.047852993011475, + "learning_rate": 0.0012714170348424791, + "loss": 7.5837, + "step": 1070100 + }, + { + "epoch": 4.359759606228403, + "grad_norm": 10.343796730041504, + "learning_rate": 0.001270985844054111, + "loss": 7.5884, + "step": 1070200 + }, + { + "epoch": 4.360166984251784, + "grad_norm": 4.143160343170166, + "learning_rate": 0.0012705547020529224, + "loss": 7.5587, + "step": 1070300 + }, + { + "epoch": 4.360574362275165, + "grad_norm": 3.8405749797821045, + "learning_rate": 0.0012701236088559617, + "loss": 7.6016, + "step": 1070400 + }, + { + "epoch": 4.360981740298547, + "grad_norm": 15.33755111694336, + "learning_rate": 0.0012696925644802672, + "loss": 7.5771, + "step": 1070500 + }, + { + "epoch": 4.361389118321928, + "grad_norm": 6.435998916625977, + "learning_rate": 0.0012692615689428833, + "loss": 7.5763, + "step": 1070600 + }, + { + "epoch": 4.36179649634531, + "grad_norm": 7.149518013000488, + "learning_rate": 0.001268830622260848, + "loss": 7.5817, + "step": 1070700 + }, + { + "epoch": 4.362203874368691, + "grad_norm": 5.455778121948242, + "learning_rate": 0.0012683997244511992, + "loss": 7.5637, + "step": 1070800 + }, + { + "epoch": 4.362611252392073, + "grad_norm": 7.155077934265137, + "learning_rate": 0.0012679688755309734, + "loss": 7.5535, + "step": 1070900 + }, + { + "epoch": 4.363018630415454, + "grad_norm": 4.293442726135254, + "learning_rate": 0.0012675380755172044, + "loss": 7.6067, + "step": 1071000 + }, + { + "epoch": 4.363018630415454, + "eval_MaskedAccuracy": 0.5106091601963929, + "eval_loss": 1.5963608026504517, + "eval_runtime": 165.3609, + "eval_samples_per_second": 383.863, + "eval_steps_per_second": 1.5, + "step": 1071000 + }, + { + "epoch": 4.363426008438836, + "grad_norm": 9.340729713439941, + "learning_rate": 0.0012671073244269228, + "loss": 7.6366, + "step": 1071100 + }, + { + "epoch": 4.363833386462217, + "grad_norm": 9.988269805908203, + "learning_rate": 0.0012666766222771578, + "loss": 7.5923, + "step": 1071200 + }, + { + "epoch": 4.364240764485599, + "grad_norm": 6.921746253967285, + "learning_rate": 0.00126624596908494, + "loss": 7.5636, + "step": 1071300 + }, + { + "epoch": 4.3646481425089805, + "grad_norm": 11.134282112121582, + "learning_rate": 0.0012658153648672959, + "loss": 7.5798, + "step": 1071400 + }, + { + "epoch": 4.365055520532362, + "grad_norm": 4.560843467712402, + "learning_rate": 0.0012653848096412477, + "loss": 7.5861, + "step": 1071500 + }, + { + "epoch": 4.365462898555743, + "grad_norm": 9.250143051147461, + "learning_rate": 0.0012649543034238177, + "loss": 7.5835, + "step": 1071600 + }, + { + "epoch": 4.365870276579124, + "grad_norm": 6.468830108642578, + "learning_rate": 0.0012645238462320268, + "loss": 7.5474, + "step": 1071700 + }, + { + "epoch": 4.366277654602506, + "grad_norm": 10.883308410644531, + "learning_rate": 0.0012640934380828924, + "loss": 7.5726, + "step": 1071800 + }, + { + "epoch": 4.366685032625887, + "grad_norm": 13.532042503356934, + "learning_rate": 0.00126366307899343, + "loss": 7.5847, + "step": 1071900 + }, + { + "epoch": 4.367092410649269, + "grad_norm": 9.324649810791016, + "learning_rate": 0.0012632327689806565, + "loss": 7.562, + "step": 1072000 + }, + { + "epoch": 4.367092410649269, + "eval_MaskedAccuracy": 0.5105239092277821, + "eval_loss": 1.5967248678207397, + "eval_runtime": 152.6265, + "eval_samples_per_second": 415.891, + "eval_steps_per_second": 1.625, + "step": 1072000 + }, + { + "epoch": 4.36749978867265, + "grad_norm": 11.059331893920898, + "learning_rate": 0.0012628025080615828, + "loss": 7.598, + "step": 1072100 + }, + { + "epoch": 4.367907166696032, + "grad_norm": 7.78730583190918, + "learning_rate": 0.0012623722962532204, + "loss": 7.5879, + "step": 1072200 + }, + { + "epoch": 4.368314544719413, + "grad_norm": 8.362425804138184, + "learning_rate": 0.0012619421335725762, + "loss": 7.5957, + "step": 1072300 + }, + { + "epoch": 4.368721922742795, + "grad_norm": 9.769828796386719, + "learning_rate": 0.0012615120200366592, + "loss": 7.6245, + "step": 1072400 + }, + { + "epoch": 4.3691293007661764, + "grad_norm": 4.522059917449951, + "learning_rate": 0.0012610819556624709, + "loss": 7.6131, + "step": 1072500 + }, + { + "epoch": 4.369536678789558, + "grad_norm": 11.34514331817627, + "learning_rate": 0.0012606519404670161, + "loss": 7.5927, + "step": 1072600 + }, + { + "epoch": 4.3699440568129395, + "grad_norm": 3.706719160079956, + "learning_rate": 0.0012602219744672959, + "loss": 7.6163, + "step": 1072700 + }, + { + "epoch": 4.370351434836321, + "grad_norm": 7.14650821685791, + "learning_rate": 0.001259792057680308, + "loss": 7.5554, + "step": 1072800 + }, + { + "epoch": 4.370758812859702, + "grad_norm": 9.408866882324219, + "learning_rate": 0.0012593621901230512, + "loss": 7.5682, + "step": 1072900 + }, + { + "epoch": 4.371166190883083, + "grad_norm": 6.3065690994262695, + "learning_rate": 0.0012589323718125189, + "loss": 7.5868, + "step": 1073000 + }, + { + "epoch": 4.371166190883083, + "eval_MaskedAccuracy": 0.5104471424898137, + "eval_loss": 1.5974905490875244, + "eval_runtime": 163.3728, + "eval_samples_per_second": 388.535, + "eval_steps_per_second": 1.518, + "step": 1073000 + }, + { + "epoch": 4.371573568906465, + "grad_norm": 10.61609935760498, + "learning_rate": 0.0012585026027657027, + "loss": 7.6045, + "step": 1073100 + }, + { + "epoch": 4.371980946929846, + "grad_norm": 3.768588066101074, + "learning_rate": 0.0012580728829995958, + "loss": 7.5806, + "step": 1073200 + }, + { + "epoch": 4.372388324953228, + "grad_norm": 10.519930839538574, + "learning_rate": 0.0012576432125311887, + "loss": 7.6019, + "step": 1073300 + }, + { + "epoch": 4.372795702976609, + "grad_norm": 3.8004915714263916, + "learning_rate": 0.001257213591377466, + "loss": 7.6136, + "step": 1073400 + }, + { + "epoch": 4.373203080999991, + "grad_norm": 7.739326477050781, + "learning_rate": 0.001256784019555413, + "loss": 7.5987, + "step": 1073500 + }, + { + "epoch": 4.373610459023372, + "grad_norm": 6.079395771026611, + "learning_rate": 0.0012563544970820137, + "loss": 7.5881, + "step": 1073600 + }, + { + "epoch": 4.374017837046754, + "grad_norm": 15.077213287353516, + "learning_rate": 0.0012559250239742494, + "loss": 7.5886, + "step": 1073700 + }, + { + "epoch": 4.3744252150701355, + "grad_norm": 4.818278789520264, + "learning_rate": 0.0012554956002491006, + "loss": 7.6213, + "step": 1073800 + }, + { + "epoch": 4.374832593093517, + "grad_norm": 15.234819412231445, + "learning_rate": 0.0012550662259235427, + "loss": 7.5854, + "step": 1073900 + }, + { + "epoch": 4.3752399711168986, + "grad_norm": 11.360335350036621, + "learning_rate": 0.0012546369010145533, + "loss": 7.5524, + "step": 1074000 + }, + { + "epoch": 4.3752399711168986, + "eval_MaskedAccuracy": 0.5106042494614875, + "eval_loss": 1.59951651096344, + "eval_runtime": 164.5784, + "eval_samples_per_second": 385.689, + "eval_steps_per_second": 1.507, + "step": 1074000 + }, + { + "epoch": 4.375647349140279, + "grad_norm": 9.735004425048828, + "learning_rate": 0.0012542076255391025, + "loss": 7.5879, + "step": 1074100 + }, + { + "epoch": 4.376054727163661, + "grad_norm": 18.597047805786133, + "learning_rate": 0.0012537783995141651, + "loss": 7.6059, + "step": 1074200 + }, + { + "epoch": 4.376462105187042, + "grad_norm": 14.264388084411621, + "learning_rate": 0.0012533492229567097, + "loss": 7.5719, + "step": 1074300 + }, + { + "epoch": 4.376869483210424, + "grad_norm": 3.560699224472046, + "learning_rate": 0.0012529200958837038, + "loss": 7.6011, + "step": 1074400 + }, + { + "epoch": 4.377276861233805, + "grad_norm": 6.42880916595459, + "learning_rate": 0.0012524910183121121, + "loss": 7.5836, + "step": 1074500 + }, + { + "epoch": 4.377684239257187, + "grad_norm": 4.812469959259033, + "learning_rate": 0.0012520619902589001, + "loss": 7.5817, + "step": 1074600 + }, + { + "epoch": 4.378091617280568, + "grad_norm": 6.2986249923706055, + "learning_rate": 0.0012516330117410296, + "loss": 7.6005, + "step": 1074700 + }, + { + "epoch": 4.37849899530395, + "grad_norm": 8.868752479553223, + "learning_rate": 0.0012512040827754572, + "loss": 7.6097, + "step": 1074800 + }, + { + "epoch": 4.3789063733273315, + "grad_norm": 13.603480339050293, + "learning_rate": 0.001250775203379144, + "loss": 7.5842, + "step": 1074900 + }, + { + "epoch": 4.379313751350713, + "grad_norm": 9.5311279296875, + "learning_rate": 0.0012503463735690453, + "loss": 7.5775, + "step": 1075000 + }, + { + "epoch": 4.379313751350713, + "eval_MaskedAccuracy": 0.5112514387056566, + "eval_loss": 1.5999317169189453, + "eval_runtime": 165.7465, + "eval_samples_per_second": 382.97, + "eval_steps_per_second": 1.496, + "step": 1075000 + }, + { + "epoch": 4.3797211293740945, + "grad_norm": 9.72091293334961, + "learning_rate": 0.001249917593362115, + "loss": 7.6107, + "step": 1075100 + }, + { + "epoch": 4.380128507397476, + "grad_norm": 7.305984973907471, + "learning_rate": 0.001249488862775306, + "loss": 7.6228, + "step": 1075200 + }, + { + "epoch": 4.380535885420857, + "grad_norm": 5.1745829582214355, + "learning_rate": 0.0012490601818255665, + "loss": 7.5918, + "step": 1075300 + }, + { + "epoch": 4.380943263444238, + "grad_norm": 9.361069679260254, + "learning_rate": 0.0012486315505298446, + "loss": 7.578, + "step": 1075400 + }, + { + "epoch": 4.38135064146762, + "grad_norm": 5.21118688583374, + "learning_rate": 0.0012482029689050882, + "loss": 7.5887, + "step": 1075500 + }, + { + "epoch": 4.381758019491001, + "grad_norm": 2.8380625247955322, + "learning_rate": 0.0012477744369682395, + "loss": 7.5884, + "step": 1075600 + }, + { + "epoch": 4.382165397514383, + "grad_norm": 13.930927276611328, + "learning_rate": 0.0012473459547362419, + "loss": 7.5569, + "step": 1075700 + }, + { + "epoch": 4.382572775537764, + "grad_norm": 5.366053581237793, + "learning_rate": 0.0012469175222260345, + "loss": 7.5963, + "step": 1075800 + }, + { + "epoch": 4.382980153561146, + "grad_norm": 3.2593436241149902, + "learning_rate": 0.0012464891394545579, + "loss": 7.6076, + "step": 1075900 + }, + { + "epoch": 4.383387531584527, + "grad_norm": 8.94016170501709, + "learning_rate": 0.0012460608064387459, + "loss": 7.606, + "step": 1076000 + }, + { + "epoch": 4.383387531584527, + "eval_MaskedAccuracy": 0.5099280734437895, + "eval_loss": 1.6040574312210083, + "eval_runtime": 163.1411, + "eval_samples_per_second": 389.086, + "eval_steps_per_second": 1.52, + "step": 1076000 + }, + { + "epoch": 4.383794909607909, + "grad_norm": 4.465366363525391, + "learning_rate": 0.0012456325231955325, + "loss": 7.5911, + "step": 1076100 + }, + { + "epoch": 4.3842022876312905, + "grad_norm": 11.04880142211914, + "learning_rate": 0.001245204289741853, + "loss": 7.5984, + "step": 1076200 + }, + { + "epoch": 4.384609665654672, + "grad_norm": 9.210282325744629, + "learning_rate": 0.0012447761060946346, + "loss": 7.585, + "step": 1076300 + }, + { + "epoch": 4.385017043678054, + "grad_norm": 5.569253444671631, + "learning_rate": 0.0012443479722708069, + "loss": 7.5811, + "step": 1076400 + }, + { + "epoch": 4.385424421701435, + "grad_norm": 5.126228332519531, + "learning_rate": 0.0012439198882872955, + "loss": 7.5979, + "step": 1076500 + }, + { + "epoch": 4.385831799724816, + "grad_norm": 8.947504997253418, + "learning_rate": 0.0012434918541610264, + "loss": 7.5892, + "step": 1076600 + }, + { + "epoch": 4.386239177748197, + "grad_norm": 9.127795219421387, + "learning_rate": 0.00124306386990892, + "loss": 7.5509, + "step": 1076700 + }, + { + "epoch": 4.386646555771579, + "grad_norm": 5.873353004455566, + "learning_rate": 0.0012426359355478965, + "loss": 7.6015, + "step": 1076800 + }, + { + "epoch": 4.38705393379496, + "grad_norm": 10.857064247131348, + "learning_rate": 0.0012422080510948795, + "loss": 7.5986, + "step": 1076900 + }, + { + "epoch": 4.387461311818342, + "grad_norm": 3.498786449432373, + "learning_rate": 0.0012417802165667813, + "loss": 7.6264, + "step": 1077000 + }, + { + "epoch": 4.387461311818342, + "eval_MaskedAccuracy": 0.5107773997206252, + "eval_loss": 1.602007508277893, + "eval_runtime": 168.0554, + "eval_samples_per_second": 377.709, + "eval_steps_per_second": 1.476, + "step": 1077000 + }, + { + "epoch": 4.387868689841723, + "grad_norm": 5.145960330963135, + "learning_rate": 0.0012413524319805163, + "loss": 7.6106, + "step": 1077100 + }, + { + "epoch": 4.388276067865105, + "grad_norm": 7.251046657562256, + "learning_rate": 0.0012409246973529992, + "loss": 7.6026, + "step": 1077200 + }, + { + "epoch": 4.3886834458884865, + "grad_norm": 2.875115394592285, + "learning_rate": 0.001240497012701139, + "loss": 7.5612, + "step": 1077300 + }, + { + "epoch": 4.389090823911868, + "grad_norm": 8.643251419067383, + "learning_rate": 0.0012400693780418447, + "loss": 7.637, + "step": 1077400 + }, + { + "epoch": 4.3894982019352495, + "grad_norm": 9.116021156311035, + "learning_rate": 0.0012396417933920247, + "loss": 7.5846, + "step": 1077500 + }, + { + "epoch": 4.389905579958631, + "grad_norm": 7.570412635803223, + "learning_rate": 0.0012392142587685816, + "loss": 7.6097, + "step": 1077600 + }, + { + "epoch": 4.390312957982013, + "grad_norm": 6.775341033935547, + "learning_rate": 0.0012387867741884189, + "loss": 7.589, + "step": 1077700 + }, + { + "epoch": 4.390720336005394, + "grad_norm": 3.919856309890747, + "learning_rate": 0.0012383593396684373, + "loss": 7.5687, + "step": 1077800 + }, + { + "epoch": 4.391127714028775, + "grad_norm": 5.87698221206665, + "learning_rate": 0.0012379319552255357, + "loss": 7.6009, + "step": 1077900 + }, + { + "epoch": 4.391535092052156, + "grad_norm": 10.432027816772461, + "learning_rate": 0.0012375046208766098, + "loss": 7.5815, + "step": 1078000 + }, + { + "epoch": 4.391535092052156, + "eval_MaskedAccuracy": 0.5110336118225574, + "eval_loss": 1.5920871496200562, + "eval_runtime": 166.6891, + "eval_samples_per_second": 380.805, + "eval_steps_per_second": 1.488, + "step": 1078000 + }, + { + "epoch": 4.391942470075538, + "grad_norm": 8.6183443069458, + "learning_rate": 0.0012370773366385535, + "loss": 7.5998, + "step": 1078100 + }, + { + "epoch": 4.392349848098919, + "grad_norm": 7.473204612731934, + "learning_rate": 0.001236650102528263, + "loss": 7.5326, + "step": 1078200 + }, + { + "epoch": 4.392757226122301, + "grad_norm": 13.099045753479004, + "learning_rate": 0.0012362229185626274, + "loss": 7.585, + "step": 1078300 + }, + { + "epoch": 4.393164604145682, + "grad_norm": 4.473751068115234, + "learning_rate": 0.0012357957847585362, + "loss": 7.577, + "step": 1078400 + }, + { + "epoch": 4.393571982169064, + "grad_norm": 12.312962532043457, + "learning_rate": 0.0012353687011328754, + "loss": 7.6053, + "step": 1078500 + }, + { + "epoch": 4.3939793601924455, + "grad_norm": 4.935724258422852, + "learning_rate": 0.0012349416677025296, + "loss": 7.5757, + "step": 1078600 + }, + { + "epoch": 4.394386738215827, + "grad_norm": 4.584693431854248, + "learning_rate": 0.0012345146844843834, + "loss": 7.5887, + "step": 1078700 + }, + { + "epoch": 4.394794116239209, + "grad_norm": 3.618016481399536, + "learning_rate": 0.0012340877514953165, + "loss": 7.6074, + "step": 1078800 + }, + { + "epoch": 4.39520149426259, + "grad_norm": 13.403838157653809, + "learning_rate": 0.0012336608687522094, + "loss": 7.5633, + "step": 1078900 + }, + { + "epoch": 4.395608872285972, + "grad_norm": 7.863653659820557, + "learning_rate": 0.0012332340362719372, + "loss": 7.5719, + "step": 1079000 + }, + { + "epoch": 4.395608872285972, + "eval_MaskedAccuracy": 0.5112417628732163, + "eval_loss": 1.5996752977371216, + "eval_runtime": 152.9429, + "eval_samples_per_second": 415.031, + "eval_steps_per_second": 1.622, + "step": 1079000 + }, + { + "epoch": 4.396016250309352, + "grad_norm": 6.779383182525635, + "learning_rate": 0.001232807254071375, + "loss": 7.5974, + "step": 1079100 + }, + { + "epoch": 4.396423628332734, + "grad_norm": 7.017134189605713, + "learning_rate": 0.0012323805221673952, + "loss": 7.5547, + "step": 1079200 + }, + { + "epoch": 4.396831006356115, + "grad_norm": 6.765597343444824, + "learning_rate": 0.0012319538405768718, + "loss": 7.5843, + "step": 1079300 + }, + { + "epoch": 4.397238384379497, + "grad_norm": 11.13979434967041, + "learning_rate": 0.0012315272093166708, + "loss": 7.5937, + "step": 1079400 + }, + { + "epoch": 4.397645762402878, + "grad_norm": 12.424825668334961, + "learning_rate": 0.001231100628403661, + "loss": 7.5907, + "step": 1079500 + }, + { + "epoch": 4.39805314042626, + "grad_norm": 5.490167140960693, + "learning_rate": 0.0012306740978547078, + "loss": 7.5813, + "step": 1079600 + }, + { + "epoch": 4.3984605184496415, + "grad_norm": 15.808433532714844, + "learning_rate": 0.0012302476176866723, + "loss": 7.6011, + "step": 1079700 + }, + { + "epoch": 4.398867896473023, + "grad_norm": 3.578862190246582, + "learning_rate": 0.0012298211879164132, + "loss": 7.5841, + "step": 1079800 + }, + { + "epoch": 4.3992752744964045, + "grad_norm": 6.766655921936035, + "learning_rate": 0.001229394808560797, + "loss": 7.5828, + "step": 1079900 + }, + { + "epoch": 4.399682652519786, + "grad_norm": 6.24049711227417, + "learning_rate": 0.001228968479636674, + "loss": 7.5878, + "step": 1080000 + }, + { + "epoch": 4.399682652519786, + "eval_MaskedAccuracy": 0.5112307424352367, + "eval_loss": 1.5944077968597412, + "eval_runtime": 158.2273, + "eval_samples_per_second": 401.17, + "eval_steps_per_second": 1.567, + "step": 1080000 + }, + { + "epoch": 4.400090030543168, + "grad_norm": 15.223663330078125, + "learning_rate": 0.0012285422011609024, + "loss": 7.5534, + "step": 1080100 + }, + { + "epoch": 4.400497408566549, + "grad_norm": 11.505247116088867, + "learning_rate": 0.0012281159731503352, + "loss": 7.5738, + "step": 1080200 + }, + { + "epoch": 4.40090478658993, + "grad_norm": 3.8178861141204834, + "learning_rate": 0.001227689795621821, + "loss": 7.5777, + "step": 1080300 + }, + { + "epoch": 4.401312164613311, + "grad_norm": 6.735739231109619, + "learning_rate": 0.0012272636685922108, + "loss": 7.5766, + "step": 1080400 + }, + { + "epoch": 4.401719542636693, + "grad_norm": 17.2441463470459, + "learning_rate": 0.0012268375920783524, + "loss": 7.5491, + "step": 1080500 + }, + { + "epoch": 4.402126920660074, + "grad_norm": 6.725284099578857, + "learning_rate": 0.0012264115660970895, + "loss": 7.5856, + "step": 1080600 + }, + { + "epoch": 4.402534298683456, + "grad_norm": 10.950495719909668, + "learning_rate": 0.0012259855906652647, + "loss": 7.5661, + "step": 1080700 + }, + { + "epoch": 4.4029416767068374, + "grad_norm": 3.5515589714050293, + "learning_rate": 0.0012255596657997209, + "loss": 7.5844, + "step": 1080800 + }, + { + "epoch": 4.403349054730219, + "grad_norm": 8.926765441894531, + "learning_rate": 0.0012251337915172979, + "loss": 7.5928, + "step": 1080900 + }, + { + "epoch": 4.4037564327536005, + "grad_norm": 8.883869171142578, + "learning_rate": 0.0012247079678348296, + "loss": 7.5739, + "step": 1081000 + }, + { + "epoch": 4.4037564327536005, + "eval_MaskedAccuracy": 0.5111937848838357, + "eval_loss": 1.5912305116653442, + "eval_runtime": 160.2994, + "eval_samples_per_second": 395.984, + "eval_steps_per_second": 1.547, + "step": 1081000 + }, + { + "epoch": 4.404163810776982, + "grad_norm": 7.667311191558838, + "learning_rate": 0.0012242821947691547, + "loss": 7.5878, + "step": 1081100 + }, + { + "epoch": 4.404571188800364, + "grad_norm": 5.586745262145996, + "learning_rate": 0.0012238564723371045, + "loss": 7.567, + "step": 1081200 + }, + { + "epoch": 4.404978566823745, + "grad_norm": 5.00696325302124, + "learning_rate": 0.0012234308005555075, + "loss": 7.5621, + "step": 1081300 + }, + { + "epoch": 4.405385944847127, + "grad_norm": 3.9975998401641846, + "learning_rate": 0.001223005179441196, + "loss": 7.5989, + "step": 1081400 + }, + { + "epoch": 4.405793322870508, + "grad_norm": 10.185312271118164, + "learning_rate": 0.0012225796090109965, + "loss": 7.5867, + "step": 1081500 + }, + { + "epoch": 4.406200700893889, + "grad_norm": 12.204691886901855, + "learning_rate": 0.001222154089281734, + "loss": 7.6038, + "step": 1081600 + }, + { + "epoch": 4.40660807891727, + "grad_norm": 6.501101016998291, + "learning_rate": 0.0012217286202702315, + "loss": 7.5944, + "step": 1081700 + }, + { + "epoch": 4.407015456940652, + "grad_norm": 9.930747985839844, + "learning_rate": 0.0012213032019933099, + "loss": 7.578, + "step": 1081800 + }, + { + "epoch": 4.407422834964033, + "grad_norm": 9.826106071472168, + "learning_rate": 0.00122087783446779, + "loss": 7.5861, + "step": 1081900 + }, + { + "epoch": 4.407830212987415, + "grad_norm": 6.22445011138916, + "learning_rate": 0.001220452517710486, + "loss": 7.5844, + "step": 1082000 + }, + { + "epoch": 4.407830212987415, + "eval_MaskedAccuracy": 0.5105746082038909, + "eval_loss": 1.5927674770355225, + "eval_runtime": 152.06, + "eval_samples_per_second": 417.441, + "eval_steps_per_second": 1.631, + "step": 1082000 + }, + { + "epoch": 4.4082375910107965, + "grad_norm": 11.709757804870605, + "learning_rate": 0.0012200272517382154, + "loss": 7.6239, + "step": 1082100 + }, + { + "epoch": 4.408644969034178, + "grad_norm": 5.423645973205566, + "learning_rate": 0.0012196020365677902, + "loss": 7.5756, + "step": 1082200 + }, + { + "epoch": 4.4090523470575596, + "grad_norm": 8.007949829101562, + "learning_rate": 0.00121917687221602, + "loss": 7.5693, + "step": 1082300 + }, + { + "epoch": 4.409459725080941, + "grad_norm": 16.422317504882812, + "learning_rate": 0.0012187517586997152, + "loss": 7.6059, + "step": 1082400 + }, + { + "epoch": 4.409867103104323, + "grad_norm": 7.6850762367248535, + "learning_rate": 0.001218326696035683, + "loss": 7.5635, + "step": 1082500 + }, + { + "epoch": 4.410274481127704, + "grad_norm": 4.924514293670654, + "learning_rate": 0.0012179016842407268, + "loss": 7.5817, + "step": 1082600 + }, + { + "epoch": 4.410681859151086, + "grad_norm": 3.909653902053833, + "learning_rate": 0.0012174767233316518, + "loss": 7.551, + "step": 1082700 + }, + { + "epoch": 4.411089237174467, + "grad_norm": 10.372475624084473, + "learning_rate": 0.0012170518133252594, + "loss": 7.6428, + "step": 1082800 + }, + { + "epoch": 4.411496615197848, + "grad_norm": 5.996923446655273, + "learning_rate": 0.0012166269542383458, + "loss": 7.58, + "step": 1082900 + }, + { + "epoch": 4.411903993221229, + "grad_norm": 4.6754913330078125, + "learning_rate": 0.0012162021460877098, + "loss": 7.5846, + "step": 1083000 + }, + { + "epoch": 4.411903993221229, + "eval_MaskedAccuracy": 0.5109740300946131, + "eval_loss": 1.5946656465530396, + "eval_runtime": 154.5479, + "eval_samples_per_second": 410.72, + "eval_steps_per_second": 1.605, + "step": 1083000 + }, + { + "epoch": 4.412311371244611, + "grad_norm": 5.254715919494629, + "learning_rate": 0.0012157773888901468, + "loss": 7.5685, + "step": 1083100 + }, + { + "epoch": 4.4127187492679925, + "grad_norm": 5.549961090087891, + "learning_rate": 0.0012153526826624488, + "loss": 7.5656, + "step": 1083200 + }, + { + "epoch": 4.413126127291374, + "grad_norm": 5.245545864105225, + "learning_rate": 0.0012149280274214068, + "loss": 7.5923, + "step": 1083300 + }, + { + "epoch": 4.4135335053147555, + "grad_norm": 4.338502883911133, + "learning_rate": 0.001214503423183809, + "loss": 7.5961, + "step": 1083400 + }, + { + "epoch": 4.413940883338137, + "grad_norm": 4.973672866821289, + "learning_rate": 0.001214078869966443, + "loss": 7.5644, + "step": 1083500 + }, + { + "epoch": 4.414348261361519, + "grad_norm": 5.538344860076904, + "learning_rate": 0.001213654367786093, + "loss": 7.6126, + "step": 1083600 + }, + { + "epoch": 4.4147556393849, + "grad_norm": 11.5521240234375, + "learning_rate": 0.001213229916659542, + "loss": 7.6021, + "step": 1083700 + }, + { + "epoch": 4.415163017408282, + "grad_norm": 7.949106216430664, + "learning_rate": 0.0012128055166035717, + "loss": 7.6021, + "step": 1083800 + }, + { + "epoch": 4.415570395431663, + "grad_norm": 6.9478302001953125, + "learning_rate": 0.0012123811676349604, + "loss": 7.5997, + "step": 1083900 + }, + { + "epoch": 4.415977773455045, + "grad_norm": 7.187282562255859, + "learning_rate": 0.0012119568697704858, + "loss": 7.6215, + "step": 1084000 + }, + { + "epoch": 4.415977773455045, + "eval_MaskedAccuracy": 0.5106979206601311, + "eval_loss": 1.6136956214904785, + "eval_runtime": 159.2963, + "eval_samples_per_second": 398.478, + "eval_steps_per_second": 1.557, + "step": 1084000 + }, + { + "epoch": 4.416385151478425, + "grad_norm": 8.625130653381348, + "learning_rate": 0.0012115326230269213, + "loss": 7.5802, + "step": 1084100 + }, + { + "epoch": 4.416792529501807, + "grad_norm": 4.124234199523926, + "learning_rate": 0.001211108427421041, + "loss": 7.5743, + "step": 1084200 + }, + { + "epoch": 4.417199907525188, + "grad_norm": 16.35880470275879, + "learning_rate": 0.0012106842829696142, + "loss": 7.5847, + "step": 1084300 + }, + { + "epoch": 4.41760728554857, + "grad_norm": 7.555070400238037, + "learning_rate": 0.0012102601896894093, + "loss": 7.6228, + "step": 1084400 + }, + { + "epoch": 4.4180146635719515, + "grad_norm": 8.547143936157227, + "learning_rate": 0.0012098361475971948, + "loss": 7.5763, + "step": 1084500 + }, + { + "epoch": 4.418422041595333, + "grad_norm": 10.39609146118164, + "learning_rate": 0.0012094121567097348, + "loss": 7.5958, + "step": 1084600 + }, + { + "epoch": 4.418829419618715, + "grad_norm": 4.880538463592529, + "learning_rate": 0.0012089882170437894, + "loss": 7.6122, + "step": 1084700 + }, + { + "epoch": 4.419236797642096, + "grad_norm": 5.7425665855407715, + "learning_rate": 0.001208564328616122, + "loss": 7.5848, + "step": 1084800 + }, + { + "epoch": 4.419644175665478, + "grad_norm": 19.172582626342773, + "learning_rate": 0.0012081404914434924, + "loss": 7.5912, + "step": 1084900 + }, + { + "epoch": 4.420051553688859, + "grad_norm": 9.863676071166992, + "learning_rate": 0.0012077167055426544, + "loss": 7.5921, + "step": 1085000 + }, + { + "epoch": 4.420051553688859, + "eval_MaskedAccuracy": 0.5111017514881756, + "eval_loss": 1.5961647033691406, + "eval_runtime": 151.5831, + "eval_samples_per_second": 418.754, + "eval_steps_per_second": 1.636, + "step": 1085000 + }, + { + "epoch": 4.420458931712241, + "grad_norm": 11.566855430603027, + "learning_rate": 0.0012072929709303639, + "loss": 7.5695, + "step": 1085100 + }, + { + "epoch": 4.420866309735622, + "grad_norm": 16.011913299560547, + "learning_rate": 0.0012068692876233723, + "loss": 7.6089, + "step": 1085200 + }, + { + "epoch": 4.421273687759003, + "grad_norm": 11.74954891204834, + "learning_rate": 0.00120644565563843, + "loss": 7.5623, + "step": 1085300 + }, + { + "epoch": 4.421681065782384, + "grad_norm": 12.427276611328125, + "learning_rate": 0.0012060220749922873, + "loss": 7.565, + "step": 1085400 + }, + { + "epoch": 4.422088443805766, + "grad_norm": 4.826947212219238, + "learning_rate": 0.001205598545701689, + "loss": 7.5716, + "step": 1085500 + }, + { + "epoch": 4.4224958218291475, + "grad_norm": 5.644373893737793, + "learning_rate": 0.001205175067783379, + "loss": 7.5933, + "step": 1085600 + }, + { + "epoch": 4.422903199852529, + "grad_norm": 6.411406517028809, + "learning_rate": 0.0012047516412541015, + "loss": 7.5879, + "step": 1085700 + }, + { + "epoch": 4.4233105778759105, + "grad_norm": 6.444009304046631, + "learning_rate": 0.0012043282661305956, + "loss": 7.5915, + "step": 1085800 + }, + { + "epoch": 4.423717955899292, + "grad_norm": 8.614859580993652, + "learning_rate": 0.0012039049424295996, + "loss": 7.5614, + "step": 1085900 + }, + { + "epoch": 4.424125333922674, + "grad_norm": 4.692538261413574, + "learning_rate": 0.001203481670167852, + "loss": 7.5842, + "step": 1086000 + }, + { + "epoch": 4.424125333922674, + "eval_MaskedAccuracy": 0.511321580160768, + "eval_loss": 1.5966092348098755, + "eval_runtime": 151.3759, + "eval_samples_per_second": 419.327, + "eval_steps_per_second": 1.638, + "step": 1086000 + }, + { + "epoch": 4.424532711946055, + "grad_norm": 2.622321844100952, + "learning_rate": 0.0012030584493620826, + "loss": 7.6025, + "step": 1086100 + }, + { + "epoch": 4.424940089969437, + "grad_norm": 14.408987998962402, + "learning_rate": 0.0012026352800290267, + "loss": 7.5859, + "step": 1086200 + }, + { + "epoch": 4.425347467992818, + "grad_norm": 10.01328182220459, + "learning_rate": 0.0012022121621854133, + "loss": 7.5963, + "step": 1086300 + }, + { + "epoch": 4.4257548460162, + "grad_norm": 3.0204148292541504, + "learning_rate": 0.0012017890958479715, + "loss": 7.5842, + "step": 1086400 + }, + { + "epoch": 4.426162224039581, + "grad_norm": 4.547139644622803, + "learning_rate": 0.0012013660810334264, + "loss": 7.6014, + "step": 1086500 + }, + { + "epoch": 4.426569602062962, + "grad_norm": 7.615049362182617, + "learning_rate": 0.0012009431177585042, + "loss": 7.5841, + "step": 1086600 + }, + { + "epoch": 4.426976980086343, + "grad_norm": 15.627482414245605, + "learning_rate": 0.0012005202060399226, + "loss": 7.5759, + "step": 1086700 + }, + { + "epoch": 4.427384358109725, + "grad_norm": 3.7921981811523438, + "learning_rate": 0.0012000973458944047, + "loss": 7.5868, + "step": 1086800 + }, + { + "epoch": 4.4277917361331065, + "grad_norm": 9.941889762878418, + "learning_rate": 0.0011996745373386677, + "loss": 7.5671, + "step": 1086900 + }, + { + "epoch": 4.428199114156488, + "grad_norm": 10.985698699951172, + "learning_rate": 0.0011992517803894277, + "loss": 7.5999, + "step": 1087000 + }, + { + "epoch": 4.428199114156488, + "eval_MaskedAccuracy": 0.5106609983064447, + "eval_loss": 1.6057345867156982, + "eval_runtime": 153.6328, + "eval_samples_per_second": 413.167, + "eval_steps_per_second": 1.614, + "step": 1087000 + }, + { + "epoch": 4.42860649217987, + "grad_norm": 3.85752272605896, + "learning_rate": 0.001198829075063398, + "loss": 7.5876, + "step": 1087100 + }, + { + "epoch": 4.429013870203251, + "grad_norm": 3.826331377029419, + "learning_rate": 0.001198406421377292, + "loss": 7.5646, + "step": 1087200 + }, + { + "epoch": 4.429421248226633, + "grad_norm": 5.111105442047119, + "learning_rate": 0.0011979838193478171, + "loss": 7.6104, + "step": 1087300 + }, + { + "epoch": 4.429828626250014, + "grad_norm": 6.366291046142578, + "learning_rate": 0.0011975612689916837, + "loss": 7.5915, + "step": 1087400 + }, + { + "epoch": 4.430236004273396, + "grad_norm": 8.502144813537598, + "learning_rate": 0.0011971387703255951, + "loss": 7.5837, + "step": 1087500 + }, + { + "epoch": 4.430643382296777, + "grad_norm": 3.6586952209472656, + "learning_rate": 0.0011967163233662554, + "loss": 7.5898, + "step": 1087600 + }, + { + "epoch": 4.431050760320159, + "grad_norm": 2.7903194427490234, + "learning_rate": 0.0011962939281303674, + "loss": 7.5713, + "step": 1087700 + }, + { + "epoch": 4.43145813834354, + "grad_norm": 6.929922103881836, + "learning_rate": 0.0011958715846346305, + "loss": 7.592, + "step": 1087800 + }, + { + "epoch": 4.431865516366921, + "grad_norm": 5.419848442077637, + "learning_rate": 0.001195449292895742, + "loss": 7.5619, + "step": 1087900 + }, + { + "epoch": 4.4322728943903025, + "grad_norm": 5.424269676208496, + "learning_rate": 0.0011950270529303968, + "loss": 7.5614, + "step": 1088000 + }, + { + "epoch": 4.4322728943903025, + "eval_MaskedAccuracy": 0.5111898012969128, + "eval_loss": 1.5937535762786865, + "eval_runtime": 153.6066, + "eval_samples_per_second": 413.237, + "eval_steps_per_second": 1.615, + "step": 1088000 + }, + { + "epoch": 4.432680272413684, + "grad_norm": 3.4336557388305664, + "learning_rate": 0.0011946048647552868, + "loss": 7.5644, + "step": 1088100 + }, + { + "epoch": 4.4330876504370655, + "grad_norm": 6.4227614402771, + "learning_rate": 0.0011941827283871052, + "loss": 7.5607, + "step": 1088200 + }, + { + "epoch": 4.433495028460447, + "grad_norm": 3.375950813293457, + "learning_rate": 0.0011937606438425423, + "loss": 7.5485, + "step": 1088300 + }, + { + "epoch": 4.433902406483829, + "grad_norm": 6.799926280975342, + "learning_rate": 0.001193338611138283, + "loss": 7.5628, + "step": 1088400 + }, + { + "epoch": 4.43430978450721, + "grad_norm": 5.494699001312256, + "learning_rate": 0.0011929166302910145, + "loss": 7.5864, + "step": 1088500 + }, + { + "epoch": 4.434717162530592, + "grad_norm": 6.4925456047058105, + "learning_rate": 0.0011924947013174185, + "loss": 7.545, + "step": 1088600 + }, + { + "epoch": 4.435124540553973, + "grad_norm": 6.94785213470459, + "learning_rate": 0.0011920728242341759, + "loss": 7.5698, + "step": 1088700 + }, + { + "epoch": 4.435531918577355, + "grad_norm": 12.0343599319458, + "learning_rate": 0.0011916509990579683, + "loss": 7.6076, + "step": 1088800 + }, + { + "epoch": 4.435939296600736, + "grad_norm": 4.20711612701416, + "learning_rate": 0.0011912292258054705, + "loss": 7.5962, + "step": 1088900 + }, + { + "epoch": 4.436346674624118, + "grad_norm": 4.755501747131348, + "learning_rate": 0.0011908075044933582, + "loss": 7.5634, + "step": 1089000 + }, + { + "epoch": 4.436346674624118, + "eval_MaskedAccuracy": 0.5104738030339839, + "eval_loss": 1.5926874876022339, + "eval_runtime": 154.5136, + "eval_samples_per_second": 410.812, + "eval_steps_per_second": 1.605, + "step": 1089000 + }, + { + "epoch": 4.4367540526474984, + "grad_norm": 8.358015060424805, + "learning_rate": 0.001190385835138303, + "loss": 7.5721, + "step": 1089100 + }, + { + "epoch": 4.43716143067088, + "grad_norm": 6.558157920837402, + "learning_rate": 0.0011899642177569758, + "loss": 7.5767, + "step": 1089200 + }, + { + "epoch": 4.4375688086942615, + "grad_norm": 4.260998249053955, + "learning_rate": 0.0011895426523660475, + "loss": 7.578, + "step": 1089300 + }, + { + "epoch": 4.437976186717643, + "grad_norm": 6.9902119636535645, + "learning_rate": 0.0011891211389821852, + "loss": 7.5777, + "step": 1089400 + }, + { + "epoch": 4.438383564741025, + "grad_norm": 5.035665512084961, + "learning_rate": 0.0011886996776220512, + "loss": 7.5873, + "step": 1089500 + }, + { + "epoch": 4.438790942764406, + "grad_norm": 4.066616535186768, + "learning_rate": 0.0011882782683023104, + "loss": 7.5808, + "step": 1089600 + }, + { + "epoch": 4.439198320787788, + "grad_norm": 5.067715644836426, + "learning_rate": 0.0011878569110396193, + "loss": 7.5963, + "step": 1089700 + }, + { + "epoch": 4.439605698811169, + "grad_norm": 9.498620986938477, + "learning_rate": 0.0011874356058506406, + "loss": 7.5426, + "step": 1089800 + }, + { + "epoch": 4.440013076834551, + "grad_norm": 4.712803363800049, + "learning_rate": 0.001187014352752028, + "loss": 7.5929, + "step": 1089900 + }, + { + "epoch": 4.440420454857932, + "grad_norm": 11.693345069885254, + "learning_rate": 0.0011865931517604379, + "loss": 7.5669, + "step": 1090000 + }, + { + "epoch": 4.440420454857932, + "eval_MaskedAccuracy": 0.5109665043950206, + "eval_loss": 1.5994694232940674, + "eval_runtime": 151.7163, + "eval_samples_per_second": 418.386, + "eval_steps_per_second": 1.635, + "step": 1090000 + }, + { + "epoch": 4.440827832881314, + "grad_norm": 13.621916770935059, + "learning_rate": 0.0011861720028925205, + "loss": 7.6227, + "step": 1090100 + }, + { + "epoch": 4.441235210904695, + "grad_norm": 11.77079963684082, + "learning_rate": 0.0011857509061649278, + "loss": 7.5836, + "step": 1090200 + }, + { + "epoch": 4.441642588928076, + "grad_norm": 6.685378551483154, + "learning_rate": 0.001185329861594309, + "loss": 7.5556, + "step": 1090300 + }, + { + "epoch": 4.4420499669514575, + "grad_norm": 4.267038822174072, + "learning_rate": 0.0011849088691973082, + "loss": 7.5452, + "step": 1090400 + }, + { + "epoch": 4.442457344974839, + "grad_norm": 13.093615531921387, + "learning_rate": 0.0011844879289905723, + "loss": 7.6044, + "step": 1090500 + }, + { + "epoch": 4.4428647229982206, + "grad_norm": 5.280068874359131, + "learning_rate": 0.00118406704099074, + "loss": 7.5898, + "step": 1090600 + }, + { + "epoch": 4.443272101021602, + "grad_norm": 11.806068420410156, + "learning_rate": 0.0011836462052144503, + "loss": 7.5706, + "step": 1090700 + }, + { + "epoch": 4.443679479044984, + "grad_norm": 6.73701286315918, + "learning_rate": 0.0011832254216783457, + "loss": 7.5986, + "step": 1090800 + }, + { + "epoch": 4.444086857068365, + "grad_norm": 4.740633010864258, + "learning_rate": 0.0011828046903990582, + "loss": 7.6002, + "step": 1090900 + }, + { + "epoch": 4.444494235091747, + "grad_norm": 3.2756428718566895, + "learning_rate": 0.0011823840113932237, + "loss": 7.6033, + "step": 1091000 + }, + { + "epoch": 4.444494235091747, + "eval_MaskedAccuracy": 0.5105687841075114, + "eval_loss": 1.608370304107666, + "eval_runtime": 155.7681, + "eval_samples_per_second": 407.503, + "eval_steps_per_second": 1.592, + "step": 1091000 + }, + { + "epoch": 4.444901613115128, + "grad_norm": 4.4155378341674805, + "learning_rate": 0.0011819633846774718, + "loss": 7.5621, + "step": 1091100 + }, + { + "epoch": 4.44530899113851, + "grad_norm": 10.781631469726562, + "learning_rate": 0.0011815428102684331, + "loss": 7.5782, + "step": 1091200 + }, + { + "epoch": 4.445716369161891, + "grad_norm": 8.093812942504883, + "learning_rate": 0.0011811222881827363, + "loss": 7.5588, + "step": 1091300 + }, + { + "epoch": 4.446123747185273, + "grad_norm": 12.548389434814453, + "learning_rate": 0.0011807018184370065, + "loss": 7.5939, + "step": 1091400 + }, + { + "epoch": 4.446531125208654, + "grad_norm": 7.770724773406982, + "learning_rate": 0.0011802814010478662, + "loss": 7.5897, + "step": 1091500 + }, + { + "epoch": 4.446938503232035, + "grad_norm": 16.370756149291992, + "learning_rate": 0.0011798610360319386, + "loss": 7.5856, + "step": 1091600 + }, + { + "epoch": 4.4473458812554165, + "grad_norm": 10.37841510772705, + "learning_rate": 0.001179440723405841, + "loss": 7.5576, + "step": 1091700 + }, + { + "epoch": 4.447753259278798, + "grad_norm": 3.476844310760498, + "learning_rate": 0.001179020463186191, + "loss": 7.5757, + "step": 1091800 + }, + { + "epoch": 4.44816063730218, + "grad_norm": 4.5110602378845215, + "learning_rate": 0.0011786002553896033, + "loss": 7.5616, + "step": 1091900 + }, + { + "epoch": 4.448568015325561, + "grad_norm": 4.60313081741333, + "learning_rate": 0.0011781801000326922, + "loss": 7.5999, + "step": 1092000 + }, + { + "epoch": 4.448568015325561, + "eval_MaskedAccuracy": 0.5106858458393869, + "eval_loss": 1.5967841148376465, + "eval_runtime": 154.5113, + "eval_samples_per_second": 410.818, + "eval_steps_per_second": 1.605, + "step": 1092000 + }, + { + "epoch": 4.448975393348943, + "grad_norm": 4.803823947906494, + "learning_rate": 0.0011777599971320677, + "loss": 7.5799, + "step": 1092100 + }, + { + "epoch": 4.449382771372324, + "grad_norm": 8.219694137573242, + "learning_rate": 0.0011773399467043405, + "loss": 7.6024, + "step": 1092200 + }, + { + "epoch": 4.449790149395706, + "grad_norm": 5.870816707611084, + "learning_rate": 0.0011769199487661158, + "loss": 7.5963, + "step": 1092300 + }, + { + "epoch": 4.450197527419087, + "grad_norm": 6.064818382263184, + "learning_rate": 0.0011765000033339983, + "loss": 7.585, + "step": 1092400 + }, + { + "epoch": 4.450604905442469, + "grad_norm": 6.240481853485107, + "learning_rate": 0.0011760801104245902, + "loss": 7.5826, + "step": 1092500 + }, + { + "epoch": 4.45101228346585, + "grad_norm": 6.479210376739502, + "learning_rate": 0.0011756602700544927, + "loss": 7.5915, + "step": 1092600 + }, + { + "epoch": 4.451419661489232, + "grad_norm": 4.48755407333374, + "learning_rate": 0.0011752404822403034, + "loss": 7.5721, + "step": 1092700 + }, + { + "epoch": 4.451827039512613, + "grad_norm": 6.282494068145752, + "learning_rate": 0.001174820746998621, + "loss": 7.5633, + "step": 1092800 + }, + { + "epoch": 4.452234417535994, + "grad_norm": 7.3973798751831055, + "learning_rate": 0.0011744010643460392, + "loss": 7.5684, + "step": 1092900 + }, + { + "epoch": 4.452641795559376, + "grad_norm": 10.869181632995605, + "learning_rate": 0.0011739814342991495, + "loss": 7.5983, + "step": 1093000 + }, + { + "epoch": 4.452641795559376, + "eval_MaskedAccuracy": 0.5115278303939732, + "eval_loss": 1.5952904224395752, + "eval_runtime": 161.1135, + "eval_samples_per_second": 393.983, + "eval_steps_per_second": 1.539, + "step": 1093000 + }, + { + "epoch": 4.453049173582757, + "grad_norm": 9.483210563659668, + "learning_rate": 0.0011735618568745423, + "loss": 7.5681, + "step": 1093100 + }, + { + "epoch": 4.453456551606139, + "grad_norm": 22.26955223083496, + "learning_rate": 0.0011731423320888049, + "loss": 7.5787, + "step": 1093200 + }, + { + "epoch": 4.45386392962952, + "grad_norm": 4.262212753295898, + "learning_rate": 0.0011727228599585258, + "loss": 7.5832, + "step": 1093300 + }, + { + "epoch": 4.454271307652902, + "grad_norm": 12.767822265625, + "learning_rate": 0.001172303440500288, + "loss": 7.5825, + "step": 1093400 + }, + { + "epoch": 4.454678685676283, + "grad_norm": 4.370329856872559, + "learning_rate": 0.0011718840737306728, + "loss": 7.5992, + "step": 1093500 + }, + { + "epoch": 4.455086063699665, + "grad_norm": 10.459832191467285, + "learning_rate": 0.0011714647596662608, + "loss": 7.5409, + "step": 1093600 + }, + { + "epoch": 4.455493441723046, + "grad_norm": 5.395259857177734, + "learning_rate": 0.0011710454983236286, + "loss": 7.5962, + "step": 1093700 + }, + { + "epoch": 4.455900819746428, + "grad_norm": 13.553374290466309, + "learning_rate": 0.0011706262897193522, + "loss": 7.5825, + "step": 1093800 + }, + { + "epoch": 4.456308197769809, + "grad_norm": 12.136026382446289, + "learning_rate": 0.0011702071338700055, + "loss": 7.6182, + "step": 1093900 + }, + { + "epoch": 4.456715575793191, + "grad_norm": 8.024489402770996, + "learning_rate": 0.0011697880307921598, + "loss": 7.5562, + "step": 1094000 + }, + { + "epoch": 4.456715575793191, + "eval_MaskedAccuracy": 0.5102432589090533, + "eval_loss": 1.600201964378357, + "eval_runtime": 152.8216, + "eval_samples_per_second": 415.36, + "eval_steps_per_second": 1.623, + "step": 1094000 + }, + { + "epoch": 4.4571229538165715, + "grad_norm": 6.045042514801025, + "learning_rate": 0.0011693689805023838, + "loss": 7.5699, + "step": 1094100 + }, + { + "epoch": 4.457530331839953, + "grad_norm": 8.82064437866211, + "learning_rate": 0.0011689499830172458, + "loss": 7.5747, + "step": 1094200 + }, + { + "epoch": 4.457937709863335, + "grad_norm": 14.469527244567871, + "learning_rate": 0.001168531038353309, + "loss": 7.6117, + "step": 1094300 + }, + { + "epoch": 4.458345087886716, + "grad_norm": 7.1125922203063965, + "learning_rate": 0.0011681121465271403, + "loss": 7.5571, + "step": 1094400 + }, + { + "epoch": 4.458752465910098, + "grad_norm": 7.602787017822266, + "learning_rate": 0.0011676933075552979, + "loss": 7.6027, + "step": 1094500 + }, + { + "epoch": 4.459159843933479, + "grad_norm": 6.530357360839844, + "learning_rate": 0.001167274521454341, + "loss": 7.5878, + "step": 1094600 + }, + { + "epoch": 4.459567221956861, + "grad_norm": 4.430764675140381, + "learning_rate": 0.0011668557882408286, + "loss": 7.5595, + "step": 1094700 + }, + { + "epoch": 4.459974599980242, + "grad_norm": 5.939826011657715, + "learning_rate": 0.0011664371079313123, + "loss": 7.5885, + "step": 1094800 + }, + { + "epoch": 4.460381978003624, + "grad_norm": 9.3687162399292, + "learning_rate": 0.0011660184805423464, + "loss": 7.562, + "step": 1094900 + }, + { + "epoch": 4.460789356027005, + "grad_norm": 10.468751907348633, + "learning_rate": 0.0011655999060904807, + "loss": 7.5865, + "step": 1095000 + }, + { + "epoch": 4.460789356027005, + "eval_MaskedAccuracy": 0.5109743135864614, + "eval_loss": 1.5934405326843262, + "eval_runtime": 169.5334, + "eval_samples_per_second": 374.416, + "eval_steps_per_second": 1.463, + "step": 1095000 + }, + { + "epoch": 4.461196734050387, + "grad_norm": 8.076781272888184, + "learning_rate": 0.0011651813845922653, + "loss": 7.5814, + "step": 1095100 + }, + { + "epoch": 4.461604112073768, + "grad_norm": 6.898649215698242, + "learning_rate": 0.0011647629160642461, + "loss": 7.5781, + "step": 1095200 + }, + { + "epoch": 4.462011490097149, + "grad_norm": 10.0347318649292, + "learning_rate": 0.0011643445005229671, + "loss": 7.5619, + "step": 1095300 + }, + { + "epoch": 4.462418868120531, + "grad_norm": 3.8157432079315186, + "learning_rate": 0.0011639261379849698, + "loss": 7.5844, + "step": 1095400 + }, + { + "epoch": 4.462826246143912, + "grad_norm": 8.751187324523926, + "learning_rate": 0.0011635078284667946, + "loss": 7.5724, + "step": 1095500 + }, + { + "epoch": 4.463233624167294, + "grad_norm": 8.541107177734375, + "learning_rate": 0.0011630895719849797, + "loss": 7.56, + "step": 1095600 + }, + { + "epoch": 4.463641002190675, + "grad_norm": 11.511309623718262, + "learning_rate": 0.0011626713685560608, + "loss": 7.6009, + "step": 1095700 + }, + { + "epoch": 4.464048380214057, + "grad_norm": 4.9507551193237305, + "learning_rate": 0.001162253218196572, + "loss": 7.5872, + "step": 1095800 + }, + { + "epoch": 4.464455758237438, + "grad_norm": 5.776829719543457, + "learning_rate": 0.0011618351209230458, + "loss": 7.5693, + "step": 1095900 + }, + { + "epoch": 4.46486313626082, + "grad_norm": 4.541170597076416, + "learning_rate": 0.00116141707675201, + "loss": 7.5666, + "step": 1096000 + }, + { + "epoch": 4.46486313626082, + "eval_MaskedAccuracy": 0.5106916249196752, + "eval_loss": 1.5983508825302124, + "eval_runtime": 158.9889, + "eval_samples_per_second": 399.248, + "eval_steps_per_second": 1.56, + "step": 1096000 + }, + { + "epoch": 4.465270514284201, + "grad_norm": 6.547701835632324, + "learning_rate": 0.0011609990856999933, + "loss": 7.5948, + "step": 1096100 + }, + { + "epoch": 4.465677892307583, + "grad_norm": 8.93804931640625, + "learning_rate": 0.001160581147783522, + "loss": 7.618, + "step": 1096200 + }, + { + "epoch": 4.466085270330964, + "grad_norm": 7.38311767578125, + "learning_rate": 0.0011601632630191178, + "loss": 7.5622, + "step": 1096300 + }, + { + "epoch": 4.466492648354346, + "grad_norm": 12.070695877075195, + "learning_rate": 0.0011597454314233031, + "loss": 7.5446, + "step": 1096400 + }, + { + "epoch": 4.466900026377727, + "grad_norm": 5.291610240936279, + "learning_rate": 0.001159327653012594, + "loss": 7.5962, + "step": 1096500 + }, + { + "epoch": 4.467307404401108, + "grad_norm": 8.69069766998291, + "learning_rate": 0.0011589099278035118, + "loss": 7.5772, + "step": 1096600 + }, + { + "epoch": 4.46771478242449, + "grad_norm": 4.978950023651123, + "learning_rate": 0.0011584922558125702, + "loss": 7.5725, + "step": 1096700 + }, + { + "epoch": 4.468122160447871, + "grad_norm": 9.29713249206543, + "learning_rate": 0.0011580746370562825, + "loss": 7.5776, + "step": 1096800 + }, + { + "epoch": 4.468529538471253, + "grad_norm": 7.964879989624023, + "learning_rate": 0.0011576570715511553, + "loss": 7.5843, + "step": 1096900 + }, + { + "epoch": 4.468936916494634, + "grad_norm": 5.121318340301514, + "learning_rate": 0.0011572395593137006, + "loss": 7.605, + "step": 1097000 + }, + { + "epoch": 4.468936916494634, + "eval_MaskedAccuracy": 0.5107859315154086, + "eval_loss": 1.599096655845642, + "eval_runtime": 156.7402, + "eval_samples_per_second": 404.976, + "eval_steps_per_second": 1.582, + "step": 1097000 + }, + { + "epoch": 4.469344294518016, + "grad_norm": 8.140433311462402, + "learning_rate": 0.0011568221003604255, + "loss": 7.5799, + "step": 1097100 + }, + { + "epoch": 4.469751672541397, + "grad_norm": 6.903355121612549, + "learning_rate": 0.001156404694707832, + "loss": 7.5554, + "step": 1097200 + }, + { + "epoch": 4.470159050564779, + "grad_norm": 6.56154203414917, + "learning_rate": 0.0011559873423724251, + "loss": 7.581, + "step": 1097300 + }, + { + "epoch": 4.47056642858816, + "grad_norm": 7.409020900726318, + "learning_rate": 0.0011555700433707032, + "loss": 7.562, + "step": 1097400 + }, + { + "epoch": 4.470973806611542, + "grad_norm": 4.009287357330322, + "learning_rate": 0.0011551527977191648, + "loss": 7.5889, + "step": 1097500 + }, + { + "epoch": 4.471381184634923, + "grad_norm": 8.088984489440918, + "learning_rate": 0.001154735605434305, + "loss": 7.6211, + "step": 1097600 + }, + { + "epoch": 4.471788562658305, + "grad_norm": 5.861762523651123, + "learning_rate": 0.001154318466532618, + "loss": 7.6004, + "step": 1097700 + }, + { + "epoch": 4.4721959406816865, + "grad_norm": 14.661588668823242, + "learning_rate": 0.0011539013810305957, + "loss": 7.5999, + "step": 1097800 + }, + { + "epoch": 4.472603318705067, + "grad_norm": 10.20081615447998, + "learning_rate": 0.0011534843489447292, + "loss": 7.5885, + "step": 1097900 + }, + { + "epoch": 4.473010696728449, + "grad_norm": 11.643823623657227, + "learning_rate": 0.0011530673702915043, + "loss": 7.5972, + "step": 1098000 + }, + { + "epoch": 4.473010696728449, + "eval_MaskedAccuracy": 0.5111975728571545, + "eval_loss": 1.5915279388427734, + "eval_runtime": 153.0242, + "eval_samples_per_second": 414.81, + "eval_steps_per_second": 1.621, + "step": 1098000 + }, + { + "epoch": 4.47341807475183, + "grad_norm": 7.542632102966309, + "learning_rate": 0.0011526504450874073, + "loss": 7.6034, + "step": 1098100 + }, + { + "epoch": 4.473825452775212, + "grad_norm": 4.5746049880981445, + "learning_rate": 0.0011522335733489208, + "loss": 7.6155, + "step": 1098200 + }, + { + "epoch": 4.474232830798593, + "grad_norm": 11.481820106506348, + "learning_rate": 0.001151816755092527, + "loss": 7.5799, + "step": 1098300 + }, + { + "epoch": 4.474640208821975, + "grad_norm": 6.03773832321167, + "learning_rate": 0.0011513999903347044, + "loss": 7.6035, + "step": 1098400 + }, + { + "epoch": 4.475047586845356, + "grad_norm": 5.938760757446289, + "learning_rate": 0.0011509832790919277, + "loss": 7.5712, + "step": 1098500 + }, + { + "epoch": 4.475454964868738, + "grad_norm": 8.458950996398926, + "learning_rate": 0.0011505666213806741, + "loss": 7.5992, + "step": 1098600 + }, + { + "epoch": 4.475862342892119, + "grad_norm": 3.898512840270996, + "learning_rate": 0.001150150017217418, + "loss": 7.5664, + "step": 1098700 + }, + { + "epoch": 4.476269720915501, + "grad_norm": 7.1401519775390625, + "learning_rate": 0.0011497334666186272, + "loss": 7.5925, + "step": 1098800 + }, + { + "epoch": 4.476677098938882, + "grad_norm": 7.631970405578613, + "learning_rate": 0.0011493169696007693, + "loss": 7.5772, + "step": 1098900 + }, + { + "epoch": 4.477084476962264, + "grad_norm": 5.312163829803467, + "learning_rate": 0.0011489005261803142, + "loss": 7.5905, + "step": 1099000 + }, + { + "epoch": 4.477084476962264, + "eval_MaskedAccuracy": 0.5109278042613278, + "eval_loss": 1.5994435548782349, + "eval_runtime": 153.4923, + "eval_samples_per_second": 413.545, + "eval_steps_per_second": 1.616, + "step": 1099000 + }, + { + "epoch": 4.477491854985645, + "grad_norm": 4.4299445152282715, + "learning_rate": 0.0011484841363737235, + "loss": 7.5938, + "step": 1099100 + }, + { + "epoch": 4.477899233009026, + "grad_norm": 8.612445831298828, + "learning_rate": 0.00114806780019746, + "loss": 7.6022, + "step": 1099200 + }, + { + "epoch": 4.478306611032408, + "grad_norm": 8.290127754211426, + "learning_rate": 0.0011476515176679846, + "loss": 7.5825, + "step": 1099300 + }, + { + "epoch": 4.478713989055789, + "grad_norm": 8.893336296081543, + "learning_rate": 0.0011472352888017546, + "loss": 7.5821, + "step": 1099400 + }, + { + "epoch": 4.479121367079171, + "grad_norm": 10.074488639831543, + "learning_rate": 0.0011468191136152239, + "loss": 7.607, + "step": 1099500 + }, + { + "epoch": 4.479528745102552, + "grad_norm": 5.985406398773193, + "learning_rate": 0.001146402992124848, + "loss": 7.5869, + "step": 1099600 + }, + { + "epoch": 4.479936123125934, + "grad_norm": 13.412552833557129, + "learning_rate": 0.0011459869243470784, + "loss": 7.5837, + "step": 1099700 + }, + { + "epoch": 4.480343501149315, + "grad_norm": 7.82590913772583, + "learning_rate": 0.0011455709102983645, + "loss": 7.6066, + "step": 1099800 + }, + { + "epoch": 4.480750879172697, + "grad_norm": 5.195190906524658, + "learning_rate": 0.0011451549499951529, + "loss": 7.5673, + "step": 1099900 + }, + { + "epoch": 4.481158257196078, + "grad_norm": 5.314785957336426, + "learning_rate": 0.0011447390434538903, + "loss": 7.5783, + "step": 1100000 + }, + { + "epoch": 4.481158257196078, + "eval_MaskedAccuracy": 0.5108102540832667, + "eval_loss": 1.5934927463531494, + "eval_runtime": 169.4816, + "eval_samples_per_second": 374.53, + "eval_steps_per_second": 1.463, + "step": 1100000 + }, + { + "epoch": 4.48156563521946, + "grad_norm": 7.085400581359863, + "learning_rate": 0.001144323190691018, + "loss": 7.5407, + "step": 1100100 + }, + { + "epoch": 4.4819730132428415, + "grad_norm": 11.708148002624512, + "learning_rate": 0.001143907391722975, + "loss": 7.5778, + "step": 1100200 + }, + { + "epoch": 4.482380391266222, + "grad_norm": 4.760464191436768, + "learning_rate": 0.0011434916465662048, + "loss": 7.607, + "step": 1100300 + }, + { + "epoch": 4.482787769289604, + "grad_norm": 4.3901567459106445, + "learning_rate": 0.0011430759552371404, + "loss": 7.5536, + "step": 1100400 + }, + { + "epoch": 4.483195147312985, + "grad_norm": 8.597267150878906, + "learning_rate": 0.0011426603177522182, + "loss": 7.5832, + "step": 1100500 + }, + { + "epoch": 4.483602525336367, + "grad_norm": 7.280422210693359, + "learning_rate": 0.00114224473412787, + "loss": 7.5808, + "step": 1100600 + }, + { + "epoch": 4.484009903359748, + "grad_norm": 3.35546612739563, + "learning_rate": 0.0011418292043805245, + "loss": 7.5853, + "step": 1100700 + }, + { + "epoch": 4.48441728138313, + "grad_norm": 9.373496055603027, + "learning_rate": 0.0011414137285266136, + "loss": 7.5487, + "step": 1100800 + }, + { + "epoch": 4.484824659406511, + "grad_norm": 6.74008321762085, + "learning_rate": 0.0011409983065825626, + "loss": 7.5769, + "step": 1100900 + }, + { + "epoch": 4.485232037429893, + "grad_norm": 5.951924800872803, + "learning_rate": 0.001140582938564791, + "loss": 7.555, + "step": 1101000 + }, + { + "epoch": 4.485232037429893, + "eval_MaskedAccuracy": 0.5108742536780447, + "eval_loss": 1.6083155870437622, + "eval_runtime": 159.5517, + "eval_samples_per_second": 397.84, + "eval_steps_per_second": 1.554, + "step": 1101000 + }, + { + "epoch": 4.485639415453274, + "grad_norm": 3.0179784297943115, + "learning_rate": 0.0011401676244897241, + "loss": 7.5667, + "step": 1101100 + }, + { + "epoch": 4.486046793476656, + "grad_norm": 4.752579212188721, + "learning_rate": 0.0011397523643737807, + "loss": 7.5688, + "step": 1101200 + }, + { + "epoch": 4.4864541715000374, + "grad_norm": 10.299113273620605, + "learning_rate": 0.0011393371582333763, + "loss": 7.583, + "step": 1101300 + }, + { + "epoch": 4.486861549523419, + "grad_norm": 5.598707675933838, + "learning_rate": 0.0011389220060849282, + "loss": 7.6029, + "step": 1101400 + }, + { + "epoch": 4.4872689275468005, + "grad_norm": 4.877350330352783, + "learning_rate": 0.0011385069079448515, + "loss": 7.5969, + "step": 1101500 + }, + { + "epoch": 4.487676305570181, + "grad_norm": 11.625675201416016, + "learning_rate": 0.0011380918638295519, + "loss": 7.5557, + "step": 1101600 + }, + { + "epoch": 4.488083683593563, + "grad_norm": 5.352262496948242, + "learning_rate": 0.0011376768737554454, + "loss": 7.5396, + "step": 1101700 + }, + { + "epoch": 4.488491061616944, + "grad_norm": 4.063558101654053, + "learning_rate": 0.0011372619377389335, + "loss": 7.5678, + "step": 1101800 + }, + { + "epoch": 4.488898439640326, + "grad_norm": 6.834637641906738, + "learning_rate": 0.0011368470557964242, + "loss": 7.5794, + "step": 1101900 + }, + { + "epoch": 4.489305817663707, + "grad_norm": 13.786849975585938, + "learning_rate": 0.0011364322279443155, + "loss": 7.5751, + "step": 1102000 + }, + { + "epoch": 4.489305817663707, + "eval_MaskedAccuracy": 0.5115187416144197, + "eval_loss": 1.5978195667266846, + "eval_runtime": 157.5184, + "eval_samples_per_second": 402.975, + "eval_steps_per_second": 1.574, + "step": 1102000 + }, + { + "epoch": 4.489713195687089, + "grad_norm": 7.651937484741211, + "learning_rate": 0.0011360174541990118, + "loss": 7.5754, + "step": 1102100 + }, + { + "epoch": 4.49012057371047, + "grad_norm": 5.232959270477295, + "learning_rate": 0.00113560273457691, + "loss": 7.5747, + "step": 1102200 + }, + { + "epoch": 4.490527951733852, + "grad_norm": 8.001144409179688, + "learning_rate": 0.0011351880690944067, + "loss": 7.5895, + "step": 1102300 + }, + { + "epoch": 4.490935329757233, + "grad_norm": 6.840724468231201, + "learning_rate": 0.001134773457767896, + "loss": 7.5791, + "step": 1102400 + }, + { + "epoch": 4.491342707780615, + "grad_norm": 7.303138256072998, + "learning_rate": 0.0011343589006137683, + "loss": 7.5895, + "step": 1102500 + }, + { + "epoch": 4.4917500858039965, + "grad_norm": 7.090054512023926, + "learning_rate": 0.0011339443976484118, + "loss": 7.5712, + "step": 1102600 + }, + { + "epoch": 4.492157463827378, + "grad_norm": 6.0141472816467285, + "learning_rate": 0.0011335299488882196, + "loss": 7.5956, + "step": 1102700 + }, + { + "epoch": 4.4925648418507595, + "grad_norm": 5.018124580383301, + "learning_rate": 0.0011331155543495713, + "loss": 7.5981, + "step": 1102800 + }, + { + "epoch": 4.49297221987414, + "grad_norm": 7.8402099609375, + "learning_rate": 0.0011327012140488523, + "loss": 7.5694, + "step": 1102900 + }, + { + "epoch": 4.493379597897522, + "grad_norm": 5.013631343841553, + "learning_rate": 0.0011322869280024441, + "loss": 7.5893, + "step": 1103000 + }, + { + "epoch": 4.493379597897522, + "eval_MaskedAccuracy": 0.5105309000422119, + "eval_loss": 1.5985708236694336, + "eval_runtime": 153.7186, + "eval_samples_per_second": 412.936, + "eval_steps_per_second": 1.613, + "step": 1103000 + }, + { + "epoch": 4.493786975920903, + "grad_norm": 7.964849472045898, + "learning_rate": 0.0011318726962267253, + "loss": 7.5889, + "step": 1103100 + }, + { + "epoch": 4.494194353944285, + "grad_norm": 10.593032836914062, + "learning_rate": 0.0011314585187380726, + "loss": 7.5907, + "step": 1103200 + }, + { + "epoch": 4.494601731967666, + "grad_norm": 6.0983734130859375, + "learning_rate": 0.001131044395552862, + "loss": 7.5453, + "step": 1103300 + }, + { + "epoch": 4.495009109991048, + "grad_norm": 15.389650344848633, + "learning_rate": 0.0011306303266874656, + "loss": 7.5742, + "step": 1103400 + }, + { + "epoch": 4.495416488014429, + "grad_norm": 11.013498306274414, + "learning_rate": 0.0011302163121582514, + "loss": 7.5711, + "step": 1103500 + }, + { + "epoch": 4.495823866037811, + "grad_norm": 9.118451118469238, + "learning_rate": 0.0011298023519815902, + "loss": 7.5783, + "step": 1103600 + }, + { + "epoch": 4.4962312440611925, + "grad_norm": 9.36022663116455, + "learning_rate": 0.001129388446173847, + "loss": 7.5725, + "step": 1103700 + }, + { + "epoch": 4.496638622084574, + "grad_norm": 13.599825859069824, + "learning_rate": 0.001128974594751386, + "loss": 7.5771, + "step": 1103800 + }, + { + "epoch": 4.4970460001079555, + "grad_norm": 4.665948867797852, + "learning_rate": 0.0011285607977305674, + "loss": 7.5897, + "step": 1103900 + }, + { + "epoch": 4.497453378131337, + "grad_norm": 4.070620536804199, + "learning_rate": 0.0011281470551277536, + "loss": 7.5785, + "step": 1104000 + }, + { + "epoch": 4.497453378131337, + "eval_MaskedAccuracy": 0.510516774561958, + "eval_loss": 1.5918331146240234, + "eval_runtime": 153.3538, + "eval_samples_per_second": 413.919, + "eval_steps_per_second": 1.617, + "step": 1104000 + }, + { + "epoch": 4.497860756154718, + "grad_norm": 4.339487075805664, + "learning_rate": 0.0011277333669593, + "loss": 7.596, + "step": 1104100 + }, + { + "epoch": 4.498268134178099, + "grad_norm": 6.340195655822754, + "learning_rate": 0.001127319733241563, + "loss": 7.5638, + "step": 1104200 + }, + { + "epoch": 4.498675512201481, + "grad_norm": 8.324033737182617, + "learning_rate": 0.0011269061539908956, + "loss": 7.5983, + "step": 1104300 + }, + { + "epoch": 4.499082890224862, + "grad_norm": 9.860664367675781, + "learning_rate": 0.0011264926292236473, + "loss": 7.5914, + "step": 1104400 + }, + { + "epoch": 4.499490268248244, + "grad_norm": 9.858746528625488, + "learning_rate": 0.0011260791589561695, + "loss": 7.5984, + "step": 1104500 + }, + { + "epoch": 4.499897646271625, + "grad_norm": 6.180097579956055, + "learning_rate": 0.001125665743204807, + "loss": 7.5839, + "step": 1104600 + }, + { + "epoch": 4.500305024295007, + "grad_norm": 5.815901756286621, + "learning_rate": 0.0011252523819859056, + "loss": 7.5861, + "step": 1104700 + }, + { + "epoch": 4.500712402318388, + "grad_norm": 6.733134746551514, + "learning_rate": 0.0011248390753158055, + "loss": 7.6005, + "step": 1104800 + }, + { + "epoch": 4.50111978034177, + "grad_norm": 10.06464672088623, + "learning_rate": 0.0011244258232108479, + "loss": 7.5665, + "step": 1104900 + }, + { + "epoch": 4.5015271583651515, + "grad_norm": 4.955789089202881, + "learning_rate": 0.0011240126256873725, + "loss": 7.6031, + "step": 1105000 + }, + { + "epoch": 4.5015271583651515, + "eval_MaskedAccuracy": 0.5112919021049737, + "eval_loss": 1.5959364175796509, + "eval_runtime": 152.5655, + "eval_samples_per_second": 416.057, + "eval_steps_per_second": 1.626, + "step": 1105000 + }, + { + "epoch": 4.501934536388533, + "grad_norm": 4.404020309448242, + "learning_rate": 0.0011235994827617146, + "loss": 7.5809, + "step": 1105100 + }, + { + "epoch": 4.502341914411915, + "grad_norm": 5.39438009262085, + "learning_rate": 0.0011231863944502062, + "loss": 7.6003, + "step": 1105200 + }, + { + "epoch": 4.502749292435295, + "grad_norm": 3.4735138416290283, + "learning_rate": 0.0011227733607691805, + "loss": 7.5682, + "step": 1105300 + }, + { + "epoch": 4.503156670458677, + "grad_norm": 7.234744548797607, + "learning_rate": 0.0011223603817349668, + "loss": 7.6346, + "step": 1105400 + }, + { + "epoch": 4.503564048482058, + "grad_norm": 7.141273498535156, + "learning_rate": 0.0011219474573638916, + "loss": 7.5507, + "step": 1105500 + }, + { + "epoch": 4.50397142650544, + "grad_norm": 8.873198509216309, + "learning_rate": 0.001121534587672281, + "loss": 7.5842, + "step": 1105600 + }, + { + "epoch": 4.504378804528821, + "grad_norm": 9.169697761535645, + "learning_rate": 0.0011211217726764572, + "loss": 7.5806, + "step": 1105700 + }, + { + "epoch": 4.504786182552203, + "grad_norm": 7.4849467277526855, + "learning_rate": 0.0011207090123927423, + "loss": 7.5523, + "step": 1105800 + }, + { + "epoch": 4.505193560575584, + "grad_norm": 6.278785705566406, + "learning_rate": 0.0011202963068374532, + "loss": 7.5676, + "step": 1105900 + }, + { + "epoch": 4.505600938598966, + "grad_norm": 4.431141376495361, + "learning_rate": 0.0011198836560269063, + "loss": 7.5782, + "step": 1106000 + }, + { + "epoch": 4.505600938598966, + "eval_MaskedAccuracy": 0.5106673869629107, + "eval_loss": 1.5928922891616821, + "eval_runtime": 153.6369, + "eval_samples_per_second": 413.156, + "eval_steps_per_second": 1.614, + "step": 1106000 + }, + { + "epoch": 4.5060083166223475, + "grad_norm": 5.619106769561768, + "learning_rate": 0.0011194710599774175, + "loss": 7.5694, + "step": 1106100 + }, + { + "epoch": 4.506415694645729, + "grad_norm": 7.33353853225708, + "learning_rate": 0.001119058518705297, + "loss": 7.5745, + "step": 1106200 + }, + { + "epoch": 4.5068230726691105, + "grad_norm": 10.765326499938965, + "learning_rate": 0.001118646032226857, + "loss": 7.5548, + "step": 1106300 + }, + { + "epoch": 4.507230450692492, + "grad_norm": 7.372228622436523, + "learning_rate": 0.0011182336005584033, + "loss": 7.5957, + "step": 1106400 + }, + { + "epoch": 4.507637828715874, + "grad_norm": 12.633130073547363, + "learning_rate": 0.001117821223716241, + "loss": 7.5861, + "step": 1106500 + }, + { + "epoch": 4.508045206739254, + "grad_norm": 11.810508728027344, + "learning_rate": 0.0011174089017166762, + "loss": 7.5435, + "step": 1106600 + }, + { + "epoch": 4.508452584762636, + "grad_norm": 6.721157073974609, + "learning_rate": 0.001116996634576008, + "loss": 7.5774, + "step": 1106700 + }, + { + "epoch": 4.508859962786017, + "grad_norm": 6.119353294372559, + "learning_rate": 0.0011165844223105371, + "loss": 7.5656, + "step": 1106800 + }, + { + "epoch": 4.509267340809399, + "grad_norm": 14.55726146697998, + "learning_rate": 0.0011161722649365609, + "loss": 7.5757, + "step": 1106900 + }, + { + "epoch": 4.50967471883278, + "grad_norm": 7.758322238922119, + "learning_rate": 0.0011157601624703722, + "loss": 7.6214, + "step": 1107000 + }, + { + "epoch": 4.50967471883278, + "eval_MaskedAccuracy": 0.5110763911288638, + "eval_loss": 1.5922865867614746, + "eval_runtime": 153.6657, + "eval_samples_per_second": 413.078, + "eval_steps_per_second": 1.614, + "step": 1107000 + }, + { + "epoch": 4.510082096856162, + "grad_norm": 12.179095268249512, + "learning_rate": 0.0011153481149282652, + "loss": 7.5898, + "step": 1107100 + }, + { + "epoch": 4.510489474879543, + "grad_norm": 11.939443588256836, + "learning_rate": 0.0011149361223265276, + "loss": 7.5911, + "step": 1107200 + }, + { + "epoch": 4.510896852902925, + "grad_norm": 2.733443021774292, + "learning_rate": 0.0011145241846814507, + "loss": 7.5726, + "step": 1107300 + }, + { + "epoch": 4.5113042309263065, + "grad_norm": 4.503757953643799, + "learning_rate": 0.0011141123020093197, + "loss": 7.6024, + "step": 1107400 + }, + { + "epoch": 4.511711608949688, + "grad_norm": 2.994215726852417, + "learning_rate": 0.0011137004743264165, + "loss": 7.5971, + "step": 1107500 + }, + { + "epoch": 4.51211898697307, + "grad_norm": 6.558333396911621, + "learning_rate": 0.0011132887016490267, + "loss": 7.5714, + "step": 1107600 + }, + { + "epoch": 4.512526364996451, + "grad_norm": 5.901927947998047, + "learning_rate": 0.0011128769839934267, + "loss": 7.5853, + "step": 1107700 + }, + { + "epoch": 4.512933743019833, + "grad_norm": 3.7872025966644287, + "learning_rate": 0.0011124653213758955, + "loss": 7.5944, + "step": 1107800 + }, + { + "epoch": 4.513341121043213, + "grad_norm": 3.8414390087127686, + "learning_rate": 0.0011120537138127082, + "loss": 7.5813, + "step": 1107900 + }, + { + "epoch": 4.513748499066595, + "grad_norm": 3.5636935234069824, + "learning_rate": 0.0011116421613201392, + "loss": 7.5588, + "step": 1108000 + }, + { + "epoch": 4.513748499066595, + "eval_MaskedAccuracy": 0.5110843816749177, + "eval_loss": 1.5849589109420776, + "eval_runtime": 154.5512, + "eval_samples_per_second": 410.712, + "eval_steps_per_second": 1.605, + "step": 1108000 + }, + { + "epoch": 4.514155877089976, + "grad_norm": 6.344196319580078, + "learning_rate": 0.0011112306639144561, + "loss": 7.5914, + "step": 1108100 + }, + { + "epoch": 4.514563255113358, + "grad_norm": 11.019896507263184, + "learning_rate": 0.0011108192216119298, + "loss": 7.5995, + "step": 1108200 + }, + { + "epoch": 4.514970633136739, + "grad_norm": 11.762144088745117, + "learning_rate": 0.001110407834428826, + "loss": 7.5953, + "step": 1108300 + }, + { + "epoch": 4.515378011160121, + "grad_norm": 7.87576961517334, + "learning_rate": 0.0011099965023814106, + "loss": 7.5839, + "step": 1108400 + }, + { + "epoch": 4.5157853891835025, + "grad_norm": 8.868873596191406, + "learning_rate": 0.0011095852254859415, + "loss": 7.5734, + "step": 1108500 + }, + { + "epoch": 4.516192767206884, + "grad_norm": 5.231029033660889, + "learning_rate": 0.0011091740037586844, + "loss": 7.5795, + "step": 1108600 + }, + { + "epoch": 4.5166001452302655, + "grad_norm": 6.209876537322998, + "learning_rate": 0.0011087628372158942, + "loss": 7.5948, + "step": 1108700 + }, + { + "epoch": 4.517007523253647, + "grad_norm": 11.849695205688477, + "learning_rate": 0.0011083517258738272, + "loss": 7.5773, + "step": 1108800 + }, + { + "epoch": 4.517414901277029, + "grad_norm": 4.6103410720825195, + "learning_rate": 0.0011079406697487354, + "loss": 7.5971, + "step": 1108900 + }, + { + "epoch": 4.517822279300409, + "grad_norm": 5.200582504272461, + "learning_rate": 0.001107529668856871, + "loss": 7.5659, + "step": 1109000 + }, + { + "epoch": 4.517822279300409, + "eval_MaskedAccuracy": 0.5108471120669347, + "eval_loss": 1.5960307121276855, + "eval_runtime": 156.5972, + "eval_samples_per_second": 405.346, + "eval_steps_per_second": 1.584, + "step": 1109000 + }, + { + "epoch": 4.518229657323792, + "grad_norm": 3.9880106449127197, + "learning_rate": 0.0011071187232144823, + "loss": 7.5598, + "step": 1109100 + }, + { + "epoch": 4.518637035347172, + "grad_norm": 12.46435260772705, + "learning_rate": 0.0011067078328378165, + "loss": 7.5847, + "step": 1109200 + }, + { + "epoch": 4.519044413370554, + "grad_norm": 4.924691200256348, + "learning_rate": 0.0011062969977431183, + "loss": 7.6011, + "step": 1109300 + }, + { + "epoch": 4.519451791393935, + "grad_norm": 12.720778465270996, + "learning_rate": 0.0011058862179466325, + "loss": 7.608, + "step": 1109400 + }, + { + "epoch": 4.519859169417317, + "grad_norm": 3.4815590381622314, + "learning_rate": 0.001105475493464597, + "loss": 7.5707, + "step": 1109500 + }, + { + "epoch": 4.5202665474406984, + "grad_norm": 7.696225166320801, + "learning_rate": 0.0011050648243132496, + "loss": 7.5794, + "step": 1109600 + }, + { + "epoch": 4.52067392546408, + "grad_norm": 8.115972518920898, + "learning_rate": 0.0011046542105088277, + "loss": 7.6093, + "step": 1109700 + }, + { + "epoch": 4.5210813034874615, + "grad_norm": 9.846404075622559, + "learning_rate": 0.0011042436520675644, + "loss": 7.5725, + "step": 1109800 + }, + { + "epoch": 4.521488681510843, + "grad_norm": 8.468535423278809, + "learning_rate": 0.0011038331490056908, + "loss": 7.5819, + "step": 1109900 + }, + { + "epoch": 4.521896059534225, + "grad_norm": 3.5899410247802734, + "learning_rate": 0.0011034227013394374, + "loss": 7.5703, + "step": 1110000 + }, + { + "epoch": 4.521896059534225, + "eval_MaskedAccuracy": 0.5103302548077059, + "eval_loss": 1.6036618947982788, + "eval_runtime": 165.0916, + "eval_samples_per_second": 384.49, + "eval_steps_per_second": 1.502, + "step": 1110000 + }, + { + "epoch": 4.522303437557606, + "grad_norm": 15.857691764831543, + "learning_rate": 0.00110301230908503, + "loss": 7.5776, + "step": 1110100 + }, + { + "epoch": 4.522710815580988, + "grad_norm": 10.199382781982422, + "learning_rate": 0.0011026019722586951, + "loss": 7.5996, + "step": 1110200 + }, + { + "epoch": 4.523118193604368, + "grad_norm": 11.391201972961426, + "learning_rate": 0.001102191690876654, + "loss": 7.5761, + "step": 1110300 + }, + { + "epoch": 4.52352557162775, + "grad_norm": 3.94089412689209, + "learning_rate": 0.0011017814649551278, + "loss": 7.6011, + "step": 1110400 + }, + { + "epoch": 4.523932949651131, + "grad_norm": 12.507319450378418, + "learning_rate": 0.0011013712945103344, + "loss": 7.6186, + "step": 1110500 + }, + { + "epoch": 4.524340327674513, + "grad_norm": 10.309355735778809, + "learning_rate": 0.001100961179558493, + "loss": 7.5874, + "step": 1110600 + }, + { + "epoch": 4.524747705697894, + "grad_norm": 14.556020736694336, + "learning_rate": 0.0011005511201158148, + "loss": 7.5707, + "step": 1110700 + }, + { + "epoch": 4.525155083721276, + "grad_norm": 7.470675468444824, + "learning_rate": 0.0011001411161985122, + "loss": 7.5498, + "step": 1110800 + }, + { + "epoch": 4.5255624617446575, + "grad_norm": 3.487459659576416, + "learning_rate": 0.0010997311678227957, + "loss": 7.5726, + "step": 1110900 + }, + { + "epoch": 4.525969839768039, + "grad_norm": 7.366673946380615, + "learning_rate": 0.0010993212750048703, + "loss": 7.5897, + "step": 1111000 + }, + { + "epoch": 4.525969839768039, + "eval_MaskedAccuracy": 0.511501025603417, + "eval_loss": 1.5909398794174194, + "eval_runtime": 154.9582, + "eval_samples_per_second": 409.633, + "eval_steps_per_second": 1.6, + "step": 1111000 + }, + { + "epoch": 4.5263772177914205, + "grad_norm": 6.191447734832764, + "learning_rate": 0.0010989114377609453, + "loss": 7.587, + "step": 1111100 + }, + { + "epoch": 4.526784595814802, + "grad_norm": 4.839754104614258, + "learning_rate": 0.001098501656107221, + "loss": 7.5737, + "step": 1111200 + }, + { + "epoch": 4.527191973838184, + "grad_norm": 10.126070976257324, + "learning_rate": 0.0010980919300598976, + "loss": 7.5499, + "step": 1111300 + }, + { + "epoch": 4.527599351861565, + "grad_norm": 10.946996688842773, + "learning_rate": 0.0010976822596351747, + "loss": 7.5615, + "step": 1111400 + }, + { + "epoch": 4.528006729884947, + "grad_norm": 9.202312469482422, + "learning_rate": 0.0010972726448492491, + "loss": 7.5734, + "step": 1111500 + }, + { + "epoch": 4.528414107908327, + "grad_norm": 5.462584972381592, + "learning_rate": 0.001096863085718315, + "loss": 7.5489, + "step": 1111600 + }, + { + "epoch": 4.528821485931709, + "grad_norm": 5.888820648193359, + "learning_rate": 0.0010964535822585656, + "loss": 7.5567, + "step": 1111700 + }, + { + "epoch": 4.52922886395509, + "grad_norm": 3.6279215812683105, + "learning_rate": 0.001096044134486188, + "loss": 7.5847, + "step": 1111800 + }, + { + "epoch": 4.529636241978472, + "grad_norm": 5.458526134490967, + "learning_rate": 0.001095634742417373, + "loss": 7.6019, + "step": 1111900 + }, + { + "epoch": 4.5300436200018535, + "grad_norm": 6.370878219604492, + "learning_rate": 0.001095225406068303, + "loss": 7.5706, + "step": 1112000 + }, + { + "epoch": 4.5300436200018535, + "eval_MaskedAccuracy": 0.5114930069502411, + "eval_loss": 1.5956523418426514, + "eval_runtime": 155.7762, + "eval_samples_per_second": 407.482, + "eval_steps_per_second": 1.592, + "step": 1112000 + }, + { + "epoch": 4.530450998025235, + "grad_norm": 7.633890628814697, + "learning_rate": 0.0010948161254551636, + "loss": 7.5844, + "step": 1112100 + }, + { + "epoch": 4.5308583760486165, + "grad_norm": 9.703800201416016, + "learning_rate": 0.0010944069005941349, + "loss": 7.5652, + "step": 1112200 + }, + { + "epoch": 4.531265754071998, + "grad_norm": 10.638181686401367, + "learning_rate": 0.0010939977315013961, + "loss": 7.5798, + "step": 1112300 + }, + { + "epoch": 4.53167313209538, + "grad_norm": 10.037205696105957, + "learning_rate": 0.001093588618193123, + "loss": 7.5789, + "step": 1112400 + }, + { + "epoch": 4.532080510118761, + "grad_norm": 5.2120466232299805, + "learning_rate": 0.0010931795606854897, + "loss": 7.5769, + "step": 1112500 + }, + { + "epoch": 4.532487888142143, + "grad_norm": 10.160205841064453, + "learning_rate": 0.0010927705589946712, + "loss": 7.5521, + "step": 1112600 + }, + { + "epoch": 4.532895266165524, + "grad_norm": 3.2886061668395996, + "learning_rate": 0.001092361613136835, + "loss": 7.5638, + "step": 1112700 + }, + { + "epoch": 4.533302644188906, + "grad_norm": 3.281219959259033, + "learning_rate": 0.0010919527231281485, + "loss": 7.5669, + "step": 1112800 + }, + { + "epoch": 4.533710022212286, + "grad_norm": 10.00158977508545, + "learning_rate": 0.0010915438889847792, + "loss": 7.5775, + "step": 1112900 + }, + { + "epoch": 4.534117400235668, + "grad_norm": 8.850213050842285, + "learning_rate": 0.0010911351107228894, + "loss": 7.5842, + "step": 1113000 + }, + { + "epoch": 4.534117400235668, + "eval_MaskedAccuracy": 0.510891954563634, + "eval_loss": 1.6031019687652588, + "eval_runtime": 170.1512, + "eval_samples_per_second": 373.056, + "eval_steps_per_second": 1.458, + "step": 1113000 + }, + { + "epoch": 4.534524778259049, + "grad_norm": 11.173861503601074, + "learning_rate": 0.0010907263883586412, + "loss": 7.5714, + "step": 1113100 + }, + { + "epoch": 4.534932156282431, + "grad_norm": 4.537968158721924, + "learning_rate": 0.001090317721908192, + "loss": 7.5787, + "step": 1113200 + }, + { + "epoch": 4.5353395343058125, + "grad_norm": 6.516021728515625, + "learning_rate": 0.001089909111387701, + "loss": 7.5813, + "step": 1113300 + }, + { + "epoch": 4.535746912329194, + "grad_norm": 6.329670429229736, + "learning_rate": 0.0010895005568133192, + "loss": 7.6067, + "step": 1113400 + }, + { + "epoch": 4.536154290352576, + "grad_norm": 4.165231227874756, + "learning_rate": 0.0010890920582012013, + "loss": 7.5996, + "step": 1113500 + }, + { + "epoch": 4.536561668375957, + "grad_norm": 9.512068748474121, + "learning_rate": 0.001088683615567497, + "loss": 7.5564, + "step": 1113600 + }, + { + "epoch": 4.536969046399339, + "grad_norm": 3.5417563915252686, + "learning_rate": 0.0010882752289283555, + "loss": 7.5982, + "step": 1113700 + }, + { + "epoch": 4.53737642442272, + "grad_norm": 10.698873519897461, + "learning_rate": 0.0010878668982999226, + "loss": 7.5737, + "step": 1113800 + }, + { + "epoch": 4.537783802446102, + "grad_norm": 6.200733184814453, + "learning_rate": 0.001087458623698337, + "loss": 7.5809, + "step": 1113900 + }, + { + "epoch": 4.538191180469482, + "grad_norm": 6.130573272705078, + "learning_rate": 0.0010870504051397453, + "loss": 7.5746, + "step": 1114000 + }, + { + "epoch": 4.538191180469482, + "eval_MaskedAccuracy": 0.5102800815169236, + "eval_loss": 1.5996474027633667, + "eval_runtime": 232.3978, + "eval_samples_per_second": 273.135, + "eval_steps_per_second": 1.067, + "step": 1114000 + }, + { + "epoch": 4.538598558492865, + "grad_norm": 11.021953582763672, + "learning_rate": 0.0010866422426402853, + "loss": 7.5923, + "step": 1114100 + }, + { + "epoch": 4.539005936516245, + "grad_norm": 5.681769847869873, + "learning_rate": 0.001086234136216093, + "loss": 7.5695, + "step": 1114200 + }, + { + "epoch": 4.539413314539627, + "grad_norm": 10.913374900817871, + "learning_rate": 0.001085826085883302, + "loss": 7.5638, + "step": 1114300 + }, + { + "epoch": 4.5398206925630085, + "grad_norm": 3.1211705207824707, + "learning_rate": 0.0010854180916580484, + "loss": 7.565, + "step": 1114400 + }, + { + "epoch": 4.54022807058639, + "grad_norm": 8.329007148742676, + "learning_rate": 0.0010850101535564596, + "loss": 7.5831, + "step": 1114500 + }, + { + "epoch": 4.5406354486097715, + "grad_norm": 4.618702411651611, + "learning_rate": 0.001084602271594664, + "loss": 7.5554, + "step": 1114600 + }, + { + "epoch": 4.541042826633153, + "grad_norm": 4.494823455810547, + "learning_rate": 0.0010841944457887867, + "loss": 7.5761, + "step": 1114700 + }, + { + "epoch": 4.541450204656535, + "grad_norm": 3.781698703765869, + "learning_rate": 0.0010837866761549523, + "loss": 7.5645, + "step": 1114800 + }, + { + "epoch": 4.541857582679916, + "grad_norm": 3.5153980255126953, + "learning_rate": 0.0010833789627092817, + "loss": 7.5857, + "step": 1114900 + }, + { + "epoch": 4.542264960703298, + "grad_norm": 4.925096035003662, + "learning_rate": 0.0010829713054678943, + "loss": 7.5668, + "step": 1115000 + }, + { + "epoch": 4.542264960703298, + "eval_MaskedAccuracy": 0.5107099442867878, + "eval_loss": 1.5973669290542603, + "eval_runtime": 167.7262, + "eval_samples_per_second": 378.45, + "eval_steps_per_second": 1.479, + "step": 1115000 + }, + { + "epoch": 4.542672338726679, + "grad_norm": 5.03632116317749, + "learning_rate": 0.0010825637044469057, + "loss": 7.5835, + "step": 1115100 + }, + { + "epoch": 4.543079716750061, + "grad_norm": 7.940250873565674, + "learning_rate": 0.001082156159662433, + "loss": 7.5807, + "step": 1115200 + }, + { + "epoch": 4.543487094773441, + "grad_norm": 4.016974449157715, + "learning_rate": 0.0010817486711305864, + "loss": 7.5783, + "step": 1115300 + }, + { + "epoch": 4.543894472796823, + "grad_norm": 8.841370582580566, + "learning_rate": 0.001081341238867477, + "loss": 7.5237, + "step": 1115400 + }, + { + "epoch": 4.544301850820204, + "grad_norm": 9.194918632507324, + "learning_rate": 0.001080933862889213, + "loss": 7.5915, + "step": 1115500 + }, + { + "epoch": 4.544709228843586, + "grad_norm": 7.682662010192871, + "learning_rate": 0.0010805265432118996, + "loss": 7.5789, + "step": 1115600 + }, + { + "epoch": 4.5451166068669675, + "grad_norm": 5.003277778625488, + "learning_rate": 0.001080119279851642, + "loss": 7.5844, + "step": 1115700 + }, + { + "epoch": 4.545523984890349, + "grad_norm": 9.853026390075684, + "learning_rate": 0.0010797120728245407, + "loss": 7.5879, + "step": 1115800 + }, + { + "epoch": 4.545931362913731, + "grad_norm": 6.268301486968994, + "learning_rate": 0.001079304922146694, + "loss": 7.5754, + "step": 1115900 + }, + { + "epoch": 4.546338740937112, + "grad_norm": 4.028406620025635, + "learning_rate": 0.0010788978278341976, + "loss": 7.5678, + "step": 1116000 + }, + { + "epoch": 4.546338740937112, + "eval_MaskedAccuracy": 0.5113442019094754, + "eval_loss": 1.596558928489685, + "eval_runtime": 171.9151, + "eval_samples_per_second": 369.229, + "eval_steps_per_second": 1.443, + "step": 1116000 + }, + { + "epoch": 4.546746118960494, + "grad_norm": 3.8657755851745605, + "learning_rate": 0.0010784907899031484, + "loss": 7.5684, + "step": 1116100 + }, + { + "epoch": 4.547153496983875, + "grad_norm": 8.76175308227539, + "learning_rate": 0.0010780838083696374, + "loss": 7.5521, + "step": 1116200 + }, + { + "epoch": 4.547560875007257, + "grad_norm": 4.481683731079102, + "learning_rate": 0.0010776768832497547, + "loss": 7.5587, + "step": 1116300 + }, + { + "epoch": 4.547968253030638, + "grad_norm": 7.46676778793335, + "learning_rate": 0.0010772700145595872, + "loss": 7.5549, + "step": 1116400 + }, + { + "epoch": 4.54837563105402, + "grad_norm": 18.588315963745117, + "learning_rate": 0.0010768632023152233, + "loss": 7.587, + "step": 1116500 + }, + { + "epoch": 4.5487830090774, + "grad_norm": 4.303117752075195, + "learning_rate": 0.0010764564465327454, + "loss": 7.5673, + "step": 1116600 + }, + { + "epoch": 4.549190387100782, + "grad_norm": 4.791867256164551, + "learning_rate": 0.0010760497472282326, + "loss": 7.562, + "step": 1116700 + }, + { + "epoch": 4.5495977651241635, + "grad_norm": 9.731392860412598, + "learning_rate": 0.0010756431044177683, + "loss": 7.555, + "step": 1116800 + }, + { + "epoch": 4.550005143147545, + "grad_norm": 16.8034610748291, + "learning_rate": 0.0010752365181174257, + "loss": 7.5842, + "step": 1116900 + }, + { + "epoch": 4.5504125211709265, + "grad_norm": 11.9800386428833, + "learning_rate": 0.0010748299883432803, + "loss": 7.5738, + "step": 1117000 + }, + { + "epoch": 4.5504125211709265, + "eval_MaskedAccuracy": 0.5114025195152014, + "eval_loss": 1.5812039375305176, + "eval_runtime": 163.9994, + "eval_samples_per_second": 387.05, + "eval_steps_per_second": 1.512, + "step": 1117000 + }, + { + "epoch": 4.550819899194308, + "grad_norm": 10.361425399780273, + "learning_rate": 0.0010744235151114053, + "loss": 7.5542, + "step": 1117100 + }, + { + "epoch": 4.55122727721769, + "grad_norm": 8.61764144897461, + "learning_rate": 0.0010740170984378688, + "loss": 7.5817, + "step": 1117200 + }, + { + "epoch": 4.551634655241071, + "grad_norm": 8.817815780639648, + "learning_rate": 0.0010736107383387417, + "loss": 7.6032, + "step": 1117300 + }, + { + "epoch": 4.552042033264453, + "grad_norm": 2.282099723815918, + "learning_rate": 0.0010732044348300853, + "loss": 7.5587, + "step": 1117400 + }, + { + "epoch": 4.552449411287834, + "grad_norm": 10.575343132019043, + "learning_rate": 0.0010727981879279666, + "loss": 7.5449, + "step": 1117500 + }, + { + "epoch": 4.552856789311216, + "grad_norm": 13.9160795211792, + "learning_rate": 0.0010723919976484465, + "loss": 7.5514, + "step": 1117600 + }, + { + "epoch": 4.553264167334597, + "grad_norm": 9.172242164611816, + "learning_rate": 0.0010719858640075826, + "loss": 7.5638, + "step": 1117700 + }, + { + "epoch": 4.553671545357979, + "grad_norm": 7.916067600250244, + "learning_rate": 0.001071579787021432, + "loss": 7.5755, + "step": 1117800 + }, + { + "epoch": 4.5540789233813594, + "grad_norm": 3.5791852474212646, + "learning_rate": 0.0010711737667060484, + "loss": 7.5668, + "step": 1117900 + }, + { + "epoch": 4.554486301404741, + "grad_norm": 12.591864585876465, + "learning_rate": 0.0010707678030774837, + "loss": 7.586, + "step": 1118000 + }, + { + "epoch": 4.554486301404741, + "eval_MaskedAccuracy": 0.5105842546790761, + "eval_loss": 1.6017931699752808, + "eval_runtime": 173.2789, + "eval_samples_per_second": 366.323, + "eval_steps_per_second": 1.431, + "step": 1118000 + }, + { + "epoch": 4.5548936794281225, + "grad_norm": 6.084591388702393, + "learning_rate": 0.001070361896151789, + "loss": 7.5895, + "step": 1118100 + }, + { + "epoch": 4.555301057451504, + "grad_norm": 13.230748176574707, + "learning_rate": 0.0010699560459450108, + "loss": 7.5819, + "step": 1118200 + }, + { + "epoch": 4.555708435474886, + "grad_norm": 6.209146499633789, + "learning_rate": 0.001069550252473196, + "loss": 7.5958, + "step": 1118300 + }, + { + "epoch": 4.556115813498267, + "grad_norm": 6.586806774139404, + "learning_rate": 0.0010691445157523871, + "loss": 7.5711, + "step": 1118400 + }, + { + "epoch": 4.556523191521649, + "grad_norm": 4.204071998596191, + "learning_rate": 0.0010687388357986261, + "loss": 7.5655, + "step": 1118500 + }, + { + "epoch": 4.55693056954503, + "grad_norm": 9.868650436401367, + "learning_rate": 0.0010683332126279492, + "loss": 7.5773, + "step": 1118600 + }, + { + "epoch": 4.557337947568412, + "grad_norm": 5.84165620803833, + "learning_rate": 0.0010679276462563947, + "loss": 7.5928, + "step": 1118700 + }, + { + "epoch": 4.557745325591793, + "grad_norm": 5.934895038604736, + "learning_rate": 0.0010675221366999966, + "loss": 7.5915, + "step": 1118800 + }, + { + "epoch": 4.558152703615175, + "grad_norm": 8.131453514099121, + "learning_rate": 0.001067116683974785, + "loss": 7.5941, + "step": 1118900 + }, + { + "epoch": 4.558560081638555, + "grad_norm": 12.419584274291992, + "learning_rate": 0.0010667112880967918, + "loss": 7.5414, + "step": 1119000 + }, + { + "epoch": 4.558560081638555, + "eval_MaskedAccuracy": 0.5111769190311796, + "eval_loss": 1.6003597974777222, + "eval_runtime": 176.7664, + "eval_samples_per_second": 359.095, + "eval_steps_per_second": 1.403, + "step": 1119000 + }, + { + "epoch": 4.558967459661938, + "grad_norm": 9.122279167175293, + "learning_rate": 0.0010663059490820435, + "loss": 7.5885, + "step": 1119100 + }, + { + "epoch": 4.5593748376853185, + "grad_norm": 5.479848384857178, + "learning_rate": 0.0010659006669465668, + "loss": 7.5954, + "step": 1119200 + }, + { + "epoch": 4.5597822157087, + "grad_norm": 9.282177925109863, + "learning_rate": 0.0010654954417063827, + "loss": 7.5906, + "step": 1119300 + }, + { + "epoch": 4.5601895937320815, + "grad_norm": 4.657062530517578, + "learning_rate": 0.0010650902733775117, + "loss": 7.5873, + "step": 1119400 + }, + { + "epoch": 4.560596971755463, + "grad_norm": 11.059744834899902, + "learning_rate": 0.0010646851619759761, + "loss": 7.5997, + "step": 1119500 + }, + { + "epoch": 4.561004349778845, + "grad_norm": 13.795628547668457, + "learning_rate": 0.0010642801075177871, + "loss": 7.5797, + "step": 1119600 + }, + { + "epoch": 4.561411727802226, + "grad_norm": 3.236295223236084, + "learning_rate": 0.0010638751100189624, + "loss": 7.573, + "step": 1119700 + }, + { + "epoch": 4.561819105825608, + "grad_norm": 7.676112651824951, + "learning_rate": 0.0010634701694955113, + "loss": 7.5793, + "step": 1119800 + }, + { + "epoch": 4.562226483848989, + "grad_norm": 7.574018955230713, + "learning_rate": 0.0010630652859634452, + "loss": 7.5777, + "step": 1119900 + }, + { + "epoch": 4.562633861872371, + "grad_norm": 7.075233459472656, + "learning_rate": 0.0010626604594387709, + "loss": 7.6028, + "step": 1120000 + }, + { + "epoch": 4.562633861872371, + "eval_MaskedAccuracy": 0.5112010266145673, + "eval_loss": 1.581120491027832, + "eval_runtime": 158.2883, + "eval_samples_per_second": 401.015, + "eval_steps_per_second": 1.567, + "step": 1120000 + }, + { + "epoch": 4.563041239895752, + "grad_norm": 6.8384270668029785, + "learning_rate": 0.0010622556899374917, + "loss": 7.5601, + "step": 1120100 + }, + { + "epoch": 4.563448617919134, + "grad_norm": 6.318542003631592, + "learning_rate": 0.0010618509774756118, + "loss": 7.5672, + "step": 1120200 + }, + { + "epoch": 4.5638559959425145, + "grad_norm": 3.199270009994507, + "learning_rate": 0.001061446322069132, + "loss": 7.5738, + "step": 1120300 + }, + { + "epoch": 4.564263373965896, + "grad_norm": 3.4965708255767822, + "learning_rate": 0.0010610417237340488, + "loss": 7.5535, + "step": 1120400 + }, + { + "epoch": 4.5646707519892775, + "grad_norm": 8.460668563842773, + "learning_rate": 0.0010606371824863588, + "loss": 7.5691, + "step": 1120500 + }, + { + "epoch": 4.565078130012659, + "grad_norm": 5.669637203216553, + "learning_rate": 0.0010602326983420562, + "loss": 7.5642, + "step": 1120600 + }, + { + "epoch": 4.565485508036041, + "grad_norm": 10.30564022064209, + "learning_rate": 0.0010598282713171327, + "loss": 7.5627, + "step": 1120700 + }, + { + "epoch": 4.565892886059422, + "grad_norm": 4.847383975982666, + "learning_rate": 0.0010594239014275766, + "loss": 7.5927, + "step": 1120800 + }, + { + "epoch": 4.566300264082804, + "grad_norm": 4.886056900024414, + "learning_rate": 0.0010590195886893757, + "loss": 7.5778, + "step": 1120900 + }, + { + "epoch": 4.566707642106185, + "grad_norm": 14.0728120803833, + "learning_rate": 0.0010586153331185138, + "loss": 7.5856, + "step": 1121000 + }, + { + "epoch": 4.566707642106185, + "eval_MaskedAccuracy": 0.5101112160708272, + "eval_loss": 1.6021958589553833, + "eval_runtime": 170.5827, + "eval_samples_per_second": 372.113, + "eval_steps_per_second": 1.454, + "step": 1121000 + }, + { + "epoch": 4.567115020129567, + "grad_norm": 16.848453521728516, + "learning_rate": 0.0010582111347309732, + "loss": 7.5735, + "step": 1121100 + }, + { + "epoch": 4.567522398152948, + "grad_norm": 12.972195625305176, + "learning_rate": 0.001057806993542734, + "loss": 7.5976, + "step": 1121200 + }, + { + "epoch": 4.56792977617633, + "grad_norm": 6.517111778259277, + "learning_rate": 0.0010574029095697759, + "loss": 7.5521, + "step": 1121300 + }, + { + "epoch": 4.568337154199711, + "grad_norm": 9.902020454406738, + "learning_rate": 0.001056998882828071, + "loss": 7.5594, + "step": 1121400 + }, + { + "epoch": 4.568744532223093, + "grad_norm": 7.743771076202393, + "learning_rate": 0.0010565949133335974, + "loss": 7.5574, + "step": 1121500 + }, + { + "epoch": 4.5691519102464735, + "grad_norm": 5.402374744415283, + "learning_rate": 0.0010561910011023217, + "loss": 7.5652, + "step": 1121600 + }, + { + "epoch": 4.569559288269855, + "grad_norm": 17.277074813842773, + "learning_rate": 0.001055787146150215, + "loss": 7.5835, + "step": 1121700 + }, + { + "epoch": 4.569966666293237, + "grad_norm": 6.748904705047607, + "learning_rate": 0.0010553833484932436, + "loss": 7.5952, + "step": 1121800 + }, + { + "epoch": 4.570374044316618, + "grad_norm": 4.527615070343018, + "learning_rate": 0.0010549796081473705, + "loss": 7.5704, + "step": 1121900 + }, + { + "epoch": 4.57078142234, + "grad_norm": 4.347258567810059, + "learning_rate": 0.0010545759251285596, + "loss": 7.5871, + "step": 1122000 + }, + { + "epoch": 4.57078142234, + "eval_MaskedAccuracy": 0.5109850990092902, + "eval_loss": 1.5945192575454712, + "eval_runtime": 162.1547, + "eval_samples_per_second": 391.453, + "eval_steps_per_second": 1.529, + "step": 1122000 + }, + { + "epoch": 4.571188800363381, + "grad_norm": 9.114790916442871, + "learning_rate": 0.0010541722994527698, + "loss": 7.5719, + "step": 1122100 + }, + { + "epoch": 4.571596178386763, + "grad_norm": 3.812300682067871, + "learning_rate": 0.0010537687311359596, + "loss": 7.556, + "step": 1122200 + }, + { + "epoch": 4.572003556410144, + "grad_norm": 8.83957576751709, + "learning_rate": 0.0010533652201940818, + "loss": 7.5836, + "step": 1122300 + }, + { + "epoch": 4.572410934433526, + "grad_norm": 4.638280391693115, + "learning_rate": 0.0010529617666430922, + "loss": 7.5522, + "step": 1122400 + }, + { + "epoch": 4.572818312456907, + "grad_norm": 5.700409412384033, + "learning_rate": 0.0010525583704989396, + "loss": 7.5415, + "step": 1122500 + }, + { + "epoch": 4.573225690480289, + "grad_norm": 13.936721801757812, + "learning_rate": 0.001052155031777572, + "loss": 7.5813, + "step": 1122600 + }, + { + "epoch": 4.57363306850367, + "grad_norm": 7.647896766662598, + "learning_rate": 0.0010517517504949356, + "loss": 7.5843, + "step": 1122700 + }, + { + "epoch": 4.574040446527052, + "grad_norm": 6.669071674346924, + "learning_rate": 0.0010513485266669758, + "loss": 7.5789, + "step": 1122800 + }, + { + "epoch": 4.5744478245504325, + "grad_norm": 14.1397705078125, + "learning_rate": 0.0010509453603096321, + "loss": 7.6079, + "step": 1122900 + }, + { + "epoch": 4.574855202573814, + "grad_norm": 7.113717555999756, + "learning_rate": 0.0010505422514388472, + "loss": 7.5777, + "step": 1123000 + }, + { + "epoch": 4.574855202573814, + "eval_MaskedAccuracy": 0.5117404056359195, + "eval_loss": 1.5946736335754395, + "eval_runtime": 164.3125, + "eval_samples_per_second": 386.313, + "eval_steps_per_second": 1.509, + "step": 1123000 + }, + { + "epoch": 4.575262580597196, + "grad_norm": 5.9611639976501465, + "learning_rate": 0.0010501392000705555, + "loss": 7.5581, + "step": 1123100 + }, + { + "epoch": 4.575669958620577, + "grad_norm": 7.352646827697754, + "learning_rate": 0.0010497362062206921, + "loss": 7.5928, + "step": 1123200 + }, + { + "epoch": 4.576077336643959, + "grad_norm": 17.109018325805664, + "learning_rate": 0.0010493332699051878, + "loss": 7.589, + "step": 1123300 + }, + { + "epoch": 4.57648471466734, + "grad_norm": 12.888470649719238, + "learning_rate": 0.0010489303911399763, + "loss": 7.5735, + "step": 1123400 + }, + { + "epoch": 4.576892092690722, + "grad_norm": 8.116667747497559, + "learning_rate": 0.0010485275699409855, + "loss": 7.5698, + "step": 1123500 + }, + { + "epoch": 4.577299470714103, + "grad_norm": 8.50510025024414, + "learning_rate": 0.0010481248063241384, + "loss": 7.5857, + "step": 1123600 + }, + { + "epoch": 4.577706848737485, + "grad_norm": 9.210661888122559, + "learning_rate": 0.00104772210030536, + "loss": 7.5789, + "step": 1123700 + }, + { + "epoch": 4.578114226760866, + "grad_norm": 12.335138320922852, + "learning_rate": 0.0010473194519005718, + "loss": 7.5577, + "step": 1123800 + }, + { + "epoch": 4.578521604784248, + "grad_norm": 14.52278995513916, + "learning_rate": 0.0010469168611256918, + "loss": 7.5606, + "step": 1123900 + }, + { + "epoch": 4.5789289828076285, + "grad_norm": 4.502407073974609, + "learning_rate": 0.0010465143279966365, + "loss": 7.575, + "step": 1124000 + }, + { + "epoch": 4.5789289828076285, + "eval_MaskedAccuracy": 0.5108547609190957, + "eval_loss": 1.5947115421295166, + "eval_runtime": 177.8627, + "eval_samples_per_second": 356.882, + "eval_steps_per_second": 1.394, + "step": 1124000 + }, + { + "epoch": 4.579336360831011, + "grad_norm": 8.426820755004883, + "learning_rate": 0.0010461118525293195, + "loss": 7.5658, + "step": 1124100 + }, + { + "epoch": 4.579743738854392, + "grad_norm": 8.536303520202637, + "learning_rate": 0.001045709434739654, + "loss": 7.5717, + "step": 1124200 + }, + { + "epoch": 4.580151116877773, + "grad_norm": 12.823686599731445, + "learning_rate": 0.0010453070746435482, + "loss": 7.5661, + "step": 1124300 + }, + { + "epoch": 4.580558494901155, + "grad_norm": 11.003029823303223, + "learning_rate": 0.0010449047722569116, + "loss": 7.5612, + "step": 1124400 + }, + { + "epoch": 4.580965872924536, + "grad_norm": 13.4091796875, + "learning_rate": 0.0010445025275956474, + "loss": 7.5741, + "step": 1124500 + }, + { + "epoch": 4.581373250947918, + "grad_norm": 8.795026779174805, + "learning_rate": 0.0010441003406756603, + "loss": 7.5822, + "step": 1124600 + }, + { + "epoch": 4.581780628971299, + "grad_norm": 12.963661193847656, + "learning_rate": 0.0010436982115128507, + "loss": 7.5543, + "step": 1124700 + }, + { + "epoch": 4.582188006994681, + "grad_norm": 8.706557273864746, + "learning_rate": 0.0010432961401231166, + "loss": 7.562, + "step": 1124800 + }, + { + "epoch": 4.582595385018062, + "grad_norm": 3.7110719680786133, + "learning_rate": 0.0010428941265223544, + "loss": 7.5633, + "step": 1124900 + }, + { + "epoch": 4.583002763041444, + "grad_norm": 15.754704475402832, + "learning_rate": 0.0010424921707264572, + "loss": 7.5555, + "step": 1125000 + }, + { + "epoch": 4.583002763041444, + "eval_MaskedAccuracy": 0.5112558824422517, + "eval_loss": 1.588430404663086, + "eval_runtime": 178.381, + "eval_samples_per_second": 355.845, + "eval_steps_per_second": 1.39, + "step": 1125000 + }, + { + "epoch": 4.583410141064825, + "grad_norm": 7.041046619415283, + "learning_rate": 0.0010420902727513149, + "loss": 7.5687, + "step": 1125100 + }, + { + "epoch": 4.583817519088207, + "grad_norm": 4.503931999206543, + "learning_rate": 0.0010416884326128202, + "loss": 7.5734, + "step": 1125200 + }, + { + "epoch": 4.5842248971115875, + "grad_norm": 5.288613319396973, + "learning_rate": 0.0010412866503268581, + "loss": 7.5729, + "step": 1125300 + }, + { + "epoch": 4.584632275134969, + "grad_norm": 12.690825462341309, + "learning_rate": 0.0010408849259093118, + "loss": 7.5734, + "step": 1125400 + }, + { + "epoch": 4.585039653158351, + "grad_norm": 14.83486270904541, + "learning_rate": 0.001040483259376066, + "loss": 7.5675, + "step": 1125500 + }, + { + "epoch": 4.585447031181732, + "grad_norm": 5.501760959625244, + "learning_rate": 0.0010400816507430003, + "loss": 7.5875, + "step": 1125600 + }, + { + "epoch": 4.585854409205114, + "grad_norm": 10.312743186950684, + "learning_rate": 0.0010396801000259917, + "loss": 7.568, + "step": 1125700 + }, + { + "epoch": 4.586261787228495, + "grad_norm": 5.359569549560547, + "learning_rate": 0.0010392786072409154, + "loss": 7.5473, + "step": 1125800 + }, + { + "epoch": 4.586669165251877, + "grad_norm": 7.668104648590088, + "learning_rate": 0.001038877172403645, + "loss": 7.5719, + "step": 1125900 + }, + { + "epoch": 4.587076543275258, + "grad_norm": 9.105608940124512, + "learning_rate": 0.0010384757955300514, + "loss": 7.5493, + "step": 1126000 + }, + { + "epoch": 4.587076543275258, + "eval_MaskedAccuracy": 0.5116670547969623, + "eval_loss": 1.5944896936416626, + "eval_runtime": 170.7368, + "eval_samples_per_second": 371.777, + "eval_steps_per_second": 1.453, + "step": 1126000 + }, + { + "epoch": 4.58748392129864, + "grad_norm": 7.818266868591309, + "learning_rate": 0.001038074476636002, + "loss": 7.5657, + "step": 1126100 + }, + { + "epoch": 4.587891299322021, + "grad_norm": 5.807822227478027, + "learning_rate": 0.0010376732157373648, + "loss": 7.5645, + "step": 1126200 + }, + { + "epoch": 4.588298677345403, + "grad_norm": 5.894556999206543, + "learning_rate": 0.0010372720128500035, + "loss": 7.5616, + "step": 1126300 + }, + { + "epoch": 4.588706055368784, + "grad_norm": 9.41707706451416, + "learning_rate": 0.0010368708679897782, + "loss": 7.5584, + "step": 1126400 + }, + { + "epoch": 4.589113433392166, + "grad_norm": 10.203904151916504, + "learning_rate": 0.0010364697811725506, + "loss": 7.5567, + "step": 1126500 + }, + { + "epoch": 4.589520811415547, + "grad_norm": 12.825005531311035, + "learning_rate": 0.0010360687524141774, + "loss": 7.5901, + "step": 1126600 + }, + { + "epoch": 4.589928189438928, + "grad_norm": 7.494446277618408, + "learning_rate": 0.0010356677817305125, + "loss": 7.5719, + "step": 1126700 + }, + { + "epoch": 4.59033556746231, + "grad_norm": 5.06825590133667, + "learning_rate": 0.0010352668691374096, + "loss": 7.5904, + "step": 1126800 + }, + { + "epoch": 4.590742945485691, + "grad_norm": 7.149888515472412, + "learning_rate": 0.0010348660146507153, + "loss": 7.5604, + "step": 1126900 + }, + { + "epoch": 4.591150323509073, + "grad_norm": 18.231800079345703, + "learning_rate": 0.0010344652182862825, + "loss": 7.5706, + "step": 1127000 + }, + { + "epoch": 4.591150323509073, + "eval_MaskedAccuracy": 0.5105027030301975, + "eval_loss": 1.5943135023117065, + "eval_runtime": 175.4477, + "eval_samples_per_second": 361.794, + "eval_steps_per_second": 1.414, + "step": 1127000 + }, + { + "epoch": 4.591557701532454, + "grad_norm": 15.378019332885742, + "learning_rate": 0.001034064480059953, + "loss": 7.5635, + "step": 1127100 + }, + { + "epoch": 4.591965079555836, + "grad_norm": 8.497958183288574, + "learning_rate": 0.001033663799987573, + "loss": 7.5727, + "step": 1127200 + }, + { + "epoch": 4.592372457579217, + "grad_norm": 5.854098796844482, + "learning_rate": 0.0010332631780849823, + "loss": 7.5606, + "step": 1127300 + }, + { + "epoch": 4.592779835602599, + "grad_norm": 11.54566478729248, + "learning_rate": 0.0010328626143680188, + "loss": 7.5814, + "step": 1127400 + }, + { + "epoch": 4.59318721362598, + "grad_norm": 4.415400505065918, + "learning_rate": 0.00103246210885252, + "loss": 7.5459, + "step": 1127500 + }, + { + "epoch": 4.593594591649362, + "grad_norm": 9.316043853759766, + "learning_rate": 0.00103206166155432, + "loss": 7.5811, + "step": 1127600 + }, + { + "epoch": 4.594001969672743, + "grad_norm": 8.586962699890137, + "learning_rate": 0.0010316612724892505, + "loss": 7.5944, + "step": 1127700 + }, + { + "epoch": 4.594409347696125, + "grad_norm": 6.871153354644775, + "learning_rate": 0.0010312609416731407, + "loss": 7.5594, + "step": 1127800 + }, + { + "epoch": 4.594816725719506, + "grad_norm": 3.2205865383148193, + "learning_rate": 0.001030860669121818, + "loss": 7.5644, + "step": 1127900 + }, + { + "epoch": 4.595224103742887, + "grad_norm": 2.867100954055786, + "learning_rate": 0.0010304604548511073, + "loss": 7.5547, + "step": 1128000 + }, + { + "epoch": 4.595224103742887, + "eval_MaskedAccuracy": 0.5108949392155295, + "eval_loss": 1.5948376655578613, + "eval_runtime": 163.237, + "eval_samples_per_second": 388.858, + "eval_steps_per_second": 1.519, + "step": 1128000 + }, + { + "epoch": 4.595631481766269, + "grad_norm": 5.63533878326416, + "learning_rate": 0.0010300602988768314, + "loss": 7.5702, + "step": 1128100 + }, + { + "epoch": 4.59603885978965, + "grad_norm": 4.0221943855285645, + "learning_rate": 0.00102966020121481, + "loss": 7.5674, + "step": 1128200 + }, + { + "epoch": 4.596446237813032, + "grad_norm": 4.27165412902832, + "learning_rate": 0.0010292601618808609, + "loss": 7.5578, + "step": 1128300 + }, + { + "epoch": 4.596853615836413, + "grad_norm": 3.5418479442596436, + "learning_rate": 0.0010288601808908008, + "loss": 7.5607, + "step": 1128400 + }, + { + "epoch": 4.597260993859795, + "grad_norm": 4.3672871589660645, + "learning_rate": 0.0010284602582604422, + "loss": 7.5502, + "step": 1128500 + }, + { + "epoch": 4.597668371883176, + "grad_norm": 19.08277130126953, + "learning_rate": 0.0010280603940055976, + "loss": 7.5674, + "step": 1128600 + }, + { + "epoch": 4.598075749906558, + "grad_norm": 5.180901050567627, + "learning_rate": 0.0010276605881420749, + "loss": 7.5999, + "step": 1128700 + }, + { + "epoch": 4.598483127929939, + "grad_norm": 7.836824893951416, + "learning_rate": 0.00102726084068568, + "loss": 7.5926, + "step": 1128800 + }, + { + "epoch": 4.598890505953321, + "grad_norm": 4.071815013885498, + "learning_rate": 0.0010268611516522181, + "loss": 7.5707, + "step": 1128900 + }, + { + "epoch": 4.599297883976702, + "grad_norm": 3.604161500930786, + "learning_rate": 0.0010264615210574893, + "loss": 7.5708, + "step": 1129000 + }, + { + "epoch": 4.599297883976702, + "eval_MaskedAccuracy": 0.5113635999654071, + "eval_loss": 1.5902968645095825, + "eval_runtime": 179.3944, + "eval_samples_per_second": 353.835, + "eval_steps_per_second": 1.382, + "step": 1129000 + }, + { + "epoch": 4.599705262000084, + "grad_norm": 3.2433528900146484, + "learning_rate": 0.0010260619489172959, + "loss": 7.5727, + "step": 1129100 + }, + { + "epoch": 4.600112640023465, + "grad_norm": 7.34473180770874, + "learning_rate": 0.001025662435247433, + "loss": 7.5644, + "step": 1129200 + }, + { + "epoch": 4.600520018046846, + "grad_norm": 7.951183795928955, + "learning_rate": 0.0010252629800636965, + "loss": 7.5735, + "step": 1129300 + }, + { + "epoch": 4.600927396070228, + "grad_norm": 8.226068496704102, + "learning_rate": 0.001024863583381878, + "loss": 7.5836, + "step": 1129400 + }, + { + "epoch": 4.601334774093609, + "grad_norm": 11.12123966217041, + "learning_rate": 0.0010244642452177707, + "loss": 7.5349, + "step": 1129500 + }, + { + "epoch": 4.601742152116991, + "grad_norm": 4.654839992523193, + "learning_rate": 0.0010240649655871583, + "loss": 7.5896, + "step": 1129600 + }, + { + "epoch": 4.602149530140372, + "grad_norm": 5.445435523986816, + "learning_rate": 0.0010236657445058283, + "loss": 7.6001, + "step": 1129700 + }, + { + "epoch": 4.602556908163754, + "grad_norm": 9.480306625366211, + "learning_rate": 0.0010232665819895639, + "loss": 7.5434, + "step": 1129800 + }, + { + "epoch": 4.602964286187135, + "grad_norm": 8.214174270629883, + "learning_rate": 0.0010228674780541462, + "loss": 7.5839, + "step": 1129900 + }, + { + "epoch": 4.603371664210517, + "grad_norm": 10.587176322937012, + "learning_rate": 0.001022468432715355, + "loss": 7.5806, + "step": 1130000 + }, + { + "epoch": 4.603371664210517, + "eval_MaskedAccuracy": 0.5111672492458016, + "eval_loss": 1.5945781469345093, + "eval_runtime": 171.1909, + "eval_samples_per_second": 370.791, + "eval_steps_per_second": 1.449, + "step": 1130000 + }, + { + "epoch": 4.6037790422338984, + "grad_norm": 7.952520370483398, + "learning_rate": 0.0010220694459889654, + "loss": 7.5885, + "step": 1130100 + }, + { + "epoch": 4.60418642025728, + "grad_norm": 4.537388801574707, + "learning_rate": 0.001021670517890752, + "loss": 7.5772, + "step": 1130200 + }, + { + "epoch": 4.604593798280661, + "grad_norm": 25.86749267578125, + "learning_rate": 0.0010212716484364855, + "loss": 7.5998, + "step": 1130300 + }, + { + "epoch": 4.605001176304042, + "grad_norm": 16.674840927124023, + "learning_rate": 0.0010208728376419366, + "loss": 7.5769, + "step": 1130400 + }, + { + "epoch": 4.605408554327424, + "grad_norm": 5.859453201293945, + "learning_rate": 0.0010204740855228724, + "loss": 7.6014, + "step": 1130500 + }, + { + "epoch": 4.605815932350805, + "grad_norm": 7.464219570159912, + "learning_rate": 0.0010200753920950577, + "loss": 7.5805, + "step": 1130600 + }, + { + "epoch": 4.606223310374187, + "grad_norm": 7.1977057456970215, + "learning_rate": 0.0010196767573742542, + "loss": 7.5872, + "step": 1130700 + }, + { + "epoch": 4.606630688397568, + "grad_norm": 5.396115779876709, + "learning_rate": 0.0010192781813762233, + "loss": 7.5671, + "step": 1130800 + }, + { + "epoch": 4.60703806642095, + "grad_norm": 4.91425085067749, + "learning_rate": 0.0010188796641167213, + "loss": 7.5778, + "step": 1130900 + }, + { + "epoch": 4.607445444444331, + "grad_norm": 6.383526802062988, + "learning_rate": 0.0010184812056115043, + "loss": 7.5679, + "step": 1131000 + }, + { + "epoch": 4.607445444444331, + "eval_MaskedAccuracy": 0.5117166080500752, + "eval_loss": 1.5865892171859741, + "eval_runtime": 197.6892, + "eval_samples_per_second": 321.09, + "eval_steps_per_second": 1.254, + "step": 1131000 + }, + { + "epoch": 4.607852822467713, + "grad_norm": 4.455655574798584, + "learning_rate": 0.0010180828058763266, + "loss": 7.5653, + "step": 1131100 + }, + { + "epoch": 4.608260200491094, + "grad_norm": 4.2978081703186035, + "learning_rate": 0.001017684464926938, + "loss": 7.5493, + "step": 1131200 + }, + { + "epoch": 4.608667578514476, + "grad_norm": 6.322062015533447, + "learning_rate": 0.001017286182779087, + "loss": 7.5542, + "step": 1131300 + }, + { + "epoch": 4.6090749565378575, + "grad_norm": 10.278402328491211, + "learning_rate": 0.0010168879594485202, + "loss": 7.5369, + "step": 1131400 + }, + { + "epoch": 4.609482334561239, + "grad_norm": 13.21437931060791, + "learning_rate": 0.0010164897949509818, + "loss": 7.5549, + "step": 1131500 + }, + { + "epoch": 4.60988971258462, + "grad_norm": 5.219822883605957, + "learning_rate": 0.0010160916893022125, + "loss": 7.5572, + "step": 1131600 + }, + { + "epoch": 4.610297090608001, + "grad_norm": 8.883370399475098, + "learning_rate": 0.0010156936425179516, + "loss": 7.5831, + "step": 1131700 + }, + { + "epoch": 4.610704468631383, + "grad_norm": 5.648265361785889, + "learning_rate": 0.0010152956546139373, + "loss": 7.5822, + "step": 1131800 + }, + { + "epoch": 4.611111846654764, + "grad_norm": 10.747003555297852, + "learning_rate": 0.001014897725605902, + "loss": 7.5424, + "step": 1131900 + }, + { + "epoch": 4.611519224678146, + "grad_norm": 7.558552265167236, + "learning_rate": 0.00101449985550958, + "loss": 7.5522, + "step": 1132000 + }, + { + "epoch": 4.611519224678146, + "eval_MaskedAccuracy": 0.5109786866953794, + "eval_loss": 1.5969634056091309, + "eval_runtime": 212.6225, + "eval_samples_per_second": 298.538, + "eval_steps_per_second": 1.166, + "step": 1132000 + }, + { + "epoch": 4.611926602701527, + "grad_norm": 3.6435275077819824, + "learning_rate": 0.0010141020443406989, + "loss": 7.5524, + "step": 1132100 + }, + { + "epoch": 4.612333980724909, + "grad_norm": 4.184134483337402, + "learning_rate": 0.0010137042921149891, + "loss": 7.5903, + "step": 1132200 + }, + { + "epoch": 4.61274135874829, + "grad_norm": 7.438037395477295, + "learning_rate": 0.0010133065988481749, + "loss": 7.5521, + "step": 1132300 + }, + { + "epoch": 4.613148736771672, + "grad_norm": 5.46174955368042, + "learning_rate": 0.0010129089645559797, + "loss": 7.5624, + "step": 1132400 + }, + { + "epoch": 4.6135561147950535, + "grad_norm": 14.735342979431152, + "learning_rate": 0.001012511389254122, + "loss": 7.5784, + "step": 1132500 + }, + { + "epoch": 4.613963492818435, + "grad_norm": 4.234184741973877, + "learning_rate": 0.001012113872958322, + "loss": 7.5548, + "step": 1132600 + }, + { + "epoch": 4.6143708708418165, + "grad_norm": 4.062404632568359, + "learning_rate": 0.0010117164156842973, + "loss": 7.5537, + "step": 1132700 + }, + { + "epoch": 4.614778248865198, + "grad_norm": 11.664778709411621, + "learning_rate": 0.001011319017447758, + "loss": 7.6005, + "step": 1132800 + }, + { + "epoch": 4.615185626888579, + "grad_norm": 9.382136344909668, + "learning_rate": 0.0010109216782644165, + "loss": 7.5342, + "step": 1132900 + }, + { + "epoch": 4.61559300491196, + "grad_norm": 10.988692283630371, + "learning_rate": 0.0010105243981499816, + "loss": 7.6027, + "step": 1133000 + }, + { + "epoch": 4.61559300491196, + "eval_MaskedAccuracy": 0.511027137844796, + "eval_loss": 1.6006269454956055, + "eval_runtime": 169.4552, + "eval_samples_per_second": 374.589, + "eval_steps_per_second": 1.464, + "step": 1133000 + }, + { + "epoch": 4.616000382935342, + "grad_norm": 6.641720771789551, + "learning_rate": 0.0010101271771201614, + "loss": 7.572, + "step": 1133100 + }, + { + "epoch": 4.616407760958723, + "grad_norm": 4.952685356140137, + "learning_rate": 0.0010097300151906575, + "loss": 7.5532, + "step": 1133200 + }, + { + "epoch": 4.616815138982105, + "grad_norm": 6.696595191955566, + "learning_rate": 0.0010093329123771718, + "loss": 7.5571, + "step": 1133300 + }, + { + "epoch": 4.617222517005486, + "grad_norm": 5.600722789764404, + "learning_rate": 0.001008935868695409, + "loss": 7.5685, + "step": 1133400 + }, + { + "epoch": 4.617629895028868, + "grad_norm": 13.680784225463867, + "learning_rate": 0.0010085388841610601, + "loss": 7.56, + "step": 1133500 + }, + { + "epoch": 4.618037273052249, + "grad_norm": 5.2523417472839355, + "learning_rate": 0.0010081419587898229, + "loss": 7.5759, + "step": 1133600 + }, + { + "epoch": 4.618444651075631, + "grad_norm": 3.5989181995391846, + "learning_rate": 0.0010077450925973913, + "loss": 7.5614, + "step": 1133700 + }, + { + "epoch": 4.6188520290990125, + "grad_norm": 3.7085886001586914, + "learning_rate": 0.0010073482855994537, + "loss": 7.5725, + "step": 1133800 + }, + { + "epoch": 4.619259407122394, + "grad_norm": 10.798259735107422, + "learning_rate": 0.0010069515378116978, + "loss": 7.5735, + "step": 1133900 + }, + { + "epoch": 4.619666785145775, + "grad_norm": 5.2844390869140625, + "learning_rate": 0.0010065548492498094, + "loss": 7.5779, + "step": 1134000 + }, + { + "epoch": 4.619666785145775, + "eval_MaskedAccuracy": 0.5113493641710556, + "eval_loss": 1.5992459058761597, + "eval_runtime": 176.7383, + "eval_samples_per_second": 359.152, + "eval_steps_per_second": 1.403, + "step": 1134000 + }, + { + "epoch": 4.620074163169157, + "grad_norm": 5.834797382354736, + "learning_rate": 0.0010061582199294736, + "loss": 7.5809, + "step": 1134100 + }, + { + "epoch": 4.620481541192538, + "grad_norm": 3.8468172550201416, + "learning_rate": 0.0010057616498663682, + "loss": 7.5913, + "step": 1134200 + }, + { + "epoch": 4.620888919215919, + "grad_norm": 9.360038757324219, + "learning_rate": 0.0010053651390761734, + "loss": 7.5828, + "step": 1134300 + }, + { + "epoch": 4.621296297239301, + "grad_norm": 5.326218128204346, + "learning_rate": 0.0010049686875745656, + "loss": 7.5461, + "step": 1134400 + }, + { + "epoch": 4.621703675262682, + "grad_norm": 5.876611232757568, + "learning_rate": 0.001004572295377219, + "loss": 7.5581, + "step": 1134500 + }, + { + "epoch": 4.622111053286064, + "grad_norm": 10.328592300415039, + "learning_rate": 0.001004175962499803, + "loss": 7.5571, + "step": 1134600 + }, + { + "epoch": 4.622518431309445, + "grad_norm": 11.294349670410156, + "learning_rate": 0.001003779688957991, + "loss": 7.5911, + "step": 1134700 + }, + { + "epoch": 4.622925809332827, + "grad_norm": 5.186825752258301, + "learning_rate": 0.0010033834747674452, + "loss": 7.5542, + "step": 1134800 + }, + { + "epoch": 4.6233331873562085, + "grad_norm": 16.173288345336914, + "learning_rate": 0.001002987319943831, + "loss": 7.5833, + "step": 1134900 + }, + { + "epoch": 4.62374056537959, + "grad_norm": 13.111103057861328, + "learning_rate": 0.0010025912245028121, + "loss": 7.554, + "step": 1135000 + }, + { + "epoch": 4.62374056537959, + "eval_MaskedAccuracy": 0.5109919107297, + "eval_loss": 1.588957667350769, + "eval_runtime": 158.148, + "eval_samples_per_second": 401.371, + "eval_steps_per_second": 1.568, + "step": 1135000 + }, + { + "epoch": 4.6241479434029715, + "grad_norm": 3.178123712539673, + "learning_rate": 0.0010021951884600483, + "loss": 7.5383, + "step": 1135100 + }, + { + "epoch": 4.624555321426353, + "grad_norm": 8.649395942687988, + "learning_rate": 0.0010017992118311956, + "loss": 7.5498, + "step": 1135200 + }, + { + "epoch": 4.624962699449734, + "grad_norm": 6.623432636260986, + "learning_rate": 0.0010014032946319092, + "loss": 7.5677, + "step": 1135300 + }, + { + "epoch": 4.625370077473115, + "grad_norm": 7.139390468597412, + "learning_rate": 0.0010010074368778433, + "loss": 7.5651, + "step": 1135400 + }, + { + "epoch": 4.625777455496497, + "grad_norm": 5.244434356689453, + "learning_rate": 0.001000611638584647, + "loss": 7.5451, + "step": 1135500 + }, + { + "epoch": 4.626184833519878, + "grad_norm": 7.217027187347412, + "learning_rate": 0.001000215899767969, + "loss": 7.5459, + "step": 1135600 + }, + { + "epoch": 4.62659221154326, + "grad_norm": 5.953061103820801, + "learning_rate": 0.0009998202204434547, + "loss": 7.5492, + "step": 1135700 + }, + { + "epoch": 4.626999589566641, + "grad_norm": 5.7374467849731445, + "learning_rate": 0.0009994246006267467, + "loss": 7.5751, + "step": 1135800 + }, + { + "epoch": 4.627406967590023, + "grad_norm": 5.704726219177246, + "learning_rate": 0.0009990290403334864, + "loss": 7.5706, + "step": 1135900 + }, + { + "epoch": 4.627814345613404, + "grad_norm": 5.9133124351501465, + "learning_rate": 0.0009986335395793123, + "loss": 7.5748, + "step": 1136000 + }, + { + "epoch": 4.627814345613404, + "eval_MaskedAccuracy": 0.5111270056741465, + "eval_loss": 1.5983633995056152, + "eval_runtime": 163.718, + "eval_samples_per_second": 387.716, + "eval_steps_per_second": 1.515, + "step": 1136000 + }, + { + "epoch": 4.628221723636786, + "grad_norm": 5.866695880889893, + "learning_rate": 0.0009982380983798614, + "loss": 7.59, + "step": 1136100 + }, + { + "epoch": 4.6286291016601675, + "grad_norm": 6.089090824127197, + "learning_rate": 0.0009978427167507673, + "loss": 7.5661, + "step": 1136200 + }, + { + "epoch": 4.629036479683549, + "grad_norm": 5.132720947265625, + "learning_rate": 0.000997447394707662, + "loss": 7.5672, + "step": 1136300 + }, + { + "epoch": 4.629443857706931, + "grad_norm": 10.10950756072998, + "learning_rate": 0.0009970521322661746, + "loss": 7.6056, + "step": 1136400 + }, + { + "epoch": 4.629851235730312, + "grad_norm": 6.252572536468506, + "learning_rate": 0.000996656929441931, + "loss": 7.5389, + "step": 1136500 + }, + { + "epoch": 4.630258613753693, + "grad_norm": 6.584892272949219, + "learning_rate": 0.0009962617862505564, + "loss": 7.5638, + "step": 1136600 + }, + { + "epoch": 4.630665991777074, + "grad_norm": 7.151813983917236, + "learning_rate": 0.0009958667027076735, + "loss": 7.5606, + "step": 1136700 + }, + { + "epoch": 4.631073369800456, + "grad_norm": 6.8990960121154785, + "learning_rate": 0.0009954716788289008, + "loss": 7.5572, + "step": 1136800 + }, + { + "epoch": 4.631480747823837, + "grad_norm": 12.346681594848633, + "learning_rate": 0.0009950767146298556, + "loss": 7.5588, + "step": 1136900 + }, + { + "epoch": 4.631888125847219, + "grad_norm": 4.093268394470215, + "learning_rate": 0.0009946818101261529, + "loss": 7.5845, + "step": 1137000 + }, + { + "epoch": 4.631888125847219, + "eval_MaskedAccuracy": 0.5110830540974026, + "eval_loss": 1.6059156656265259, + "eval_runtime": 159.358, + "eval_samples_per_second": 398.323, + "eval_steps_per_second": 1.556, + "step": 1137000 + }, + { + "epoch": 4.6322955038706, + "grad_norm": 7.124499797821045, + "learning_rate": 0.0009942869653334066, + "loss": 7.5619, + "step": 1137100 + }, + { + "epoch": 4.632702881893982, + "grad_norm": 12.021746635437012, + "learning_rate": 0.0009938921802672268, + "loss": 7.5638, + "step": 1137200 + }, + { + "epoch": 4.6331102599173635, + "grad_norm": 7.023179054260254, + "learning_rate": 0.0009934974549432201, + "loss": 7.5889, + "step": 1137300 + }, + { + "epoch": 4.633517637940745, + "grad_norm": 5.897668838500977, + "learning_rate": 0.0009931027893769918, + "loss": 7.563, + "step": 1137400 + }, + { + "epoch": 4.6339250159641265, + "grad_norm": 5.407822132110596, + "learning_rate": 0.0009927081835841482, + "loss": 7.5535, + "step": 1137500 + }, + { + "epoch": 4.634332393987508, + "grad_norm": 4.564602375030518, + "learning_rate": 0.0009923136375802888, + "loss": 7.5615, + "step": 1137600 + }, + { + "epoch": 4.63473977201089, + "grad_norm": 7.788735866546631, + "learning_rate": 0.0009919191513810108, + "loss": 7.5493, + "step": 1137700 + }, + { + "epoch": 4.635147150034271, + "grad_norm": 7.4595441818237305, + "learning_rate": 0.0009915247250019106, + "loss": 7.5604, + "step": 1137800 + }, + { + "epoch": 4.635554528057652, + "grad_norm": 12.7979097366333, + "learning_rate": 0.0009911303584585838, + "loss": 7.5602, + "step": 1137900 + }, + { + "epoch": 4.635961906081033, + "grad_norm": 8.483467102050781, + "learning_rate": 0.0009907360517666202, + "loss": 7.5891, + "step": 1138000 + }, + { + "epoch": 4.635961906081033, + "eval_MaskedAccuracy": 0.5113504076574895, + "eval_loss": 1.5911567211151123, + "eval_runtime": 182.7869, + "eval_samples_per_second": 347.268, + "eval_steps_per_second": 1.357, + "step": 1138000 + }, + { + "epoch": 4.636369284104415, + "grad_norm": 13.062750816345215, + "learning_rate": 0.0009903418049416095, + "loss": 7.5754, + "step": 1138100 + }, + { + "epoch": 4.636776662127796, + "grad_norm": 6.437638282775879, + "learning_rate": 0.0009899476179991375, + "loss": 7.5613, + "step": 1138200 + }, + { + "epoch": 4.637184040151178, + "grad_norm": 7.485435962677002, + "learning_rate": 0.0009895534909547905, + "loss": 7.5303, + "step": 1138300 + }, + { + "epoch": 4.637591418174559, + "grad_norm": 11.320119857788086, + "learning_rate": 0.0009891594238241486, + "loss": 7.5563, + "step": 1138400 + }, + { + "epoch": 4.637998796197941, + "grad_norm": 3.7288763523101807, + "learning_rate": 0.0009887654166227926, + "loss": 7.5556, + "step": 1138500 + }, + { + "epoch": 4.6384061742213225, + "grad_norm": 4.068279266357422, + "learning_rate": 0.0009883714693662985, + "loss": 7.5811, + "step": 1138600 + }, + { + "epoch": 4.638813552244704, + "grad_norm": 22.849851608276367, + "learning_rate": 0.0009879775820702428, + "loss": 7.5548, + "step": 1138700 + }, + { + "epoch": 4.639220930268086, + "grad_norm": 4.552586078643799, + "learning_rate": 0.000987583754750195, + "loss": 7.5686, + "step": 1138800 + }, + { + "epoch": 4.639628308291467, + "grad_norm": 8.008922576904297, + "learning_rate": 0.000987189987421728, + "loss": 7.5439, + "step": 1138900 + }, + { + "epoch": 4.640035686314848, + "grad_norm": 7.918119430541992, + "learning_rate": 0.0009867962801004078, + "loss": 7.5674, + "step": 1139000 + }, + { + "epoch": 4.640035686314848, + "eval_MaskedAccuracy": 0.5108878226299906, + "eval_loss": 1.5939793586730957, + "eval_runtime": 186.1234, + "eval_samples_per_second": 341.042, + "eval_steps_per_second": 1.332, + "step": 1139000 + }, + { + "epoch": 4.64044306433823, + "grad_norm": 4.122742652893066, + "learning_rate": 0.0009864026328017983, + "loss": 7.565, + "step": 1139100 + }, + { + "epoch": 4.640850442361611, + "grad_norm": 5.125577926635742, + "learning_rate": 0.0009860090455414616, + "loss": 7.5802, + "step": 1139200 + }, + { + "epoch": 4.641257820384992, + "grad_norm": 4.104841709136963, + "learning_rate": 0.0009856155183349637, + "loss": 7.5967, + "step": 1139300 + }, + { + "epoch": 4.641665198408374, + "grad_norm": 12.787208557128906, + "learning_rate": 0.0009852220511978591, + "loss": 7.5735, + "step": 1139400 + }, + { + "epoch": 4.642072576431755, + "grad_norm": 7.688920021057129, + "learning_rate": 0.0009848286441457058, + "loss": 7.5604, + "step": 1139500 + }, + { + "epoch": 4.642479954455137, + "grad_norm": 4.265176296234131, + "learning_rate": 0.0009844352971940557, + "loss": 7.5465, + "step": 1139600 + }, + { + "epoch": 4.6428873324785185, + "grad_norm": 7.598531723022461, + "learning_rate": 0.0009840420103584616, + "loss": 7.5898, + "step": 1139700 + }, + { + "epoch": 4.6432947105019, + "grad_norm": 14.754538536071777, + "learning_rate": 0.0009836487836544718, + "loss": 7.5626, + "step": 1139800 + }, + { + "epoch": 4.6437020885252815, + "grad_norm": 13.873635292053223, + "learning_rate": 0.0009832556170976312, + "loss": 7.5814, + "step": 1139900 + }, + { + "epoch": 4.644109466548663, + "grad_norm": 5.928347110748291, + "learning_rate": 0.0009828625107034849, + "loss": 7.5458, + "step": 1140000 + }, + { + "epoch": 4.644109466548663, + "eval_MaskedAccuracy": 0.5112632507850131, + "eval_loss": 1.588282823562622, + "eval_runtime": 167.4255, + "eval_samples_per_second": 379.13, + "eval_steps_per_second": 1.481, + "step": 1140000 + }, + { + "epoch": 4.644516844572045, + "grad_norm": 11.163224220275879, + "learning_rate": 0.0009824694644875755, + "loss": 7.5592, + "step": 1140100 + }, + { + "epoch": 4.644924222595426, + "grad_norm": 5.4645514488220215, + "learning_rate": 0.0009820764784654412, + "loss": 7.5757, + "step": 1140200 + }, + { + "epoch": 4.645331600618807, + "grad_norm": 9.20749282836914, + "learning_rate": 0.0009816835526526174, + "loss": 7.5959, + "step": 1140300 + }, + { + "epoch": 4.645738978642188, + "grad_norm": 5.6333489418029785, + "learning_rate": 0.0009812906870646403, + "loss": 7.5675, + "step": 1140400 + }, + { + "epoch": 4.64614635666557, + "grad_norm": 10.956141471862793, + "learning_rate": 0.0009808978817170428, + "loss": 7.5533, + "step": 1140500 + }, + { + "epoch": 4.646553734688951, + "grad_norm": 4.930355072021484, + "learning_rate": 0.000980505136625352, + "loss": 7.5678, + "step": 1140600 + }, + { + "epoch": 4.646961112712333, + "grad_norm": 11.471359252929688, + "learning_rate": 0.000980112451805098, + "loss": 7.5696, + "step": 1140700 + }, + { + "epoch": 4.6473684907357145, + "grad_norm": 11.46281623840332, + "learning_rate": 0.0009797198272718053, + "loss": 7.5496, + "step": 1140800 + }, + { + "epoch": 4.647775868759096, + "grad_norm": 8.473072052001953, + "learning_rate": 0.0009793272630409943, + "loss": 7.5514, + "step": 1140900 + }, + { + "epoch": 4.6481832467824775, + "grad_norm": 5.101851463317871, + "learning_rate": 0.0009789347591281858, + "loss": 7.5946, + "step": 1141000 + }, + { + "epoch": 4.6481832467824775, + "eval_MaskedAccuracy": 0.5106380063732754, + "eval_loss": 1.6024672985076904, + "eval_runtime": 171.8719, + "eval_samples_per_second": 369.322, + "eval_steps_per_second": 1.443, + "step": 1141000 + }, + { + "epoch": 4.648590624805859, + "grad_norm": 7.18214750289917, + "learning_rate": 0.0009785423155488993, + "loss": 7.5712, + "step": 1141100 + }, + { + "epoch": 4.648998002829241, + "grad_norm": 74.3897705078125, + "learning_rate": 0.000978149932318649, + "loss": 7.5622, + "step": 1141200 + }, + { + "epoch": 4.649405380852622, + "grad_norm": 5.3525872230529785, + "learning_rate": 0.000977757609452949, + "loss": 7.5846, + "step": 1141300 + }, + { + "epoch": 4.649812758876004, + "grad_norm": 5.943066596984863, + "learning_rate": 0.0009773653469673082, + "loss": 7.5625, + "step": 1141400 + }, + { + "epoch": 4.650220136899385, + "grad_norm": 17.435598373413086, + "learning_rate": 0.0009769731448772353, + "loss": 7.5962, + "step": 1141500 + }, + { + "epoch": 4.650627514922766, + "grad_norm": 14.843661308288574, + "learning_rate": 0.0009765810031982362, + "loss": 7.5575, + "step": 1141600 + }, + { + "epoch": 4.651034892946147, + "grad_norm": 2.8244335651397705, + "learning_rate": 0.0009761889219458203, + "loss": 7.5549, + "step": 1141700 + }, + { + "epoch": 4.651442270969529, + "grad_norm": 5.403951168060303, + "learning_rate": 0.0009757969011354824, + "loss": 7.569, + "step": 1141800 + }, + { + "epoch": 4.65184964899291, + "grad_norm": 10.951919555664062, + "learning_rate": 0.0009754049407827225, + "loss": 7.5864, + "step": 1141900 + }, + { + "epoch": 4.652257027016292, + "grad_norm": 15.487794876098633, + "learning_rate": 0.000975013040903036, + "loss": 7.5902, + "step": 1142000 + }, + { + "epoch": 4.652257027016292, + "eval_MaskedAccuracy": 0.5107587389265058, + "eval_loss": 1.6009981632232666, + "eval_runtime": 165.3809, + "eval_samples_per_second": 383.817, + "eval_steps_per_second": 1.5, + "step": 1142000 + }, + { + "epoch": 4.6526644050396735, + "grad_norm": 3.882822275161743, + "learning_rate": 0.0009746212015119169, + "loss": 7.5798, + "step": 1142100 + }, + { + "epoch": 4.653071783063055, + "grad_norm": 4.652502536773682, + "learning_rate": 0.0009742294226248559, + "loss": 7.5497, + "step": 1142200 + }, + { + "epoch": 4.6534791610864366, + "grad_norm": 7.919519424438477, + "learning_rate": 0.0009738377042573433, + "loss": 7.5726, + "step": 1142300 + }, + { + "epoch": 4.653886539109818, + "grad_norm": 9.157454490661621, + "learning_rate": 0.0009734460464248658, + "loss": 7.5391, + "step": 1142400 + }, + { + "epoch": 4.6542939171332, + "grad_norm": 8.602246284484863, + "learning_rate": 0.0009730544491429075, + "loss": 7.5814, + "step": 1142500 + }, + { + "epoch": 4.654701295156581, + "grad_norm": 17.3488712310791, + "learning_rate": 0.0009726629124269508, + "loss": 7.5472, + "step": 1142600 + }, + { + "epoch": 4.655108673179962, + "grad_norm": 9.232991218566895, + "learning_rate": 0.0009722714362924737, + "loss": 7.5696, + "step": 1142700 + }, + { + "epoch": 4.655516051203344, + "grad_norm": 4.006420612335205, + "learning_rate": 0.0009718800207549548, + "loss": 7.5809, + "step": 1142800 + }, + { + "epoch": 4.655923429226725, + "grad_norm": 5.405423164367676, + "learning_rate": 0.0009714886658298674, + "loss": 7.5426, + "step": 1142900 + }, + { + "epoch": 4.656330807250106, + "grad_norm": 5.900599479675293, + "learning_rate": 0.000971097371532686, + "loss": 7.5743, + "step": 1143000 + }, + { + "epoch": 4.656330807250106, + "eval_MaskedAccuracy": 0.5108904182976085, + "eval_loss": 1.5938276052474976, + "eval_runtime": 203.4273, + "eval_samples_per_second": 312.033, + "eval_steps_per_second": 1.219, + "step": 1143000 + }, + { + "epoch": 4.656738185273488, + "grad_norm": 3.059312105178833, + "learning_rate": 0.0009707061378788781, + "loss": 7.587, + "step": 1143100 + }, + { + "epoch": 4.6571455632968695, + "grad_norm": 7.715422630310059, + "learning_rate": 0.0009703149648839125, + "loss": 7.566, + "step": 1143200 + }, + { + "epoch": 4.657552941320251, + "grad_norm": 2.4538686275482178, + "learning_rate": 0.0009699238525632545, + "loss": 7.5648, + "step": 1143300 + }, + { + "epoch": 4.6579603193436325, + "grad_norm": 6.942965030670166, + "learning_rate": 0.0009695328009323676, + "loss": 7.5429, + "step": 1143400 + }, + { + "epoch": 4.658367697367014, + "grad_norm": 7.37758731842041, + "learning_rate": 0.0009691418100067122, + "loss": 7.5582, + "step": 1143500 + }, + { + "epoch": 4.658775075390396, + "grad_norm": 4.007798194885254, + "learning_rate": 0.0009687508798017454, + "loss": 7.5776, + "step": 1143600 + }, + { + "epoch": 4.659182453413777, + "grad_norm": 13.702225685119629, + "learning_rate": 0.0009683600103329228, + "loss": 7.5836, + "step": 1143700 + }, + { + "epoch": 4.659589831437159, + "grad_norm": 4.413660049438477, + "learning_rate": 0.0009679692016156966, + "loss": 7.5519, + "step": 1143800 + }, + { + "epoch": 4.65999720946054, + "grad_norm": 5.619871616363525, + "learning_rate": 0.0009675784536655186, + "loss": 7.5697, + "step": 1143900 + }, + { + "epoch": 4.660404587483921, + "grad_norm": 10.72059440612793, + "learning_rate": 0.0009671877664978373, + "loss": 7.5397, + "step": 1144000 + }, + { + "epoch": 4.660404587483921, + "eval_MaskedAccuracy": 0.5106447879565417, + "eval_loss": 1.594344139099121, + "eval_runtime": 168.6792, + "eval_samples_per_second": 376.312, + "eval_steps_per_second": 1.47, + "step": 1144000 + }, + { + "epoch": 4.660811965507303, + "grad_norm": 9.93779468536377, + "learning_rate": 0.0009667971401280979, + "loss": 7.5475, + "step": 1144100 + }, + { + "epoch": 4.661219343530684, + "grad_norm": 4.302090644836426, + "learning_rate": 0.0009664065745717441, + "loss": 7.5715, + "step": 1144200 + }, + { + "epoch": 4.661626721554065, + "grad_norm": 13.303513526916504, + "learning_rate": 0.0009660160698442181, + "loss": 7.559, + "step": 1144300 + }, + { + "epoch": 4.662034099577447, + "grad_norm": 3.0725815296173096, + "learning_rate": 0.000965625625960958, + "loss": 7.5932, + "step": 1144400 + }, + { + "epoch": 4.6624414776008285, + "grad_norm": 11.4290132522583, + "learning_rate": 0.0009652352429373998, + "loss": 7.5794, + "step": 1144500 + }, + { + "epoch": 4.66284885562421, + "grad_norm": 6.4777350425720215, + "learning_rate": 0.0009648449207889775, + "loss": 7.5705, + "step": 1144600 + }, + { + "epoch": 4.663256233647592, + "grad_norm": 4.026091575622559, + "learning_rate": 0.0009644546595311234, + "loss": 7.5718, + "step": 1144700 + }, + { + "epoch": 4.663663611670973, + "grad_norm": 11.589993476867676, + "learning_rate": 0.0009640644591792654, + "loss": 7.5862, + "step": 1144800 + }, + { + "epoch": 4.664070989694355, + "grad_norm": 4.978834629058838, + "learning_rate": 0.0009636743197488318, + "loss": 7.5616, + "step": 1144900 + }, + { + "epoch": 4.664478367717736, + "grad_norm": 11.424694061279297, + "learning_rate": 0.0009632842412552456, + "loss": 7.5634, + "step": 1145000 + }, + { + "epoch": 4.664478367717736, + "eval_MaskedAccuracy": 0.5117445791108512, + "eval_loss": 1.592308521270752, + "eval_runtime": 185.3599, + "eval_samples_per_second": 342.447, + "eval_steps_per_second": 1.338, + "step": 1145000 + }, + { + "epoch": 4.664885745741118, + "grad_norm": 13.615307807922363, + "learning_rate": 0.0009628942237139287, + "loss": 7.5524, + "step": 1145100 + }, + { + "epoch": 4.665293123764499, + "grad_norm": 4.647578239440918, + "learning_rate": 0.0009625042671403015, + "loss": 7.5626, + "step": 1145200 + }, + { + "epoch": 4.66570050178788, + "grad_norm": 6.316812515258789, + "learning_rate": 0.0009621143715497798, + "loss": 7.6045, + "step": 1145300 + }, + { + "epoch": 4.666107879811261, + "grad_norm": 4.423739433288574, + "learning_rate": 0.0009617245369577794, + "loss": 7.5796, + "step": 1145400 + }, + { + "epoch": 4.666515257834643, + "grad_norm": 6.143904209136963, + "learning_rate": 0.0009613347633797118, + "loss": 7.5647, + "step": 1145500 + }, + { + "epoch": 4.6669226358580245, + "grad_norm": 5.285592555999756, + "learning_rate": 0.000960945050830987, + "loss": 7.5881, + "step": 1145600 + }, + { + "epoch": 4.667330013881406, + "grad_norm": 9.723875999450684, + "learning_rate": 0.0009605553993270131, + "loss": 7.5708, + "step": 1145700 + }, + { + "epoch": 4.6677373919047875, + "grad_norm": 6.489499568939209, + "learning_rate": 0.0009601658088831943, + "loss": 7.5466, + "step": 1145800 + }, + { + "epoch": 4.668144769928169, + "grad_norm": 9.560879707336426, + "learning_rate": 0.0009597762795149336, + "loss": 7.5744, + "step": 1145900 + }, + { + "epoch": 4.668552147951551, + "grad_norm": 4.059078693389893, + "learning_rate": 0.0009593868112376308, + "loss": 7.5617, + "step": 1146000 + }, + { + "epoch": 4.668552147951551, + "eval_MaskedAccuracy": 0.5113877713197927, + "eval_loss": 1.5960330963134766, + "eval_runtime": 177.4826, + "eval_samples_per_second": 357.646, + "eval_steps_per_second": 1.397, + "step": 1146000 + }, + { + "epoch": 4.668959525974932, + "grad_norm": 12.183576583862305, + "learning_rate": 0.0009589974040666838, + "loss": 7.5596, + "step": 1146100 + }, + { + "epoch": 4.669366903998314, + "grad_norm": 7.5042619705200195, + "learning_rate": 0.0009586080580174874, + "loss": 7.575, + "step": 1146200 + }, + { + "epoch": 4.669774282021695, + "grad_norm": 14.12924575805664, + "learning_rate": 0.0009582187731054356, + "loss": 7.5706, + "step": 1146300 + }, + { + "epoch": 4.670181660045077, + "grad_norm": 4.307675838470459, + "learning_rate": 0.0009578295493459185, + "loss": 7.5781, + "step": 1146400 + }, + { + "epoch": 4.670589038068458, + "grad_norm": 12.567525863647461, + "learning_rate": 0.0009574403867543238, + "loss": 7.5722, + "step": 1146500 + }, + { + "epoch": 4.670996416091839, + "grad_norm": 7.56950044631958, + "learning_rate": 0.000957051285346038, + "loss": 7.5686, + "step": 1146600 + }, + { + "epoch": 4.67140379411522, + "grad_norm": 12.223543167114258, + "learning_rate": 0.0009566622451364445, + "loss": 7.5831, + "step": 1146700 + }, + { + "epoch": 4.671811172138602, + "grad_norm": 12.852731704711914, + "learning_rate": 0.0009562732661409233, + "loss": 7.5545, + "step": 1146800 + }, + { + "epoch": 4.6722185501619835, + "grad_norm": 3.26822829246521, + "learning_rate": 0.0009558843483748528, + "loss": 7.5588, + "step": 1146900 + }, + { + "epoch": 4.672625928185365, + "grad_norm": 9.585270881652832, + "learning_rate": 0.000955495491853609, + "loss": 7.561, + "step": 1147000 + }, + { + "epoch": 4.672625928185365, + "eval_MaskedAccuracy": 0.511802373359456, + "eval_loss": 1.58987557888031, + "eval_runtime": 173.1924, + "eval_samples_per_second": 366.506, + "eval_steps_per_second": 1.432, + "step": 1147000 + }, + { + "epoch": 4.673033306208747, + "grad_norm": 5.222931861877441, + "learning_rate": 0.0009551066965925656, + "loss": 7.5802, + "step": 1147100 + }, + { + "epoch": 4.673440684232128, + "grad_norm": 14.629156112670898, + "learning_rate": 0.0009547179626070933, + "loss": 7.5598, + "step": 1147200 + }, + { + "epoch": 4.67384806225551, + "grad_norm": 22.34994125366211, + "learning_rate": 0.0009543292899125627, + "loss": 7.5542, + "step": 1147300 + }, + { + "epoch": 4.674255440278891, + "grad_norm": 10.149872779846191, + "learning_rate": 0.000953940678524338, + "loss": 7.5516, + "step": 1147400 + }, + { + "epoch": 4.674662818302273, + "grad_norm": 15.584869384765625, + "learning_rate": 0.0009535521284577845, + "loss": 7.5781, + "step": 1147500 + }, + { + "epoch": 4.675070196325654, + "grad_norm": 9.096953392028809, + "learning_rate": 0.0009531636397282633, + "loss": 7.5922, + "step": 1147600 + }, + { + "epoch": 4.675477574349035, + "grad_norm": 5.695037841796875, + "learning_rate": 0.0009527752123511332, + "loss": 7.5604, + "step": 1147700 + }, + { + "epoch": 4.675884952372417, + "grad_norm": 11.450030326843262, + "learning_rate": 0.0009523868463417511, + "loss": 7.5651, + "step": 1147800 + }, + { + "epoch": 4.676292330395798, + "grad_norm": 15.076812744140625, + "learning_rate": 0.0009519985417154695, + "loss": 7.5982, + "step": 1147900 + }, + { + "epoch": 4.6766997084191795, + "grad_norm": 5.078405380249023, + "learning_rate": 0.0009516102984876421, + "loss": 7.5564, + "step": 1148000 + }, + { + "epoch": 4.6766997084191795, + "eval_MaskedAccuracy": 0.5112680448618053, + "eval_loss": 1.6016606092453003, + "eval_runtime": 159.3213, + "eval_samples_per_second": 398.415, + "eval_steps_per_second": 1.557, + "step": 1148000 + }, + { + "epoch": 4.677107086442561, + "grad_norm": 9.016671180725098, + "learning_rate": 0.0009512221166736176, + "loss": 7.5621, + "step": 1148100 + }, + { + "epoch": 4.6775144644659425, + "grad_norm": 7.202791213989258, + "learning_rate": 0.0009508339962887438, + "loss": 7.5468, + "step": 1148200 + }, + { + "epoch": 4.677921842489324, + "grad_norm": 9.776394844055176, + "learning_rate": 0.0009504459373483638, + "loss": 7.5705, + "step": 1148300 + }, + { + "epoch": 4.678329220512706, + "grad_norm": 13.585295677185059, + "learning_rate": 0.0009500579398678206, + "loss": 7.5861, + "step": 1148400 + }, + { + "epoch": 4.678736598536087, + "grad_norm": 18.066389083862305, + "learning_rate": 0.0009496700038624534, + "loss": 7.5696, + "step": 1148500 + }, + { + "epoch": 4.679143976559469, + "grad_norm": 4.05338716506958, + "learning_rate": 0.0009492821293475995, + "loss": 7.5646, + "step": 1148600 + }, + { + "epoch": 4.67955135458285, + "grad_norm": 12.096548080444336, + "learning_rate": 0.0009488943163385934, + "loss": 7.5707, + "step": 1148700 + }, + { + "epoch": 4.679958732606232, + "grad_norm": 11.088159561157227, + "learning_rate": 0.0009485065648507686, + "loss": 7.5828, + "step": 1148800 + }, + { + "epoch": 4.680366110629613, + "grad_norm": 6.6414875984191895, + "learning_rate": 0.000948118874899453, + "loss": 7.58, + "step": 1148900 + }, + { + "epoch": 4.680773488652994, + "grad_norm": 5.62465763092041, + "learning_rate": 0.000947731246499975, + "loss": 7.5294, + "step": 1149000 + }, + { + "epoch": 4.680773488652994, + "eval_MaskedAccuracy": 0.5116329883426698, + "eval_loss": 1.5883326530456543, + "eval_runtime": 203.7743, + "eval_samples_per_second": 311.502, + "eval_steps_per_second": 1.217, + "step": 1149000 + }, + { + "epoch": 4.681180866676376, + "grad_norm": 3.951120138168335, + "learning_rate": 0.0009473436796676603, + "loss": 7.5658, + "step": 1149100 + }, + { + "epoch": 4.681588244699757, + "grad_norm": 8.422918319702148, + "learning_rate": 0.0009469561744178301, + "loss": 7.559, + "step": 1149200 + }, + { + "epoch": 4.6819956227231385, + "grad_norm": 6.362574577331543, + "learning_rate": 0.000946568730765806, + "loss": 7.5737, + "step": 1149300 + }, + { + "epoch": 4.68240300074652, + "grad_norm": 4.840132713317871, + "learning_rate": 0.0009461813487269046, + "loss": 7.5654, + "step": 1149400 + }, + { + "epoch": 4.682810378769902, + "grad_norm": 8.584992408752441, + "learning_rate": 0.0009457940283164404, + "loss": 7.5624, + "step": 1149500 + }, + { + "epoch": 4.683217756793283, + "grad_norm": 4.112203121185303, + "learning_rate": 0.0009454067695497283, + "loss": 7.566, + "step": 1149600 + }, + { + "epoch": 4.683625134816665, + "grad_norm": 3.2341792583465576, + "learning_rate": 0.0009450195724420789, + "loss": 7.5753, + "step": 1149700 + }, + { + "epoch": 4.684032512840046, + "grad_norm": 5.026054382324219, + "learning_rate": 0.0009446324370087991, + "loss": 7.5528, + "step": 1149800 + }, + { + "epoch": 4.684439890863428, + "grad_norm": 8.230510711669922, + "learning_rate": 0.0009442453632651933, + "loss": 7.5804, + "step": 1149900 + }, + { + "epoch": 4.684847268886809, + "grad_norm": 10.661288261413574, + "learning_rate": 0.0009438583512265662, + "loss": 7.5863, + "step": 1150000 + }, + { + "epoch": 4.684847268886809, + "eval_MaskedAccuracy": 0.5112104708678854, + "eval_loss": 1.59536612033844, + "eval_runtime": 154.144, + "eval_samples_per_second": 411.797, + "eval_steps_per_second": 1.609, + "step": 1150000 + }, + { + "epoch": 4.685254646910191, + "grad_norm": 4.505941867828369, + "learning_rate": 0.0009434714009082187, + "loss": 7.5788, + "step": 1150100 + }, + { + "epoch": 4.685662024933572, + "grad_norm": 4.621151447296143, + "learning_rate": 0.0009430845123254485, + "loss": 7.5662, + "step": 1150200 + }, + { + "epoch": 4.686069402956953, + "grad_norm": 13.552003860473633, + "learning_rate": 0.0009426976854935505, + "loss": 7.5705, + "step": 1150300 + }, + { + "epoch": 4.6864767809803345, + "grad_norm": 10.98857307434082, + "learning_rate": 0.0009423109204278197, + "loss": 7.5407, + "step": 1150400 + }, + { + "epoch": 4.686884159003716, + "grad_norm": 7.751584529876709, + "learning_rate": 0.0009419242171435458, + "loss": 7.5761, + "step": 1150500 + }, + { + "epoch": 4.6872915370270976, + "grad_norm": 5.739630222320557, + "learning_rate": 0.0009415375756560168, + "loss": 7.5842, + "step": 1150600 + }, + { + "epoch": 4.687698915050479, + "grad_norm": 4.1098408699035645, + "learning_rate": 0.0009411509959805195, + "loss": 7.5826, + "step": 1150700 + }, + { + "epoch": 4.688106293073861, + "grad_norm": 3.4388153553009033, + "learning_rate": 0.0009407644781323382, + "loss": 7.5715, + "step": 1150800 + }, + { + "epoch": 4.688513671097242, + "grad_norm": 3.127047061920166, + "learning_rate": 0.0009403780221267528, + "loss": 7.575, + "step": 1150900 + }, + { + "epoch": 4.688921049120624, + "grad_norm": 10.35447883605957, + "learning_rate": 0.0009399916279790423, + "loss": 7.5464, + "step": 1151000 + }, + { + "epoch": 4.688921049120624, + "eval_MaskedAccuracy": 0.5115344628783024, + "eval_loss": 1.5853630304336548, + "eval_runtime": 156.1249, + "eval_samples_per_second": 406.572, + "eval_steps_per_second": 1.588, + "step": 1151000 + }, + { + "epoch": 4.689328427144005, + "grad_norm": 3.8142848014831543, + "learning_rate": 0.0009396052957044819, + "loss": 7.5292, + "step": 1151100 + }, + { + "epoch": 4.689735805167387, + "grad_norm": 10.079751014709473, + "learning_rate": 0.0009392190253183472, + "loss": 7.5651, + "step": 1151200 + }, + { + "epoch": 4.690143183190768, + "grad_norm": 4.882888317108154, + "learning_rate": 0.000938832816835908, + "loss": 7.5667, + "step": 1151300 + }, + { + "epoch": 4.69055056121415, + "grad_norm": 4.196845054626465, + "learning_rate": 0.0009384466702724349, + "loss": 7.5767, + "step": 1151400 + }, + { + "epoch": 4.690957939237531, + "grad_norm": 7.795995712280273, + "learning_rate": 0.0009380605856431927, + "loss": 7.5496, + "step": 1151500 + }, + { + "epoch": 4.691365317260912, + "grad_norm": 4.899525165557861, + "learning_rate": 0.0009376745629634473, + "loss": 7.5885, + "step": 1151600 + }, + { + "epoch": 4.6917726952842935, + "grad_norm": 7.629162311553955, + "learning_rate": 0.0009372886022484589, + "loss": 7.5856, + "step": 1151700 + }, + { + "epoch": 4.692180073307675, + "grad_norm": 5.845236301422119, + "learning_rate": 0.0009369027035134867, + "loss": 7.5688, + "step": 1151800 + }, + { + "epoch": 4.692587451331057, + "grad_norm": 5.813826560974121, + "learning_rate": 0.0009365168667737876, + "loss": 7.5521, + "step": 1151900 + }, + { + "epoch": 4.692994829354438, + "grad_norm": 10.613510131835938, + "learning_rate": 0.0009361310920446154, + "loss": 7.5786, + "step": 1152000 + }, + { + "epoch": 4.692994829354438, + "eval_MaskedAccuracy": 0.5118310677127698, + "eval_loss": 1.5911270380020142, + "eval_runtime": 155.8643, + "eval_samples_per_second": 407.252, + "eval_steps_per_second": 1.591, + "step": 1152000 + }, + { + "epoch": 4.69340220737782, + "grad_norm": 4.008493900299072, + "learning_rate": 0.0009357453793412213, + "loss": 7.597, + "step": 1152100 + }, + { + "epoch": 4.693809585401201, + "grad_norm": 4.623472213745117, + "learning_rate": 0.0009353597286788566, + "loss": 7.5364, + "step": 1152200 + }, + { + "epoch": 4.694216963424583, + "grad_norm": 17.645742416381836, + "learning_rate": 0.000934974140072766, + "loss": 7.5699, + "step": 1152300 + }, + { + "epoch": 4.694624341447964, + "grad_norm": 9.060762405395508, + "learning_rate": 0.0009345886135381944, + "loss": 7.5748, + "step": 1152400 + }, + { + "epoch": 4.695031719471346, + "grad_norm": 6.78955602645874, + "learning_rate": 0.0009342031490903841, + "loss": 7.5464, + "step": 1152500 + }, + { + "epoch": 4.695439097494727, + "grad_norm": 3.801654815673828, + "learning_rate": 0.0009338177467445753, + "loss": 7.5737, + "step": 1152600 + }, + { + "epoch": 4.695846475518108, + "grad_norm": 8.123143196105957, + "learning_rate": 0.0009334324065160029, + "loss": 7.5737, + "step": 1152700 + }, + { + "epoch": 4.69625385354149, + "grad_norm": 5.954250335693359, + "learning_rate": 0.0009330471284199043, + "loss": 7.5386, + "step": 1152800 + }, + { + "epoch": 4.696661231564871, + "grad_norm": 16.418846130371094, + "learning_rate": 0.0009326619124715094, + "loss": 7.5939, + "step": 1152900 + }, + { + "epoch": 4.697068609588253, + "grad_norm": 7.963831901550293, + "learning_rate": 0.0009322767586860495, + "loss": 7.5794, + "step": 1153000 + }, + { + "epoch": 4.697068609588253, + "eval_MaskedAccuracy": 0.5114758930732738, + "eval_loss": 1.5934879779815674, + "eval_runtime": 180.4163, + "eval_samples_per_second": 351.831, + "eval_steps_per_second": 1.375, + "step": 1153000 + }, + { + "epoch": 4.697475987611634, + "grad_norm": 4.721909046173096, + "learning_rate": 0.0009318916670787493, + "loss": 7.5947, + "step": 1153100 + }, + { + "epoch": 4.697883365635016, + "grad_norm": 8.068775177001953, + "learning_rate": 0.0009315066376648359, + "loss": 7.5498, + "step": 1153200 + }, + { + "epoch": 4.698290743658397, + "grad_norm": 5.424630165100098, + "learning_rate": 0.0009311216704595312, + "loss": 7.5658, + "step": 1153300 + }, + { + "epoch": 4.698698121681779, + "grad_norm": 16.26315689086914, + "learning_rate": 0.0009307367654780546, + "loss": 7.5756, + "step": 1153400 + }, + { + "epoch": 4.69910549970516, + "grad_norm": 4.9227495193481445, + "learning_rate": 0.0009303519227356221, + "loss": 7.568, + "step": 1153500 + }, + { + "epoch": 4.699512877728542, + "grad_norm": 8.267171859741211, + "learning_rate": 0.0009299671422474505, + "loss": 7.562, + "step": 1153600 + }, + { + "epoch": 4.699920255751923, + "grad_norm": 5.523322105407715, + "learning_rate": 0.0009295824240287522, + "loss": 7.551, + "step": 1153700 + }, + { + "epoch": 4.700327633775305, + "grad_norm": 5.789429664611816, + "learning_rate": 0.0009291977680947362, + "loss": 7.5907, + "step": 1153800 + }, + { + "epoch": 4.700735011798686, + "grad_norm": 11.09715461730957, + "learning_rate": 0.0009288131744606112, + "loss": 7.5964, + "step": 1153900 + }, + { + "epoch": 4.701142389822067, + "grad_norm": 5.65620231628418, + "learning_rate": 0.000928428643141581, + "loss": 7.5622, + "step": 1154000 + }, + { + "epoch": 4.701142389822067, + "eval_MaskedAccuracy": 0.5111806981310292, + "eval_loss": 1.5976860523223877, + "eval_runtime": 154.3055, + "eval_samples_per_second": 411.366, + "eval_steps_per_second": 1.607, + "step": 1154000 + }, + { + "epoch": 4.701549767845449, + "grad_norm": 5.494576454162598, + "learning_rate": 0.0009280441741528481, + "loss": 7.5637, + "step": 1154100 + }, + { + "epoch": 4.70195714586883, + "grad_norm": 12.01253604888916, + "learning_rate": 0.0009276597675096129, + "loss": 7.5816, + "step": 1154200 + }, + { + "epoch": 4.702364523892212, + "grad_norm": 9.708366394042969, + "learning_rate": 0.0009272754232270741, + "loss": 7.5551, + "step": 1154300 + }, + { + "epoch": 4.702771901915593, + "grad_norm": 15.648822784423828, + "learning_rate": 0.0009268911413204264, + "loss": 7.5346, + "step": 1154400 + }, + { + "epoch": 4.703179279938975, + "grad_norm": 6.7168450355529785, + "learning_rate": 0.0009265069218048618, + "loss": 7.5566, + "step": 1154500 + }, + { + "epoch": 4.703586657962356, + "grad_norm": 7.411017417907715, + "learning_rate": 0.0009261227646955714, + "loss": 7.5763, + "step": 1154600 + }, + { + "epoch": 4.703994035985738, + "grad_norm": 6.240413188934326, + "learning_rate": 0.0009257386700077413, + "loss": 7.5579, + "step": 1154700 + }, + { + "epoch": 4.704401414009119, + "grad_norm": 4.57463264465332, + "learning_rate": 0.0009253546377565591, + "loss": 7.5935, + "step": 1154800 + }, + { + "epoch": 4.704808792032501, + "grad_norm": 3.5772671699523926, + "learning_rate": 0.000924970667957206, + "loss": 7.5578, + "step": 1154900 + }, + { + "epoch": 4.705216170055882, + "grad_norm": 4.515414714813232, + "learning_rate": 0.0009245867606248639, + "loss": 7.5838, + "step": 1155000 + }, + { + "epoch": 4.705216170055882, + "eval_MaskedAccuracy": 0.5115335460324503, + "eval_loss": 1.6010119915008545, + "eval_runtime": 153.3033, + "eval_samples_per_second": 414.055, + "eval_steps_per_second": 1.618, + "step": 1155000 + }, + { + "epoch": 4.705623548079264, + "grad_norm": 18.80776023864746, + "learning_rate": 0.0009242029157747076, + "loss": 7.5651, + "step": 1155100 + }, + { + "epoch": 4.706030926102645, + "grad_norm": 4.243402481079102, + "learning_rate": 0.0009238191334219159, + "loss": 7.5421, + "step": 1155200 + }, + { + "epoch": 4.706438304126026, + "grad_norm": 10.762371063232422, + "learning_rate": 0.000923435413581661, + "loss": 7.5476, + "step": 1155300 + }, + { + "epoch": 4.706845682149408, + "grad_norm": 8.20543384552002, + "learning_rate": 0.0009230517562691131, + "loss": 7.5626, + "step": 1155400 + }, + { + "epoch": 4.707253060172789, + "grad_norm": 6.052391052246094, + "learning_rate": 0.0009226681614994401, + "loss": 7.573, + "step": 1155500 + }, + { + "epoch": 4.707660438196171, + "grad_norm": 3.0920984745025635, + "learning_rate": 0.0009222846292878069, + "loss": 7.5171, + "step": 1155600 + }, + { + "epoch": 4.708067816219552, + "grad_norm": 13.645121574401855, + "learning_rate": 0.0009219011596493782, + "loss": 7.5465, + "step": 1155700 + }, + { + "epoch": 4.708475194242934, + "grad_norm": 5.219463348388672, + "learning_rate": 0.0009215177525993128, + "loss": 7.5827, + "step": 1155800 + }, + { + "epoch": 4.708882572266315, + "grad_norm": 6.038544654846191, + "learning_rate": 0.0009211344081527708, + "loss": 7.5518, + "step": 1155900 + }, + { + "epoch": 4.709289950289697, + "grad_norm": 11.335884094238281, + "learning_rate": 0.0009207511263249059, + "loss": 7.5398, + "step": 1156000 + }, + { + "epoch": 4.709289950289697, + "eval_MaskedAccuracy": 0.5115045317851677, + "eval_loss": 1.5966569185256958, + "eval_runtime": 168.76, + "eval_samples_per_second": 376.132, + "eval_steps_per_second": 1.47, + "step": 1156000 + }, + { + "epoch": 4.709697328313078, + "grad_norm": 11.417588233947754, + "learning_rate": 0.0009203679071308713, + "loss": 7.5605, + "step": 1156100 + }, + { + "epoch": 4.71010470633646, + "grad_norm": 9.42202377319336, + "learning_rate": 0.0009199847505858197, + "loss": 7.5569, + "step": 1156200 + }, + { + "epoch": 4.710512084359841, + "grad_norm": 13.6173677444458, + "learning_rate": 0.0009196016567048987, + "loss": 7.5665, + "step": 1156300 + }, + { + "epoch": 4.710919462383223, + "grad_norm": 5.46528434753418, + "learning_rate": 0.0009192186255032525, + "loss": 7.5827, + "step": 1156400 + }, + { + "epoch": 4.711326840406604, + "grad_norm": 9.322909355163574, + "learning_rate": 0.0009188356569960253, + "loss": 7.581, + "step": 1156500 + }, + { + "epoch": 4.711734218429985, + "grad_norm": 11.99737548828125, + "learning_rate": 0.000918452751198359, + "loss": 7.5692, + "step": 1156600 + }, + { + "epoch": 4.712141596453367, + "grad_norm": 13.795363426208496, + "learning_rate": 0.0009180699081253903, + "loss": 7.5569, + "step": 1156700 + }, + { + "epoch": 4.712548974476748, + "grad_norm": 6.006650924682617, + "learning_rate": 0.0009176871277922558, + "loss": 7.5583, + "step": 1156800 + }, + { + "epoch": 4.71295635250013, + "grad_norm": 5.746901512145996, + "learning_rate": 0.0009173044102140893, + "loss": 7.5764, + "step": 1156900 + }, + { + "epoch": 4.713363730523511, + "grad_norm": 6.087377071380615, + "learning_rate": 0.0009169217554060214, + "loss": 7.5724, + "step": 1157000 + }, + { + "epoch": 4.713363730523511, + "eval_MaskedAccuracy": 0.5118616903480597, + "eval_loss": 1.590957522392273, + "eval_runtime": 166.5876, + "eval_samples_per_second": 381.037, + "eval_steps_per_second": 1.489, + "step": 1157000 + }, + { + "epoch": 4.713771108546893, + "grad_norm": 5.474801540374756, + "learning_rate": 0.0009165391633831785, + "loss": 7.5729, + "step": 1157100 + }, + { + "epoch": 4.714178486570274, + "grad_norm": 6.61529541015625, + "learning_rate": 0.0009161566341606887, + "loss": 7.5782, + "step": 1157200 + }, + { + "epoch": 4.714585864593656, + "grad_norm": 8.765612602233887, + "learning_rate": 0.0009157741677536754, + "loss": 7.5509, + "step": 1157300 + }, + { + "epoch": 4.714993242617037, + "grad_norm": 3.340832471847534, + "learning_rate": 0.0009153917641772582, + "loss": 7.5786, + "step": 1157400 + }, + { + "epoch": 4.715400620640419, + "grad_norm": 3.8586692810058594, + "learning_rate": 0.0009150094234465567, + "loss": 7.5683, + "step": 1157500 + }, + { + "epoch": 4.7158079986638, + "grad_norm": 10.54784870147705, + "learning_rate": 0.000914627145576686, + "loss": 7.5576, + "step": 1157600 + }, + { + "epoch": 4.716215376687181, + "grad_norm": 8.375208854675293, + "learning_rate": 0.000914244930582762, + "loss": 7.5488, + "step": 1157700 + }, + { + "epoch": 4.7166227547105635, + "grad_norm": 8.476005554199219, + "learning_rate": 0.0009138627784798932, + "loss": 7.5585, + "step": 1157800 + }, + { + "epoch": 4.717030132733944, + "grad_norm": 6.5658721923828125, + "learning_rate": 0.0009134806892831892, + "loss": 7.5327, + "step": 1157900 + }, + { + "epoch": 4.717437510757326, + "grad_norm": 4.999338150024414, + "learning_rate": 0.0009130986630077549, + "loss": 7.5549, + "step": 1158000 + }, + { + "epoch": 4.717437510757326, + "eval_MaskedAccuracy": 0.5116032715326428, + "eval_loss": 1.596752405166626, + "eval_runtime": 218.3894, + "eval_samples_per_second": 290.655, + "eval_steps_per_second": 1.136, + "step": 1158000 + }, + { + "epoch": 4.717844888780707, + "grad_norm": 4.15541410446167, + "learning_rate": 0.0009127166996686937, + "loss": 7.5502, + "step": 1158100 + }, + { + "epoch": 4.718252266804089, + "grad_norm": 9.1082763671875, + "learning_rate": 0.0009123347992811084, + "loss": 7.5645, + "step": 1158200 + }, + { + "epoch": 4.71865964482747, + "grad_norm": 7.027690410614014, + "learning_rate": 0.0009119529618600974, + "loss": 7.5473, + "step": 1158300 + }, + { + "epoch": 4.719067022850852, + "grad_norm": 4.834759712219238, + "learning_rate": 0.0009115711874207551, + "loss": 7.5449, + "step": 1158400 + }, + { + "epoch": 4.719474400874233, + "grad_norm": 4.868383407592773, + "learning_rate": 0.0009111894759781747, + "loss": 7.5653, + "step": 1158500 + }, + { + "epoch": 4.719881778897615, + "grad_norm": 6.506927490234375, + "learning_rate": 0.00091080782754745, + "loss": 7.5646, + "step": 1158600 + }, + { + "epoch": 4.720289156920996, + "grad_norm": 4.405208587646484, + "learning_rate": 0.0009104262421436677, + "loss": 7.5832, + "step": 1158700 + }, + { + "epoch": 4.720696534944378, + "grad_norm": 8.431148529052734, + "learning_rate": 0.0009100447197819151, + "loss": 7.5627, + "step": 1158800 + }, + { + "epoch": 4.721103912967759, + "grad_norm": 6.002216815948486, + "learning_rate": 0.0009096632604772751, + "loss": 7.5574, + "step": 1158900 + }, + { + "epoch": 4.72151129099114, + "grad_norm": 4.2939229011535645, + "learning_rate": 0.0009092818642448285, + "loss": 7.5764, + "step": 1159000 + }, + { + "epoch": 4.72151129099114, + "eval_MaskedAccuracy": 0.5107660818319678, + "eval_loss": 1.598768949508667, + "eval_runtime": 165.1411, + "eval_samples_per_second": 384.374, + "eval_steps_per_second": 1.502, + "step": 1159000 + }, + { + "epoch": 4.7219186690145225, + "grad_norm": 4.288996696472168, + "learning_rate": 0.0009089005310996544, + "loss": 7.5438, + "step": 1159100 + }, + { + "epoch": 4.722326047037903, + "grad_norm": 5.802402496337891, + "learning_rate": 0.0009085192610568289, + "loss": 7.57, + "step": 1159200 + }, + { + "epoch": 4.722733425061285, + "grad_norm": 11.19207763671875, + "learning_rate": 0.0009081380541314263, + "loss": 7.5694, + "step": 1159300 + }, + { + "epoch": 4.723140803084666, + "grad_norm": 8.473098754882812, + "learning_rate": 0.0009077569103385164, + "loss": 7.5503, + "step": 1159400 + }, + { + "epoch": 4.723548181108048, + "grad_norm": 4.782020568847656, + "learning_rate": 0.0009073758296931687, + "loss": 7.5529, + "step": 1159500 + }, + { + "epoch": 4.723955559131429, + "grad_norm": 3.8732857704162598, + "learning_rate": 0.0009069948122104485, + "loss": 7.5557, + "step": 1159600 + }, + { + "epoch": 4.724362937154811, + "grad_norm": 5.217780113220215, + "learning_rate": 0.0009066138579054214, + "loss": 7.5645, + "step": 1159700 + }, + { + "epoch": 4.724770315178192, + "grad_norm": 7.695767402648926, + "learning_rate": 0.0009062329667931455, + "loss": 7.5688, + "step": 1159800 + }, + { + "epoch": 4.725177693201574, + "grad_norm": 13.585648536682129, + "learning_rate": 0.0009058521388886819, + "loss": 7.5559, + "step": 1159900 + }, + { + "epoch": 4.725585071224955, + "grad_norm": 5.548020362854004, + "learning_rate": 0.0009054713742070851, + "loss": 7.5506, + "step": 1160000 + }, + { + "epoch": 4.725585071224955, + "eval_MaskedAccuracy": 0.5112732356047921, + "eval_loss": 1.60223388671875, + "eval_runtime": 165.0537, + "eval_samples_per_second": 384.578, + "eval_steps_per_second": 1.503, + "step": 1160000 + }, + { + "epoch": 4.725992449248337, + "grad_norm": 3.6210856437683105, + "learning_rate": 0.00090509067276341, + "loss": 7.5474, + "step": 1160100 + }, + { + "epoch": 4.7263998272717185, + "grad_norm": 8.424483299255371, + "learning_rate": 0.0009047100345727088, + "loss": 7.5553, + "step": 1160200 + }, + { + "epoch": 4.726807205295099, + "grad_norm": 15.483039855957031, + "learning_rate": 0.0009043294596500279, + "loss": 7.5684, + "step": 1160300 + }, + { + "epoch": 4.727214583318481, + "grad_norm": 6.544163703918457, + "learning_rate": 0.0009039489480104152, + "loss": 7.5656, + "step": 1160400 + }, + { + "epoch": 4.727621961341862, + "grad_norm": 6.401214599609375, + "learning_rate": 0.0009035684996689137, + "loss": 7.5752, + "step": 1160500 + }, + { + "epoch": 4.728029339365244, + "grad_norm": 7.120314598083496, + "learning_rate": 0.000903188114640565, + "loss": 7.5552, + "step": 1160600 + }, + { + "epoch": 4.728436717388625, + "grad_norm": 11.215267181396484, + "learning_rate": 0.0009028077929404073, + "loss": 7.5178, + "step": 1160700 + }, + { + "epoch": 4.728844095412007, + "grad_norm": 8.733551025390625, + "learning_rate": 0.0009024275345834758, + "loss": 7.5551, + "step": 1160800 + }, + { + "epoch": 4.729251473435388, + "grad_norm": 6.027173042297363, + "learning_rate": 0.0009020473395848047, + "loss": 7.549, + "step": 1160900 + }, + { + "epoch": 4.72965885145877, + "grad_norm": 4.815136909484863, + "learning_rate": 0.0009016672079594246, + "loss": 7.5705, + "step": 1161000 + }, + { + "epoch": 4.72965885145877, + "eval_MaskedAccuracy": 0.5118587553350832, + "eval_loss": 1.596041202545166, + "eval_runtime": 173.4566, + "eval_samples_per_second": 365.947, + "eval_steps_per_second": 1.43, + "step": 1161000 + }, + { + "epoch": 4.730066229482151, + "grad_norm": 8.049763679504395, + "learning_rate": 0.0009012871397223662, + "loss": 7.5641, + "step": 1161100 + }, + { + "epoch": 4.730473607505533, + "grad_norm": 9.600907325744629, + "learning_rate": 0.0009009071348886541, + "loss": 7.5797, + "step": 1161200 + }, + { + "epoch": 4.7308809855289145, + "grad_norm": 3.3699300289154053, + "learning_rate": 0.0009005271934733123, + "loss": 7.5313, + "step": 1161300 + }, + { + "epoch": 4.731288363552296, + "grad_norm": 4.550024509429932, + "learning_rate": 0.0009001473154913624, + "loss": 7.5517, + "step": 1161400 + }, + { + "epoch": 4.7316957415756775, + "grad_norm": 17.960681915283203, + "learning_rate": 0.000899767500957822, + "loss": 7.5803, + "step": 1161500 + }, + { + "epoch": 4.732103119599058, + "grad_norm": 8.197354316711426, + "learning_rate": 0.0008993877498877079, + "loss": 7.5665, + "step": 1161600 + }, + { + "epoch": 4.73251049762244, + "grad_norm": 5.120255947113037, + "learning_rate": 0.0008990080622960325, + "loss": 7.5399, + "step": 1161700 + }, + { + "epoch": 4.732917875645821, + "grad_norm": 3.9899091720581055, + "learning_rate": 0.0008986284381978085, + "loss": 7.5394, + "step": 1161800 + }, + { + "epoch": 4.733325253669203, + "grad_norm": 12.039031982421875, + "learning_rate": 0.0008982488776080437, + "loss": 7.57, + "step": 1161900 + }, + { + "epoch": 4.733732631692584, + "grad_norm": 4.0215911865234375, + "learning_rate": 0.000897869380541744, + "loss": 7.5371, + "step": 1162000 + }, + { + "epoch": 4.733732631692584, + "eval_MaskedAccuracy": 0.5118360792934855, + "eval_loss": 1.5812571048736572, + "eval_runtime": 150.9453, + "eval_samples_per_second": 420.523, + "eval_steps_per_second": 1.643, + "step": 1162000 + }, + { + "epoch": 4.734140009715966, + "grad_norm": 10.553079605102539, + "learning_rate": 0.0008974899470139132, + "loss": 7.5778, + "step": 1162100 + }, + { + "epoch": 4.734547387739347, + "grad_norm": 3.1538913249969482, + "learning_rate": 0.0008971105770395535, + "loss": 7.5433, + "step": 1162200 + }, + { + "epoch": 4.734954765762729, + "grad_norm": 4.568304061889648, + "learning_rate": 0.000896731270633662, + "loss": 7.569, + "step": 1162300 + }, + { + "epoch": 4.73536214378611, + "grad_norm": 7.588132858276367, + "learning_rate": 0.0008963520278112351, + "loss": 7.5664, + "step": 1162400 + }, + { + "epoch": 4.735769521809492, + "grad_norm": 7.682785987854004, + "learning_rate": 0.0008959728485872658, + "loss": 7.5311, + "step": 1162500 + }, + { + "epoch": 4.7361768998328735, + "grad_norm": 6.86749267578125, + "learning_rate": 0.0008955937329767455, + "loss": 7.5475, + "step": 1162600 + }, + { + "epoch": 4.736584277856254, + "grad_norm": 12.38428020477295, + "learning_rate": 0.0008952146809946629, + "loss": 7.5966, + "step": 1162700 + }, + { + "epoch": 4.7369916558796366, + "grad_norm": 12.370417594909668, + "learning_rate": 0.0008948356926560034, + "loss": 7.5645, + "step": 1162800 + }, + { + "epoch": 4.737399033903017, + "grad_norm": 3.9510908126831055, + "learning_rate": 0.0008944567679757506, + "loss": 7.548, + "step": 1162900 + }, + { + "epoch": 4.737806411926399, + "grad_norm": 7.90590763092041, + "learning_rate": 0.0008940779069688855, + "loss": 7.5456, + "step": 1163000 + }, + { + "epoch": 4.737806411926399, + "eval_MaskedAccuracy": 0.5119326063557618, + "eval_loss": 1.586570382118225, + "eval_runtime": 154.9261, + "eval_samples_per_second": 409.718, + "eval_steps_per_second": 1.601, + "step": 1163000 + }, + { + "epoch": 4.73821378994978, + "grad_norm": 5.847414970397949, + "learning_rate": 0.000893699109650387, + "loss": 7.559, + "step": 1163100 + }, + { + "epoch": 4.738621167973162, + "grad_norm": 10.857290267944336, + "learning_rate": 0.0008933203760352303, + "loss": 7.5917, + "step": 1163200 + }, + { + "epoch": 4.739028545996543, + "grad_norm": 7.471684455871582, + "learning_rate": 0.0008929417061383891, + "loss": 7.5403, + "step": 1163300 + }, + { + "epoch": 4.739435924019925, + "grad_norm": 7.344918727874756, + "learning_rate": 0.0008925630999748348, + "loss": 7.5972, + "step": 1163400 + }, + { + "epoch": 4.739843302043306, + "grad_norm": 11.058645248413086, + "learning_rate": 0.0008921845575595341, + "loss": 7.5894, + "step": 1163500 + }, + { + "epoch": 4.740250680066688, + "grad_norm": 15.42564582824707, + "learning_rate": 0.0008918060789074541, + "loss": 7.5493, + "step": 1163600 + }, + { + "epoch": 4.7406580580900695, + "grad_norm": 12.14067554473877, + "learning_rate": 0.0008914276640335578, + "loss": 7.5389, + "step": 1163700 + }, + { + "epoch": 4.741065436113451, + "grad_norm": 9.472207069396973, + "learning_rate": 0.000891049312952807, + "loss": 7.5601, + "step": 1163800 + }, + { + "epoch": 4.7414728141368325, + "grad_norm": 10.546226501464844, + "learning_rate": 0.0008906710256801578, + "loss": 7.5863, + "step": 1163900 + }, + { + "epoch": 4.741880192160213, + "grad_norm": 6.95032262802124, + "learning_rate": 0.0008902928022305664, + "loss": 7.5769, + "step": 1164000 + }, + { + "epoch": 4.741880192160213, + "eval_MaskedAccuracy": 0.5117164427856976, + "eval_loss": 1.5969483852386475, + "eval_runtime": 170.8262, + "eval_samples_per_second": 371.582, + "eval_steps_per_second": 1.452, + "step": 1164000 + }, + { + "epoch": 4.742287570183595, + "grad_norm": 9.718832969665527, + "learning_rate": 0.0008899146426189865, + "loss": 7.5576, + "step": 1164100 + }, + { + "epoch": 4.742694948206976, + "grad_norm": 11.1882963180542, + "learning_rate": 0.0008895365468603693, + "loss": 7.5773, + "step": 1164200 + }, + { + "epoch": 4.743102326230358, + "grad_norm": 8.55040168762207, + "learning_rate": 0.0008891585149696628, + "loss": 7.5588, + "step": 1164300 + }, + { + "epoch": 4.743509704253739, + "grad_norm": 13.320935249328613, + "learning_rate": 0.0008887805469618123, + "loss": 7.5798, + "step": 1164400 + }, + { + "epoch": 4.743917082277121, + "grad_norm": 7.613979816436768, + "learning_rate": 0.0008884026428517607, + "loss": 7.5466, + "step": 1164500 + }, + { + "epoch": 4.744324460300502, + "grad_norm": 8.136162757873535, + "learning_rate": 0.0008880248026544489, + "loss": 7.59, + "step": 1164600 + }, + { + "epoch": 4.744731838323884, + "grad_norm": 8.828904151916504, + "learning_rate": 0.0008876470263848153, + "loss": 7.5592, + "step": 1164700 + }, + { + "epoch": 4.745139216347265, + "grad_norm": 5.564899444580078, + "learning_rate": 0.0008872693140577958, + "loss": 7.56, + "step": 1164800 + }, + { + "epoch": 4.745546594370647, + "grad_norm": 8.49870777130127, + "learning_rate": 0.0008868916656883212, + "loss": 7.567, + "step": 1164900 + }, + { + "epoch": 4.7459539723940285, + "grad_norm": 9.09511661529541, + "learning_rate": 0.000886514081291322, + "loss": 7.5508, + "step": 1165000 + }, + { + "epoch": 4.7459539723940285, + "eval_MaskedAccuracy": 0.5113661104288739, + "eval_loss": 1.6031959056854248, + "eval_runtime": 190.2801, + "eval_samples_per_second": 333.593, + "eval_steps_per_second": 1.303, + "step": 1165000 + }, + { + "epoch": 4.74636135041741, + "grad_norm": 7.04810905456543, + "learning_rate": 0.0008861365608817294, + "loss": 7.5461, + "step": 1165100 + }, + { + "epoch": 4.746768728440792, + "grad_norm": 6.270197868347168, + "learning_rate": 0.0008857591044744663, + "loss": 7.581, + "step": 1165200 + }, + { + "epoch": 4.747176106464172, + "grad_norm": 12.531331062316895, + "learning_rate": 0.0008853817120844558, + "loss": 7.5635, + "step": 1165300 + }, + { + "epoch": 4.747583484487554, + "grad_norm": 4.336494445800781, + "learning_rate": 0.000885004383726619, + "loss": 7.5655, + "step": 1165400 + }, + { + "epoch": 4.747990862510935, + "grad_norm": 5.909377574920654, + "learning_rate": 0.0008846271194158728, + "loss": 7.5881, + "step": 1165500 + }, + { + "epoch": 4.748398240534317, + "grad_norm": 5.464470386505127, + "learning_rate": 0.0008842499191671333, + "loss": 7.5237, + "step": 1165600 + }, + { + "epoch": 4.748805618557698, + "grad_norm": 6.150482177734375, + "learning_rate": 0.0008838727829953133, + "loss": 7.5772, + "step": 1165700 + }, + { + "epoch": 4.74921299658108, + "grad_norm": 18.878080368041992, + "learning_rate": 0.0008834957109153224, + "loss": 7.5897, + "step": 1165800 + }, + { + "epoch": 4.749620374604461, + "grad_norm": 5.610697269439697, + "learning_rate": 0.0008831187029420681, + "loss": 7.5657, + "step": 1165900 + }, + { + "epoch": 4.750027752627843, + "grad_norm": 10.221170425415039, + "learning_rate": 0.0008827417590904548, + "loss": 7.5678, + "step": 1166000 + }, + { + "epoch": 4.750027752627843, + "eval_MaskedAccuracy": 0.5113002197967617, + "eval_loss": 1.5943676233291626, + "eval_runtime": 164.9484, + "eval_samples_per_second": 384.823, + "eval_steps_per_second": 1.504, + "step": 1166000 + }, + { + "epoch": 4.7504351306512245, + "grad_norm": 8.151117324829102, + "learning_rate": 0.0008823648793753875, + "loss": 7.5369, + "step": 1166100 + }, + { + "epoch": 4.750842508674606, + "grad_norm": 3.9326717853546143, + "learning_rate": 0.0008819880638117646, + "loss": 7.5922, + "step": 1166200 + }, + { + "epoch": 4.7512498866979875, + "grad_norm": 6.830138206481934, + "learning_rate": 0.0008816113124144843, + "loss": 7.5527, + "step": 1166300 + }, + { + "epoch": 4.751657264721369, + "grad_norm": 19.084400177001953, + "learning_rate": 0.0008812346251984412, + "loss": 7.5512, + "step": 1166400 + }, + { + "epoch": 4.752064642744751, + "grad_norm": 11.66727066040039, + "learning_rate": 0.0008808580021785284, + "loss": 7.5515, + "step": 1166500 + }, + { + "epoch": 4.752472020768131, + "grad_norm": 4.68572998046875, + "learning_rate": 0.0008804814433696352, + "loss": 7.5802, + "step": 1166600 + }, + { + "epoch": 4.752879398791513, + "grad_norm": 15.044259071350098, + "learning_rate": 0.0008801049487866491, + "loss": 7.5799, + "step": 1166700 + }, + { + "epoch": 4.753286776814894, + "grad_norm": 4.469608783721924, + "learning_rate": 0.0008797285184444543, + "loss": 7.5697, + "step": 1166800 + }, + { + "epoch": 4.753694154838276, + "grad_norm": 10.37489128112793, + "learning_rate": 0.0008793521523579333, + "loss": 7.5539, + "step": 1166900 + }, + { + "epoch": 4.754101532861657, + "grad_norm": 5.180149078369141, + "learning_rate": 0.0008789758505419663, + "loss": 7.5689, + "step": 1167000 + }, + { + "epoch": 4.754101532861657, + "eval_MaskedAccuracy": 0.5114207717471436, + "eval_loss": 1.5994853973388672, + "eval_runtime": 162.4343, + "eval_samples_per_second": 390.78, + "eval_steps_per_second": 1.527, + "step": 1167000 + }, + { + "epoch": 4.754508910885039, + "grad_norm": 10.346702575683594, + "learning_rate": 0.000878599613011431, + "loss": 7.5567, + "step": 1167100 + }, + { + "epoch": 4.75491628890842, + "grad_norm": 8.587042808532715, + "learning_rate": 0.0008782234397812026, + "loss": 7.5207, + "step": 1167200 + }, + { + "epoch": 4.755323666931802, + "grad_norm": 3.1612582206726074, + "learning_rate": 0.0008778473308661511, + "loss": 7.5726, + "step": 1167300 + }, + { + "epoch": 4.7557310449551835, + "grad_norm": 4.133459568023682, + "learning_rate": 0.000877471286281146, + "loss": 7.5411, + "step": 1167400 + }, + { + "epoch": 4.756138422978565, + "grad_norm": 12.26366138458252, + "learning_rate": 0.0008770953060410577, + "loss": 7.5428, + "step": 1167500 + }, + { + "epoch": 4.756545801001947, + "grad_norm": 8.328948974609375, + "learning_rate": 0.0008767193901607479, + "loss": 7.5525, + "step": 1167600 + }, + { + "epoch": 4.756953179025327, + "grad_norm": 12.7047119140625, + "learning_rate": 0.0008763435386550798, + "loss": 7.5579, + "step": 1167700 + }, + { + "epoch": 4.75736055704871, + "grad_norm": 6.194194316864014, + "learning_rate": 0.0008759677515389119, + "loss": 7.5937, + "step": 1167800 + }, + { + "epoch": 4.75776793507209, + "grad_norm": 7.560429096221924, + "learning_rate": 0.000875592028827102, + "loss": 7.5618, + "step": 1167900 + }, + { + "epoch": 4.758175313095472, + "grad_norm": 11.472549438476562, + "learning_rate": 0.000875216370534502, + "loss": 7.5542, + "step": 1168000 + }, + { + "epoch": 4.758175313095472, + "eval_MaskedAccuracy": 0.5114655179978685, + "eval_loss": 1.5953867435455322, + "eval_runtime": 164.6842, + "eval_samples_per_second": 385.441, + "eval_steps_per_second": 1.506, + "step": 1168000 + }, + { + "epoch": 4.758582691118853, + "grad_norm": 5.689689636230469, + "learning_rate": 0.0008748407766759673, + "loss": 7.5692, + "step": 1168100 + }, + { + "epoch": 4.758990069142235, + "grad_norm": 5.820748805999756, + "learning_rate": 0.0008744652472663456, + "loss": 7.5568, + "step": 1168200 + }, + { + "epoch": 4.759397447165616, + "grad_norm": 4.162635326385498, + "learning_rate": 0.0008740897823204831, + "loss": 7.5429, + "step": 1168300 + }, + { + "epoch": 4.759804825188998, + "grad_norm": 8.76908016204834, + "learning_rate": 0.0008737143818532241, + "loss": 7.5525, + "step": 1168400 + }, + { + "epoch": 4.7602122032123795, + "grad_norm": 6.940735816955566, + "learning_rate": 0.0008733390458794099, + "loss": 7.6008, + "step": 1168500 + }, + { + "epoch": 4.760619581235761, + "grad_norm": 9.81445026397705, + "learning_rate": 0.0008729637744138795, + "loss": 7.5677, + "step": 1168600 + }, + { + "epoch": 4.7610269592591425, + "grad_norm": 3.841867446899414, + "learning_rate": 0.0008725885674714701, + "loss": 7.5302, + "step": 1168700 + }, + { + "epoch": 4.761434337282524, + "grad_norm": 6.942512512207031, + "learning_rate": 0.0008722134250670156, + "loss": 7.5518, + "step": 1168800 + }, + { + "epoch": 4.761841715305906, + "grad_norm": 6.805759906768799, + "learning_rate": 0.0008718383472153469, + "loss": 7.5466, + "step": 1168900 + }, + { + "epoch": 4.762249093329286, + "grad_norm": 7.043669700622559, + "learning_rate": 0.0008714633339312923, + "loss": 7.5593, + "step": 1169000 + }, + { + "epoch": 4.762249093329286, + "eval_MaskedAccuracy": 0.5117339740654038, + "eval_loss": 1.5920907258987427, + "eval_runtime": 199.7228, + "eval_samples_per_second": 317.82, + "eval_steps_per_second": 1.242, + "step": 1169000 + }, + { + "epoch": 4.762656471352668, + "grad_norm": 8.697935104370117, + "learning_rate": 0.000871088385229681, + "loss": 7.5533, + "step": 1169100 + }, + { + "epoch": 4.763063849376049, + "grad_norm": 8.470230102539062, + "learning_rate": 0.0008707135011253334, + "loss": 7.5727, + "step": 1169200 + }, + { + "epoch": 4.763471227399431, + "grad_norm": 10.522871971130371, + "learning_rate": 0.0008703386816330725, + "loss": 7.5232, + "step": 1169300 + }, + { + "epoch": 4.763878605422812, + "grad_norm": 6.342298984527588, + "learning_rate": 0.0008699639267677165, + "loss": 7.5681, + "step": 1169400 + }, + { + "epoch": 4.764285983446194, + "grad_norm": 9.163177490234375, + "learning_rate": 0.0008695892365440817, + "loss": 7.5368, + "step": 1169500 + }, + { + "epoch": 4.7646933614695754, + "grad_norm": 5.029087543487549, + "learning_rate": 0.0008692146109769804, + "loss": 7.5465, + "step": 1169600 + }, + { + "epoch": 4.765100739492957, + "grad_norm": 7.584436893463135, + "learning_rate": 0.0008688400500812244, + "loss": 7.5663, + "step": 1169700 + }, + { + "epoch": 4.7655081175163385, + "grad_norm": 5.353863716125488, + "learning_rate": 0.000868465553871622, + "loss": 7.5442, + "step": 1169800 + }, + { + "epoch": 4.76591549553972, + "grad_norm": 14.737866401672363, + "learning_rate": 0.0008680911223629781, + "loss": 7.5757, + "step": 1169900 + }, + { + "epoch": 4.766322873563102, + "grad_norm": 4.93510627746582, + "learning_rate": 0.0008677167555700973, + "loss": 7.573, + "step": 1170000 + }, + { + "epoch": 4.766322873563102, + "eval_MaskedAccuracy": 0.5113181159684853, + "eval_loss": 1.595564842224121, + "eval_runtime": 162.6779, + "eval_samples_per_second": 390.194, + "eval_steps_per_second": 1.524, + "step": 1170000 + }, + { + "epoch": 4.766730251586483, + "grad_norm": 14.309826850891113, + "learning_rate": 0.0008673424535077804, + "loss": 7.5814, + "step": 1170100 + }, + { + "epoch": 4.767137629609865, + "grad_norm": 3.418689727783203, + "learning_rate": 0.0008669682161908257, + "loss": 7.5516, + "step": 1170200 + }, + { + "epoch": 4.767545007633245, + "grad_norm": 16.931493759155273, + "learning_rate": 0.000866594043634028, + "loss": 7.5483, + "step": 1170300 + }, + { + "epoch": 4.767952385656627, + "grad_norm": 3.264801263809204, + "learning_rate": 0.0008662199358521804, + "loss": 7.5704, + "step": 1170400 + }, + { + "epoch": 4.768359763680008, + "grad_norm": 11.889373779296875, + "learning_rate": 0.0008658458928600735, + "loss": 7.5637, + "step": 1170500 + }, + { + "epoch": 4.76876714170339, + "grad_norm": 5.08151388168335, + "learning_rate": 0.0008654719146724953, + "loss": 7.5387, + "step": 1170600 + }, + { + "epoch": 4.769174519726771, + "grad_norm": 8.597216606140137, + "learning_rate": 0.0008650980013042308, + "loss": 7.573, + "step": 1170700 + }, + { + "epoch": 4.769581897750153, + "grad_norm": 4.913052558898926, + "learning_rate": 0.0008647241527700637, + "loss": 7.5568, + "step": 1170800 + }, + { + "epoch": 4.7699892757735345, + "grad_norm": 20.04241180419922, + "learning_rate": 0.0008643503690847726, + "loss": 7.5769, + "step": 1170900 + }, + { + "epoch": 4.770396653796916, + "grad_norm": 19.256046295166016, + "learning_rate": 0.000863976650263137, + "loss": 7.5777, + "step": 1171000 + }, + { + "epoch": 4.770396653796916, + "eval_MaskedAccuracy": 0.5114205840526601, + "eval_loss": 1.5949627161026, + "eval_runtime": 160.7449, + "eval_samples_per_second": 394.887, + "eval_steps_per_second": 1.543, + "step": 1171000 + }, + { + "epoch": 4.7708040318202976, + "grad_norm": 6.5651960372924805, + "learning_rate": 0.0008636029963199313, + "loss": 7.5617, + "step": 1171100 + }, + { + "epoch": 4.771211409843679, + "grad_norm": 4.102616786956787, + "learning_rate": 0.0008632294072699271, + "loss": 7.5564, + "step": 1171200 + }, + { + "epoch": 4.771618787867061, + "grad_norm": 9.836995124816895, + "learning_rate": 0.0008628558831278951, + "loss": 7.5346, + "step": 1171300 + }, + { + "epoch": 4.772026165890442, + "grad_norm": 6.117199897766113, + "learning_rate": 0.0008624824239086028, + "loss": 7.5615, + "step": 1171400 + }, + { + "epoch": 4.772433543913824, + "grad_norm": 8.702661514282227, + "learning_rate": 0.0008621090296268162, + "loss": 7.5579, + "step": 1171500 + }, + { + "epoch": 4.772840921937204, + "grad_norm": 8.160992622375488, + "learning_rate": 0.0008617357002972964, + "loss": 7.5548, + "step": 1171600 + }, + { + "epoch": 4.773248299960586, + "grad_norm": 4.512524604797363, + "learning_rate": 0.0008613624359348026, + "loss": 7.5602, + "step": 1171700 + }, + { + "epoch": 4.773655677983967, + "grad_norm": 6.5624895095825195, + "learning_rate": 0.0008609892365540935, + "loss": 7.567, + "step": 1171800 + }, + { + "epoch": 4.774063056007349, + "grad_norm": 7.824966907501221, + "learning_rate": 0.0008606161021699222, + "loss": 7.533, + "step": 1171900 + }, + { + "epoch": 4.7744704340307305, + "grad_norm": 8.146249771118164, + "learning_rate": 0.0008602430327970409, + "loss": 7.5275, + "step": 1172000 + }, + { + "epoch": 4.7744704340307305, + "eval_MaskedAccuracy": 0.511129574165486, + "eval_loss": 1.586852788925171, + "eval_runtime": 173.3589, + "eval_samples_per_second": 366.154, + "eval_steps_per_second": 1.431, + "step": 1172000 + }, + { + "epoch": 4.774877812054112, + "grad_norm": 9.132033348083496, + "learning_rate": 0.0008598700284501994, + "loss": 7.5793, + "step": 1172100 + }, + { + "epoch": 4.7752851900774935, + "grad_norm": 3.6502768993377686, + "learning_rate": 0.0008594970891441445, + "loss": 7.5517, + "step": 1172200 + }, + { + "epoch": 4.775692568100875, + "grad_norm": 4.238583564758301, + "learning_rate": 0.0008591242148936199, + "loss": 7.5552, + "step": 1172300 + }, + { + "epoch": 4.776099946124257, + "grad_norm": 3.451655626296997, + "learning_rate": 0.0008587514057133689, + "loss": 7.5518, + "step": 1172400 + }, + { + "epoch": 4.776507324147638, + "grad_norm": 3.534184694290161, + "learning_rate": 0.000858378661618129, + "loss": 7.5849, + "step": 1172500 + }, + { + "epoch": 4.77691470217102, + "grad_norm": 15.55631160736084, + "learning_rate": 0.0008580059826226376, + "loss": 7.5186, + "step": 1172600 + }, + { + "epoch": 4.7773220801944, + "grad_norm": 6.297276496887207, + "learning_rate": 0.0008576333687416277, + "loss": 7.5566, + "step": 1172700 + }, + { + "epoch": 4.777729458217783, + "grad_norm": 12.850409507751465, + "learning_rate": 0.000857260819989832, + "loss": 7.5354, + "step": 1172800 + }, + { + "epoch": 4.778136836241163, + "grad_norm": 6.520290374755859, + "learning_rate": 0.0008568883363819779, + "loss": 7.5494, + "step": 1172900 + }, + { + "epoch": 4.778544214264545, + "grad_norm": 4.348905086517334, + "learning_rate": 0.0008565159179327939, + "loss": 7.5546, + "step": 1173000 + }, + { + "epoch": 4.778544214264545, + "eval_MaskedAccuracy": 0.5115314028956122, + "eval_loss": 1.5912624597549438, + "eval_runtime": 185.6518, + "eval_samples_per_second": 341.909, + "eval_steps_per_second": 1.336, + "step": 1173000 + }, + { + "epoch": 4.778951592287926, + "grad_norm": 5.397449016571045, + "learning_rate": 0.0008561435646570022, + "loss": 7.5621, + "step": 1173100 + }, + { + "epoch": 4.779358970311308, + "grad_norm": 3.4884116649627686, + "learning_rate": 0.0008557712765693242, + "loss": 7.5594, + "step": 1173200 + }, + { + "epoch": 4.7797663483346895, + "grad_norm": 8.312586784362793, + "learning_rate": 0.0008553990536844777, + "loss": 7.5803, + "step": 1173300 + }, + { + "epoch": 4.780173726358071, + "grad_norm": 4.167957305908203, + "learning_rate": 0.0008550268960171794, + "loss": 7.584, + "step": 1173400 + }, + { + "epoch": 4.780581104381453, + "grad_norm": 5.039919376373291, + "learning_rate": 0.0008546548035821431, + "loss": 7.5604, + "step": 1173500 + }, + { + "epoch": 4.780988482404834, + "grad_norm": 10.528605461120605, + "learning_rate": 0.0008542827763940794, + "loss": 7.5642, + "step": 1173600 + }, + { + "epoch": 4.781395860428216, + "grad_norm": 16.156702041625977, + "learning_rate": 0.0008539108144676959, + "loss": 7.5513, + "step": 1173700 + }, + { + "epoch": 4.781803238451597, + "grad_norm": 5.934739589691162, + "learning_rate": 0.0008535389178176985, + "loss": 7.5693, + "step": 1173800 + }, + { + "epoch": 4.782210616474979, + "grad_norm": 6.528579235076904, + "learning_rate": 0.000853167086458789, + "loss": 7.5572, + "step": 1173900 + }, + { + "epoch": 4.782617994498359, + "grad_norm": 7.340646266937256, + "learning_rate": 0.0008527953204056709, + "loss": 7.5583, + "step": 1174000 + }, + { + "epoch": 4.782617994498359, + "eval_MaskedAccuracy": 0.5111816050317756, + "eval_loss": 1.5916887521743774, + "eval_runtime": 157.6226, + "eval_samples_per_second": 402.709, + "eval_steps_per_second": 1.573, + "step": 1174000 + }, + { + "epoch": 4.783025372521741, + "grad_norm": 5.891901969909668, + "learning_rate": 0.0008524236196730395, + "loss": 7.5695, + "step": 1174100 + }, + { + "epoch": 4.783432750545122, + "grad_norm": 7.876992702484131, + "learning_rate": 0.0008520519842755907, + "loss": 7.5495, + "step": 1174200 + }, + { + "epoch": 4.783840128568504, + "grad_norm": 4.458004951477051, + "learning_rate": 0.0008516804142280179, + "loss": 7.5643, + "step": 1174300 + }, + { + "epoch": 4.7842475065918855, + "grad_norm": 9.906769752502441, + "learning_rate": 0.0008513089095450113, + "loss": 7.5376, + "step": 1174400 + }, + { + "epoch": 4.784654884615267, + "grad_norm": 5.301359176635742, + "learning_rate": 0.0008509374702412575, + "loss": 7.5495, + "step": 1174500 + }, + { + "epoch": 4.7850622626386485, + "grad_norm": 8.92895221710205, + "learning_rate": 0.0008505660963314424, + "loss": 7.5627, + "step": 1174600 + }, + { + "epoch": 4.78546964066203, + "grad_norm": 6.8228068351745605, + "learning_rate": 0.0008501947878302477, + "loss": 7.5624, + "step": 1174700 + }, + { + "epoch": 4.785877018685412, + "grad_norm": 11.703251838684082, + "learning_rate": 0.0008498235447523541, + "loss": 7.5339, + "step": 1174800 + }, + { + "epoch": 4.786284396708793, + "grad_norm": 3.5254666805267334, + "learning_rate": 0.0008494523671124373, + "loss": 7.5439, + "step": 1174900 + }, + { + "epoch": 4.786691774732175, + "grad_norm": 12.059759140014648, + "learning_rate": 0.000849081254925174, + "loss": 7.5601, + "step": 1175000 + }, + { + "epoch": 4.786691774732175, + "eval_MaskedAccuracy": 0.5108074817477763, + "eval_loss": 1.5949333906173706, + "eval_runtime": 159.7461, + "eval_samples_per_second": 397.356, + "eval_steps_per_second": 1.552, + "step": 1175000 + }, + { + "epoch": 4.787099152755556, + "grad_norm": 7.086734294891357, + "learning_rate": 0.0008487102082052353, + "loss": 7.5566, + "step": 1175100 + }, + { + "epoch": 4.787506530778938, + "grad_norm": 7.3957672119140625, + "learning_rate": 0.00084833922696729, + "loss": 7.5542, + "step": 1175200 + }, + { + "epoch": 4.787913908802318, + "grad_norm": 8.284390449523926, + "learning_rate": 0.0008479683112260062, + "loss": 7.5498, + "step": 1175300 + }, + { + "epoch": 4.7883212868257, + "grad_norm": 3.2272274494171143, + "learning_rate": 0.0008475974609960468, + "loss": 7.5619, + "step": 1175400 + }, + { + "epoch": 4.788728664849081, + "grad_norm": 3.680732250213623, + "learning_rate": 0.0008472266762920747, + "loss": 7.5294, + "step": 1175500 + }, + { + "epoch": 4.789136042872463, + "grad_norm": 4.4533514976501465, + "learning_rate": 0.0008468559571287474, + "loss": 7.5329, + "step": 1175600 + }, + { + "epoch": 4.7895434208958445, + "grad_norm": 15.333439826965332, + "learning_rate": 0.0008464853035207225, + "loss": 7.5385, + "step": 1175700 + }, + { + "epoch": 4.789950798919226, + "grad_norm": 4.0509772300720215, + "learning_rate": 0.000846114715482654, + "loss": 7.5505, + "step": 1175800 + }, + { + "epoch": 4.790358176942608, + "grad_norm": 9.513604164123535, + "learning_rate": 0.0008457441930291938, + "loss": 7.5768, + "step": 1175900 + }, + { + "epoch": 4.790765554965989, + "grad_norm": 7.331367015838623, + "learning_rate": 0.0008453737361749897, + "loss": 7.5372, + "step": 1176000 + }, + { + "epoch": 4.790765554965989, + "eval_MaskedAccuracy": 0.5116188169980294, + "eval_loss": 1.589592695236206, + "eval_runtime": 162.4316, + "eval_samples_per_second": 390.786, + "eval_steps_per_second": 1.527, + "step": 1176000 + }, + { + "epoch": 4.791172932989371, + "grad_norm": 5.087259769439697, + "learning_rate": 0.0008450033449346889, + "loss": 7.553, + "step": 1176100 + }, + { + "epoch": 4.791580311012752, + "grad_norm": 11.546634674072266, + "learning_rate": 0.0008446330193229337, + "loss": 7.5676, + "step": 1176200 + }, + { + "epoch": 4.791987689036134, + "grad_norm": 4.462437629699707, + "learning_rate": 0.0008442627593543655, + "loss": 7.549, + "step": 1176300 + }, + { + "epoch": 4.792395067059515, + "grad_norm": 5.836212635040283, + "learning_rate": 0.000843892565043623, + "loss": 7.5656, + "step": 1176400 + }, + { + "epoch": 4.792802445082897, + "grad_norm": 16.149566650390625, + "learning_rate": 0.0008435224364053408, + "loss": 7.5743, + "step": 1176500 + }, + { + "epoch": 4.793209823106277, + "grad_norm": 5.329946994781494, + "learning_rate": 0.0008431523734541532, + "loss": 7.5559, + "step": 1176600 + }, + { + "epoch": 4.793617201129659, + "grad_norm": 3.8502087593078613, + "learning_rate": 0.0008427823762046909, + "loss": 7.5601, + "step": 1176700 + }, + { + "epoch": 4.7940245791530405, + "grad_norm": 6.400423049926758, + "learning_rate": 0.0008424124446715808, + "loss": 7.5447, + "step": 1176800 + }, + { + "epoch": 4.794431957176422, + "grad_norm": 3.2085399627685547, + "learning_rate": 0.0008420425788694498, + "loss": 7.5418, + "step": 1176900 + }, + { + "epoch": 4.7948393351998035, + "grad_norm": 3.724039077758789, + "learning_rate": 0.000841672778812919, + "loss": 7.5298, + "step": 1177000 + }, + { + "epoch": 4.7948393351998035, + "eval_MaskedAccuracy": 0.5119181182744716, + "eval_loss": 1.5904992818832397, + "eval_runtime": 159.9055, + "eval_samples_per_second": 396.959, + "eval_steps_per_second": 1.551, + "step": 1177000 + }, + { + "epoch": 4.795246713223185, + "grad_norm": 4.9031782150268555, + "learning_rate": 0.0008413030445166099, + "loss": 7.5699, + "step": 1177100 + }, + { + "epoch": 4.795654091246567, + "grad_norm": 10.650490760803223, + "learning_rate": 0.0008409333759951393, + "loss": 7.546, + "step": 1177200 + }, + { + "epoch": 4.796061469269948, + "grad_norm": 7.704545497894287, + "learning_rate": 0.0008405637732631219, + "loss": 7.5634, + "step": 1177300 + }, + { + "epoch": 4.79646884729333, + "grad_norm": 13.6079740524292, + "learning_rate": 0.0008401942363351713, + "loss": 7.5523, + "step": 1177400 + }, + { + "epoch": 4.796876225316711, + "grad_norm": 7.489874362945557, + "learning_rate": 0.0008398247652258953, + "loss": 7.5635, + "step": 1177500 + }, + { + "epoch": 4.797283603340093, + "grad_norm": 8.139029502868652, + "learning_rate": 0.0008394553599499032, + "loss": 7.5421, + "step": 1177600 + }, + { + "epoch": 4.797690981363473, + "grad_norm": 3.4920918941497803, + "learning_rate": 0.0008390860205217983, + "loss": 7.5678, + "step": 1177700 + }, + { + "epoch": 4.798098359386856, + "grad_norm": 9.2100248336792, + "learning_rate": 0.0008387167469561832, + "loss": 7.5353, + "step": 1177800 + }, + { + "epoch": 4.7985057374102364, + "grad_norm": 4.074262619018555, + "learning_rate": 0.0008383475392676562, + "loss": 7.5206, + "step": 1177900 + }, + { + "epoch": 4.798913115433618, + "grad_norm": 6.044760227203369, + "learning_rate": 0.0008379783974708151, + "loss": 7.5689, + "step": 1178000 + }, + { + "epoch": 4.798913115433618, + "eval_MaskedAccuracy": 0.5115592092361615, + "eval_loss": 1.5906074047088623, + "eval_runtime": 169.0923, + "eval_samples_per_second": 375.393, + "eval_steps_per_second": 1.467, + "step": 1178000 + }, + { + "epoch": 4.7993204934569995, + "grad_norm": 5.4311113357543945, + "learning_rate": 0.0008376093215802557, + "loss": 7.5713, + "step": 1178100 + }, + { + "epoch": 4.799727871480381, + "grad_norm": 13.861988067626953, + "learning_rate": 0.0008372403116105657, + "loss": 7.5553, + "step": 1178200 + }, + { + "epoch": 4.800135249503763, + "grad_norm": 12.188407897949219, + "learning_rate": 0.000836871367576337, + "loss": 7.5857, + "step": 1178300 + }, + { + "epoch": 4.800542627527144, + "grad_norm": 8.346395492553711, + "learning_rate": 0.0008365024894921546, + "loss": 7.5611, + "step": 1178400 + }, + { + "epoch": 4.800950005550526, + "grad_norm": 10.427240371704102, + "learning_rate": 0.0008361336773726031, + "loss": 7.5669, + "step": 1178500 + }, + { + "epoch": 4.801357383573907, + "grad_norm": 7.915243625640869, + "learning_rate": 0.0008357649312322624, + "loss": 7.5343, + "step": 1178600 + }, + { + "epoch": 4.801764761597289, + "grad_norm": 15.282696723937988, + "learning_rate": 0.0008353962510857115, + "loss": 7.5569, + "step": 1178700 + }, + { + "epoch": 4.80217213962067, + "grad_norm": 4.565001964569092, + "learning_rate": 0.000835027636947527, + "loss": 7.562, + "step": 1178800 + }, + { + "epoch": 4.802579517644052, + "grad_norm": 5.097818851470947, + "learning_rate": 0.0008346590888322795, + "loss": 7.5534, + "step": 1178900 + }, + { + "epoch": 4.802986895667432, + "grad_norm": 5.627564430236816, + "learning_rate": 0.0008342906067545435, + "loss": 7.5384, + "step": 1179000 + }, + { + "epoch": 4.802986895667432, + "eval_MaskedAccuracy": 0.5117527316966278, + "eval_loss": 1.585954189300537, + "eval_runtime": 172.4997, + "eval_samples_per_second": 367.977, + "eval_steps_per_second": 1.438, + "step": 1179000 + }, + { + "epoch": 4.803394273690814, + "grad_norm": 16.744068145751953, + "learning_rate": 0.000833922190728886, + "loss": 7.5589, + "step": 1179100 + }, + { + "epoch": 4.8038016517141955, + "grad_norm": 9.47870922088623, + "learning_rate": 0.0008335538407698721, + "loss": 7.5891, + "step": 1179200 + }, + { + "epoch": 4.804209029737577, + "grad_norm": 7.305385112762451, + "learning_rate": 0.0008331855568920641, + "loss": 7.5582, + "step": 1179300 + }, + { + "epoch": 4.8046164077609586, + "grad_norm": 10.57204532623291, + "learning_rate": 0.0008328173391100221, + "loss": 7.559, + "step": 1179400 + }, + { + "epoch": 4.80502378578434, + "grad_norm": 9.74145793914795, + "learning_rate": 0.0008324491874383042, + "loss": 7.5358, + "step": 1179500 + }, + { + "epoch": 4.805431163807722, + "grad_norm": 11.683330535888672, + "learning_rate": 0.0008320811018914656, + "loss": 7.5596, + "step": 1179600 + }, + { + "epoch": 4.805838541831103, + "grad_norm": 4.935939311981201, + "learning_rate": 0.0008317130824840579, + "loss": 7.5346, + "step": 1179700 + }, + { + "epoch": 4.806245919854485, + "grad_norm": 9.072016716003418, + "learning_rate": 0.0008313451292306317, + "loss": 7.5421, + "step": 1179800 + }, + { + "epoch": 4.806653297877866, + "grad_norm": 6.465701580047607, + "learning_rate": 0.0008309772421457329, + "loss": 7.5538, + "step": 1179900 + }, + { + "epoch": 4.807060675901248, + "grad_norm": 8.924321174621582, + "learning_rate": 0.0008306094212439082, + "loss": 7.576, + "step": 1180000 + }, + { + "epoch": 4.807060675901248, + "eval_MaskedAccuracy": 0.5117493672356868, + "eval_loss": 1.5885009765625, + "eval_runtime": 169.4175, + "eval_samples_per_second": 374.672, + "eval_steps_per_second": 1.464, + "step": 1180000 + }, + { + "epoch": 4.807468053924629, + "grad_norm": 11.632453918457031, + "learning_rate": 0.0008302416665396979, + "loss": 7.5657, + "step": 1180100 + }, + { + "epoch": 4.807875431948011, + "grad_norm": 5.263997554779053, + "learning_rate": 0.0008298739780476414, + "loss": 7.5603, + "step": 1180200 + }, + { + "epoch": 4.8082828099713915, + "grad_norm": 12.246635437011719, + "learning_rate": 0.0008295063557822768, + "loss": 7.5642, + "step": 1180300 + }, + { + "epoch": 4.808690187994773, + "grad_norm": 7.712202072143555, + "learning_rate": 0.0008291387997581372, + "loss": 7.5582, + "step": 1180400 + }, + { + "epoch": 4.8090975660181545, + "grad_norm": 7.902210712432861, + "learning_rate": 0.0008287713099897533, + "loss": 7.5842, + "step": 1180500 + }, + { + "epoch": 4.809504944041536, + "grad_norm": 10.562908172607422, + "learning_rate": 0.0008284038864916547, + "loss": 7.5277, + "step": 1180600 + }, + { + "epoch": 4.809912322064918, + "grad_norm": 7.974733829498291, + "learning_rate": 0.0008280365292783681, + "loss": 7.5559, + "step": 1180700 + }, + { + "epoch": 4.810319700088299, + "grad_norm": 6.211431980133057, + "learning_rate": 0.0008276692383644154, + "loss": 7.512, + "step": 1180800 + }, + { + "epoch": 4.810727078111681, + "grad_norm": 9.059982299804688, + "learning_rate": 0.00082730201376432, + "loss": 7.5429, + "step": 1180900 + }, + { + "epoch": 4.811134456135062, + "grad_norm": 3.615178346633911, + "learning_rate": 0.0008269348554925981, + "loss": 7.5495, + "step": 1181000 + }, + { + "epoch": 4.811134456135062, + "eval_MaskedAccuracy": 0.51130747334969, + "eval_loss": 1.5924112796783447, + "eval_runtime": 167.4774, + "eval_samples_per_second": 379.012, + "eval_steps_per_second": 1.481, + "step": 1181000 + }, + { + "epoch": 4.811541834158444, + "grad_norm": 3.9623985290527344, + "learning_rate": 0.0008265677635637671, + "loss": 7.5728, + "step": 1181100 + }, + { + "epoch": 4.811949212181825, + "grad_norm": 5.541474342346191, + "learning_rate": 0.0008262007379923382, + "loss": 7.5579, + "step": 1181200 + }, + { + "epoch": 4.812356590205207, + "grad_norm": 12.991304397583008, + "learning_rate": 0.0008258337787928237, + "loss": 7.5556, + "step": 1181300 + }, + { + "epoch": 4.812763968228588, + "grad_norm": 9.231416702270508, + "learning_rate": 0.0008254668859797306, + "loss": 7.5713, + "step": 1181400 + }, + { + "epoch": 4.81317134625197, + "grad_norm": 12.248048782348633, + "learning_rate": 0.0008251000595675641, + "loss": 7.5807, + "step": 1181500 + }, + { + "epoch": 4.8135787242753505, + "grad_norm": 4.622974395751953, + "learning_rate": 0.0008247332995708265, + "loss": 7.572, + "step": 1181600 + }, + { + "epoch": 4.813986102298732, + "grad_norm": 16.859704971313477, + "learning_rate": 0.0008243666060040183, + "loss": 7.5727, + "step": 1181700 + }, + { + "epoch": 4.814393480322114, + "grad_norm": 6.884955406188965, + "learning_rate": 0.0008239999788816374, + "loss": 7.5525, + "step": 1181800 + }, + { + "epoch": 4.814800858345495, + "grad_norm": 5.5208234786987305, + "learning_rate": 0.0008236334182181777, + "loss": 7.5362, + "step": 1181900 + }, + { + "epoch": 4.815208236368877, + "grad_norm": 6.944272518157959, + "learning_rate": 0.0008232669240281317, + "loss": 7.5922, + "step": 1182000 + }, + { + "epoch": 4.815208236368877, + "eval_MaskedAccuracy": 0.5108697522849652, + "eval_loss": 1.6041796207427979, + "eval_runtime": 161.7268, + "eval_samples_per_second": 392.489, + "eval_steps_per_second": 1.533, + "step": 1182000 + }, + { + "epoch": 4.815615614392258, + "grad_norm": 4.707577228546143, + "learning_rate": 0.0008229004963259892, + "loss": 7.5376, + "step": 1182100 + }, + { + "epoch": 4.81602299241564, + "grad_norm": 12.653528213500977, + "learning_rate": 0.0008225341351262367, + "loss": 7.5649, + "step": 1182200 + }, + { + "epoch": 4.816430370439021, + "grad_norm": 5.855892658233643, + "learning_rate": 0.0008221678404433578, + "loss": 7.5719, + "step": 1182300 + }, + { + "epoch": 4.816837748462403, + "grad_norm": 15.072898864746094, + "learning_rate": 0.0008218016122918338, + "loss": 7.5101, + "step": 1182400 + }, + { + "epoch": 4.817245126485784, + "grad_norm": 6.575567722320557, + "learning_rate": 0.0008214354506861457, + "loss": 7.5501, + "step": 1182500 + }, + { + "epoch": 4.817652504509166, + "grad_norm": 3.4672160148620605, + "learning_rate": 0.0008210693556407684, + "loss": 7.5534, + "step": 1182600 + }, + { + "epoch": 4.8180598825325465, + "grad_norm": 10.773113250732422, + "learning_rate": 0.0008207033271701755, + "loss": 7.5582, + "step": 1182700 + }, + { + "epoch": 4.818467260555929, + "grad_norm": 9.565873146057129, + "learning_rate": 0.0008203373652888373, + "loss": 7.5465, + "step": 1182800 + }, + { + "epoch": 4.8188746385793095, + "grad_norm": 7.289404392242432, + "learning_rate": 0.0008199714700112242, + "loss": 7.5607, + "step": 1182900 + }, + { + "epoch": 4.819282016602691, + "grad_norm": 3.710710287094116, + "learning_rate": 0.0008196056413518006, + "loss": 7.5357, + "step": 1183000 + }, + { + "epoch": 4.819282016602691, + "eval_MaskedAccuracy": 0.5118950024515713, + "eval_loss": 1.590827226638794, + "eval_runtime": 168.9435, + "eval_samples_per_second": 375.723, + "eval_steps_per_second": 1.468, + "step": 1183000 + }, + { + "epoch": 4.819689394626073, + "grad_norm": 3.4109690189361572, + "learning_rate": 0.0008192398793250301, + "loss": 7.5449, + "step": 1183100 + }, + { + "epoch": 4.820096772649454, + "grad_norm": 29.629152297973633, + "learning_rate": 0.0008188741839453739, + "loss": 7.5535, + "step": 1183200 + }, + { + "epoch": 4.820504150672836, + "grad_norm": 10.11011791229248, + "learning_rate": 0.0008185085552272896, + "loss": 7.4998, + "step": 1183300 + }, + { + "epoch": 4.820911528696217, + "grad_norm": 4.354499816894531, + "learning_rate": 0.0008181429931852319, + "loss": 7.542, + "step": 1183400 + }, + { + "epoch": 4.821318906719599, + "grad_norm": 6.408851623535156, + "learning_rate": 0.0008177774978336531, + "loss": 7.5473, + "step": 1183500 + }, + { + "epoch": 4.82172628474298, + "grad_norm": 16.92336654663086, + "learning_rate": 0.0008174120691870037, + "loss": 7.5295, + "step": 1183600 + }, + { + "epoch": 4.822133662766362, + "grad_norm": 16.88974952697754, + "learning_rate": 0.000817046707259731, + "loss": 7.5474, + "step": 1183700 + }, + { + "epoch": 4.822541040789743, + "grad_norm": 7.427070617675781, + "learning_rate": 0.0008166814120662792, + "loss": 7.5787, + "step": 1183800 + }, + { + "epoch": 4.822948418813125, + "grad_norm": 11.39084243774414, + "learning_rate": 0.0008163161836210915, + "loss": 7.5496, + "step": 1183900 + }, + { + "epoch": 4.8233557968365055, + "grad_norm": 6.5332441329956055, + "learning_rate": 0.0008159510219386075, + "loss": 7.5599, + "step": 1184000 + }, + { + "epoch": 4.8233557968365055, + "eval_MaskedAccuracy": 0.5120332026114306, + "eval_loss": 1.5897085666656494, + "eval_runtime": 164.398, + "eval_samples_per_second": 386.112, + "eval_steps_per_second": 1.509, + "step": 1184000 + }, + { + "epoch": 4.823763174859887, + "grad_norm": 7.744347095489502, + "learning_rate": 0.0008155859270332627, + "loss": 7.5517, + "step": 1184100 + }, + { + "epoch": 4.824170552883269, + "grad_norm": 10.261436462402344, + "learning_rate": 0.0008152208989194916, + "loss": 7.5532, + "step": 1184200 + }, + { + "epoch": 4.82457793090665, + "grad_norm": 5.4755754470825195, + "learning_rate": 0.0008148559376117268, + "loss": 7.5245, + "step": 1184300 + }, + { + "epoch": 4.824985308930032, + "grad_norm": 17.02247428894043, + "learning_rate": 0.000814491043124396, + "loss": 7.5803, + "step": 1184400 + }, + { + "epoch": 4.825392686953413, + "grad_norm": 9.922220230102539, + "learning_rate": 0.0008141262154719258, + "loss": 7.5631, + "step": 1184500 + }, + { + "epoch": 4.825800064976795, + "grad_norm": 5.512035369873047, + "learning_rate": 0.0008137614546687394, + "loss": 7.5538, + "step": 1184600 + }, + { + "epoch": 4.826207443000176, + "grad_norm": 6.123693943023682, + "learning_rate": 0.0008133967607292579, + "loss": 7.5373, + "step": 1184700 + }, + { + "epoch": 4.826614821023558, + "grad_norm": 10.376054763793945, + "learning_rate": 0.0008130321336679002, + "loss": 7.5408, + "step": 1184800 + }, + { + "epoch": 4.827022199046939, + "grad_norm": 7.109657287597656, + "learning_rate": 0.000812667573499081, + "loss": 7.5449, + "step": 1184900 + }, + { + "epoch": 4.827429577070321, + "grad_norm": 13.123220443725586, + "learning_rate": 0.0008123030802372152, + "loss": 7.5583, + "step": 1185000 + }, + { + "epoch": 4.827429577070321, + "eval_MaskedAccuracy": 0.5111163088078949, + "eval_loss": 1.6001918315887451, + "eval_runtime": 158.7699, + "eval_samples_per_second": 399.799, + "eval_steps_per_second": 1.562, + "step": 1185000 + }, + { + "epoch": 4.827836955093702, + "grad_norm": 5.369037628173828, + "learning_rate": 0.0008119386538967105, + "loss": 7.5462, + "step": 1185100 + }, + { + "epoch": 4.828244333117084, + "grad_norm": 8.000434875488281, + "learning_rate": 0.0008115742944919748, + "loss": 7.5668, + "step": 1185200 + }, + { + "epoch": 4.8286517111404645, + "grad_norm": 10.486366271972656, + "learning_rate": 0.0008112100020374161, + "loss": 7.5508, + "step": 1185300 + }, + { + "epoch": 4.829059089163846, + "grad_norm": 20.98023223876953, + "learning_rate": 0.000810845776547435, + "loss": 7.5701, + "step": 1185400 + }, + { + "epoch": 4.829466467187228, + "grad_norm": 7.4165730476379395, + "learning_rate": 0.0008104816180364309, + "loss": 7.5397, + "step": 1185500 + }, + { + "epoch": 4.829873845210609, + "grad_norm": 8.169611930847168, + "learning_rate": 0.0008101175265188017, + "loss": 7.5534, + "step": 1185600 + }, + { + "epoch": 4.830281223233991, + "grad_norm": 6.466330051422119, + "learning_rate": 0.000809753502008942, + "loss": 7.5308, + "step": 1185700 + }, + { + "epoch": 4.830688601257372, + "grad_norm": 8.122366905212402, + "learning_rate": 0.0008093895445212426, + "loss": 7.549, + "step": 1185800 + }, + { + "epoch": 4.831095979280754, + "grad_norm": 3.603215456008911, + "learning_rate": 0.000809025654070093, + "loss": 7.538, + "step": 1185900 + }, + { + "epoch": 4.831503357304135, + "grad_norm": 5.026655673980713, + "learning_rate": 0.0008086618306698808, + "loss": 7.5431, + "step": 1186000 + }, + { + "epoch": 4.831503357304135, + "eval_MaskedAccuracy": 0.5110371926782977, + "eval_loss": 1.5962096452713013, + "eval_runtime": 173.0103, + "eval_samples_per_second": 366.891, + "eval_steps_per_second": 1.433, + "step": 1186000 + }, + { + "epoch": 4.831910735327517, + "grad_norm": 4.054526329040527, + "learning_rate": 0.0008082980743349883, + "loss": 7.5448, + "step": 1186100 + }, + { + "epoch": 4.832318113350898, + "grad_norm": 7.328049182891846, + "learning_rate": 0.0008079343850797972, + "loss": 7.5691, + "step": 1186200 + }, + { + "epoch": 4.83272549137428, + "grad_norm": 5.259098529815674, + "learning_rate": 0.0008075707629186859, + "loss": 7.5568, + "step": 1186300 + }, + { + "epoch": 4.833132869397661, + "grad_norm": 9.088693618774414, + "learning_rate": 0.0008072072078660324, + "loss": 7.5279, + "step": 1186400 + }, + { + "epoch": 4.833540247421043, + "grad_norm": 8.539562225341797, + "learning_rate": 0.0008068437199362077, + "loss": 7.5686, + "step": 1186500 + }, + { + "epoch": 4.833947625444424, + "grad_norm": 18.32861328125, + "learning_rate": 0.0008064802991435834, + "loss": 7.5616, + "step": 1186600 + }, + { + "epoch": 4.834355003467805, + "grad_norm": 9.15188217163086, + "learning_rate": 0.0008061169455025269, + "loss": 7.557, + "step": 1186700 + }, + { + "epoch": 4.834762381491187, + "grad_norm": 7.382136344909668, + "learning_rate": 0.0008057536590274034, + "loss": 7.5468, + "step": 1186800 + }, + { + "epoch": 4.835169759514568, + "grad_norm": 16.520309448242188, + "learning_rate": 0.0008053904397325771, + "loss": 7.58, + "step": 1186900 + }, + { + "epoch": 4.83557713753795, + "grad_norm": 12.390761375427246, + "learning_rate": 0.0008050272876324063, + "loss": 7.5283, + "step": 1187000 + }, + { + "epoch": 4.83557713753795, + "eval_MaskedAccuracy": 0.5117377525810891, + "eval_loss": 1.5956392288208008, + "eval_runtime": 171.3553, + "eval_samples_per_second": 370.435, + "eval_steps_per_second": 1.447, + "step": 1187000 + }, + { + "epoch": 4.835984515561331, + "grad_norm": 20.251081466674805, + "learning_rate": 0.0008046642027412489, + "loss": 7.5667, + "step": 1187100 + }, + { + "epoch": 4.836391893584713, + "grad_norm": 5.811011791229248, + "learning_rate": 0.0008043011850734598, + "loss": 7.5397, + "step": 1187200 + }, + { + "epoch": 4.836799271608094, + "grad_norm": 6.2515339851379395, + "learning_rate": 0.0008039382346433905, + "loss": 7.5672, + "step": 1187300 + }, + { + "epoch": 4.837206649631476, + "grad_norm": 12.136992454528809, + "learning_rate": 0.0008035753514653918, + "loss": 7.517, + "step": 1187400 + }, + { + "epoch": 4.837614027654857, + "grad_norm": 7.822018146514893, + "learning_rate": 0.0008032125355538078, + "loss": 7.5217, + "step": 1187500 + }, + { + "epoch": 4.838021405678239, + "grad_norm": 9.882133483886719, + "learning_rate": 0.0008028497869229848, + "loss": 7.5685, + "step": 1187600 + }, + { + "epoch": 4.8384287837016196, + "grad_norm": 8.234831809997559, + "learning_rate": 0.0008024871055872637, + "loss": 7.5238, + "step": 1187700 + }, + { + "epoch": 4.838836161725002, + "grad_norm": 6.588048934936523, + "learning_rate": 0.000802124491560982, + "loss": 7.5805, + "step": 1187800 + }, + { + "epoch": 4.839243539748383, + "grad_norm": 4.4182353019714355, + "learning_rate": 0.000801761944858477, + "loss": 7.5356, + "step": 1187900 + }, + { + "epoch": 4.839650917771764, + "grad_norm": 10.374272346496582, + "learning_rate": 0.0008013994654940826, + "loss": 7.5774, + "step": 1188000 + }, + { + "epoch": 4.839650917771764, + "eval_MaskedAccuracy": 0.5114369197100814, + "eval_loss": 1.5909814834594727, + "eval_runtime": 165.4066, + "eval_samples_per_second": 383.757, + "eval_steps_per_second": 1.499, + "step": 1188000 + }, + { + "epoch": 4.840058295795146, + "grad_norm": 17.36933135986328, + "learning_rate": 0.0008010370534821281, + "loss": 7.5422, + "step": 1188100 + }, + { + "epoch": 4.840465673818527, + "grad_norm": 23.012807846069336, + "learning_rate": 0.0008006747088369422, + "loss": 7.5512, + "step": 1188200 + }, + { + "epoch": 4.840873051841909, + "grad_norm": 11.495627403259277, + "learning_rate": 0.0008003124315728504, + "loss": 7.5815, + "step": 1188300 + }, + { + "epoch": 4.84128042986529, + "grad_norm": 9.812355041503906, + "learning_rate": 0.0007999502217041759, + "loss": 7.5426, + "step": 1188400 + }, + { + "epoch": 4.841687807888672, + "grad_norm": 8.009356498718262, + "learning_rate": 0.0007995880792452387, + "loss": 7.5432, + "step": 1188500 + }, + { + "epoch": 4.842095185912053, + "grad_norm": 18.623714447021484, + "learning_rate": 0.0007992260042103557, + "loss": 7.5557, + "step": 1188600 + }, + { + "epoch": 4.842502563935435, + "grad_norm": 3.701794385910034, + "learning_rate": 0.000798863996613842, + "loss": 7.5489, + "step": 1188700 + }, + { + "epoch": 4.842909941958816, + "grad_norm": 5.531005859375, + "learning_rate": 0.0007985020564700094, + "loss": 7.563, + "step": 1188800 + }, + { + "epoch": 4.843317319982198, + "grad_norm": 4.462012767791748, + "learning_rate": 0.0007981401837931671, + "loss": 7.5386, + "step": 1188900 + }, + { + "epoch": 4.843724698005579, + "grad_norm": 5.2527875900268555, + "learning_rate": 0.000797778378597623, + "loss": 7.5612, + "step": 1189000 + }, + { + "epoch": 4.843724698005579, + "eval_MaskedAccuracy": 0.5116886053689556, + "eval_loss": 1.5937349796295166, + "eval_runtime": 163.6586, + "eval_samples_per_second": 387.856, + "eval_steps_per_second": 1.515, + "step": 1189000 + }, + { + "epoch": 4.84413207602896, + "grad_norm": 7.725368976593018, + "learning_rate": 0.0007974166408976809, + "loss": 7.5494, + "step": 1189100 + }, + { + "epoch": 4.844539454052342, + "grad_norm": 13.587323188781738, + "learning_rate": 0.0007970549707076406, + "loss": 7.5619, + "step": 1189200 + }, + { + "epoch": 4.844946832075723, + "grad_norm": 8.40877914428711, + "learning_rate": 0.0007966933680418032, + "loss": 7.5109, + "step": 1189300 + }, + { + "epoch": 4.845354210099105, + "grad_norm": 18.549039840698242, + "learning_rate": 0.0007963318329144626, + "loss": 7.5597, + "step": 1189400 + }, + { + "epoch": 4.845761588122486, + "grad_norm": 13.995420455932617, + "learning_rate": 0.0007959703653399143, + "loss": 7.5592, + "step": 1189500 + }, + { + "epoch": 4.846168966145868, + "grad_norm": 4.556239604949951, + "learning_rate": 0.0007956089653324473, + "loss": 7.5232, + "step": 1189600 + }, + { + "epoch": 4.846576344169249, + "grad_norm": 3.827169179916382, + "learning_rate": 0.00079524763290635, + "loss": 7.5391, + "step": 1189700 + }, + { + "epoch": 4.846983722192631, + "grad_norm": 10.173226356506348, + "learning_rate": 0.0007948863680759086, + "loss": 7.5328, + "step": 1189800 + }, + { + "epoch": 4.847391100216012, + "grad_norm": 12.815058708190918, + "learning_rate": 0.0007945251708554043, + "loss": 7.5205, + "step": 1189900 + }, + { + "epoch": 4.847798478239394, + "grad_norm": 5.204775333404541, + "learning_rate": 0.0007941640412591194, + "loss": 7.5524, + "step": 1190000 + }, + { + "epoch": 4.847798478239394, + "eval_MaskedAccuracy": 0.5115184180846889, + "eval_loss": 1.5997650623321533, + "eval_runtime": 165.1239, + "eval_samples_per_second": 384.414, + "eval_steps_per_second": 1.502, + "step": 1190000 + }, + { + "epoch": 4.8482058562627754, + "grad_norm": 16.823041915893555, + "learning_rate": 0.0007938029793013307, + "loss": 7.5503, + "step": 1190100 + }, + { + "epoch": 4.848613234286157, + "grad_norm": 4.9770026206970215, + "learning_rate": 0.0007934419849963117, + "loss": 7.5683, + "step": 1190200 + }, + { + "epoch": 4.849020612309538, + "grad_norm": 5.374384880065918, + "learning_rate": 0.0007930810583583345, + "loss": 7.5581, + "step": 1190300 + }, + { + "epoch": 4.849427990332919, + "grad_norm": 13.473366737365723, + "learning_rate": 0.0007927201994016704, + "loss": 7.5634, + "step": 1190400 + }, + { + "epoch": 4.849835368356301, + "grad_norm": 3.5089282989501953, + "learning_rate": 0.0007923594081405834, + "loss": 7.5612, + "step": 1190500 + }, + { + "epoch": 4.850242746379682, + "grad_norm": 3.3681108951568604, + "learning_rate": 0.0007919986845893393, + "loss": 7.5523, + "step": 1190600 + }, + { + "epoch": 4.850650124403064, + "grad_norm": 12.124752044677734, + "learning_rate": 0.0007916380287621984, + "loss": 7.5654, + "step": 1190700 + }, + { + "epoch": 4.851057502426445, + "grad_norm": 21.249267578125, + "learning_rate": 0.0007912774406734202, + "loss": 7.5434, + "step": 1190800 + }, + { + "epoch": 4.851464880449827, + "grad_norm": 6.28021240234375, + "learning_rate": 0.0007909169203372618, + "loss": 7.5697, + "step": 1190900 + }, + { + "epoch": 4.851872258473208, + "grad_norm": 7.58031702041626, + "learning_rate": 0.000790556467767974, + "loss": 7.5487, + "step": 1191000 + }, + { + "epoch": 4.851872258473208, + "eval_MaskedAccuracy": 0.5123831634801366, + "eval_loss": 1.5865329504013062, + "eval_runtime": 162.9624, + "eval_samples_per_second": 389.513, + "eval_steps_per_second": 1.522, + "step": 1191000 + }, + { + "epoch": 4.85227963649659, + "grad_norm": 17.498788833618164, + "learning_rate": 0.0007901960829798082, + "loss": 7.52, + "step": 1191100 + }, + { + "epoch": 4.852687014519971, + "grad_norm": 6.950149059295654, + "learning_rate": 0.0007898357659870131, + "loss": 7.5632, + "step": 1191200 + }, + { + "epoch": 4.853094392543353, + "grad_norm": 5.151035308837891, + "learning_rate": 0.0007894755168038332, + "loss": 7.5828, + "step": 1191300 + }, + { + "epoch": 4.8535017705667345, + "grad_norm": 4.0569610595703125, + "learning_rate": 0.0007891153354445123, + "loss": 7.5552, + "step": 1191400 + }, + { + "epoch": 4.853909148590116, + "grad_norm": 5.584034442901611, + "learning_rate": 0.0007887552219232886, + "loss": 7.5662, + "step": 1191500 + }, + { + "epoch": 4.854316526613497, + "grad_norm": 7.946394443511963, + "learning_rate": 0.0007883951762543998, + "loss": 7.5149, + "step": 1191600 + }, + { + "epoch": 4.854723904636878, + "grad_norm": 7.690493106842041, + "learning_rate": 0.0007880351984520812, + "loss": 7.5979, + "step": 1191700 + }, + { + "epoch": 4.85513128266026, + "grad_norm": 5.265111923217773, + "learning_rate": 0.0007876752885305632, + "loss": 7.5156, + "step": 1191800 + }, + { + "epoch": 4.855538660683641, + "grad_norm": 5.356286525726318, + "learning_rate": 0.0007873154465040766, + "loss": 7.5383, + "step": 1191900 + }, + { + "epoch": 4.855946038707023, + "grad_norm": 6.729825019836426, + "learning_rate": 0.000786955672386848, + "loss": 7.541, + "step": 1192000 + }, + { + "epoch": 4.855946038707023, + "eval_MaskedAccuracy": 0.5114614765556479, + "eval_loss": 1.6037479639053345, + "eval_runtime": 176.5018, + "eval_samples_per_second": 359.634, + "eval_steps_per_second": 1.405, + "step": 1192000 + }, + { + "epoch": 4.856353416730404, + "grad_norm": 16.198883056640625, + "learning_rate": 0.0007865959661930994, + "loss": 7.5375, + "step": 1192100 + }, + { + "epoch": 4.856760794753786, + "grad_norm": 5.049928188323975, + "learning_rate": 0.0007862363279370542, + "loss": 7.5553, + "step": 1192200 + }, + { + "epoch": 4.857168172777167, + "grad_norm": 12.35220718383789, + "learning_rate": 0.0007858767576329285, + "loss": 7.5294, + "step": 1192300 + }, + { + "epoch": 4.857575550800549, + "grad_norm": 16.83810806274414, + "learning_rate": 0.0007855172552949396, + "loss": 7.541, + "step": 1192400 + }, + { + "epoch": 4.8579829288239305, + "grad_norm": 12.581002235412598, + "learning_rate": 0.0007851578209372996, + "loss": 7.5417, + "step": 1192500 + }, + { + "epoch": 4.858390306847312, + "grad_norm": 4.008618354797363, + "learning_rate": 0.000784798454574219, + "loss": 7.5609, + "step": 1192600 + }, + { + "epoch": 4.858797684870693, + "grad_norm": 20.214204788208008, + "learning_rate": 0.0007844391562199052, + "loss": 7.5383, + "step": 1192700 + }, + { + "epoch": 4.859205062894075, + "grad_norm": 11.226751327514648, + "learning_rate": 0.0007840799258885661, + "loss": 7.5476, + "step": 1192800 + }, + { + "epoch": 4.859612440917456, + "grad_norm": 3.165907144546509, + "learning_rate": 0.0007837207635944005, + "loss": 7.5658, + "step": 1192900 + }, + { + "epoch": 4.860019818940837, + "grad_norm": 3.552816867828369, + "learning_rate": 0.0007833616693516094, + "loss": 7.5542, + "step": 1193000 + }, + { + "epoch": 4.860019818940837, + "eval_MaskedAccuracy": 0.5119116573491553, + "eval_loss": 1.5996180772781372, + "eval_runtime": 202.3496, + "eval_samples_per_second": 313.695, + "eval_steps_per_second": 1.226, + "step": 1193000 + }, + { + "epoch": 4.860427196964219, + "grad_norm": 7.732130527496338, + "learning_rate": 0.000783002643174389, + "loss": 7.5853, + "step": 1193100 + }, + { + "epoch": 4.8608345749876, + "grad_norm": 11.34261703491211, + "learning_rate": 0.0007826436850769341, + "loss": 7.542, + "step": 1193200 + }, + { + "epoch": 4.861241953010982, + "grad_norm": 4.273205280303955, + "learning_rate": 0.0007822847950734366, + "loss": 7.556, + "step": 1193300 + }, + { + "epoch": 4.861649331034363, + "grad_norm": 5.62993860244751, + "learning_rate": 0.0007819259731780844, + "loss": 7.5515, + "step": 1193400 + }, + { + "epoch": 4.862056709057745, + "grad_norm": 11.265458106994629, + "learning_rate": 0.0007815672194050635, + "loss": 7.5491, + "step": 1193500 + }, + { + "epoch": 4.862464087081126, + "grad_norm": 4.582005023956299, + "learning_rate": 0.0007812085337685592, + "loss": 7.5536, + "step": 1193600 + }, + { + "epoch": 4.862871465104508, + "grad_norm": 8.613621711730957, + "learning_rate": 0.0007808499162827504, + "loss": 7.5293, + "step": 1193700 + }, + { + "epoch": 4.8632788431278895, + "grad_norm": 6.918115139007568, + "learning_rate": 0.0007804913669618152, + "loss": 7.5194, + "step": 1193800 + }, + { + "epoch": 4.863686221151271, + "grad_norm": 8.471829414367676, + "learning_rate": 0.0007801328858199305, + "loss": 7.5394, + "step": 1193900 + }, + { + "epoch": 4.864093599174652, + "grad_norm": 9.388032913208008, + "learning_rate": 0.0007797744728712683, + "loss": 7.5661, + "step": 1194000 + }, + { + "epoch": 4.864093599174652, + "eval_MaskedAccuracy": 0.5120076719840594, + "eval_loss": 1.5941848754882812, + "eval_runtime": 165.3658, + "eval_samples_per_second": 383.852, + "eval_steps_per_second": 1.5, + "step": 1194000 + }, + { + "epoch": 4.864500977198033, + "grad_norm": 10.334595680236816, + "learning_rate": 0.0007794161281299983, + "loss": 7.5442, + "step": 1194100 + }, + { + "epoch": 4.864908355221415, + "grad_norm": 5.058508396148682, + "learning_rate": 0.0007790578516102866, + "loss": 7.5621, + "step": 1194200 + }, + { + "epoch": 4.865315733244796, + "grad_norm": 3.7669878005981445, + "learning_rate": 0.0007786996433263002, + "loss": 7.5869, + "step": 1194300 + }, + { + "epoch": 4.865723111268178, + "grad_norm": 8.707598686218262, + "learning_rate": 0.0007783415032921997, + "loss": 7.5463, + "step": 1194400 + }, + { + "epoch": 4.866130489291559, + "grad_norm": 12.151708602905273, + "learning_rate": 0.0007779834315221447, + "loss": 7.5517, + "step": 1194500 + }, + { + "epoch": 4.866537867314941, + "grad_norm": 10.467202186584473, + "learning_rate": 0.0007776254280302912, + "loss": 7.5374, + "step": 1194600 + }, + { + "epoch": 4.866945245338322, + "grad_norm": 7.354379653930664, + "learning_rate": 0.0007772674928307932, + "loss": 7.5671, + "step": 1194700 + }, + { + "epoch": 4.867352623361704, + "grad_norm": 25.011062622070312, + "learning_rate": 0.0007769096259378013, + "loss": 7.548, + "step": 1194800 + }, + { + "epoch": 4.8677600013850855, + "grad_norm": 5.9411845207214355, + "learning_rate": 0.0007765518273654639, + "loss": 7.5215, + "step": 1194900 + }, + { + "epoch": 4.868167379408467, + "grad_norm": 4.374491214752197, + "learning_rate": 0.0007761940971279279, + "loss": 7.5435, + "step": 1195000 + }, + { + "epoch": 4.868167379408467, + "eval_MaskedAccuracy": 0.51178161139941, + "eval_loss": 1.6017199754714966, + "eval_runtime": 167.3714, + "eval_samples_per_second": 379.252, + "eval_steps_per_second": 1.482, + "step": 1195000 + }, + { + "epoch": 4.8685747574318485, + "grad_norm": 21.570842742919922, + "learning_rate": 0.0007758364352393351, + "loss": 7.5455, + "step": 1195100 + }, + { + "epoch": 4.86898213545523, + "grad_norm": 7.334157466888428, + "learning_rate": 0.0007754788417138269, + "loss": 7.5505, + "step": 1195200 + }, + { + "epoch": 4.869389513478611, + "grad_norm": 5.550645351409912, + "learning_rate": 0.00077512131656554, + "loss": 7.5318, + "step": 1195300 + }, + { + "epoch": 4.869796891501992, + "grad_norm": 2.8154354095458984, + "learning_rate": 0.0007747638598086088, + "loss": 7.5582, + "step": 1195400 + }, + { + "epoch": 4.870204269525374, + "grad_norm": 4.538912773132324, + "learning_rate": 0.0007744064714571674, + "loss": 7.5342, + "step": 1195500 + }, + { + "epoch": 4.870611647548755, + "grad_norm": 6.24132776260376, + "learning_rate": 0.0007740491515253432, + "loss": 7.5108, + "step": 1195600 + }, + { + "epoch": 4.871019025572137, + "grad_norm": 8.308329582214355, + "learning_rate": 0.000773691900027263, + "loss": 7.5304, + "step": 1195700 + }, + { + "epoch": 4.871426403595518, + "grad_norm": 7.756369590759277, + "learning_rate": 0.0007733347169770532, + "loss": 7.5322, + "step": 1195800 + }, + { + "epoch": 4.8718337816189, + "grad_norm": 10.87598705291748, + "learning_rate": 0.0007729776023888328, + "loss": 7.5568, + "step": 1195900 + }, + { + "epoch": 4.872241159642281, + "grad_norm": 12.378133773803711, + "learning_rate": 0.0007726205562767216, + "loss": 7.5558, + "step": 1196000 + }, + { + "epoch": 4.872241159642281, + "eval_MaskedAccuracy": 0.5117830706125536, + "eval_loss": 1.602381944656372, + "eval_runtime": 164.3121, + "eval_samples_per_second": 386.314, + "eval_steps_per_second": 1.509, + "step": 1196000 + }, + { + "epoch": 4.872648537665663, + "grad_norm": 11.273324012756348, + "learning_rate": 0.000772263578654835, + "loss": 7.5612, + "step": 1196100 + }, + { + "epoch": 4.8730559156890445, + "grad_norm": 11.511683464050293, + "learning_rate": 0.0007719066695372866, + "loss": 7.5292, + "step": 1196200 + }, + { + "epoch": 4.873463293712426, + "grad_norm": 4.152893543243408, + "learning_rate": 0.0007715498289381868, + "loss": 7.5239, + "step": 1196300 + }, + { + "epoch": 4.873870671735808, + "grad_norm": 11.901667594909668, + "learning_rate": 0.0007711930568716437, + "loss": 7.5602, + "step": 1196400 + }, + { + "epoch": 4.874278049759189, + "grad_norm": 4.4524688720703125, + "learning_rate": 0.000770836353351762, + "loss": 7.5427, + "step": 1196500 + }, + { + "epoch": 4.87468542778257, + "grad_norm": 10.268949508666992, + "learning_rate": 0.0007704797183926442, + "loss": 7.5852, + "step": 1196600 + }, + { + "epoch": 4.875092805805951, + "grad_norm": 5.763948917388916, + "learning_rate": 0.0007701231520083893, + "loss": 7.5782, + "step": 1196700 + }, + { + "epoch": 4.875500183829333, + "grad_norm": 8.539776802062988, + "learning_rate": 0.0007697666542130953, + "loss": 7.5452, + "step": 1196800 + }, + { + "epoch": 4.875907561852714, + "grad_norm": 7.586759567260742, + "learning_rate": 0.0007694102250208563, + "loss": 7.5259, + "step": 1196900 + }, + { + "epoch": 4.876314939876096, + "grad_norm": 9.62762451171875, + "learning_rate": 0.0007690538644457642, + "loss": 7.5534, + "step": 1197000 + }, + { + "epoch": 4.876314939876096, + "eval_MaskedAccuracy": 0.5114081034475745, + "eval_loss": 1.598467469215393, + "eval_runtime": 173.6027, + "eval_samples_per_second": 365.639, + "eval_steps_per_second": 1.429, + "step": 1197000 + }, + { + "epoch": 4.876722317899477, + "grad_norm": 15.170089721679688, + "learning_rate": 0.0007686975725019069, + "loss": 7.5419, + "step": 1197100 + }, + { + "epoch": 4.877129695922859, + "grad_norm": 6.346302509307861, + "learning_rate": 0.0007683413492033714, + "loss": 7.548, + "step": 1197200 + }, + { + "epoch": 4.8775370739462405, + "grad_norm": 10.324873924255371, + "learning_rate": 0.00076798519456424, + "loss": 7.5457, + "step": 1197300 + }, + { + "epoch": 4.877944451969622, + "grad_norm": 9.471755981445312, + "learning_rate": 0.0007676291085985937, + "loss": 7.5584, + "step": 1197400 + }, + { + "epoch": 4.8783518299930035, + "grad_norm": 5.598387718200684, + "learning_rate": 0.000767273091320512, + "loss": 7.5556, + "step": 1197500 + }, + { + "epoch": 4.878759208016385, + "grad_norm": 8.985330581665039, + "learning_rate": 0.0007669171427440681, + "loss": 7.537, + "step": 1197600 + }, + { + "epoch": 4.879166586039766, + "grad_norm": 6.05348014831543, + "learning_rate": 0.0007665612628833352, + "loss": 7.5667, + "step": 1197700 + }, + { + "epoch": 4.879573964063148, + "grad_norm": 6.537703990936279, + "learning_rate": 0.0007662054517523839, + "loss": 7.5642, + "step": 1197800 + }, + { + "epoch": 4.879981342086529, + "grad_norm": 8.868982315063477, + "learning_rate": 0.0007658497093652816, + "loss": 7.5475, + "step": 1197900 + }, + { + "epoch": 4.88038872010991, + "grad_norm": 14.07412338256836, + "learning_rate": 0.0007654940357360914, + "loss": 7.5654, + "step": 1198000 + }, + { + "epoch": 4.88038872010991, + "eval_MaskedAccuracy": 0.5122425870288108, + "eval_loss": 1.5965555906295776, + "eval_runtime": 161.1757, + "eval_samples_per_second": 393.831, + "eval_steps_per_second": 1.539, + "step": 1198000 + }, + { + "epoch": 4.880796098133292, + "grad_norm": 12.023533821105957, + "learning_rate": 0.0007651384308788752, + "loss": 7.5583, + "step": 1198100 + }, + { + "epoch": 4.881203476156673, + "grad_norm": 10.829926490783691, + "learning_rate": 0.0007647828948076922, + "loss": 7.5655, + "step": 1198200 + }, + { + "epoch": 4.881610854180055, + "grad_norm": 6.019172668457031, + "learning_rate": 0.0007644274275365985, + "loss": 7.5322, + "step": 1198300 + }, + { + "epoch": 4.8820182322034364, + "grad_norm": 7.629240989685059, + "learning_rate": 0.0007640720290796481, + "loss": 7.5597, + "step": 1198400 + }, + { + "epoch": 4.882425610226818, + "grad_norm": 5.705124378204346, + "learning_rate": 0.0007637166994508922, + "loss": 7.5618, + "step": 1198500 + }, + { + "epoch": 4.8828329882501995, + "grad_norm": 16.90255355834961, + "learning_rate": 0.0007633614386643779, + "loss": 7.5585, + "step": 1198600 + }, + { + "epoch": 4.883240366273581, + "grad_norm": 12.051189422607422, + "learning_rate": 0.0007630062467341514, + "loss": 7.5594, + "step": 1198700 + }, + { + "epoch": 4.883647744296963, + "grad_norm": 10.131692886352539, + "learning_rate": 0.0007626511236742547, + "loss": 7.5269, + "step": 1198800 + }, + { + "epoch": 4.884055122320344, + "grad_norm": 5.140071392059326, + "learning_rate": 0.0007622960694987271, + "loss": 7.539, + "step": 1198900 + }, + { + "epoch": 4.884462500343725, + "grad_norm": 6.4076151847839355, + "learning_rate": 0.0007619410842216064, + "loss": 7.5137, + "step": 1199000 + }, + { + "epoch": 4.884462500343725, + "eval_MaskedAccuracy": 0.5120933543858173, + "eval_loss": 1.584120512008667, + "eval_runtime": 160.8011, + "eval_samples_per_second": 394.749, + "eval_steps_per_second": 1.542, + "step": 1199000 + }, + { + "epoch": 4.884869878367106, + "grad_norm": 3.249976873397827, + "learning_rate": 0.0007615861678569265, + "loss": 7.5453, + "step": 1199100 + }, + { + "epoch": 4.885277256390488, + "grad_norm": 10.001482009887695, + "learning_rate": 0.00076123132041872, + "loss": 7.5468, + "step": 1199200 + }, + { + "epoch": 4.885684634413869, + "grad_norm": 8.136263847351074, + "learning_rate": 0.0007608765419210159, + "loss": 7.5718, + "step": 1199300 + }, + { + "epoch": 4.886092012437251, + "grad_norm": 3.457944631576538, + "learning_rate": 0.0007605218323778406, + "loss": 7.5435, + "step": 1199400 + }, + { + "epoch": 4.886499390460632, + "grad_norm": 6.13539981842041, + "learning_rate": 0.000760167191803218, + "loss": 7.5457, + "step": 1199500 + }, + { + "epoch": 4.886906768484014, + "grad_norm": 19.31534767150879, + "learning_rate": 0.0007598126202111678, + "loss": 7.5543, + "step": 1199600 + }, + { + "epoch": 4.8873141465073955, + "grad_norm": 4.478299140930176, + "learning_rate": 0.0007594581176157088, + "loss": 7.5671, + "step": 1199700 + }, + { + "epoch": 4.887721524530777, + "grad_norm": 3.900815010070801, + "learning_rate": 0.0007591036840308561, + "loss": 7.5428, + "step": 1199800 + }, + { + "epoch": 4.8881289025541586, + "grad_norm": 3.649026393890381, + "learning_rate": 0.0007587493194706228, + "loss": 7.5303, + "step": 1199900 + }, + { + "epoch": 4.88853628057754, + "grad_norm": 4.455073356628418, + "learning_rate": 0.0007583950239490187, + "loss": 7.5484, + "step": 1200000 + }, + { + "epoch": 4.88853628057754, + "eval_MaskedAccuracy": 0.5117544590661958, + "eval_loss": 1.5857608318328857, + "eval_runtime": 162.1352, + "eval_samples_per_second": 391.5, + "eval_steps_per_second": 1.53, + "step": 1200000 + }, + { + "epoch": 4.888943658600922, + "grad_norm": 10.334717750549316, + "learning_rate": 0.0007580407974800497, + "loss": 7.5526, + "step": 1200100 + }, + { + "epoch": 4.889351036624303, + "grad_norm": 4.750763416290283, + "learning_rate": 0.0007576866400777221, + "loss": 7.5572, + "step": 1200200 + }, + { + "epoch": 4.889758414647684, + "grad_norm": 3.92103910446167, + "learning_rate": 0.000757332551756037, + "loss": 7.5074, + "step": 1200300 + }, + { + "epoch": 4.890165792671065, + "grad_norm": 4.871909141540527, + "learning_rate": 0.0007569785325289936, + "loss": 7.5334, + "step": 1200400 + }, + { + "epoch": 4.890573170694447, + "grad_norm": 3.145890474319458, + "learning_rate": 0.000756624582410587, + "loss": 7.5789, + "step": 1200500 + }, + { + "epoch": 4.890980548717828, + "grad_norm": 4.591555595397949, + "learning_rate": 0.0007562707014148116, + "loss": 7.5519, + "step": 1200600 + }, + { + "epoch": 4.89138792674121, + "grad_norm": 4.398261547088623, + "learning_rate": 0.0007559168895556579, + "loss": 7.5337, + "step": 1200700 + }, + { + "epoch": 4.8917953047645915, + "grad_norm": 3.0164170265197754, + "learning_rate": 0.0007555631468471149, + "loss": 7.5538, + "step": 1200800 + }, + { + "epoch": 4.892202682787973, + "grad_norm": 7.808837413787842, + "learning_rate": 0.000755209473303168, + "loss": 7.5553, + "step": 1200900 + }, + { + "epoch": 4.8926100608113545, + "grad_norm": 6.134701251983643, + "learning_rate": 0.0007548558689377991, + "loss": 7.5678, + "step": 1201000 + }, + { + "epoch": 4.8926100608113545, + "eval_MaskedAccuracy": 0.5118214447349616, + "eval_loss": 1.5958538055419922, + "eval_runtime": 165.0679, + "eval_samples_per_second": 384.545, + "eval_steps_per_second": 1.502, + "step": 1201000 + }, + { + "epoch": 4.893017438834736, + "grad_norm": 15.148693084716797, + "learning_rate": 0.0007545023337649872, + "loss": 7.556, + "step": 1201100 + }, + { + "epoch": 4.893424816858118, + "grad_norm": 6.748607158660889, + "learning_rate": 0.0007541488677987109, + "loss": 7.5368, + "step": 1201200 + }, + { + "epoch": 4.893832194881499, + "grad_norm": 17.395864486694336, + "learning_rate": 0.000753795471052944, + "loss": 7.5801, + "step": 1201300 + }, + { + "epoch": 4.894239572904881, + "grad_norm": 14.391432762145996, + "learning_rate": 0.0007534421435416585, + "loss": 7.5487, + "step": 1201400 + }, + { + "epoch": 4.894646950928262, + "grad_norm": 7.4796061515808105, + "learning_rate": 0.0007530888852788226, + "loss": 7.5542, + "step": 1201500 + }, + { + "epoch": 4.895054328951643, + "grad_norm": 9.131574630737305, + "learning_rate": 0.000752735696278403, + "loss": 7.552, + "step": 1201600 + }, + { + "epoch": 4.895461706975024, + "grad_norm": 14.780279159545898, + "learning_rate": 0.0007523825765543629, + "loss": 7.5478, + "step": 1201700 + }, + { + "epoch": 4.895869084998406, + "grad_norm": 3.968935251235962, + "learning_rate": 0.0007520295261206631, + "loss": 7.5934, + "step": 1201800 + }, + { + "epoch": 4.896276463021787, + "grad_norm": 11.925000190734863, + "learning_rate": 0.000751676544991263, + "loss": 7.5472, + "step": 1201900 + }, + { + "epoch": 4.896683841045169, + "grad_norm": 13.403853416442871, + "learning_rate": 0.0007513236331801159, + "loss": 7.5358, + "step": 1202000 + }, + { + "epoch": 4.896683841045169, + "eval_MaskedAccuracy": 0.511778153741451, + "eval_loss": 1.5913338661193848, + "eval_runtime": 312.7231, + "eval_samples_per_second": 202.978, + "eval_steps_per_second": 0.793, + "step": 1202000 + }, + { + "epoch": 4.8970912190685505, + "grad_norm": 18.489057540893555, + "learning_rate": 0.0007509707907011745, + "loss": 7.5609, + "step": 1202100 + }, + { + "epoch": 4.897498597091932, + "grad_norm": 6.835686206817627, + "learning_rate": 0.0007506180175683894, + "loss": 7.5582, + "step": 1202200 + }, + { + "epoch": 4.897905975115314, + "grad_norm": 4.83351469039917, + "learning_rate": 0.0007502653137957075, + "loss": 7.5397, + "step": 1202300 + }, + { + "epoch": 4.898313353138695, + "grad_norm": 17.712465286254883, + "learning_rate": 0.0007499126793970722, + "loss": 7.5395, + "step": 1202400 + }, + { + "epoch": 4.898720731162077, + "grad_norm": 7.643850803375244, + "learning_rate": 0.0007495601143864254, + "loss": 7.5416, + "step": 1202500 + }, + { + "epoch": 4.899128109185458, + "grad_norm": 3.950080156326294, + "learning_rate": 0.0007492076187777062, + "loss": 7.5261, + "step": 1202600 + }, + { + "epoch": 4.899535487208839, + "grad_norm": 6.754606246948242, + "learning_rate": 0.0007488551925848507, + "loss": 7.5333, + "step": 1202700 + }, + { + "epoch": 4.899942865232221, + "grad_norm": 13.595417976379395, + "learning_rate": 0.0007485028358217915, + "loss": 7.5228, + "step": 1202800 + }, + { + "epoch": 4.900350243255602, + "grad_norm": 3.7433109283447266, + "learning_rate": 0.0007481505485024593, + "loss": 7.5514, + "step": 1202900 + }, + { + "epoch": 4.900757621278983, + "grad_norm": 4.0549492835998535, + "learning_rate": 0.0007477983306407826, + "loss": 7.5417, + "step": 1203000 + }, + { + "epoch": 4.900757621278983, + "eval_MaskedAccuracy": 0.5117592686952326, + "eval_loss": 1.5817937850952148, + "eval_runtime": 186.8929, + "eval_samples_per_second": 339.638, + "eval_steps_per_second": 1.327, + "step": 1203000 + }, + { + "epoch": 4.901164999302365, + "grad_norm": 12.048553466796875, + "learning_rate": 0.0007474461822506855, + "loss": 7.5358, + "step": 1203100 + }, + { + "epoch": 4.9015723773257465, + "grad_norm": 3.9458818435668945, + "learning_rate": 0.0007470941033460917, + "loss": 7.5381, + "step": 1203200 + }, + { + "epoch": 4.901979755349128, + "grad_norm": 14.209798812866211, + "learning_rate": 0.0007467420939409201, + "loss": 7.5116, + "step": 1203300 + }, + { + "epoch": 4.9023871333725095, + "grad_norm": 3.4636948108673096, + "learning_rate": 0.0007463901540490868, + "loss": 7.5462, + "step": 1203400 + }, + { + "epoch": 4.902794511395891, + "grad_norm": 8.81868839263916, + "learning_rate": 0.0007460382836845061, + "loss": 7.5566, + "step": 1203500 + }, + { + "epoch": 4.903201889419273, + "grad_norm": 8.757772445678711, + "learning_rate": 0.0007456864828610907, + "loss": 7.5783, + "step": 1203600 + }, + { + "epoch": 4.903609267442654, + "grad_norm": 20.694164276123047, + "learning_rate": 0.0007453347515927477, + "loss": 7.555, + "step": 1203700 + }, + { + "epoch": 4.904016645466036, + "grad_norm": 11.508885383605957, + "learning_rate": 0.0007449830898933838, + "loss": 7.5654, + "step": 1203800 + }, + { + "epoch": 4.904424023489417, + "grad_norm": 4.550755500793457, + "learning_rate": 0.0007446314977769009, + "loss": 7.5359, + "step": 1203900 + }, + { + "epoch": 4.904831401512798, + "grad_norm": 8.318005561828613, + "learning_rate": 0.0007442799752571998, + "loss": 7.5477, + "step": 1204000 + }, + { + "epoch": 4.904831401512798, + "eval_MaskedAccuracy": 0.5113642599044101, + "eval_loss": 1.5978615283966064, + "eval_runtime": 165.3288, + "eval_samples_per_second": 383.938, + "eval_steps_per_second": 1.5, + "step": 1204000 + }, + { + "epoch": 4.905238779536179, + "grad_norm": 12.498735427856445, + "learning_rate": 0.0007439285223481781, + "loss": 7.5535, + "step": 1204100 + }, + { + "epoch": 4.905646157559561, + "grad_norm": 9.212076187133789, + "learning_rate": 0.0007435771390637318, + "loss": 7.532, + "step": 1204200 + }, + { + "epoch": 4.906053535582942, + "grad_norm": 18.942237854003906, + "learning_rate": 0.0007432258254177514, + "loss": 7.5348, + "step": 1204300 + }, + { + "epoch": 4.906460913606324, + "grad_norm": 13.009732246398926, + "learning_rate": 0.0007428745814241261, + "loss": 7.5604, + "step": 1204400 + }, + { + "epoch": 4.9068682916297055, + "grad_norm": 7.673408508300781, + "learning_rate": 0.0007425234070967434, + "loss": 7.5487, + "step": 1204500 + }, + { + "epoch": 4.907275669653087, + "grad_norm": 2.9380886554718018, + "learning_rate": 0.0007421723024494868, + "loss": 7.5483, + "step": 1204600 + }, + { + "epoch": 4.907683047676469, + "grad_norm": 5.434240818023682, + "learning_rate": 0.0007418212674962382, + "loss": 7.5482, + "step": 1204700 + }, + { + "epoch": 4.90809042569985, + "grad_norm": 8.399138450622559, + "learning_rate": 0.0007414703022508739, + "loss": 7.5417, + "step": 1204800 + }, + { + "epoch": 4.908497803723232, + "grad_norm": 4.983442783355713, + "learning_rate": 0.0007411194067272717, + "loss": 7.5382, + "step": 1204900 + }, + { + "epoch": 4.908905181746613, + "grad_norm": 4.560715198516846, + "learning_rate": 0.0007407685809393033, + "loss": 7.5431, + "step": 1205000 + }, + { + "epoch": 4.908905181746613, + "eval_MaskedAccuracy": 0.5120175489996054, + "eval_loss": 1.6002388000488281, + "eval_runtime": 166.2125, + "eval_samples_per_second": 381.897, + "eval_steps_per_second": 1.492, + "step": 1205000 + }, + { + "epoch": 4.909312559769995, + "grad_norm": 13.783533096313477, + "learning_rate": 0.0007404178249008389, + "loss": 7.5595, + "step": 1205100 + }, + { + "epoch": 4.909719937793376, + "grad_norm": 13.345892906188965, + "learning_rate": 0.0007400671386257446, + "loss": 7.5346, + "step": 1205200 + }, + { + "epoch": 4.910127315816757, + "grad_norm": 5.098935604095459, + "learning_rate": 0.0007397165221278855, + "loss": 7.5577, + "step": 1205300 + }, + { + "epoch": 4.910534693840138, + "grad_norm": 3.479382276535034, + "learning_rate": 0.0007393659754211242, + "loss": 7.5385, + "step": 1205400 + }, + { + "epoch": 4.91094207186352, + "grad_norm": 8.8301362991333, + "learning_rate": 0.0007390154985193192, + "loss": 7.5576, + "step": 1205500 + }, + { + "epoch": 4.9113494498869015, + "grad_norm": 5.703672885894775, + "learning_rate": 0.0007386650914363256, + "loss": 7.5512, + "step": 1205600 + }, + { + "epoch": 4.911756827910283, + "grad_norm": 8.167853355407715, + "learning_rate": 0.0007383147541859996, + "loss": 7.5703, + "step": 1205700 + }, + { + "epoch": 4.9121642059336645, + "grad_norm": 5.829007148742676, + "learning_rate": 0.0007379644867821892, + "loss": 7.5496, + "step": 1205800 + }, + { + "epoch": 4.912571583957046, + "grad_norm": 16.226884841918945, + "learning_rate": 0.0007376142892387445, + "loss": 7.5726, + "step": 1205900 + }, + { + "epoch": 4.912978961980428, + "grad_norm": 18.150897979736328, + "learning_rate": 0.0007372641615695092, + "loss": 7.5522, + "step": 1206000 + }, + { + "epoch": 4.912978961980428, + "eval_MaskedAccuracy": 0.5119257946600297, + "eval_loss": 1.587544560432434, + "eval_runtime": 159.4616, + "eval_samples_per_second": 398.064, + "eval_steps_per_second": 1.555, + "step": 1206000 + }, + { + "epoch": 4.913386340003809, + "grad_norm": 7.121440410614014, + "learning_rate": 0.0007369141037883261, + "loss": 7.5279, + "step": 1206100 + }, + { + "epoch": 4.913793718027191, + "grad_norm": 15.683066368103027, + "learning_rate": 0.000736564115909035, + "loss": 7.5256, + "step": 1206200 + }, + { + "epoch": 4.914201096050572, + "grad_norm": 10.379250526428223, + "learning_rate": 0.0007362141979454733, + "loss": 7.5627, + "step": 1206300 + }, + { + "epoch": 4.914608474073954, + "grad_norm": 11.896134376525879, + "learning_rate": 0.0007358643499114731, + "loss": 7.5345, + "step": 1206400 + }, + { + "epoch": 4.915015852097335, + "grad_norm": 4.112306594848633, + "learning_rate": 0.0007355145718208677, + "loss": 7.5472, + "step": 1206500 + }, + { + "epoch": 4.915423230120716, + "grad_norm": 4.409004211425781, + "learning_rate": 0.0007351648636874854, + "loss": 7.5803, + "step": 1206600 + }, + { + "epoch": 4.9158306081440974, + "grad_norm": 8.464868545532227, + "learning_rate": 0.0007348152255251507, + "loss": 7.505, + "step": 1206700 + }, + { + "epoch": 4.916237986167479, + "grad_norm": 7.594976902008057, + "learning_rate": 0.0007344656573476882, + "loss": 7.5448, + "step": 1206800 + }, + { + "epoch": 4.9166453641908605, + "grad_norm": 4.152006149291992, + "learning_rate": 0.0007341161591689183, + "loss": 7.5382, + "step": 1206900 + }, + { + "epoch": 4.917052742214242, + "grad_norm": 3.523061990737915, + "learning_rate": 0.0007337667310026584, + "loss": 7.5292, + "step": 1207000 + }, + { + "epoch": 4.917052742214242, + "eval_MaskedAccuracy": 0.5113573410134107, + "eval_loss": 1.6028958559036255, + "eval_runtime": 154.3865, + "eval_samples_per_second": 411.15, + "eval_steps_per_second": 1.606, + "step": 1207000 + }, + { + "epoch": 4.917460120237624, + "grad_norm": 4.540543556213379, + "learning_rate": 0.0007334173728627229, + "loss": 7.5733, + "step": 1207100 + }, + { + "epoch": 4.917867498261005, + "grad_norm": 6.582998275756836, + "learning_rate": 0.0007330680847629223, + "loss": 7.5597, + "step": 1207200 + }, + { + "epoch": 4.918274876284387, + "grad_norm": 4.1407904624938965, + "learning_rate": 0.0007327188667170672, + "loss": 7.5584, + "step": 1207300 + }, + { + "epoch": 4.918682254307768, + "grad_norm": 13.84073257446289, + "learning_rate": 0.0007323697187389647, + "loss": 7.5483, + "step": 1207400 + }, + { + "epoch": 4.91908963233115, + "grad_norm": 6.500828742980957, + "learning_rate": 0.0007320206408424183, + "loss": 7.5393, + "step": 1207500 + }, + { + "epoch": 4.919497010354531, + "grad_norm": 3.3657803535461426, + "learning_rate": 0.0007316716330412285, + "loss": 7.542, + "step": 1207600 + }, + { + "epoch": 4.919904388377912, + "grad_norm": 4.423755168914795, + "learning_rate": 0.000731322695349193, + "loss": 7.5362, + "step": 1207700 + }, + { + "epoch": 4.920311766401294, + "grad_norm": 3.881791353225708, + "learning_rate": 0.0007309738277801082, + "loss": 7.5344, + "step": 1207800 + }, + { + "epoch": 4.920719144424675, + "grad_norm": 10.870352745056152, + "learning_rate": 0.0007306250303477654, + "loss": 7.5518, + "step": 1207900 + }, + { + "epoch": 4.9211265224480565, + "grad_norm": 4.204110145568848, + "learning_rate": 0.0007302763030659553, + "loss": 7.5573, + "step": 1208000 + }, + { + "epoch": 4.9211265224480565, + "eval_MaskedAccuracy": 0.512164743597124, + "eval_loss": 1.5818017721176147, + "eval_runtime": 187.1196, + "eval_samples_per_second": 339.227, + "eval_steps_per_second": 1.325, + "step": 1208000 + }, + { + "epoch": 4.921533900471438, + "grad_norm": 12.669214248657227, + "learning_rate": 0.0007299276459484648, + "loss": 7.5223, + "step": 1208100 + }, + { + "epoch": 4.9219412784948195, + "grad_norm": 6.498908996582031, + "learning_rate": 0.0007295790590090782, + "loss": 7.5455, + "step": 1208200 + }, + { + "epoch": 4.922348656518201, + "grad_norm": 11.104379653930664, + "learning_rate": 0.0007292305422615774, + "loss": 7.5407, + "step": 1208300 + }, + { + "epoch": 4.922756034541583, + "grad_norm": 2.674159526824951, + "learning_rate": 0.0007288820957197401, + "loss": 7.5591, + "step": 1208400 + }, + { + "epoch": 4.923163412564964, + "grad_norm": 15.331785202026367, + "learning_rate": 0.000728533719397344, + "loss": 7.5599, + "step": 1208500 + }, + { + "epoch": 4.923570790588346, + "grad_norm": 8.375259399414062, + "learning_rate": 0.0007281854133081597, + "loss": 7.5597, + "step": 1208600 + }, + { + "epoch": 4.923978168611727, + "grad_norm": 11.889664649963379, + "learning_rate": 0.0007278371774659603, + "loss": 7.5819, + "step": 1208700 + }, + { + "epoch": 4.924385546635109, + "grad_norm": 7.2464141845703125, + "learning_rate": 0.0007274890118845125, + "loss": 7.578, + "step": 1208800 + }, + { + "epoch": 4.92479292465849, + "grad_norm": 4.1132659912109375, + "learning_rate": 0.0007271409165775803, + "loss": 7.5622, + "step": 1208900 + }, + { + "epoch": 4.925200302681871, + "grad_norm": 7.361584186553955, + "learning_rate": 0.0007267928915589258, + "loss": 7.5246, + "step": 1209000 + }, + { + "epoch": 4.925200302681871, + "eval_MaskedAccuracy": 0.5119248000298258, + "eval_loss": 1.5902751684188843, + "eval_runtime": 198.846, + "eval_samples_per_second": 319.222, + "eval_steps_per_second": 1.247, + "step": 1209000 + }, + { + "epoch": 4.9256076807052525, + "grad_norm": 8.776246070861816, + "learning_rate": 0.000726444936842309, + "loss": 7.5099, + "step": 1209100 + }, + { + "epoch": 4.926015058728634, + "grad_norm": 16.761333465576172, + "learning_rate": 0.0007260970524414868, + "loss": 7.5745, + "step": 1209200 + }, + { + "epoch": 4.9264224367520155, + "grad_norm": 7.601357936859131, + "learning_rate": 0.0007257492383702113, + "loss": 7.5531, + "step": 1209300 + }, + { + "epoch": 4.926829814775397, + "grad_norm": 3.0784542560577393, + "learning_rate": 0.0007254014946422347, + "loss": 7.5417, + "step": 1209400 + }, + { + "epoch": 4.927237192798779, + "grad_norm": 13.504761695861816, + "learning_rate": 0.0007250538212713058, + "loss": 7.5265, + "step": 1209500 + }, + { + "epoch": 4.92764457082216, + "grad_norm": 4.976802349090576, + "learning_rate": 0.0007247062182711677, + "loss": 7.5552, + "step": 1209600 + }, + { + "epoch": 4.928051948845542, + "grad_norm": 7.255842208862305, + "learning_rate": 0.000724358685655566, + "loss": 7.5497, + "step": 1209700 + }, + { + "epoch": 4.928459326868923, + "grad_norm": 6.7689528465271, + "learning_rate": 0.0007240112234382386, + "loss": 7.5878, + "step": 1209800 + }, + { + "epoch": 4.928866704892305, + "grad_norm": 3.824451208114624, + "learning_rate": 0.0007236638316329236, + "loss": 7.5322, + "step": 1209900 + }, + { + "epoch": 4.929274082915686, + "grad_norm": 9.546440124511719, + "learning_rate": 0.0007233165102533543, + "loss": 7.5591, + "step": 1210000 + }, + { + "epoch": 4.929274082915686, + "eval_MaskedAccuracy": 0.5115372377019847, + "eval_loss": 1.60365891456604, + "eval_runtime": 155.2988, + "eval_samples_per_second": 408.735, + "eval_steps_per_second": 1.597, + "step": 1210000 + }, + { + "epoch": 4.929681460939068, + "grad_norm": 5.6259002685546875, + "learning_rate": 0.0007229692593132624, + "loss": 7.5387, + "step": 1210100 + }, + { + "epoch": 4.930088838962449, + "grad_norm": 15.719690322875977, + "learning_rate": 0.0007226220788263771, + "loss": 7.5471, + "step": 1210200 + }, + { + "epoch": 4.93049621698583, + "grad_norm": 8.795516967773438, + "learning_rate": 0.0007222749688064227, + "loss": 7.5553, + "step": 1210300 + }, + { + "epoch": 4.9309035950092115, + "grad_norm": 5.547180652618408, + "learning_rate": 0.0007219279292671238, + "loss": 7.5224, + "step": 1210400 + }, + { + "epoch": 4.931310973032593, + "grad_norm": 3.618469476699829, + "learning_rate": 0.0007215809602222006, + "loss": 7.5578, + "step": 1210500 + }, + { + "epoch": 4.931718351055975, + "grad_norm": 9.542546272277832, + "learning_rate": 0.00072123406168537, + "loss": 7.5169, + "step": 1210600 + }, + { + "epoch": 4.932125729079356, + "grad_norm": 9.178125381469727, + "learning_rate": 0.0007208872336703475, + "loss": 7.5434, + "step": 1210700 + }, + { + "epoch": 4.932533107102738, + "grad_norm": 3.8660006523132324, + "learning_rate": 0.0007205404761908446, + "loss": 7.5454, + "step": 1210800 + }, + { + "epoch": 4.932940485126119, + "grad_norm": 6.547489166259766, + "learning_rate": 0.000720193789260571, + "loss": 7.5517, + "step": 1210900 + }, + { + "epoch": 4.933347863149501, + "grad_norm": 4.7753825187683105, + "learning_rate": 0.0007198471728932326, + "loss": 7.5577, + "step": 1211000 + }, + { + "epoch": 4.933347863149501, + "eval_MaskedAccuracy": 0.512251721091413, + "eval_loss": 1.5960654020309448, + "eval_runtime": 161.1451, + "eval_samples_per_second": 393.906, + "eval_steps_per_second": 1.539, + "step": 1211000 + }, + { + "epoch": 4.933755241172882, + "grad_norm": 12.791091918945312, + "learning_rate": 0.000719500627102533, + "loss": 7.5337, + "step": 1211100 + }, + { + "epoch": 4.934162619196264, + "grad_norm": 9.975597381591797, + "learning_rate": 0.0007191541519021734, + "loss": 7.5377, + "step": 1211200 + }, + { + "epoch": 4.934569997219645, + "grad_norm": 8.221304893493652, + "learning_rate": 0.0007188077473058509, + "loss": 7.5487, + "step": 1211300 + }, + { + "epoch": 4.934977375243027, + "grad_norm": 7.139200210571289, + "learning_rate": 0.0007184614133272621, + "loss": 7.5161, + "step": 1211400 + }, + { + "epoch": 4.935384753266408, + "grad_norm": 7.881173610687256, + "learning_rate": 0.0007181151499800989, + "loss": 7.545, + "step": 1211500 + }, + { + "epoch": 4.935792131289789, + "grad_norm": 13.266868591308594, + "learning_rate": 0.0007177689572780508, + "loss": 7.5327, + "step": 1211600 + }, + { + "epoch": 4.9361995093131705, + "grad_norm": 5.918792247772217, + "learning_rate": 0.0007174228352348036, + "loss": 7.5473, + "step": 1211700 + }, + { + "epoch": 4.936606887336552, + "grad_norm": 8.716567039489746, + "learning_rate": 0.0007170767838640428, + "loss": 7.5205, + "step": 1211800 + }, + { + "epoch": 4.937014265359934, + "grad_norm": 9.766985893249512, + "learning_rate": 0.0007167308031794493, + "loss": 7.5614, + "step": 1211900 + }, + { + "epoch": 4.937421643383315, + "grad_norm": 9.446882247924805, + "learning_rate": 0.0007163848931947004, + "loss": 7.5108, + "step": 1212000 + }, + { + "epoch": 4.937421643383315, + "eval_MaskedAccuracy": 0.5119367433537259, + "eval_loss": 1.5985194444656372, + "eval_runtime": 176.7689, + "eval_samples_per_second": 359.09, + "eval_steps_per_second": 1.403, + "step": 1212000 + }, + { + "epoch": 4.937829021406697, + "grad_norm": 5.008060455322266, + "learning_rate": 0.0007160390539234742, + "loss": 7.5541, + "step": 1212100 + }, + { + "epoch": 4.938236399430078, + "grad_norm": 6.689391136169434, + "learning_rate": 0.0007156932853794427, + "loss": 7.5419, + "step": 1212200 + }, + { + "epoch": 4.93864377745346, + "grad_norm": 7.030514717102051, + "learning_rate": 0.0007153475875762749, + "loss": 7.5473, + "step": 1212300 + }, + { + "epoch": 4.939051155476841, + "grad_norm": 9.665675163269043, + "learning_rate": 0.0007150019605276384, + "loss": 7.5458, + "step": 1212400 + }, + { + "epoch": 4.939458533500223, + "grad_norm": 6.615889549255371, + "learning_rate": 0.0007146564042471975, + "loss": 7.5769, + "step": 1212500 + }, + { + "epoch": 4.939865911523604, + "grad_norm": 17.90924644470215, + "learning_rate": 0.0007143109187486155, + "loss": 7.5709, + "step": 1212600 + }, + { + "epoch": 4.940273289546985, + "grad_norm": 9.423774719238281, + "learning_rate": 0.0007139655040455492, + "loss": 7.547, + "step": 1212700 + }, + { + "epoch": 4.940680667570367, + "grad_norm": 14.463005065917969, + "learning_rate": 0.0007136201601516552, + "loss": 7.5456, + "step": 1212800 + }, + { + "epoch": 4.941088045593748, + "grad_norm": 15.357586860656738, + "learning_rate": 0.0007132748870805885, + "loss": 7.5488, + "step": 1212900 + }, + { + "epoch": 4.94149542361713, + "grad_norm": 19.023876190185547, + "learning_rate": 0.0007129296848459989, + "loss": 7.5766, + "step": 1213000 + }, + { + "epoch": 4.94149542361713, + "eval_MaskedAccuracy": 0.5120620997588456, + "eval_loss": 1.595471739768982, + "eval_runtime": 168.1678, + "eval_samples_per_second": 377.456, + "eval_steps_per_second": 1.475, + "step": 1213000 + }, + { + "epoch": 4.941902801640511, + "grad_norm": 4.344493865966797, + "learning_rate": 0.0007125845534615327, + "loss": 7.5458, + "step": 1213100 + }, + { + "epoch": 4.942310179663893, + "grad_norm": 19.665414810180664, + "learning_rate": 0.000712239492940836, + "loss": 7.5566, + "step": 1213200 + }, + { + "epoch": 4.942717557687274, + "grad_norm": 12.394417762756348, + "learning_rate": 0.0007118945032975498, + "loss": 7.5318, + "step": 1213300 + }, + { + "epoch": 4.943124935710656, + "grad_norm": 8.70321273803711, + "learning_rate": 0.0007115495845453144, + "loss": 7.5444, + "step": 1213400 + }, + { + "epoch": 4.943532313734037, + "grad_norm": 7.687065124511719, + "learning_rate": 0.0007112047366977659, + "loss": 7.5605, + "step": 1213500 + }, + { + "epoch": 4.943939691757419, + "grad_norm": 8.121780395507812, + "learning_rate": 0.0007108599597685379, + "loss": 7.5398, + "step": 1213600 + }, + { + "epoch": 4.9443470697808, + "grad_norm": 3.469874382019043, + "learning_rate": 0.0007105152537712617, + "loss": 7.5462, + "step": 1213700 + }, + { + "epoch": 4.944754447804182, + "grad_norm": 12.128028869628906, + "learning_rate": 0.0007101706187195658, + "loss": 7.5294, + "step": 1213800 + }, + { + "epoch": 4.945161825827563, + "grad_norm": 18.595476150512695, + "learning_rate": 0.0007098260546270746, + "loss": 7.553, + "step": 1213900 + }, + { + "epoch": 4.945569203850944, + "grad_norm": 11.615880966186523, + "learning_rate": 0.0007094815615074105, + "loss": 7.5637, + "step": 1214000 + }, + { + "epoch": 4.945569203850944, + "eval_MaskedAccuracy": 0.5119515891990868, + "eval_loss": 1.5936355590820312, + "eval_runtime": 159.4551, + "eval_samples_per_second": 398.081, + "eval_steps_per_second": 1.555, + "step": 1214000 + }, + { + "epoch": 4.9459765818743255, + "grad_norm": 18.023178100585938, + "learning_rate": 0.0007091371393741934, + "loss": 7.5547, + "step": 1214100 + }, + { + "epoch": 4.946383959897707, + "grad_norm": 6.357263088226318, + "learning_rate": 0.0007087927882410394, + "loss": 7.5684, + "step": 1214200 + }, + { + "epoch": 4.946791337921089, + "grad_norm": 20.91889190673828, + "learning_rate": 0.0007084485081215631, + "loss": 7.5231, + "step": 1214300 + }, + { + "epoch": 4.94719871594447, + "grad_norm": 7.93168306350708, + "learning_rate": 0.0007081042990293762, + "loss": 7.5154, + "step": 1214400 + }, + { + "epoch": 4.947606093967852, + "grad_norm": 3.209022045135498, + "learning_rate": 0.000707760160978087, + "loss": 7.5569, + "step": 1214500 + }, + { + "epoch": 4.948013471991233, + "grad_norm": 5.462933540344238, + "learning_rate": 0.0007074160939813006, + "loss": 7.5551, + "step": 1214600 + }, + { + "epoch": 4.948420850014615, + "grad_norm": 7.854034900665283, + "learning_rate": 0.0007070720980526205, + "loss": 7.5412, + "step": 1214700 + }, + { + "epoch": 4.948828228037996, + "grad_norm": 5.540817737579346, + "learning_rate": 0.0007067281732056463, + "loss": 7.5218, + "step": 1214800 + }, + { + "epoch": 4.949235606061378, + "grad_norm": 5.173191070556641, + "learning_rate": 0.0007063843194539759, + "loss": 7.5735, + "step": 1214900 + }, + { + "epoch": 4.949642984084759, + "grad_norm": 7.646540641784668, + "learning_rate": 0.0007060405368112032, + "loss": 7.5173, + "step": 1215000 + }, + { + "epoch": 4.949642984084759, + "eval_MaskedAccuracy": 0.5120245841505193, + "eval_loss": 1.590340256690979, + "eval_runtime": 181.7862, + "eval_samples_per_second": 349.179, + "eval_steps_per_second": 1.364, + "step": 1215000 + }, + { + "epoch": 4.950050362108141, + "grad_norm": 5.17561149597168, + "learning_rate": 0.000705696825290919, + "loss": 7.563, + "step": 1215100 + }, + { + "epoch": 4.950457740131522, + "grad_norm": 5.196922302246094, + "learning_rate": 0.0007053531849067126, + "loss": 7.5658, + "step": 1215200 + }, + { + "epoch": 4.950865118154903, + "grad_norm": 9.764001846313477, + "learning_rate": 0.0007050096156721698, + "loss": 7.5574, + "step": 1215300 + }, + { + "epoch": 4.951272496178285, + "grad_norm": 5.893946647644043, + "learning_rate": 0.0007046661176008743, + "loss": 7.5621, + "step": 1215400 + }, + { + "epoch": 4.951679874201666, + "grad_norm": 4.240930557250977, + "learning_rate": 0.0007043226907064059, + "loss": 7.5274, + "step": 1215500 + }, + { + "epoch": 4.952087252225048, + "grad_norm": 5.049214839935303, + "learning_rate": 0.0007039793350023427, + "loss": 7.5195, + "step": 1215600 + }, + { + "epoch": 4.952494630248429, + "grad_norm": 7.266434192657471, + "learning_rate": 0.0007036360505022579, + "loss": 7.5359, + "step": 1215700 + }, + { + "epoch": 4.952902008271811, + "grad_norm": 8.328526496887207, + "learning_rate": 0.000703292837219725, + "loss": 7.5646, + "step": 1215800 + }, + { + "epoch": 4.953309386295192, + "grad_norm": 9.4588623046875, + "learning_rate": 0.0007029496951683128, + "loss": 7.5433, + "step": 1215900 + }, + { + "epoch": 4.953716764318574, + "grad_norm": 16.228639602661133, + "learning_rate": 0.0007026066243615868, + "loss": 7.5312, + "step": 1216000 + }, + { + "epoch": 4.953716764318574, + "eval_MaskedAccuracy": 0.5119462318759695, + "eval_loss": 1.586015224456787, + "eval_runtime": 157.8224, + "eval_samples_per_second": 402.199, + "eval_steps_per_second": 1.571, + "step": 1216000 + }, + { + "epoch": 4.954124142341955, + "grad_norm": 6.658722877502441, + "learning_rate": 0.0007022636248131115, + "loss": 7.5458, + "step": 1216100 + }, + { + "epoch": 4.954531520365337, + "grad_norm": 19.78124237060547, + "learning_rate": 0.0007019206965364466, + "loss": 7.5342, + "step": 1216200 + }, + { + "epoch": 4.954938898388718, + "grad_norm": 13.313947677612305, + "learning_rate": 0.0007015778395451491, + "loss": 7.5773, + "step": 1216300 + }, + { + "epoch": 4.9553462764121, + "grad_norm": 4.927038192749023, + "learning_rate": 0.000701235053852775, + "loss": 7.5592, + "step": 1216400 + }, + { + "epoch": 4.955753654435481, + "grad_norm": 9.97121810913086, + "learning_rate": 0.0007008923394728761, + "loss": 7.5425, + "step": 1216500 + }, + { + "epoch": 4.956161032458862, + "grad_norm": 16.75101661682129, + "learning_rate": 0.0007005496964190028, + "loss": 7.5683, + "step": 1216600 + }, + { + "epoch": 4.956568410482244, + "grad_norm": 10.632609367370605, + "learning_rate": 0.0007002071247047009, + "loss": 7.5703, + "step": 1216700 + }, + { + "epoch": 4.956975788505625, + "grad_norm": 9.779709815979004, + "learning_rate": 0.0006998646243435141, + "loss": 7.5214, + "step": 1216800 + }, + { + "epoch": 4.957383166529007, + "grad_norm": 6.6657795906066895, + "learning_rate": 0.0006995221953489826, + "loss": 7.5268, + "step": 1216900 + }, + { + "epoch": 4.957790544552388, + "grad_norm": 18.713905334472656, + "learning_rate": 0.0006991798377346457, + "loss": 7.5422, + "step": 1217000 + }, + { + "epoch": 4.957790544552388, + "eval_MaskedAccuracy": 0.5123652625997255, + "eval_loss": 1.5796376466751099, + "eval_runtime": 164.9988, + "eval_samples_per_second": 384.706, + "eval_steps_per_second": 1.503, + "step": 1217000 + }, + { + "epoch": 4.95819792257577, + "grad_norm": 10.145832061767578, + "learning_rate": 0.0006988375515140388, + "loss": 7.5367, + "step": 1217100 + }, + { + "epoch": 4.958605300599151, + "grad_norm": 17.383893966674805, + "learning_rate": 0.0006984953367006932, + "loss": 7.548, + "step": 1217200 + }, + { + "epoch": 4.959012678622533, + "grad_norm": 3.454498291015625, + "learning_rate": 0.0006981531933081381, + "loss": 7.5563, + "step": 1217300 + }, + { + "epoch": 4.959420056645914, + "grad_norm": 13.539298057556152, + "learning_rate": 0.0006978111213499014, + "loss": 7.5305, + "step": 1217400 + }, + { + "epoch": 4.959827434669296, + "grad_norm": 9.780884742736816, + "learning_rate": 0.0006974691208395057, + "loss": 7.5471, + "step": 1217500 + }, + { + "epoch": 4.960234812692677, + "grad_norm": 3.3330161571502686, + "learning_rate": 0.0006971271917904734, + "loss": 7.5407, + "step": 1217600 + }, + { + "epoch": 4.960642190716058, + "grad_norm": 5.288601398468018, + "learning_rate": 0.0006967853342163229, + "loss": 7.5269, + "step": 1217700 + }, + { + "epoch": 4.9610495687394405, + "grad_norm": 7.116894245147705, + "learning_rate": 0.0006964435481305688, + "loss": 7.547, + "step": 1217800 + }, + { + "epoch": 4.961456946762821, + "grad_norm": 6.527875900268555, + "learning_rate": 0.0006961018335467239, + "loss": 7.5503, + "step": 1217900 + }, + { + "epoch": 4.961864324786203, + "grad_norm": 6.521652698516846, + "learning_rate": 0.0006957601904782982, + "loss": 7.5476, + "step": 1218000 + }, + { + "epoch": 4.961864324786203, + "eval_MaskedAccuracy": 0.512317709785432, + "eval_loss": 1.587033748626709, + "eval_runtime": 198.291, + "eval_samples_per_second": 320.115, + "eval_steps_per_second": 1.251, + "step": 1218000 + }, + { + "epoch": 4.962271702809584, + "grad_norm": 5.053981304168701, + "learning_rate": 0.0006954186189387996, + "loss": 7.5563, + "step": 1218100 + }, + { + "epoch": 4.962679080832966, + "grad_norm": 8.57411003112793, + "learning_rate": 0.000695077118941731, + "loss": 7.5518, + "step": 1218200 + }, + { + "epoch": 4.963086458856347, + "grad_norm": 6.027956008911133, + "learning_rate": 0.0006947356905005938, + "loss": 7.5219, + "step": 1218300 + }, + { + "epoch": 4.963493836879729, + "grad_norm": 13.150764465332031, + "learning_rate": 0.0006943943336288865, + "loss": 7.5229, + "step": 1218400 + }, + { + "epoch": 4.96390121490311, + "grad_norm": 6.306985378265381, + "learning_rate": 0.0006940530483401045, + "loss": 7.5328, + "step": 1218500 + }, + { + "epoch": 4.964308592926492, + "grad_norm": 3.376652717590332, + "learning_rate": 0.0006937118346477413, + "loss": 7.5606, + "step": 1218600 + }, + { + "epoch": 4.964715970949873, + "grad_norm": 3.4649882316589355, + "learning_rate": 0.0006933706925652879, + "loss": 7.5298, + "step": 1218700 + }, + { + "epoch": 4.965123348973255, + "grad_norm": 4.103401184082031, + "learning_rate": 0.0006930296221062292, + "loss": 7.5757, + "step": 1218800 + }, + { + "epoch": 4.9655307269966364, + "grad_norm": 5.4330925941467285, + "learning_rate": 0.0006926886232840514, + "loss": 7.5406, + "step": 1218900 + }, + { + "epoch": 4.965938105020017, + "grad_norm": 5.642839431762695, + "learning_rate": 0.0006923476961122346, + "loss": 7.5665, + "step": 1219000 + }, + { + "epoch": 4.965938105020017, + "eval_MaskedAccuracy": 0.511894358141707, + "eval_loss": 1.5927015542984009, + "eval_runtime": 162.5161, + "eval_samples_per_second": 390.583, + "eval_steps_per_second": 1.526, + "step": 1219000 + }, + { + "epoch": 4.966345483043399, + "grad_norm": 4.599015712738037, + "learning_rate": 0.0006920068406042587, + "loss": 7.5584, + "step": 1219100 + }, + { + "epoch": 4.96675286106678, + "grad_norm": 4.394110679626465, + "learning_rate": 0.0006916660567735992, + "loss": 7.5543, + "step": 1219200 + }, + { + "epoch": 4.967160239090162, + "grad_norm": 6.67686128616333, + "learning_rate": 0.000691325344633728, + "loss": 7.5341, + "step": 1219300 + }, + { + "epoch": 4.967567617113543, + "grad_norm": 7.108152389526367, + "learning_rate": 0.0006909847041981161, + "loss": 7.539, + "step": 1219400 + }, + { + "epoch": 4.967974995136925, + "grad_norm": 10.382682800292969, + "learning_rate": 0.0006906441354802306, + "loss": 7.5425, + "step": 1219500 + }, + { + "epoch": 4.968382373160306, + "grad_norm": 9.100656509399414, + "learning_rate": 0.0006903036384935381, + "loss": 7.506, + "step": 1219600 + }, + { + "epoch": 4.968789751183688, + "grad_norm": 4.32513952255249, + "learning_rate": 0.000689963213251497, + "loss": 7.515, + "step": 1219700 + }, + { + "epoch": 4.969197129207069, + "grad_norm": 7.653156280517578, + "learning_rate": 0.0006896228597675673, + "loss": 7.5366, + "step": 1219800 + }, + { + "epoch": 4.969604507230451, + "grad_norm": 15.72667121887207, + "learning_rate": 0.0006892825780552056, + "loss": 7.5449, + "step": 1219900 + }, + { + "epoch": 4.970011885253832, + "grad_norm": 9.174826622009277, + "learning_rate": 0.0006889423681278641, + "loss": 7.5405, + "step": 1220000 + }, + { + "epoch": 4.970011885253832, + "eval_MaskedAccuracy": 0.512115965289989, + "eval_loss": 1.590442419052124, + "eval_runtime": 160.9373, + "eval_samples_per_second": 394.415, + "eval_steps_per_second": 1.541, + "step": 1220000 + }, + { + "epoch": 4.970419263277214, + "grad_norm": 6.958202362060547, + "learning_rate": 0.0006886022299989953, + "loss": 7.5262, + "step": 1220100 + }, + { + "epoch": 4.9708266413005955, + "grad_norm": 8.148906707763672, + "learning_rate": 0.0006882621636820443, + "loss": 7.5343, + "step": 1220200 + }, + { + "epoch": 4.971234019323976, + "grad_norm": 14.049768447875977, + "learning_rate": 0.0006879221691904563, + "loss": 7.5337, + "step": 1220300 + }, + { + "epoch": 4.971641397347358, + "grad_norm": 6.911771297454834, + "learning_rate": 0.0006875822465376738, + "loss": 7.5715, + "step": 1220400 + }, + { + "epoch": 4.972048775370739, + "grad_norm": 6.940609455108643, + "learning_rate": 0.0006872423957371355, + "loss": 7.5754, + "step": 1220500 + }, + { + "epoch": 4.972456153394121, + "grad_norm": 11.549186706542969, + "learning_rate": 0.000686902616802278, + "loss": 7.5271, + "step": 1220600 + }, + { + "epoch": 4.972863531417502, + "grad_norm": 3.5038044452667236, + "learning_rate": 0.0006865629097465335, + "loss": 7.529, + "step": 1220700 + }, + { + "epoch": 4.973270909440884, + "grad_norm": 8.903679847717285, + "learning_rate": 0.0006862232745833321, + "loss": 7.5604, + "step": 1220800 + }, + { + "epoch": 4.973678287464265, + "grad_norm": 19.229045867919922, + "learning_rate": 0.0006858837113261025, + "loss": 7.5344, + "step": 1220900 + }, + { + "epoch": 4.974085665487647, + "grad_norm": 9.70226001739502, + "learning_rate": 0.0006855442199882678, + "loss": 7.5518, + "step": 1221000 + }, + { + "epoch": 4.974085665487647, + "eval_MaskedAccuracy": 0.5120600921687889, + "eval_loss": 1.584570288658142, + "eval_runtime": 157.4361, + "eval_samples_per_second": 403.186, + "eval_steps_per_second": 1.575, + "step": 1221000 + }, + { + "epoch": 4.974493043511028, + "grad_norm": 13.518266677856445, + "learning_rate": 0.000685204800583253, + "loss": 7.553, + "step": 1221100 + }, + { + "epoch": 4.97490042153441, + "grad_norm": 13.211006164550781, + "learning_rate": 0.0006848654531244754, + "loss": 7.5281, + "step": 1221200 + }, + { + "epoch": 4.9753077995577915, + "grad_norm": 12.042580604553223, + "learning_rate": 0.0006845261776253507, + "loss": 7.5236, + "step": 1221300 + }, + { + "epoch": 4.975715177581173, + "grad_norm": 4.958207607269287, + "learning_rate": 0.0006841869740992928, + "loss": 7.5424, + "step": 1221400 + }, + { + "epoch": 4.9761225556045545, + "grad_norm": 3.994436740875244, + "learning_rate": 0.0006838478425597126, + "loss": 7.5442, + "step": 1221500 + }, + { + "epoch": 4.976529933627935, + "grad_norm": 8.007580757141113, + "learning_rate": 0.0006835087830200165, + "loss": 7.5554, + "step": 1221600 + }, + { + "epoch": 4.976937311651317, + "grad_norm": 4.834044933319092, + "learning_rate": 0.0006831697954936109, + "loss": 7.5528, + "step": 1221700 + }, + { + "epoch": 4.977344689674698, + "grad_norm": 7.90855073928833, + "learning_rate": 0.0006828308799938971, + "loss": 7.5414, + "step": 1221800 + }, + { + "epoch": 4.97775206769808, + "grad_norm": 7.272604465484619, + "learning_rate": 0.000682492036534274, + "loss": 7.5618, + "step": 1221900 + }, + { + "epoch": 4.978159445721461, + "grad_norm": 13.992048263549805, + "learning_rate": 0.000682153265128138, + "loss": 7.5439, + "step": 1222000 + }, + { + "epoch": 4.978159445721461, + "eval_MaskedAccuracy": 0.511423294216237, + "eval_loss": 1.590978980064392, + "eval_runtime": 180.8448, + "eval_samples_per_second": 350.997, + "eval_steps_per_second": 1.371, + "step": 1222000 + }, + { + "epoch": 4.978566823744843, + "grad_norm": 15.217825889587402, + "learning_rate": 0.0006818145657888822, + "loss": 7.5627, + "step": 1222100 + }, + { + "epoch": 4.978974201768224, + "grad_norm": 6.383872032165527, + "learning_rate": 0.0006814759385298989, + "loss": 7.5386, + "step": 1222200 + }, + { + "epoch": 4.979381579791606, + "grad_norm": 13.450451850891113, + "learning_rate": 0.0006811373833645746, + "loss": 7.5648, + "step": 1222300 + }, + { + "epoch": 4.979788957814987, + "grad_norm": 12.187459945678711, + "learning_rate": 0.0006807989003062939, + "loss": 7.5256, + "step": 1222400 + }, + { + "epoch": 4.980196335838369, + "grad_norm": 9.030609130859375, + "learning_rate": 0.0006804604893684383, + "loss": 7.5648, + "step": 1222500 + }, + { + "epoch": 4.9806037138617505, + "grad_norm": 6.992572784423828, + "learning_rate": 0.0006801221505643891, + "loss": 7.5238, + "step": 1222600 + }, + { + "epoch": 4.981011091885131, + "grad_norm": 3.9543697834014893, + "learning_rate": 0.0006797838839075205, + "loss": 7.5242, + "step": 1222700 + }, + { + "epoch": 4.981418469908514, + "grad_norm": 9.706334114074707, + "learning_rate": 0.0006794456894112076, + "loss": 7.5423, + "step": 1222800 + }, + { + "epoch": 4.981825847931894, + "grad_norm": 4.013082504272461, + "learning_rate": 0.0006791075670888206, + "loss": 7.5461, + "step": 1222900 + }, + { + "epoch": 4.982233225955276, + "grad_norm": 6.769832611083984, + "learning_rate": 0.0006787695169537265, + "loss": 7.518, + "step": 1223000 + }, + { + "epoch": 4.982233225955276, + "eval_MaskedAccuracy": 0.5118910308254481, + "eval_loss": 1.6035711765289307, + "eval_runtime": 172.2235, + "eval_samples_per_second": 368.568, + "eval_steps_per_second": 1.44, + "step": 1223000 + }, + { + "epoch": 4.982640603978657, + "grad_norm": 9.585030555725098, + "learning_rate": 0.0006784315390192912, + "loss": 7.5537, + "step": 1223100 + }, + { + "epoch": 4.983047982002039, + "grad_norm": 14.527358055114746, + "learning_rate": 0.0006780936332988761, + "loss": 7.5511, + "step": 1223200 + }, + { + "epoch": 4.98345536002542, + "grad_norm": 9.325721740722656, + "learning_rate": 0.0006777557998058404, + "loss": 7.5221, + "step": 1223300 + }, + { + "epoch": 4.983862738048802, + "grad_norm": 6.756707668304443, + "learning_rate": 0.0006774180385535401, + "loss": 7.5538, + "step": 1223400 + }, + { + "epoch": 4.984270116072183, + "grad_norm": 9.356431007385254, + "learning_rate": 0.0006770803495553293, + "loss": 7.5535, + "step": 1223500 + }, + { + "epoch": 4.984677494095565, + "grad_norm": 3.1493470668792725, + "learning_rate": 0.00067674273282456, + "loss": 7.5381, + "step": 1223600 + }, + { + "epoch": 4.9850848721189465, + "grad_norm": 6.802088737487793, + "learning_rate": 0.0006764051883745777, + "loss": 7.5455, + "step": 1223700 + }, + { + "epoch": 4.985492250142328, + "grad_norm": 18.060728073120117, + "learning_rate": 0.0006760677162187292, + "loss": 7.5534, + "step": 1223800 + }, + { + "epoch": 4.9858996281657095, + "grad_norm": 3.9281394481658936, + "learning_rate": 0.0006757303163703543, + "loss": 7.5359, + "step": 1223900 + }, + { + "epoch": 4.98630700618909, + "grad_norm": 12.208638191223145, + "learning_rate": 0.0006753929888427948, + "loss": 7.5467, + "step": 1224000 + }, + { + "epoch": 4.98630700618909, + "eval_MaskedAccuracy": 0.5127115042746659, + "eval_loss": 1.5897910594940186, + "eval_runtime": 161.2034, + "eval_samples_per_second": 393.763, + "eval_steps_per_second": 1.538, + "step": 1224000 + }, + { + "epoch": 4.986714384212472, + "grad_norm": 3.983851671218872, + "learning_rate": 0.000675055733649386, + "loss": 7.545, + "step": 1224100 + }, + { + "epoch": 4.987121762235853, + "grad_norm": 11.683621406555176, + "learning_rate": 0.000674718550803461, + "loss": 7.5659, + "step": 1224200 + }, + { + "epoch": 4.987529140259235, + "grad_norm": 5.003474712371826, + "learning_rate": 0.0006743814403183518, + "loss": 7.5639, + "step": 1224300 + }, + { + "epoch": 4.987936518282616, + "grad_norm": 13.809906005859375, + "learning_rate": 0.0006740444022073836, + "loss": 7.5556, + "step": 1224400 + }, + { + "epoch": 4.988343896305998, + "grad_norm": 12.487983703613281, + "learning_rate": 0.0006737074364838838, + "loss": 7.5397, + "step": 1224500 + }, + { + "epoch": 4.988751274329379, + "grad_norm": 11.203004837036133, + "learning_rate": 0.0006733705431611725, + "loss": 7.5272, + "step": 1224600 + }, + { + "epoch": 4.989158652352761, + "grad_norm": 5.616673946380615, + "learning_rate": 0.0006730337222525702, + "loss": 7.545, + "step": 1224700 + }, + { + "epoch": 4.989566030376142, + "grad_norm": 6.645087718963623, + "learning_rate": 0.0006726969737713924, + "loss": 7.5325, + "step": 1224800 + }, + { + "epoch": 4.989973408399524, + "grad_norm": 8.293774604797363, + "learning_rate": 0.0006723602977309536, + "loss": 7.5518, + "step": 1224900 + }, + { + "epoch": 4.9903807864229055, + "grad_norm": 7.347590923309326, + "learning_rate": 0.0006720236941445641, + "loss": 7.5345, + "step": 1225000 + }, + { + "epoch": 4.9903807864229055, + "eval_MaskedAccuracy": 0.5120031573281795, + "eval_loss": 1.5872365236282349, + "eval_runtime": 169.0849, + "eval_samples_per_second": 375.409, + "eval_steps_per_second": 1.467, + "step": 1225000 + }, + { + "epoch": 4.990788164446287, + "grad_norm": 6.247953414916992, + "learning_rate": 0.0006716871630255315, + "loss": 7.5583, + "step": 1225100 + }, + { + "epoch": 4.991195542469669, + "grad_norm": 9.45461654663086, + "learning_rate": 0.0006713507043871601, + "loss": 7.5196, + "step": 1225200 + }, + { + "epoch": 4.991602920493049, + "grad_norm": 9.74386215209961, + "learning_rate": 0.0006710143182427533, + "loss": 7.5245, + "step": 1225300 + }, + { + "epoch": 4.992010298516431, + "grad_norm": 15.22220230102539, + "learning_rate": 0.0006706780046056085, + "loss": 7.5144, + "step": 1225400 + }, + { + "epoch": 4.992417676539812, + "grad_norm": 12.491472244262695, + "learning_rate": 0.0006703417634890228, + "loss": 7.5216, + "step": 1225500 + }, + { + "epoch": 4.992825054563194, + "grad_norm": 13.283463478088379, + "learning_rate": 0.00067000559490629, + "loss": 7.5587, + "step": 1225600 + }, + { + "epoch": 4.993232432586575, + "grad_norm": 4.220707416534424, + "learning_rate": 0.0006696694988707003, + "loss": 7.5687, + "step": 1225700 + }, + { + "epoch": 4.993639810609957, + "grad_norm": 5.348223686218262, + "learning_rate": 0.0006693334753955413, + "loss": 7.5399, + "step": 1225800 + }, + { + "epoch": 4.994047188633338, + "grad_norm": 5.098475456237793, + "learning_rate": 0.0006689975244940982, + "loss": 7.5416, + "step": 1225900 + }, + { + "epoch": 4.99445456665672, + "grad_norm": 5.169957637786865, + "learning_rate": 0.000668661646179652, + "loss": 7.5448, + "step": 1226000 + }, + { + "epoch": 4.99445456665672, + "eval_MaskedAccuracy": 0.5121886745386547, + "eval_loss": 1.59209144115448, + "eval_runtime": 169.7167, + "eval_samples_per_second": 374.011, + "eval_steps_per_second": 1.461, + "step": 1226000 + }, + { + "epoch": 4.9948619446801015, + "grad_norm": 15.679362297058105, + "learning_rate": 0.0006683258404654823, + "loss": 7.5624, + "step": 1226100 + }, + { + "epoch": 4.995269322703483, + "grad_norm": 5.099567890167236, + "learning_rate": 0.0006679901073648651, + "loss": 7.5409, + "step": 1226200 + }, + { + "epoch": 4.9956767007268645, + "grad_norm": 18.295846939086914, + "learning_rate": 0.0006676544468910742, + "loss": 7.5332, + "step": 1226300 + }, + { + "epoch": 4.996084078750246, + "grad_norm": 3.887827157974243, + "learning_rate": 0.0006673188590573797, + "loss": 7.5142, + "step": 1226400 + }, + { + "epoch": 4.996491456773628, + "grad_norm": 4.531190395355225, + "learning_rate": 0.0006669833438770492, + "loss": 7.5377, + "step": 1226500 + }, + { + "epoch": 4.996898834797008, + "grad_norm": 4.338390350341797, + "learning_rate": 0.0006666479013633482, + "loss": 7.5642, + "step": 1226600 + }, + { + "epoch": 4.99730621282039, + "grad_norm": 6.9320197105407715, + "learning_rate": 0.0006663125315295379, + "loss": 7.5329, + "step": 1226700 + }, + { + "epoch": 4.997713590843771, + "grad_norm": 4.566051483154297, + "learning_rate": 0.0006659772343888761, + "loss": 7.5234, + "step": 1226800 + }, + { + "epoch": 4.998120968867153, + "grad_norm": 2.9808521270751953, + "learning_rate": 0.0006656420099546208, + "loss": 7.556, + "step": 1226900 + }, + { + "epoch": 4.998528346890534, + "grad_norm": 13.253824234008789, + "learning_rate": 0.0006653068582400242, + "loss": 7.5222, + "step": 1227000 + }, + { + "epoch": 4.998528346890534, + "eval_MaskedAccuracy": 0.5122886249159206, + "eval_loss": 1.5890278816223145, + "eval_runtime": 162.5801, + "eval_samples_per_second": 390.429, + "eval_steps_per_second": 1.525, + "step": 1227000 + }, + { + "epoch": 4.998935724913916, + "grad_norm": 11.345479965209961, + "learning_rate": 0.000664971779258337, + "loss": 7.5476, + "step": 1227100 + }, + { + "epoch": 4.9993431029372974, + "grad_norm": 3.7569751739501953, + "learning_rate": 0.0006646367730228067, + "loss": 7.5337, + "step": 1227200 + }, + { + "epoch": 4.999750480960679, + "grad_norm": 14.374944686889648, + "learning_rate": 0.0006643018395466777, + "loss": 7.5692, + "step": 1227300 + }, + { + "epoch": 5.0001578589840605, + "grad_norm": 5.712322235107422, + "learning_rate": 0.0006639669788431905, + "loss": 7.5522, + "step": 1227400 + }, + { + "epoch": 5.000565237007442, + "grad_norm": 4.443236827850342, + "learning_rate": 0.0006636321909255869, + "loss": 7.5616, + "step": 1227500 + }, + { + "epoch": 5.000972615030824, + "grad_norm": 8.470004081726074, + "learning_rate": 0.0006632974758071009, + "loss": 7.5996, + "step": 1227600 + }, + { + "epoch": 5.001379993054205, + "grad_norm": 5.000079154968262, + "learning_rate": 0.0006629628335009662, + "loss": 7.5359, + "step": 1227700 + }, + { + "epoch": 5.001787371077586, + "grad_norm": 9.659152030944824, + "learning_rate": 0.0006626282640204117, + "loss": 7.57, + "step": 1227800 + }, + { + "epoch": 5.002194749100967, + "grad_norm": 3.7673120498657227, + "learning_rate": 0.0006622937673786657, + "loss": 7.562, + "step": 1227900 + }, + { + "epoch": 5.002602127124349, + "grad_norm": 6.777065277099609, + "learning_rate": 0.000661959343588953, + "loss": 7.5599, + "step": 1228000 + }, + { + "epoch": 5.002602127124349, + "eval_MaskedAccuracy": 0.5117803751789692, + "eval_loss": 1.5943812131881714, + "eval_runtime": 149.2434, + "eval_samples_per_second": 425.319, + "eval_steps_per_second": 1.662, + "step": 1228000 + }, + { + "epoch": 5.00300950514773, + "grad_norm": 13.324762344360352, + "learning_rate": 0.0006616249926644944, + "loss": 7.5493, + "step": 1228100 + }, + { + "epoch": 5.003416883171112, + "grad_norm": 13.191681861877441, + "learning_rate": 0.0006612907146185087, + "loss": 7.5454, + "step": 1228200 + }, + { + "epoch": 5.003824261194493, + "grad_norm": 7.181677341461182, + "learning_rate": 0.0006609565094642122, + "loss": 7.5511, + "step": 1228300 + }, + { + "epoch": 5.004231639217875, + "grad_norm": 21.544034957885742, + "learning_rate": 0.0006606223772148165, + "loss": 7.5532, + "step": 1228400 + }, + { + "epoch": 5.0046390172412565, + "grad_norm": 10.261125564575195, + "learning_rate": 0.0006602883178835334, + "loss": 7.5641, + "step": 1228500 + }, + { + "epoch": 5.005046395264638, + "grad_norm": 6.927099704742432, + "learning_rate": 0.0006599543314835694, + "loss": 7.5241, + "step": 1228600 + }, + { + "epoch": 5.0054537732880195, + "grad_norm": 3.984013557434082, + "learning_rate": 0.0006596204180281294, + "loss": 7.5216, + "step": 1228700 + }, + { + "epoch": 5.005861151311401, + "grad_norm": 7.137518405914307, + "learning_rate": 0.000659286577530414, + "loss": 7.5325, + "step": 1228800 + }, + { + "epoch": 5.006268529334783, + "grad_norm": 8.60405158996582, + "learning_rate": 0.0006589528100036211, + "loss": 7.566, + "step": 1228900 + }, + { + "epoch": 5.006675907358164, + "grad_norm": 13.43557357788086, + "learning_rate": 0.0006586191154609472, + "loss": 7.5554, + "step": 1229000 + }, + { + "epoch": 5.006675907358164, + "eval_MaskedAccuracy": 0.5120016789837898, + "eval_loss": 1.5948749780654907, + "eval_runtime": 147.1504, + "eval_samples_per_second": 431.368, + "eval_steps_per_second": 1.685, + "step": 1229000 + }, + { + "epoch": 5.007083285381545, + "grad_norm": 9.201789855957031, + "learning_rate": 0.0006582854939155844, + "loss": 7.5459, + "step": 1229100 + }, + { + "epoch": 5.007490663404926, + "grad_norm": 19.81255340576172, + "learning_rate": 0.0006579519453807234, + "loss": 7.5338, + "step": 1229200 + }, + { + "epoch": 5.007898041428308, + "grad_norm": 4.990116596221924, + "learning_rate": 0.0006576184698695503, + "loss": 7.5349, + "step": 1229300 + }, + { + "epoch": 5.008305419451689, + "grad_norm": 9.439986228942871, + "learning_rate": 0.0006572850673952495, + "loss": 7.5722, + "step": 1229400 + }, + { + "epoch": 5.008712797475071, + "grad_norm": 4.5637006759643555, + "learning_rate": 0.0006569517379710025, + "loss": 7.5144, + "step": 1229500 + }, + { + "epoch": 5.0091201754984525, + "grad_norm": 13.385419845581055, + "learning_rate": 0.0006566184816099879, + "loss": 7.5402, + "step": 1229600 + }, + { + "epoch": 5.009527553521834, + "grad_norm": 8.353857040405273, + "learning_rate": 0.0006562852983253799, + "loss": 7.5562, + "step": 1229700 + }, + { + "epoch": 5.0099349315452155, + "grad_norm": 13.3303804397583, + "learning_rate": 0.0006559521881303521, + "loss": 7.5471, + "step": 1229800 + }, + { + "epoch": 5.010342309568597, + "grad_norm": 12.460258483886719, + "learning_rate": 0.0006556191510380728, + "loss": 7.5468, + "step": 1229900 + }, + { + "epoch": 5.010749687591979, + "grad_norm": 4.557527542114258, + "learning_rate": 0.00065528618706171, + "loss": 7.5539, + "step": 1230000 + }, + { + "epoch": 5.010749687591979, + "eval_MaskedAccuracy": 0.5120977751441654, + "eval_loss": 1.6034334897994995, + "eval_runtime": 147.2259, + "eval_samples_per_second": 431.147, + "eval_steps_per_second": 1.684, + "step": 1230000 + }, + { + "epoch": 5.01115706561536, + "grad_norm": 4.789175033569336, + "learning_rate": 0.0006549532962144281, + "loss": 7.5718, + "step": 1230100 + }, + { + "epoch": 5.011564443638742, + "grad_norm": 6.723941802978516, + "learning_rate": 0.0006546204785093867, + "loss": 7.5443, + "step": 1230200 + }, + { + "epoch": 5.011971821662122, + "grad_norm": 16.7134952545166, + "learning_rate": 0.0006542877339597431, + "loss": 7.5383, + "step": 1230300 + }, + { + "epoch": 5.012379199685504, + "grad_norm": 5.229317665100098, + "learning_rate": 0.0006539550625786548, + "loss": 7.5319, + "step": 1230400 + }, + { + "epoch": 5.012786577708885, + "grad_norm": 4.620895862579346, + "learning_rate": 0.0006536224643792727, + "loss": 7.5577, + "step": 1230500 + }, + { + "epoch": 5.013193955732267, + "grad_norm": 15.162534713745117, + "learning_rate": 0.0006532899393747466, + "loss": 7.5539, + "step": 1230600 + }, + { + "epoch": 5.013601333755648, + "grad_norm": 21.066532135009766, + "learning_rate": 0.0006529574875782229, + "loss": 7.5467, + "step": 1230700 + }, + { + "epoch": 5.01400871177903, + "grad_norm": 4.801368236541748, + "learning_rate": 0.0006526251090028454, + "loss": 7.5722, + "step": 1230800 + }, + { + "epoch": 5.0144160898024115, + "grad_norm": 15.411423683166504, + "learning_rate": 0.0006522928036617541, + "loss": 7.5509, + "step": 1230900 + }, + { + "epoch": 5.014823467825793, + "grad_norm": 16.93613624572754, + "learning_rate": 0.0006519605715680867, + "loss": 7.5444, + "step": 1231000 + }, + { + "epoch": 5.014823467825793, + "eval_MaskedAccuracy": 0.5123794414665871, + "eval_loss": 1.5937520265579224, + "eval_runtime": 150.3004, + "eval_samples_per_second": 422.328, + "eval_steps_per_second": 1.65, + "step": 1231000 + }, + { + "epoch": 5.015230845849175, + "grad_norm": 4.480098724365234, + "learning_rate": 0.0006516284127349796, + "loss": 7.5674, + "step": 1231100 + }, + { + "epoch": 5.015638223872556, + "grad_norm": 10.921262741088867, + "learning_rate": 0.000651296327175563, + "loss": 7.5748, + "step": 1231200 + }, + { + "epoch": 5.016045601895938, + "grad_norm": 5.700741291046143, + "learning_rate": 0.0006509643149029676, + "loss": 7.5292, + "step": 1231300 + }, + { + "epoch": 5.016452979919319, + "grad_norm": 12.017867088317871, + "learning_rate": 0.0006506323759303185, + "loss": 7.5319, + "step": 1231400 + }, + { + "epoch": 5.016860357942701, + "grad_norm": 4.3226094245910645, + "learning_rate": 0.0006503005102707396, + "loss": 7.5432, + "step": 1231500 + }, + { + "epoch": 5.017267735966081, + "grad_norm": 4.801712989807129, + "learning_rate": 0.0006499687179373513, + "loss": 7.552, + "step": 1231600 + }, + { + "epoch": 5.017675113989463, + "grad_norm": 11.267165184020996, + "learning_rate": 0.000649636998943271, + "loss": 7.5417, + "step": 1231700 + }, + { + "epoch": 5.018082492012844, + "grad_norm": 5.242594242095947, + "learning_rate": 0.0006493053533016136, + "loss": 7.5636, + "step": 1231800 + }, + { + "epoch": 5.018489870036226, + "grad_norm": 6.6134114265441895, + "learning_rate": 0.0006489737810254903, + "loss": 7.5765, + "step": 1231900 + }, + { + "epoch": 5.0188972480596075, + "grad_norm": 12.343679428100586, + "learning_rate": 0.0006486422821280105, + "loss": 7.54, + "step": 1232000 + }, + { + "epoch": 5.0188972480596075, + "eval_MaskedAccuracy": 0.5116733871540049, + "eval_loss": 1.592746615409851, + "eval_runtime": 149.8209, + "eval_samples_per_second": 423.679, + "eval_steps_per_second": 1.655, + "step": 1232000 + }, + { + "epoch": 5.019304626082989, + "grad_norm": 6.2411675453186035, + "learning_rate": 0.00064831085662228, + "loss": 7.5497, + "step": 1232100 + }, + { + "epoch": 5.0197120041063705, + "grad_norm": 14.235241889953613, + "learning_rate": 0.000647979504521401, + "loss": 7.5724, + "step": 1232200 + }, + { + "epoch": 5.020119382129752, + "grad_norm": 9.591054916381836, + "learning_rate": 0.0006476482258384742, + "loss": 7.5926, + "step": 1232300 + }, + { + "epoch": 5.020526760153134, + "grad_norm": 7.400914192199707, + "learning_rate": 0.0006473170205865959, + "loss": 7.5692, + "step": 1232400 + }, + { + "epoch": 5.020934138176515, + "grad_norm": 7.066140651702881, + "learning_rate": 0.0006469858887788625, + "loss": 7.5784, + "step": 1232500 + }, + { + "epoch": 5.021341516199897, + "grad_norm": 4.17603874206543, + "learning_rate": 0.0006466548304283648, + "loss": 7.57, + "step": 1232600 + }, + { + "epoch": 5.021748894223278, + "grad_norm": 4.827794075012207, + "learning_rate": 0.0006463238455481907, + "loss": 7.5599, + "step": 1232700 + }, + { + "epoch": 5.022156272246659, + "grad_norm": 3.1116693019866943, + "learning_rate": 0.0006459929341514248, + "loss": 7.5577, + "step": 1232800 + }, + { + "epoch": 5.02256365027004, + "grad_norm": 6.376616477966309, + "learning_rate": 0.0006456620962511512, + "loss": 7.5453, + "step": 1232900 + }, + { + "epoch": 5.022971028293422, + "grad_norm": 5.128245830535889, + "learning_rate": 0.00064533133186045, + "loss": 7.5689, + "step": 1233000 + }, + { + "epoch": 5.022971028293422, + "eval_MaskedAccuracy": 0.5119402879554316, + "eval_loss": 1.5978705883026123, + "eval_runtime": 151.5867, + "eval_samples_per_second": 418.744, + "eval_steps_per_second": 1.636, + "step": 1233000 + }, + { + "epoch": 5.023378406316803, + "grad_norm": 4.2856950759887695, + "learning_rate": 0.000645000640992397, + "loss": 7.5258, + "step": 1233100 + }, + { + "epoch": 5.023785784340185, + "grad_norm": 6.219287395477295, + "learning_rate": 0.000644670023660067, + "loss": 7.5731, + "step": 1233200 + }, + { + "epoch": 5.0241931623635665, + "grad_norm": 11.586651802062988, + "learning_rate": 0.00064433947987653, + "loss": 7.5728, + "step": 1233300 + }, + { + "epoch": 5.024600540386948, + "grad_norm": 15.761351585388184, + "learning_rate": 0.0006440090096548546, + "loss": 7.5649, + "step": 1233400 + }, + { + "epoch": 5.02500791841033, + "grad_norm": 10.314072608947754, + "learning_rate": 0.0006436786130081072, + "loss": 7.5498, + "step": 1233500 + }, + { + "epoch": 5.025415296433711, + "grad_norm": 11.7467041015625, + "learning_rate": 0.0006433482899493493, + "loss": 7.5529, + "step": 1233600 + }, + { + "epoch": 5.025822674457093, + "grad_norm": 9.369192123413086, + "learning_rate": 0.00064301804049164, + "loss": 7.5365, + "step": 1233700 + }, + { + "epoch": 5.026230052480474, + "grad_norm": 6.06901741027832, + "learning_rate": 0.0006426878646480359, + "loss": 7.5585, + "step": 1233800 + }, + { + "epoch": 5.026637430503856, + "grad_norm": 17.563222885131836, + "learning_rate": 0.0006423577624315919, + "loss": 7.5502, + "step": 1233900 + }, + { + "epoch": 5.027044808527236, + "grad_norm": 5.640834331512451, + "learning_rate": 0.0006420277338553568, + "loss": 7.5621, + "step": 1234000 + }, + { + "epoch": 5.027044808527236, + "eval_MaskedAccuracy": 0.5119214262047922, + "eval_loss": 1.594098448753357, + "eval_runtime": 171.365, + "eval_samples_per_second": 370.414, + "eval_steps_per_second": 1.447, + "step": 1234000 + }, + { + "epoch": 5.027452186550618, + "grad_norm": 4.507350444793701, + "learning_rate": 0.0006416977789323789, + "loss": 7.5617, + "step": 1234100 + }, + { + "epoch": 5.027859564573999, + "grad_norm": 7.678382873535156, + "learning_rate": 0.0006413678976757045, + "loss": 7.5864, + "step": 1234200 + }, + { + "epoch": 5.028266942597381, + "grad_norm": 11.929092407226562, + "learning_rate": 0.0006410380900983741, + "loss": 7.5457, + "step": 1234300 + }, + { + "epoch": 5.0286743206207625, + "grad_norm": 4.17978572845459, + "learning_rate": 0.0006407083562134268, + "loss": 7.5625, + "step": 1234400 + }, + { + "epoch": 5.029081698644144, + "grad_norm": 10.068114280700684, + "learning_rate": 0.0006403786960338991, + "loss": 7.5691, + "step": 1234500 + }, + { + "epoch": 5.0294890766675255, + "grad_norm": 13.511075019836426, + "learning_rate": 0.0006400491095728252, + "loss": 7.5597, + "step": 1234600 + }, + { + "epoch": 5.029896454690907, + "grad_norm": 11.481148719787598, + "learning_rate": 0.0006397195968432339, + "loss": 7.5419, + "step": 1234700 + }, + { + "epoch": 5.030303832714289, + "grad_norm": 6.723931789398193, + "learning_rate": 0.0006393901578581525, + "loss": 7.5497, + "step": 1234800 + }, + { + "epoch": 5.03071121073767, + "grad_norm": 7.834571838378906, + "learning_rate": 0.0006390607926306063, + "loss": 7.5659, + "step": 1234900 + }, + { + "epoch": 5.031118588761052, + "grad_norm": 8.159076690673828, + "learning_rate": 0.000638731501173617, + "loss": 7.541, + "step": 1235000 + }, + { + "epoch": 5.031118588761052, + "eval_MaskedAccuracy": 0.5121700753974395, + "eval_loss": 1.5960626602172852, + "eval_runtime": 154.5013, + "eval_samples_per_second": 410.844, + "eval_steps_per_second": 1.605, + "step": 1235000 + }, + { + "epoch": 5.031525966784433, + "grad_norm": 6.948030948638916, + "learning_rate": 0.0006384022835002031, + "loss": 7.5352, + "step": 1235100 + }, + { + "epoch": 5.031933344807815, + "grad_norm": 2.986198663711548, + "learning_rate": 0.0006380731396233794, + "loss": 7.5597, + "step": 1235200 + }, + { + "epoch": 5.032340722831195, + "grad_norm": 3.1850740909576416, + "learning_rate": 0.0006377440695561599, + "loss": 7.5361, + "step": 1235300 + }, + { + "epoch": 5.032748100854577, + "grad_norm": 12.844918251037598, + "learning_rate": 0.0006374150733115545, + "loss": 7.56, + "step": 1235400 + }, + { + "epoch": 5.0331554788779584, + "grad_norm": 5.665256977081299, + "learning_rate": 0.0006370861509025694, + "loss": 7.5411, + "step": 1235500 + }, + { + "epoch": 5.03356285690134, + "grad_norm": 2.9927115440368652, + "learning_rate": 0.0006367573023422098, + "loss": 7.5629, + "step": 1235600 + }, + { + "epoch": 5.0339702349247215, + "grad_norm": 6.249614715576172, + "learning_rate": 0.0006364285276434758, + "loss": 7.5761, + "step": 1235700 + }, + { + "epoch": 5.034377612948103, + "grad_norm": 13.659110069274902, + "learning_rate": 0.0006360998268193654, + "loss": 7.5576, + "step": 1235800 + }, + { + "epoch": 5.034784990971485, + "grad_norm": 4.137001991271973, + "learning_rate": 0.0006357711998828752, + "loss": 7.5726, + "step": 1235900 + }, + { + "epoch": 5.035192368994866, + "grad_norm": 4.4844770431518555, + "learning_rate": 0.0006354426468469963, + "loss": 7.5949, + "step": 1236000 + }, + { + "epoch": 5.035192368994866, + "eval_MaskedAccuracy": 0.5122887211273998, + "eval_loss": 1.5912740230560303, + "eval_runtime": 150.1366, + "eval_samples_per_second": 422.788, + "eval_steps_per_second": 1.652, + "step": 1236000 + }, + { + "epoch": 5.035599747018248, + "grad_norm": 7.55839729309082, + "learning_rate": 0.0006351141677247188, + "loss": 7.5687, + "step": 1236100 + }, + { + "epoch": 5.036007125041629, + "grad_norm": 4.499295234680176, + "learning_rate": 0.0006347857625290293, + "loss": 7.5633, + "step": 1236200 + }, + { + "epoch": 5.036414503065011, + "grad_norm": 13.397000312805176, + "learning_rate": 0.0006344574312729102, + "loss": 7.5499, + "step": 1236300 + }, + { + "epoch": 5.036821881088392, + "grad_norm": 5.78963565826416, + "learning_rate": 0.0006341291739693437, + "loss": 7.5459, + "step": 1236400 + }, + { + "epoch": 5.037229259111774, + "grad_norm": 11.74659252166748, + "learning_rate": 0.000633800990631307, + "loss": 7.5468, + "step": 1236500 + }, + { + "epoch": 5.037636637135154, + "grad_norm": 23.07303810119629, + "learning_rate": 0.0006334728812717758, + "loss": 7.5352, + "step": 1236600 + }, + { + "epoch": 5.038044015158536, + "grad_norm": 8.282074928283691, + "learning_rate": 0.000633144845903721, + "loss": 7.533, + "step": 1236700 + }, + { + "epoch": 5.0384513931819175, + "grad_norm": 11.630181312561035, + "learning_rate": 0.0006328168845401113, + "loss": 7.5395, + "step": 1236800 + }, + { + "epoch": 5.038858771205299, + "grad_norm": 11.355972290039062, + "learning_rate": 0.0006324889971939131, + "loss": 7.5454, + "step": 1236900 + }, + { + "epoch": 5.0392661492286805, + "grad_norm": 3.331683397293091, + "learning_rate": 0.0006321611838780903, + "loss": 7.5635, + "step": 1237000 + }, + { + "epoch": 5.0392661492286805, + "eval_MaskedAccuracy": 0.5126326265318845, + "eval_loss": 1.5927146673202515, + "eval_runtime": 160.3941, + "eval_samples_per_second": 395.75, + "eval_steps_per_second": 1.546, + "step": 1237000 + }, + { + "epoch": 5.039673527252062, + "grad_norm": 9.400132179260254, + "learning_rate": 0.0006318334446056021, + "loss": 7.5406, + "step": 1237100 + }, + { + "epoch": 5.040080905275444, + "grad_norm": 12.600625991821289, + "learning_rate": 0.0006315057793894067, + "loss": 7.5457, + "step": 1237200 + }, + { + "epoch": 5.040488283298825, + "grad_norm": 6.932764053344727, + "learning_rate": 0.0006311781882424574, + "loss": 7.5593, + "step": 1237300 + }, + { + "epoch": 5.040895661322207, + "grad_norm": 6.773071765899658, + "learning_rate": 0.0006308506711777069, + "loss": 7.5205, + "step": 1237400 + }, + { + "epoch": 5.041303039345588, + "grad_norm": 13.089978218078613, + "learning_rate": 0.0006305232282081035, + "loss": 7.5466, + "step": 1237500 + }, + { + "epoch": 5.04171041736897, + "grad_norm": 5.462027549743652, + "learning_rate": 0.0006301958593465915, + "loss": 7.5614, + "step": 1237600 + }, + { + "epoch": 5.042117795392351, + "grad_norm": 11.837106704711914, + "learning_rate": 0.0006298685646061147, + "loss": 7.5382, + "step": 1237700 + }, + { + "epoch": 5.042525173415732, + "grad_norm": 9.655179023742676, + "learning_rate": 0.0006295413439996122, + "loss": 7.552, + "step": 1237800 + }, + { + "epoch": 5.0429325514391135, + "grad_norm": 5.208213806152344, + "learning_rate": 0.0006292141975400214, + "loss": 7.5591, + "step": 1237900 + }, + { + "epoch": 5.043339929462495, + "grad_norm": 6.39457893371582, + "learning_rate": 0.0006288871252402751, + "loss": 7.5299, + "step": 1238000 + }, + { + "epoch": 5.043339929462495, + "eval_MaskedAccuracy": 0.5122286821516949, + "eval_loss": 1.5957825183868408, + "eval_runtime": 180.9774, + "eval_samples_per_second": 350.74, + "eval_steps_per_second": 1.37, + "step": 1238000 + }, + { + "epoch": 5.0437473074858765, + "grad_norm": 3.443537473678589, + "learning_rate": 0.000628560127113305, + "loss": 7.562, + "step": 1238100 + }, + { + "epoch": 5.044154685509258, + "grad_norm": 3.330414056777954, + "learning_rate": 0.0006282332031720396, + "loss": 7.5548, + "step": 1238200 + }, + { + "epoch": 5.04456206353264, + "grad_norm": 4.9159955978393555, + "learning_rate": 0.0006279063534294034, + "loss": 7.5721, + "step": 1238300 + }, + { + "epoch": 5.044969441556021, + "grad_norm": 3.3496997356414795, + "learning_rate": 0.0006275795778983178, + "loss": 7.5472, + "step": 1238400 + }, + { + "epoch": 5.045376819579403, + "grad_norm": 2.7423839569091797, + "learning_rate": 0.0006272528765917022, + "loss": 7.5708, + "step": 1238500 + }, + { + "epoch": 5.045784197602784, + "grad_norm": 8.792877197265625, + "learning_rate": 0.000626926249522474, + "loss": 7.5471, + "step": 1238600 + }, + { + "epoch": 5.046191575626166, + "grad_norm": 12.140045166015625, + "learning_rate": 0.0006265996967035461, + "loss": 7.5585, + "step": 1238700 + }, + { + "epoch": 5.046598953649547, + "grad_norm": 7.998610496520996, + "learning_rate": 0.0006262732181478278, + "loss": 7.5587, + "step": 1238800 + }, + { + "epoch": 5.047006331672929, + "grad_norm": 5.750330924987793, + "learning_rate": 0.000625946813868228, + "loss": 7.5774, + "step": 1238900 + }, + { + "epoch": 5.047413709696309, + "grad_norm": 8.81126880645752, + "learning_rate": 0.0006256204838776503, + "loss": 7.572, + "step": 1239000 + }, + { + "epoch": 5.047413709696309, + "eval_MaskedAccuracy": 0.5119281307959642, + "eval_loss": 1.5961500406265259, + "eval_runtime": 155.0172, + "eval_samples_per_second": 409.477, + "eval_steps_per_second": 1.6, + "step": 1239000 + }, + { + "epoch": 5.047821087719691, + "grad_norm": 11.096508979797363, + "learning_rate": 0.0006252942281889956, + "loss": 7.5435, + "step": 1239100 + }, + { + "epoch": 5.0482284657430725, + "grad_norm": 11.315113067626953, + "learning_rate": 0.0006249680468151646, + "loss": 7.5503, + "step": 1239200 + }, + { + "epoch": 5.048635843766454, + "grad_norm": 9.148561477661133, + "learning_rate": 0.0006246419397690517, + "loss": 7.5525, + "step": 1239300 + }, + { + "epoch": 5.049043221789836, + "grad_norm": 5.415726661682129, + "learning_rate": 0.0006243159070635498, + "loss": 7.5513, + "step": 1239400 + }, + { + "epoch": 5.049450599813217, + "grad_norm": 6.175321102142334, + "learning_rate": 0.0006239899487115492, + "loss": 7.5694, + "step": 1239500 + }, + { + "epoch": 5.049857977836599, + "grad_norm": 3.0635392665863037, + "learning_rate": 0.0006236640647259356, + "loss": 7.5822, + "step": 1239600 + }, + { + "epoch": 5.05026535585998, + "grad_norm": 6.307855129241943, + "learning_rate": 0.0006233382551195924, + "loss": 7.5385, + "step": 1239700 + }, + { + "epoch": 5.050672733883362, + "grad_norm": 9.618474960327148, + "learning_rate": 0.0006230125199054031, + "loss": 7.5466, + "step": 1239800 + }, + { + "epoch": 5.051080111906743, + "grad_norm": 4.653321266174316, + "learning_rate": 0.0006226868590962443, + "loss": 7.5483, + "step": 1239900 + }, + { + "epoch": 5.051487489930125, + "grad_norm": 10.856022834777832, + "learning_rate": 0.0006223612727049914, + "loss": 7.5544, + "step": 1240000 + }, + { + "epoch": 5.051487489930125, + "eval_MaskedAccuracy": 0.5123791608637853, + "eval_loss": 1.594712734222412, + "eval_runtime": 174.7706, + "eval_samples_per_second": 363.196, + "eval_steps_per_second": 1.419, + "step": 1240000 + }, + { + "epoch": 5.051894867953506, + "grad_norm": 6.180064678192139, + "learning_rate": 0.0006220357607445162, + "loss": 7.5663, + "step": 1240100 + }, + { + "epoch": 5.052302245976888, + "grad_norm": 21.003381729125977, + "learning_rate": 0.0006217103232276886, + "loss": 7.5411, + "step": 1240200 + }, + { + "epoch": 5.0527096240002685, + "grad_norm": 9.569164276123047, + "learning_rate": 0.0006213849601673735, + "loss": 7.5279, + "step": 1240300 + }, + { + "epoch": 5.05311700202365, + "grad_norm": 4.3968095779418945, + "learning_rate": 0.0006210596715764357, + "loss": 7.5777, + "step": 1240400 + }, + { + "epoch": 5.0535243800470315, + "grad_norm": 17.27248191833496, + "learning_rate": 0.0006207344574677353, + "loss": 7.544, + "step": 1240500 + }, + { + "epoch": 5.053931758070413, + "grad_norm": 3.057788848876953, + "learning_rate": 0.0006204093178541299, + "loss": 7.5487, + "step": 1240600 + }, + { + "epoch": 5.054339136093795, + "grad_norm": 17.172550201416016, + "learning_rate": 0.0006200842527484734, + "loss": 7.5459, + "step": 1240700 + }, + { + "epoch": 5.054746514117176, + "grad_norm": 16.662858963012695, + "learning_rate": 0.0006197592621636174, + "loss": 7.5822, + "step": 1240800 + }, + { + "epoch": 5.055153892140558, + "grad_norm": 12.877495765686035, + "learning_rate": 0.0006194343461124117, + "loss": 7.5713, + "step": 1240900 + }, + { + "epoch": 5.055561270163939, + "grad_norm": 4.151196002960205, + "learning_rate": 0.0006191095046076997, + "loss": 7.5532, + "step": 1241000 + }, + { + "epoch": 5.055561270163939, + "eval_MaskedAccuracy": 0.512082622485966, + "eval_loss": 1.5900789499282837, + "eval_runtime": 195.1468, + "eval_samples_per_second": 325.273, + "eval_steps_per_second": 1.271, + "step": 1241000 + }, + { + "epoch": 5.055968648187321, + "grad_norm": 6.3274760246276855, + "learning_rate": 0.0006187847376623259, + "loss": 7.5781, + "step": 1241100 + }, + { + "epoch": 5.056376026210702, + "grad_norm": 4.799252033233643, + "learning_rate": 0.0006184600452891298, + "loss": 7.5625, + "step": 1241200 + }, + { + "epoch": 5.056783404234084, + "grad_norm": 24.22987174987793, + "learning_rate": 0.0006181354275009471, + "loss": 7.5594, + "step": 1241300 + }, + { + "epoch": 5.057190782257465, + "grad_norm": 10.769682884216309, + "learning_rate": 0.0006178108843106137, + "loss": 7.5522, + "step": 1241400 + }, + { + "epoch": 5.057598160280847, + "grad_norm": 12.865377426147461, + "learning_rate": 0.0006174864157309583, + "loss": 7.5753, + "step": 1241500 + }, + { + "epoch": 5.0580055383042275, + "grad_norm": 8.817621231079102, + "learning_rate": 0.0006171620217748107, + "loss": 7.5563, + "step": 1241600 + }, + { + "epoch": 5.058412916327609, + "grad_norm": 12.362869262695312, + "learning_rate": 0.0006168377024549951, + "loss": 7.5794, + "step": 1241700 + }, + { + "epoch": 5.058820294350991, + "grad_norm": 8.096556663513184, + "learning_rate": 0.0006165134577843348, + "loss": 7.5728, + "step": 1241800 + }, + { + "epoch": 5.059227672374372, + "grad_norm": 9.449455261230469, + "learning_rate": 0.0006161892877756477, + "loss": 7.5657, + "step": 1241900 + }, + { + "epoch": 5.059635050397754, + "grad_norm": 10.60107707977295, + "learning_rate": 0.0006158651924417497, + "loss": 7.5518, + "step": 1242000 + }, + { + "epoch": 5.059635050397754, + "eval_MaskedAccuracy": 0.5116715651411367, + "eval_loss": 1.5865345001220703, + "eval_runtime": 156.4843, + "eval_samples_per_second": 405.638, + "eval_steps_per_second": 1.585, + "step": 1242000 + }, + { + "epoch": 5.060042428421135, + "grad_norm": 17.698286056518555, + "learning_rate": 0.000615541171795455, + "loss": 7.5416, + "step": 1242100 + }, + { + "epoch": 5.060449806444517, + "grad_norm": 5.095458984375, + "learning_rate": 0.0006152172258495724, + "loss": 7.5648, + "step": 1242200 + }, + { + "epoch": 5.060857184467898, + "grad_norm": 3.6847846508026123, + "learning_rate": 0.0006148933546169099, + "loss": 7.5441, + "step": 1242300 + }, + { + "epoch": 5.06126456249128, + "grad_norm": 13.084811210632324, + "learning_rate": 0.0006145695581102728, + "loss": 7.5808, + "step": 1242400 + }, + { + "epoch": 5.061671940514661, + "grad_norm": 4.228727340698242, + "learning_rate": 0.0006142458363424609, + "loss": 7.5572, + "step": 1242500 + }, + { + "epoch": 5.062079318538043, + "grad_norm": 16.473806381225586, + "learning_rate": 0.000613922189326274, + "loss": 7.5709, + "step": 1242600 + }, + { + "epoch": 5.062486696561424, + "grad_norm": 13.398612022399902, + "learning_rate": 0.000613598617074508, + "loss": 7.5613, + "step": 1242700 + }, + { + "epoch": 5.062894074584805, + "grad_norm": 5.265367031097412, + "learning_rate": 0.0006132751195999541, + "loss": 7.5626, + "step": 1242800 + }, + { + "epoch": 5.0633014526081865, + "grad_norm": 6.05662727355957, + "learning_rate": 0.0006129516969154025, + "loss": 7.5746, + "step": 1242900 + }, + { + "epoch": 5.063708830631568, + "grad_norm": 9.01705551147461, + "learning_rate": 0.0006126283490336391, + "loss": 7.5784, + "step": 1243000 + }, + { + "epoch": 5.063708830631568, + "eval_MaskedAccuracy": 0.5116898757757985, + "eval_loss": 1.5971190929412842, + "eval_runtime": 153.5325, + "eval_samples_per_second": 413.437, + "eval_steps_per_second": 1.615, + "step": 1243000 + }, + { + "epoch": 5.06411620865495, + "grad_norm": 8.705946922302246, + "learning_rate": 0.0006123050759674489, + "loss": 7.5422, + "step": 1243100 + }, + { + "epoch": 5.064523586678331, + "grad_norm": 5.019017696380615, + "learning_rate": 0.000611981877729611, + "loss": 7.5754, + "step": 1243200 + }, + { + "epoch": 5.064930964701713, + "grad_norm": 9.691917419433594, + "learning_rate": 0.0006116587543329046, + "loss": 7.58, + "step": 1243300 + }, + { + "epoch": 5.065338342725094, + "grad_norm": 4.409297466278076, + "learning_rate": 0.0006113357057901047, + "loss": 7.5852, + "step": 1243400 + }, + { + "epoch": 5.065745720748476, + "grad_norm": 5.675930976867676, + "learning_rate": 0.0006110127321139818, + "loss": 7.5649, + "step": 1243500 + }, + { + "epoch": 5.066153098771857, + "grad_norm": 4.158321857452393, + "learning_rate": 0.0006106898333173048, + "loss": 7.5891, + "step": 1243600 + }, + { + "epoch": 5.066560476795239, + "grad_norm": 11.193013191223145, + "learning_rate": 0.0006103670094128405, + "loss": 7.545, + "step": 1243700 + }, + { + "epoch": 5.06696785481862, + "grad_norm": 13.805326461791992, + "learning_rate": 0.0006100442604133508, + "loss": 7.557, + "step": 1243800 + }, + { + "epoch": 5.067375232842002, + "grad_norm": 14.540875434875488, + "learning_rate": 0.0006097215863315973, + "loss": 7.5654, + "step": 1243900 + }, + { + "epoch": 5.0677826108653825, + "grad_norm": 11.002042770385742, + "learning_rate": 0.000609398987180336, + "loss": 7.5549, + "step": 1244000 + }, + { + "epoch": 5.0677826108653825, + "eval_MaskedAccuracy": 0.5124008095192564, + "eval_loss": 1.582103967666626, + "eval_runtime": 160.788, + "eval_samples_per_second": 394.781, + "eval_steps_per_second": 1.542, + "step": 1244000 + }, + { + "epoch": 5.068189988888764, + "grad_norm": 23.186279296875, + "learning_rate": 0.0006090764629723212, + "loss": 7.5456, + "step": 1244100 + }, + { + "epoch": 5.068597366912146, + "grad_norm": 3.9477131366729736, + "learning_rate": 0.0006087540137203041, + "loss": 7.5322, + "step": 1244200 + }, + { + "epoch": 5.069004744935527, + "grad_norm": 5.8432135581970215, + "learning_rate": 0.0006084316394370321, + "loss": 7.5411, + "step": 1244300 + }, + { + "epoch": 5.069412122958909, + "grad_norm": 8.310843467712402, + "learning_rate": 0.000608109340135251, + "loss": 7.5811, + "step": 1244400 + }, + { + "epoch": 5.06981950098229, + "grad_norm": 4.559779167175293, + "learning_rate": 0.0006077871158277029, + "loss": 7.5606, + "step": 1244500 + }, + { + "epoch": 5.070226879005672, + "grad_norm": 5.4670796394348145, + "learning_rate": 0.0006074649665271281, + "loss": 7.5285, + "step": 1244600 + }, + { + "epoch": 5.070634257029053, + "grad_norm": 4.705342769622803, + "learning_rate": 0.0006071428922462611, + "loss": 7.5644, + "step": 1244700 + }, + { + "epoch": 5.071041635052435, + "grad_norm": 9.920463562011719, + "learning_rate": 0.000606820892997836, + "loss": 7.5439, + "step": 1244800 + }, + { + "epoch": 5.071449013075816, + "grad_norm": 7.15165376663208, + "learning_rate": 0.0006064989687945835, + "loss": 7.5577, + "step": 1244900 + }, + { + "epoch": 5.071856391099198, + "grad_norm": 5.455395698547363, + "learning_rate": 0.0006061771196492301, + "loss": 7.5656, + "step": 1245000 + }, + { + "epoch": 5.071856391099198, + "eval_MaskedAccuracy": 0.5121866050921413, + "eval_loss": 1.5907988548278809, + "eval_runtime": 166.2802, + "eval_samples_per_second": 381.741, + "eval_steps_per_second": 1.491, + "step": 1245000 + }, + { + "epoch": 5.072263769122579, + "grad_norm": 13.10548210144043, + "learning_rate": 0.0006058553455745019, + "loss": 7.5728, + "step": 1245100 + }, + { + "epoch": 5.072671147145961, + "grad_norm": 3.50596022605896, + "learning_rate": 0.0006055336465831184, + "loss": 7.5844, + "step": 1245200 + }, + { + "epoch": 5.0730785251693415, + "grad_norm": 11.356693267822266, + "learning_rate": 0.0006052120226877984, + "loss": 7.6003, + "step": 1245300 + }, + { + "epoch": 5.073485903192723, + "grad_norm": 5.499568939208984, + "learning_rate": 0.0006048904739012585, + "loss": 7.5822, + "step": 1245400 + }, + { + "epoch": 5.073893281216105, + "grad_norm": 12.636957168579102, + "learning_rate": 0.0006045690002362105, + "loss": 7.5753, + "step": 1245500 + }, + { + "epoch": 5.074300659239486, + "grad_norm": 4.892049789428711, + "learning_rate": 0.0006042476017053644, + "loss": 7.5464, + "step": 1245600 + }, + { + "epoch": 5.074708037262868, + "grad_norm": 13.68791675567627, + "learning_rate": 0.0006039262783214258, + "loss": 7.5666, + "step": 1245700 + }, + { + "epoch": 5.075115415286249, + "grad_norm": 17.178817749023438, + "learning_rate": 0.0006036050300971004, + "loss": 7.5651, + "step": 1245800 + }, + { + "epoch": 5.075522793309631, + "grad_norm": 8.208373069763184, + "learning_rate": 0.0006032838570450868, + "loss": 7.5787, + "step": 1245900 + }, + { + "epoch": 5.075930171333012, + "grad_norm": 9.169124603271484, + "learning_rate": 0.000602962759178084, + "loss": 7.5959, + "step": 1246000 + }, + { + "epoch": 5.075930171333012, + "eval_MaskedAccuracy": 0.5118789037473761, + "eval_loss": 1.5950679779052734, + "eval_runtime": 155.3492, + "eval_samples_per_second": 408.602, + "eval_steps_per_second": 1.596, + "step": 1246000 + }, + { + "epoch": 5.076337549356394, + "grad_norm": 9.100870132446289, + "learning_rate": 0.000602641736508786, + "loss": 7.5746, + "step": 1246100 + }, + { + "epoch": 5.076744927379775, + "grad_norm": 12.614434242248535, + "learning_rate": 0.000602320789049884, + "loss": 7.5438, + "step": 1246200 + }, + { + "epoch": 5.077152305403157, + "grad_norm": 10.327921867370605, + "learning_rate": 0.0006019999168140688, + "loss": 7.5708, + "step": 1246300 + }, + { + "epoch": 5.077559683426538, + "grad_norm": 11.155251502990723, + "learning_rate": 0.0006016791198140243, + "loss": 7.5773, + "step": 1246400 + }, + { + "epoch": 5.07796706144992, + "grad_norm": 17.12721824645996, + "learning_rate": 0.0006013583980624337, + "loss": 7.5309, + "step": 1246500 + }, + { + "epoch": 5.078374439473301, + "grad_norm": 13.454442024230957, + "learning_rate": 0.0006010377515719766, + "loss": 7.5724, + "step": 1246600 + }, + { + "epoch": 5.078781817496682, + "grad_norm": 5.540965557098389, + "learning_rate": 0.0006007171803553305, + "loss": 7.5745, + "step": 1246700 + }, + { + "epoch": 5.079189195520064, + "grad_norm": 3.1566126346588135, + "learning_rate": 0.000600396684425169, + "loss": 7.5758, + "step": 1246800 + }, + { + "epoch": 5.079596573543445, + "grad_norm": 4.9269256591796875, + "learning_rate": 0.000600076263794163, + "loss": 7.5842, + "step": 1246900 + }, + { + "epoch": 5.080003951566827, + "grad_norm": 4.288941383361816, + "learning_rate": 0.0005997559184749813, + "loss": 7.5766, + "step": 1247000 + }, + { + "epoch": 5.080003951566827, + "eval_MaskedAccuracy": 0.511492453496384, + "eval_loss": 1.5896881818771362, + "eval_runtime": 178.6538, + "eval_samples_per_second": 355.302, + "eval_steps_per_second": 1.388, + "step": 1247000 + }, + { + "epoch": 5.080411329590208, + "grad_norm": 4.120076656341553, + "learning_rate": 0.0005994356484802875, + "loss": 7.5974, + "step": 1247100 + }, + { + "epoch": 5.08081870761359, + "grad_norm": 4.301754474639893, + "learning_rate": 0.0005991154538227442, + "loss": 7.5655, + "step": 1247200 + }, + { + "epoch": 5.081226085636971, + "grad_norm": 5.2326340675354, + "learning_rate": 0.000598795334515011, + "loss": 7.5692, + "step": 1247300 + }, + { + "epoch": 5.081633463660353, + "grad_norm": 4.03275728225708, + "learning_rate": 0.0005984752905697425, + "loss": 7.5522, + "step": 1247400 + }, + { + "epoch": 5.082040841683734, + "grad_norm": 3.790022850036621, + "learning_rate": 0.0005981553219995932, + "loss": 7.5502, + "step": 1247500 + }, + { + "epoch": 5.082448219707116, + "grad_norm": 10.635153770446777, + "learning_rate": 0.0005978354288172125, + "loss": 7.555, + "step": 1247600 + }, + { + "epoch": 5.082855597730497, + "grad_norm": 20.811840057373047, + "learning_rate": 0.0005975156110352463, + "loss": 7.5726, + "step": 1247700 + }, + { + "epoch": 5.083262975753878, + "grad_norm": 13.617968559265137, + "learning_rate": 0.0005971958686663415, + "loss": 7.5655, + "step": 1247800 + }, + { + "epoch": 5.08367035377726, + "grad_norm": 13.507347106933594, + "learning_rate": 0.0005968762017231368, + "loss": 7.5567, + "step": 1247900 + }, + { + "epoch": 5.084077731800641, + "grad_norm": 10.461236953735352, + "learning_rate": 0.0005965566102182718, + "loss": 7.5358, + "step": 1248000 + }, + { + "epoch": 5.084077731800641, + "eval_MaskedAccuracy": 0.5126387374378459, + "eval_loss": 1.5889209508895874, + "eval_runtime": 167.4296, + "eval_samples_per_second": 379.121, + "eval_steps_per_second": 1.481, + "step": 1248000 + }, + { + "epoch": 5.084485109824023, + "grad_norm": 4.239201068878174, + "learning_rate": 0.0005962370941643803, + "loss": 7.5566, + "step": 1248100 + }, + { + "epoch": 5.084892487847404, + "grad_norm": 3.749708890914917, + "learning_rate": 0.0005959176535740955, + "loss": 7.5685, + "step": 1248200 + }, + { + "epoch": 5.085299865870786, + "grad_norm": 5.5102009773254395, + "learning_rate": 0.000595598288460047, + "loss": 7.5543, + "step": 1248300 + }, + { + "epoch": 5.085707243894167, + "grad_norm": 4.811374664306641, + "learning_rate": 0.0005952789988348602, + "loss": 7.5522, + "step": 1248400 + }, + { + "epoch": 5.086114621917549, + "grad_norm": 4.167360305786133, + "learning_rate": 0.0005949597847111571, + "loss": 7.5536, + "step": 1248500 + }, + { + "epoch": 5.08652199994093, + "grad_norm": 14.653417587280273, + "learning_rate": 0.0005946406461015605, + "loss": 7.5484, + "step": 1248600 + }, + { + "epoch": 5.086929377964312, + "grad_norm": 11.3723783493042, + "learning_rate": 0.0005943215830186857, + "loss": 7.5298, + "step": 1248700 + }, + { + "epoch": 5.087336755987693, + "grad_norm": 5.123376369476318, + "learning_rate": 0.0005940025954751483, + "loss": 7.5693, + "step": 1248800 + }, + { + "epoch": 5.087744134011075, + "grad_norm": 7.393428802490234, + "learning_rate": 0.0005936836834835579, + "loss": 7.5546, + "step": 1248900 + }, + { + "epoch": 5.088151512034456, + "grad_norm": 12.377934455871582, + "learning_rate": 0.0005933648470565252, + "loss": 7.5704, + "step": 1249000 + }, + { + "epoch": 5.088151512034456, + "eval_MaskedAccuracy": 0.5124602522649733, + "eval_loss": 1.5873277187347412, + "eval_runtime": 161.0087, + "eval_samples_per_second": 394.24, + "eval_steps_per_second": 1.54, + "step": 1249000 + }, + { + "epoch": 5.088558890057837, + "grad_norm": 5.779115676879883, + "learning_rate": 0.0005930460862066542, + "loss": 7.5507, + "step": 1249100 + }, + { + "epoch": 5.088966268081219, + "grad_norm": 3.380244255065918, + "learning_rate": 0.0005927274009465464, + "loss": 7.555, + "step": 1249200 + }, + { + "epoch": 5.0893736461046, + "grad_norm": 9.080333709716797, + "learning_rate": 0.0005924087912888035, + "loss": 7.5308, + "step": 1249300 + }, + { + "epoch": 5.089781024127982, + "grad_norm": 10.474891662597656, + "learning_rate": 0.0005920902572460193, + "loss": 7.5426, + "step": 1249400 + }, + { + "epoch": 5.090188402151363, + "grad_norm": 14.309752464294434, + "learning_rate": 0.000591771798830788, + "loss": 7.5422, + "step": 1249500 + }, + { + "epoch": 5.090595780174745, + "grad_norm": 4.935224533081055, + "learning_rate": 0.000591453416055701, + "loss": 7.579, + "step": 1249600 + }, + { + "epoch": 5.091003158198126, + "grad_norm": 7.429482936859131, + "learning_rate": 0.0005911351089333443, + "loss": 7.5559, + "step": 1249700 + }, + { + "epoch": 5.091410536221508, + "grad_norm": 24.867961883544922, + "learning_rate": 0.0005908168774763031, + "loss": 7.5151, + "step": 1249800 + }, + { + "epoch": 5.091817914244889, + "grad_norm": 10.574111938476562, + "learning_rate": 0.0005904987216971586, + "loss": 7.5641, + "step": 1249900 + }, + { + "epoch": 5.092225292268271, + "grad_norm": 3.2641782760620117, + "learning_rate": 0.0005901806416084893, + "loss": 7.5581, + "step": 1250000 + }, + { + "epoch": 5.092225292268271, + "eval_MaskedAccuracy": 0.5119926108642491, + "eval_loss": 1.5946589708328247, + "eval_runtime": 156.1855, + "eval_samples_per_second": 406.414, + "eval_steps_per_second": 1.588, + "step": 1250000 + }, + { + "epoch": 5.0926326702916525, + "grad_norm": 7.2194671630859375, + "learning_rate": 0.0005898626372228697, + "loss": 7.5324, + "step": 1250100 + }, + { + "epoch": 5.093040048315034, + "grad_norm": 3.0554192066192627, + "learning_rate": 0.000589544708552873, + "loss": 7.5478, + "step": 1250200 + }, + { + "epoch": 5.093447426338415, + "grad_norm": 8.473506927490234, + "learning_rate": 0.0005892268556110687, + "loss": 7.5416, + "step": 1250300 + }, + { + "epoch": 5.093854804361796, + "grad_norm": 3.3607919216156006, + "learning_rate": 0.0005889090784100231, + "loss": 7.5222, + "step": 1250400 + }, + { + "epoch": 5.094262182385178, + "grad_norm": 6.100803852081299, + "learning_rate": 0.0005885913769622999, + "loss": 7.5557, + "step": 1250500 + }, + { + "epoch": 5.094669560408559, + "grad_norm": 15.029216766357422, + "learning_rate": 0.0005882737512804591, + "loss": 7.5462, + "step": 1250600 + }, + { + "epoch": 5.095076938431941, + "grad_norm": 3.2186503410339355, + "learning_rate": 0.0005879562013770584, + "loss": 7.5586, + "step": 1250700 + }, + { + "epoch": 5.095484316455322, + "grad_norm": 4.313271999359131, + "learning_rate": 0.0005876387272646515, + "loss": 7.5492, + "step": 1250800 + }, + { + "epoch": 5.095891694478704, + "grad_norm": 6.743495941162109, + "learning_rate": 0.0005873213289557906, + "loss": 7.5544, + "step": 1250900 + }, + { + "epoch": 5.096299072502085, + "grad_norm": 5.610095977783203, + "learning_rate": 0.0005870040064630246, + "loss": 7.5321, + "step": 1251000 + }, + { + "epoch": 5.096299072502085, + "eval_MaskedAccuracy": 0.5120883219105775, + "eval_loss": 1.5959781408309937, + "eval_runtime": 155.8264, + "eval_samples_per_second": 407.351, + "eval_steps_per_second": 1.592, + "step": 1251000 + }, + { + "epoch": 5.096706450525467, + "grad_norm": 5.0802001953125, + "learning_rate": 0.0005866867597988978, + "loss": 7.5512, + "step": 1251100 + }, + { + "epoch": 5.097113828548848, + "grad_norm": 5.563141345977783, + "learning_rate": 0.0005863695889759535, + "loss": 7.5453, + "step": 1251200 + }, + { + "epoch": 5.09752120657223, + "grad_norm": 15.891023635864258, + "learning_rate": 0.0005860524940067311, + "loss": 7.5568, + "step": 1251300 + }, + { + "epoch": 5.0979285845956115, + "grad_norm": 21.714107513427734, + "learning_rate": 0.0005857354749037669, + "loss": 7.5554, + "step": 1251400 + }, + { + "epoch": 5.098335962618993, + "grad_norm": 12.382031440734863, + "learning_rate": 0.0005854185316795939, + "loss": 7.567, + "step": 1251500 + }, + { + "epoch": 5.098743340642374, + "grad_norm": 8.557860374450684, + "learning_rate": 0.0005851016643467436, + "loss": 7.561, + "step": 1251600 + }, + { + "epoch": 5.099150718665755, + "grad_norm": 8.319543838500977, + "learning_rate": 0.0005847848729177425, + "loss": 7.554, + "step": 1251700 + }, + { + "epoch": 5.099558096689137, + "grad_norm": 4.27390718460083, + "learning_rate": 0.0005844681574051157, + "loss": 7.5624, + "step": 1251800 + }, + { + "epoch": 5.099965474712518, + "grad_norm": 11.300516128540039, + "learning_rate": 0.000584151517821384, + "loss": 7.5306, + "step": 1251900 + }, + { + "epoch": 5.1003728527359, + "grad_norm": 4.060203552246094, + "learning_rate": 0.0005838349541790667, + "loss": 7.5262, + "step": 1252000 + }, + { + "epoch": 5.1003728527359, + "eval_MaskedAccuracy": 0.5128182618811351, + "eval_loss": 1.5957520008087158, + "eval_runtime": 156.8332, + "eval_samples_per_second": 404.736, + "eval_steps_per_second": 1.581, + "step": 1252000 + }, + { + "epoch": 5.100780230759281, + "grad_norm": 3.796227216720581, + "learning_rate": 0.0005835184664906784, + "loss": 7.5764, + "step": 1252100 + }, + { + "epoch": 5.101187608782663, + "grad_norm": 19.47611427307129, + "learning_rate": 0.0005832020547687317, + "loss": 7.5604, + "step": 1252200 + }, + { + "epoch": 5.101594986806044, + "grad_norm": 20.198137283325195, + "learning_rate": 0.0005828857190257373, + "loss": 7.5396, + "step": 1252300 + }, + { + "epoch": 5.102002364829426, + "grad_norm": 7.291891574859619, + "learning_rate": 0.0005825694592741993, + "loss": 7.5283, + "step": 1252400 + }, + { + "epoch": 5.1024097428528075, + "grad_norm": 3.155221939086914, + "learning_rate": 0.0005822532755266229, + "loss": 7.5515, + "step": 1252500 + }, + { + "epoch": 5.102817120876189, + "grad_norm": 13.736040115356445, + "learning_rate": 0.0005819371677955081, + "loss": 7.5659, + "step": 1252600 + }, + { + "epoch": 5.1032244988995705, + "grad_norm": 6.547802925109863, + "learning_rate": 0.0005816211360933521, + "loss": 7.5592, + "step": 1252700 + }, + { + "epoch": 5.103631876922951, + "grad_norm": 3.539252281188965, + "learning_rate": 0.0005813051804326502, + "loss": 7.5672, + "step": 1252800 + }, + { + "epoch": 5.104039254946333, + "grad_norm": 6.332305908203125, + "learning_rate": 0.0005809893008258931, + "loss": 7.5759, + "step": 1252900 + }, + { + "epoch": 5.104446632969714, + "grad_norm": 18.98788070678711, + "learning_rate": 0.0005806734972855694, + "loss": 7.5309, + "step": 1253000 + }, + { + "epoch": 5.104446632969714, + "eval_MaskedAccuracy": 0.5122840903033515, + "eval_loss": 1.603977084159851, + "eval_runtime": 198.8013, + "eval_samples_per_second": 319.294, + "eval_steps_per_second": 1.247, + "step": 1253000 + }, + { + "epoch": 5.104854010993096, + "grad_norm": 4.694102764129639, + "learning_rate": 0.0005803577698241637, + "loss": 7.5488, + "step": 1253100 + }, + { + "epoch": 5.105261389016477, + "grad_norm": 4.3892974853515625, + "learning_rate": 0.000580042118454159, + "loss": 7.5326, + "step": 1253200 + }, + { + "epoch": 5.105668767039859, + "grad_norm": 7.1691670417785645, + "learning_rate": 0.0005797265431880349, + "loss": 7.5478, + "step": 1253300 + }, + { + "epoch": 5.10607614506324, + "grad_norm": 6.850220680236816, + "learning_rate": 0.0005794110440382678, + "loss": 7.5936, + "step": 1253400 + }, + { + "epoch": 5.106483523086622, + "grad_norm": 8.848091125488281, + "learning_rate": 0.0005790956210173313, + "loss": 7.5667, + "step": 1253500 + }, + { + "epoch": 5.106890901110003, + "grad_norm": 4.3532233238220215, + "learning_rate": 0.0005787802741376956, + "loss": 7.5526, + "step": 1253600 + }, + { + "epoch": 5.107298279133385, + "grad_norm": 10.647968292236328, + "learning_rate": 0.0005784650034118275, + "loss": 7.5571, + "step": 1253700 + }, + { + "epoch": 5.1077056571567665, + "grad_norm": 9.989473342895508, + "learning_rate": 0.000578149808852191, + "loss": 7.5255, + "step": 1253800 + }, + { + "epoch": 5.108113035180148, + "grad_norm": 10.404735565185547, + "learning_rate": 0.0005778346904712488, + "loss": 7.5445, + "step": 1253900 + }, + { + "epoch": 5.108520413203529, + "grad_norm": 15.024141311645508, + "learning_rate": 0.0005775196482814576, + "loss": 7.5367, + "step": 1254000 + }, + { + "epoch": 5.108520413203529, + "eval_MaskedAccuracy": 0.5125342825745884, + "eval_loss": 1.58397376537323, + "eval_runtime": 152.9923, + "eval_samples_per_second": 414.897, + "eval_steps_per_second": 1.621, + "step": 1254000 + }, + { + "epoch": 5.10892779122691, + "grad_norm": 14.381766319274902, + "learning_rate": 0.0005772046822952739, + "loss": 7.5819, + "step": 1254100 + }, + { + "epoch": 5.109335169250292, + "grad_norm": 10.739044189453125, + "learning_rate": 0.000576889792525148, + "loss": 7.554, + "step": 1254200 + }, + { + "epoch": 5.109742547273673, + "grad_norm": 10.537728309631348, + "learning_rate": 0.0005765749789835326, + "loss": 7.5674, + "step": 1254300 + }, + { + "epoch": 5.110149925297055, + "grad_norm": 4.679098606109619, + "learning_rate": 0.000576260241682872, + "loss": 7.5419, + "step": 1254400 + }, + { + "epoch": 5.110557303320436, + "grad_norm": 3.553837299346924, + "learning_rate": 0.0005759455806356104, + "loss": 7.5387, + "step": 1254500 + }, + { + "epoch": 5.110964681343818, + "grad_norm": 13.702073097229004, + "learning_rate": 0.0005756309958541866, + "loss": 7.5506, + "step": 1254600 + }, + { + "epoch": 5.111372059367199, + "grad_norm": 5.067311763763428, + "learning_rate": 0.0005753164873510391, + "loss": 7.5482, + "step": 1254700 + }, + { + "epoch": 5.111779437390581, + "grad_norm": 6.317410469055176, + "learning_rate": 0.0005750020551386011, + "loss": 7.5415, + "step": 1254800 + }, + { + "epoch": 5.1121868154139625, + "grad_norm": 8.441106796264648, + "learning_rate": 0.0005746876992293042, + "loss": 7.5387, + "step": 1254900 + }, + { + "epoch": 5.112594193437344, + "grad_norm": 4.294591903686523, + "learning_rate": 0.0005743734196355764, + "loss": 7.5423, + "step": 1255000 + }, + { + "epoch": 5.112594193437344, + "eval_MaskedAccuracy": 0.5121129902226159, + "eval_loss": 1.5859158039093018, + "eval_runtime": 166.2729, + "eval_samples_per_second": 381.758, + "eval_steps_per_second": 1.492, + "step": 1255000 + }, + { + "epoch": 5.1130015714607255, + "grad_norm": 6.235103607177734, + "learning_rate": 0.0005740592163698434, + "loss": 7.528, + "step": 1255100 + }, + { + "epoch": 5.113408949484107, + "grad_norm": 4.567471027374268, + "learning_rate": 0.0005737450894445267, + "loss": 7.5364, + "step": 1255200 + }, + { + "epoch": 5.113816327507488, + "grad_norm": 6.386045932769775, + "learning_rate": 0.0005734310388720463, + "loss": 7.5939, + "step": 1255300 + }, + { + "epoch": 5.114223705530869, + "grad_norm": 5.731184482574463, + "learning_rate": 0.000573117064664818, + "loss": 7.5333, + "step": 1255400 + }, + { + "epoch": 5.114631083554251, + "grad_norm": 3.82441782951355, + "learning_rate": 0.0005728031668352545, + "loss": 7.5742, + "step": 1255500 + }, + { + "epoch": 5.115038461577632, + "grad_norm": 4.16675329208374, + "learning_rate": 0.000572489345395765, + "loss": 7.5424, + "step": 1255600 + }, + { + "epoch": 5.115445839601014, + "grad_norm": 10.363410949707031, + "learning_rate": 0.0005721756003587576, + "loss": 7.5379, + "step": 1255700 + }, + { + "epoch": 5.115853217624395, + "grad_norm": 13.491689682006836, + "learning_rate": 0.0005718619317366372, + "loss": 7.5668, + "step": 1255800 + }, + { + "epoch": 5.116260595647777, + "grad_norm": 21.5654239654541, + "learning_rate": 0.0005715483395418032, + "loss": 7.5626, + "step": 1255900 + }, + { + "epoch": 5.116667973671158, + "grad_norm": 14.647242546081543, + "learning_rate": 0.0005712348237866539, + "loss": 7.5572, + "step": 1256000 + }, + { + "epoch": 5.116667973671158, + "eval_MaskedAccuracy": 0.5119153561224999, + "eval_loss": 1.5987579822540283, + "eval_runtime": 167.6983, + "eval_samples_per_second": 378.513, + "eval_steps_per_second": 1.479, + "step": 1256000 + }, + { + "epoch": 5.11707535169454, + "grad_norm": 5.562261581420898, + "learning_rate": 0.000570921384483585, + "loss": 7.5398, + "step": 1256100 + }, + { + "epoch": 5.1174827297179215, + "grad_norm": 5.86845064163208, + "learning_rate": 0.0005706080216449879, + "loss": 7.562, + "step": 1256200 + }, + { + "epoch": 5.117890107741303, + "grad_norm": 5.843132495880127, + "learning_rate": 0.0005702947352832513, + "loss": 7.5208, + "step": 1256300 + }, + { + "epoch": 5.118297485764685, + "grad_norm": 9.1304931640625, + "learning_rate": 0.0005699815254107612, + "loss": 7.5627, + "step": 1256400 + }, + { + "epoch": 5.118704863788066, + "grad_norm": 9.829513549804688, + "learning_rate": 0.0005696683920399006, + "loss": 7.5289, + "step": 1256500 + }, + { + "epoch": 5.119112241811447, + "grad_norm": 3.751936435699463, + "learning_rate": 0.0005693553351830492, + "loss": 7.5478, + "step": 1256600 + }, + { + "epoch": 5.119519619834828, + "grad_norm": 20.435016632080078, + "learning_rate": 0.0005690423548525833, + "loss": 7.5411, + "step": 1256700 + }, + { + "epoch": 5.11992699785821, + "grad_norm": 5.691356658935547, + "learning_rate": 0.0005687294510608776, + "loss": 7.5367, + "step": 1256800 + }, + { + "epoch": 5.120334375881591, + "grad_norm": 6.463367938995361, + "learning_rate": 0.0005684166238203031, + "loss": 7.5915, + "step": 1256900 + }, + { + "epoch": 5.120741753904973, + "grad_norm": 4.4170379638671875, + "learning_rate": 0.0005681038731432271, + "loss": 7.536, + "step": 1257000 + }, + { + "epoch": 5.120741753904973, + "eval_MaskedAccuracy": 0.5127271519708295, + "eval_loss": 1.5867799520492554, + "eval_runtime": 169.8144, + "eval_samples_per_second": 373.796, + "eval_steps_per_second": 1.46, + "step": 1257000 + }, + { + "epoch": 5.121149131928354, + "grad_norm": 3.14304518699646, + "learning_rate": 0.0005677911990420141, + "loss": 7.5447, + "step": 1257100 + }, + { + "epoch": 5.121556509951736, + "grad_norm": 11.655447959899902, + "learning_rate": 0.0005674786015290261, + "loss": 7.5494, + "step": 1257200 + }, + { + "epoch": 5.1219638879751175, + "grad_norm": 5.352982044219971, + "learning_rate": 0.000567166080616622, + "loss": 7.5544, + "step": 1257300 + }, + { + "epoch": 5.122371265998499, + "grad_norm": 12.675518035888672, + "learning_rate": 0.0005668536363171567, + "loss": 7.5741, + "step": 1257400 + }, + { + "epoch": 5.1227786440218805, + "grad_norm": 11.002009391784668, + "learning_rate": 0.0005665412686429829, + "loss": 7.5242, + "step": 1257500 + }, + { + "epoch": 5.123186022045262, + "grad_norm": 4.125703811645508, + "learning_rate": 0.0005662289776064508, + "loss": 7.5405, + "step": 1257600 + }, + { + "epoch": 5.123593400068644, + "grad_norm": 5.825300693511963, + "learning_rate": 0.000565916763219906, + "loss": 7.5437, + "step": 1257700 + }, + { + "epoch": 5.124000778092024, + "grad_norm": 18.037317276000977, + "learning_rate": 0.0005656046254956921, + "loss": 7.5688, + "step": 1257800 + }, + { + "epoch": 5.124408156115406, + "grad_norm": 5.741120338439941, + "learning_rate": 0.0005652925644461503, + "loss": 7.5351, + "step": 1257900 + }, + { + "epoch": 5.124815534138787, + "grad_norm": 3.35128116607666, + "learning_rate": 0.0005649805800836175, + "loss": 7.5357, + "step": 1258000 + }, + { + "epoch": 5.124815534138787, + "eval_MaskedAccuracy": 0.5120024325681323, + "eval_loss": 1.593034029006958, + "eval_runtime": 180.2955, + "eval_samples_per_second": 352.066, + "eval_steps_per_second": 1.376, + "step": 1258000 + }, + { + "epoch": 5.125222912162169, + "grad_norm": 12.68496322631836, + "learning_rate": 0.0005646686724204281, + "loss": 7.5331, + "step": 1258100 + }, + { + "epoch": 5.12563029018555, + "grad_norm": 3.733407974243164, + "learning_rate": 0.0005643568414689143, + "loss": 7.56, + "step": 1258200 + }, + { + "epoch": 5.126037668208932, + "grad_norm": 8.2514009475708, + "learning_rate": 0.0005640450872414042, + "loss": 7.5143, + "step": 1258300 + }, + { + "epoch": 5.1264450462323135, + "grad_norm": 12.680123329162598, + "learning_rate": 0.0005637334097502226, + "loss": 7.5809, + "step": 1258400 + }, + { + "epoch": 5.126852424255695, + "grad_norm": 7.681098461151123, + "learning_rate": 0.0005634218090076914, + "loss": 7.5516, + "step": 1258500 + }, + { + "epoch": 5.1272598022790765, + "grad_norm": 8.842367172241211, + "learning_rate": 0.0005631102850261314, + "loss": 7.5352, + "step": 1258600 + }, + { + "epoch": 5.127667180302458, + "grad_norm": 12.64952278137207, + "learning_rate": 0.0005627988378178572, + "loss": 7.5737, + "step": 1258700 + }, + { + "epoch": 5.12807455832584, + "grad_norm": 7.7794718742370605, + "learning_rate": 0.0005624874673951821, + "loss": 7.5327, + "step": 1258800 + }, + { + "epoch": 5.128481936349221, + "grad_norm": 8.353389739990234, + "learning_rate": 0.0005621761737704163, + "loss": 7.5702, + "step": 1258900 + }, + { + "epoch": 5.128889314372602, + "grad_norm": 7.190156936645508, + "learning_rate": 0.0005618649569558682, + "loss": 7.5149, + "step": 1259000 + }, + { + "epoch": 5.128889314372602, + "eval_MaskedAccuracy": 0.5123441037784241, + "eval_loss": 1.5870732069015503, + "eval_runtime": 165.3347, + "eval_samples_per_second": 383.924, + "eval_steps_per_second": 1.5, + "step": 1259000 + }, + { + "epoch": 5.129296692395983, + "grad_norm": 4.980506420135498, + "learning_rate": 0.0005615538169638411, + "loss": 7.5461, + "step": 1259100 + }, + { + "epoch": 5.129704070419365, + "grad_norm": 3.817624807357788, + "learning_rate": 0.0005612427538066355, + "loss": 7.5517, + "step": 1259200 + }, + { + "epoch": 5.130111448442746, + "grad_norm": 14.45424747467041, + "learning_rate": 0.0005609317674965501, + "loss": 7.5436, + "step": 1259300 + }, + { + "epoch": 5.130518826466128, + "grad_norm": 13.708019256591797, + "learning_rate": 0.0005606208580458796, + "loss": 7.5498, + "step": 1259400 + }, + { + "epoch": 5.130926204489509, + "grad_norm": 8.454963684082031, + "learning_rate": 0.0005603100254669155, + "loss": 7.5532, + "step": 1259500 + }, + { + "epoch": 5.131333582512891, + "grad_norm": 8.445759773254395, + "learning_rate": 0.0005599992697719477, + "loss": 7.5479, + "step": 1259600 + }, + { + "epoch": 5.1317409605362725, + "grad_norm": 6.335297107696533, + "learning_rate": 0.00055968859097326, + "loss": 7.559, + "step": 1259700 + }, + { + "epoch": 5.132148338559654, + "grad_norm": 6.836855411529541, + "learning_rate": 0.0005593779890831371, + "loss": 7.5776, + "step": 1259800 + }, + { + "epoch": 5.1325557165830356, + "grad_norm": 5.868393421173096, + "learning_rate": 0.0005590674641138585, + "loss": 7.5654, + "step": 1259900 + }, + { + "epoch": 5.132963094606417, + "grad_norm": 9.056082725524902, + "learning_rate": 0.0005587570160777005, + "loss": 7.5385, + "step": 1260000 + }, + { + "epoch": 5.132963094606417, + "eval_MaskedAccuracy": 0.5123997891684868, + "eval_loss": 1.595545768737793, + "eval_runtime": 178.9284, + "eval_samples_per_second": 354.756, + "eval_steps_per_second": 1.386, + "step": 1260000 + }, + { + "epoch": 5.133370472629799, + "grad_norm": 5.885403633117676, + "learning_rate": 0.0005584466449869369, + "loss": 7.5392, + "step": 1260100 + }, + { + "epoch": 5.13377785065318, + "grad_norm": 4.102877616882324, + "learning_rate": 0.0005581363508538385, + "loss": 7.5559, + "step": 1260200 + }, + { + "epoch": 5.134185228676561, + "grad_norm": 5.089933395385742, + "learning_rate": 0.000557826133690672, + "loss": 7.5677, + "step": 1260300 + }, + { + "epoch": 5.134592606699942, + "grad_norm": 15.710661888122559, + "learning_rate": 0.000557515993509703, + "loss": 7.5531, + "step": 1260400 + }, + { + "epoch": 5.134999984723324, + "grad_norm": 4.206904411315918, + "learning_rate": 0.0005572059303231927, + "loss": 7.5491, + "step": 1260500 + }, + { + "epoch": 5.135407362746705, + "grad_norm": 12.060872077941895, + "learning_rate": 0.0005568959441433999, + "loss": 7.5683, + "step": 1260600 + }, + { + "epoch": 5.135814740770087, + "grad_norm": 2.5560874938964844, + "learning_rate": 0.00055658603498258, + "loss": 7.5888, + "step": 1260700 + }, + { + "epoch": 5.1362221187934685, + "grad_norm": 7.485671043395996, + "learning_rate": 0.000556276202852984, + "loss": 7.5713, + "step": 1260800 + }, + { + "epoch": 5.13662949681685, + "grad_norm": 18.603740692138672, + "learning_rate": 0.0005559664477668626, + "loss": 7.5626, + "step": 1260900 + }, + { + "epoch": 5.1370368748402315, + "grad_norm": 3.6558775901794434, + "learning_rate": 0.0005556567697364616, + "loss": 7.5661, + "step": 1261000 + }, + { + "epoch": 5.1370368748402315, + "eval_MaskedAccuracy": 0.512043810548891, + "eval_loss": 1.5838418006896973, + "eval_runtime": 180.6622, + "eval_samples_per_second": 351.352, + "eval_steps_per_second": 1.373, + "step": 1261000 + }, + { + "epoch": 5.137444252863613, + "grad_norm": 6.576967239379883, + "learning_rate": 0.0005553471687740244, + "loss": 7.5592, + "step": 1261100 + }, + { + "epoch": 5.137851630886995, + "grad_norm": 5.285487174987793, + "learning_rate": 0.0005550376448917914, + "loss": 7.5759, + "step": 1261200 + }, + { + "epoch": 5.138259008910376, + "grad_norm": 6.252453327178955, + "learning_rate": 0.0005547281981019993, + "loss": 7.5457, + "step": 1261300 + }, + { + "epoch": 5.138666386933758, + "grad_norm": 5.763672351837158, + "learning_rate": 0.0005544188284168824, + "loss": 7.5586, + "step": 1261400 + }, + { + "epoch": 5.139073764957139, + "grad_norm": 13.533401489257812, + "learning_rate": 0.0005541095358486723, + "loss": 7.5717, + "step": 1261500 + }, + { + "epoch": 5.13948114298052, + "grad_norm": 4.017519474029541, + "learning_rate": 0.0005538003204095959, + "loss": 7.5298, + "step": 1261600 + }, + { + "epoch": 5.139888521003901, + "grad_norm": 17.37210464477539, + "learning_rate": 0.0005534911821118786, + "loss": 7.5192, + "step": 1261700 + }, + { + "epoch": 5.140295899027283, + "grad_norm": 17.076250076293945, + "learning_rate": 0.0005531821209677424, + "loss": 7.5374, + "step": 1261800 + }, + { + "epoch": 5.140703277050664, + "grad_norm": 12.65649127960205, + "learning_rate": 0.000552873136989405, + "loss": 7.5443, + "step": 1261900 + }, + { + "epoch": 5.141110655074046, + "grad_norm": 4.699617385864258, + "learning_rate": 0.0005525642301890837, + "loss": 7.5673, + "step": 1262000 + }, + { + "epoch": 5.141110655074046, + "eval_MaskedAccuracy": 0.5128783282627524, + "eval_loss": 1.5865117311477661, + "eval_runtime": 153.5447, + "eval_samples_per_second": 413.404, + "eval_steps_per_second": 1.615, + "step": 1262000 + }, + { + "epoch": 5.1415180330974275, + "grad_norm": 17.365907669067383, + "learning_rate": 0.0005522554005789904, + "loss": 7.5619, + "step": 1262100 + }, + { + "epoch": 5.141925411120809, + "grad_norm": 8.379057884216309, + "learning_rate": 0.0005519466481713351, + "loss": 7.5665, + "step": 1262200 + }, + { + "epoch": 5.142332789144191, + "grad_norm": 14.656353950500488, + "learning_rate": 0.0005516379729783243, + "loss": 7.5485, + "step": 1262300 + }, + { + "epoch": 5.142740167167572, + "grad_norm": 9.875476837158203, + "learning_rate": 0.0005513293750121628, + "loss": 7.5678, + "step": 1262400 + }, + { + "epoch": 5.143147545190954, + "grad_norm": 2.759429454803467, + "learning_rate": 0.0005510208542850499, + "loss": 7.545, + "step": 1262500 + }, + { + "epoch": 5.143554923214335, + "grad_norm": 4.63169002532959, + "learning_rate": 0.0005507124108091826, + "loss": 7.5508, + "step": 1262600 + }, + { + "epoch": 5.143962301237717, + "grad_norm": 4.982854843139648, + "learning_rate": 0.0005504040445967567, + "loss": 7.56, + "step": 1262700 + }, + { + "epoch": 5.144369679261097, + "grad_norm": 11.580699920654297, + "learning_rate": 0.0005500957556599621, + "loss": 7.5589, + "step": 1262800 + }, + { + "epoch": 5.144777057284479, + "grad_norm": 9.614246368408203, + "learning_rate": 0.0005497875440109886, + "loss": 7.5468, + "step": 1262900 + }, + { + "epoch": 5.14518443530786, + "grad_norm": 8.304036140441895, + "learning_rate": 0.0005494794096620203, + "loss": 7.5644, + "step": 1263000 + }, + { + "epoch": 5.14518443530786, + "eval_MaskedAccuracy": 0.5125698403114065, + "eval_loss": 1.5937505960464478, + "eval_runtime": 166.4638, + "eval_samples_per_second": 381.32, + "eval_steps_per_second": 1.49, + "step": 1263000 + }, + { + "epoch": 5.145591813331242, + "grad_norm": 5.4799113273620605, + "learning_rate": 0.0005491713526252387, + "loss": 7.5838, + "step": 1263100 + }, + { + "epoch": 5.1459991913546235, + "grad_norm": 5.485556125640869, + "learning_rate": 0.0005488633729128249, + "loss": 7.5746, + "step": 1263200 + }, + { + "epoch": 5.146406569378005, + "grad_norm": 15.55025863647461, + "learning_rate": 0.000548555470536954, + "loss": 7.5516, + "step": 1263300 + }, + { + "epoch": 5.1468139474013865, + "grad_norm": 10.720710754394531, + "learning_rate": 0.0005482476455097989, + "loss": 7.5385, + "step": 1263400 + }, + { + "epoch": 5.147221325424768, + "grad_norm": 9.393437385559082, + "learning_rate": 0.0005479398978435298, + "loss": 7.577, + "step": 1263500 + }, + { + "epoch": 5.14762870344815, + "grad_norm": 9.32558822631836, + "learning_rate": 0.0005476322275503147, + "loss": 7.5602, + "step": 1263600 + }, + { + "epoch": 5.148036081471531, + "grad_norm": 4.0196309089660645, + "learning_rate": 0.0005473246346423159, + "loss": 7.5521, + "step": 1263700 + }, + { + "epoch": 5.148443459494913, + "grad_norm": 4.326145172119141, + "learning_rate": 0.0005470171191316944, + "loss": 7.5632, + "step": 1263800 + }, + { + "epoch": 5.148850837518294, + "grad_norm": 6.749875068664551, + "learning_rate": 0.0005467096810306082, + "loss": 7.5494, + "step": 1263900 + }, + { + "epoch": 5.149258215541675, + "grad_norm": 8.418333053588867, + "learning_rate": 0.0005464023203512122, + "loss": 7.5677, + "step": 1264000 + }, + { + "epoch": 5.149258215541675, + "eval_MaskedAccuracy": 0.5127325140598666, + "eval_loss": 1.5884439945220947, + "eval_runtime": 221.1288, + "eval_samples_per_second": 287.054, + "eval_steps_per_second": 1.122, + "step": 1264000 + }, + { + "epoch": 5.149665593565056, + "grad_norm": 5.534397125244141, + "learning_rate": 0.0005460950371056575, + "loss": 7.5374, + "step": 1264100 + }, + { + "epoch": 5.150072971588438, + "grad_norm": 15.326800346374512, + "learning_rate": 0.0005457878313060935, + "loss": 7.5759, + "step": 1264200 + }, + { + "epoch": 5.150480349611819, + "grad_norm": 11.39289665222168, + "learning_rate": 0.0005454807029646647, + "loss": 7.5524, + "step": 1264300 + }, + { + "epoch": 5.150887727635201, + "grad_norm": 3.969057083129883, + "learning_rate": 0.0005451736520935137, + "loss": 7.542, + "step": 1264400 + }, + { + "epoch": 5.1512951056585825, + "grad_norm": 15.005791664123535, + "learning_rate": 0.0005448666787047811, + "loss": 7.576, + "step": 1264500 + }, + { + "epoch": 5.151702483681964, + "grad_norm": 14.393976211547852, + "learning_rate": 0.0005445597828106016, + "loss": 7.5784, + "step": 1264600 + }, + { + "epoch": 5.152109861705346, + "grad_norm": 4.292541980743408, + "learning_rate": 0.0005442529644231091, + "loss": 7.5579, + "step": 1264700 + }, + { + "epoch": 5.152517239728727, + "grad_norm": 4.117619037628174, + "learning_rate": 0.0005439462235544333, + "loss": 7.574, + "step": 1264800 + }, + { + "epoch": 5.152924617752109, + "grad_norm": 5.055161476135254, + "learning_rate": 0.0005436395602167026, + "loss": 7.5709, + "step": 1264900 + }, + { + "epoch": 5.15333199577549, + "grad_norm": 3.9017088413238525, + "learning_rate": 0.0005433329744220398, + "loss": 7.5536, + "step": 1265000 + }, + { + "epoch": 5.15333199577549, + "eval_MaskedAccuracy": 0.5119708914343247, + "eval_loss": 1.5935159921646118, + "eval_runtime": 239.3234, + "eval_samples_per_second": 265.231, + "eval_steps_per_second": 1.036, + "step": 1265000 + }, + { + "epoch": 5.153739373798872, + "grad_norm": 5.433421611785889, + "learning_rate": 0.0005430264661825662, + "loss": 7.5654, + "step": 1265100 + }, + { + "epoch": 5.154146751822253, + "grad_norm": 10.552696228027344, + "learning_rate": 0.0005427200355103999, + "loss": 7.5633, + "step": 1265200 + }, + { + "epoch": 5.154554129845634, + "grad_norm": 5.230506896972656, + "learning_rate": 0.0005424136824176553, + "loss": 7.5879, + "step": 1265300 + }, + { + "epoch": 5.154961507869015, + "grad_norm": 5.859530448913574, + "learning_rate": 0.0005421074069164449, + "loss": 7.5353, + "step": 1265400 + }, + { + "epoch": 5.155368885892397, + "grad_norm": 3.4236106872558594, + "learning_rate": 0.0005418012090188764, + "loss": 7.5419, + "step": 1265500 + }, + { + "epoch": 5.1557762639157785, + "grad_norm": 11.217131614685059, + "learning_rate": 0.0005414950887370554, + "loss": 7.5294, + "step": 1265600 + }, + { + "epoch": 5.15618364193916, + "grad_norm": 6.577310085296631, + "learning_rate": 0.0005411890460830863, + "loss": 7.5612, + "step": 1265700 + }, + { + "epoch": 5.1565910199625415, + "grad_norm": 6.010001182556152, + "learning_rate": 0.0005408830810690677, + "loss": 7.576, + "step": 1265800 + }, + { + "epoch": 5.156998397985923, + "grad_norm": 9.843003273010254, + "learning_rate": 0.0005405771937070946, + "loss": 7.5373, + "step": 1265900 + }, + { + "epoch": 5.157405776009305, + "grad_norm": 5.558066368103027, + "learning_rate": 0.0005402713840092617, + "loss": 7.5682, + "step": 1266000 + }, + { + "epoch": 5.157405776009305, + "eval_MaskedAccuracy": 0.5126519099999005, + "eval_loss": 1.585870623588562, + "eval_runtime": 176.206, + "eval_samples_per_second": 360.237, + "eval_steps_per_second": 1.407, + "step": 1266000 + }, + { + "epoch": 5.157813154032686, + "grad_norm": 12.909948348999023, + "learning_rate": 0.0005399656519876595, + "loss": 7.5489, + "step": 1266100 + }, + { + "epoch": 5.158220532056068, + "grad_norm": 16.241851806640625, + "learning_rate": 0.000539659997654375, + "loss": 7.5387, + "step": 1266200 + }, + { + "epoch": 5.158627910079449, + "grad_norm": 5.339664459228516, + "learning_rate": 0.0005393544210214921, + "loss": 7.5225, + "step": 1266300 + }, + { + "epoch": 5.159035288102831, + "grad_norm": 17.13751792907715, + "learning_rate": 0.0005390489221010918, + "loss": 7.5019, + "step": 1266400 + }, + { + "epoch": 5.159442666126212, + "grad_norm": 4.725861549377441, + "learning_rate": 0.0005387435009052526, + "loss": 7.5521, + "step": 1266500 + }, + { + "epoch": 5.159850044149593, + "grad_norm": 5.74859094619751, + "learning_rate": 0.0005384381574460492, + "loss": 7.563, + "step": 1266600 + }, + { + "epoch": 5.1602574221729745, + "grad_norm": 15.772001266479492, + "learning_rate": 0.0005381328917355535, + "loss": 7.5229, + "step": 1266700 + }, + { + "epoch": 5.160664800196356, + "grad_norm": 9.625349998474121, + "learning_rate": 0.000537827703785833, + "loss": 7.5489, + "step": 1266800 + }, + { + "epoch": 5.1610721782197375, + "grad_norm": 5.832103729248047, + "learning_rate": 0.0005375225936089551, + "loss": 7.5486, + "step": 1266900 + }, + { + "epoch": 5.161479556243119, + "grad_norm": 3.081801652908325, + "learning_rate": 0.0005372175612169822, + "loss": 7.5265, + "step": 1267000 + }, + { + "epoch": 5.161479556243119, + "eval_MaskedAccuracy": 0.5128539098904568, + "eval_loss": 1.5957610607147217, + "eval_runtime": 163.7109, + "eval_samples_per_second": 387.732, + "eval_steps_per_second": 1.515, + "step": 1267000 + }, + { + "epoch": 5.161886934266501, + "grad_norm": 4.2837982177734375, + "learning_rate": 0.0005369126066219735, + "loss": 7.5596, + "step": 1267100 + }, + { + "epoch": 5.162294312289882, + "grad_norm": 5.668388843536377, + "learning_rate": 0.0005366077298359856, + "loss": 7.5594, + "step": 1267200 + }, + { + "epoch": 5.162701690313264, + "grad_norm": 18.58271598815918, + "learning_rate": 0.0005363029308710721, + "loss": 7.5077, + "step": 1267300 + }, + { + "epoch": 5.163109068336645, + "grad_norm": 16.35295867919922, + "learning_rate": 0.0005359982097392827, + "loss": 7.5415, + "step": 1267400 + }, + { + "epoch": 5.163516446360027, + "grad_norm": 21.181188583374023, + "learning_rate": 0.0005356935664526649, + "loss": 7.5489, + "step": 1267500 + }, + { + "epoch": 5.163923824383408, + "grad_norm": 3.084169864654541, + "learning_rate": 0.0005353890010232636, + "loss": 7.5617, + "step": 1267600 + }, + { + "epoch": 5.16433120240679, + "grad_norm": 17.424720764160156, + "learning_rate": 0.0005350845134631188, + "loss": 7.562, + "step": 1267700 + }, + { + "epoch": 5.16473858043017, + "grad_norm": 3.184140920639038, + "learning_rate": 0.0005347801037842686, + "loss": 7.5331, + "step": 1267800 + }, + { + "epoch": 5.165145958453552, + "grad_norm": 6.366086483001709, + "learning_rate": 0.0005344757719987482, + "loss": 7.5336, + "step": 1267900 + }, + { + "epoch": 5.1655533364769335, + "grad_norm": 9.245692253112793, + "learning_rate": 0.0005341715181185896, + "loss": 7.5413, + "step": 1268000 + }, + { + "epoch": 5.1655533364769335, + "eval_MaskedAccuracy": 0.512463367519968, + "eval_loss": 1.5966273546218872, + "eval_runtime": 171.4111, + "eval_samples_per_second": 370.314, + "eval_steps_per_second": 1.447, + "step": 1268000 + }, + { + "epoch": 5.165960714500315, + "grad_norm": 16.10322380065918, + "learning_rate": 0.0005338673421558221, + "loss": 7.5161, + "step": 1268100 + }, + { + "epoch": 5.1663680925236966, + "grad_norm": 3.6817374229431152, + "learning_rate": 0.000533563244122471, + "loss": 7.523, + "step": 1268200 + }, + { + "epoch": 5.166775470547078, + "grad_norm": 16.2663631439209, + "learning_rate": 0.0005332592240305583, + "loss": 7.5354, + "step": 1268300 + }, + { + "epoch": 5.16718284857046, + "grad_norm": 20.580501556396484, + "learning_rate": 0.0005329552818921049, + "loss": 7.5454, + "step": 1268400 + }, + { + "epoch": 5.167590226593841, + "grad_norm": 10.921297073364258, + "learning_rate": 0.0005326514177191262, + "loss": 7.525, + "step": 1268500 + }, + { + "epoch": 5.167997604617223, + "grad_norm": 2.968923568725586, + "learning_rate": 0.0005323476315236359, + "loss": 7.5435, + "step": 1268600 + }, + { + "epoch": 5.168404982640604, + "grad_norm": 9.835203170776367, + "learning_rate": 0.000532043923317644, + "loss": 7.5515, + "step": 1268700 + }, + { + "epoch": 5.168812360663986, + "grad_norm": 6.683423042297363, + "learning_rate": 0.000531740293113158, + "loss": 7.5591, + "step": 1268800 + }, + { + "epoch": 5.169219738687367, + "grad_norm": 13.926468849182129, + "learning_rate": 0.0005314367409221814, + "loss": 7.5541, + "step": 1268900 + }, + { + "epoch": 5.169627116710748, + "grad_norm": 4.112544059753418, + "learning_rate": 0.0005311332667567165, + "loss": 7.5419, + "step": 1269000 + }, + { + "epoch": 5.169627116710748, + "eval_MaskedAccuracy": 0.5119529905206833, + "eval_loss": 1.598028302192688, + "eval_runtime": 190.2852, + "eval_samples_per_second": 333.584, + "eval_steps_per_second": 1.303, + "step": 1269000 + }, + { + "epoch": 5.1700344947341295, + "grad_norm": 6.330892086029053, + "learning_rate": 0.0005308298706287607, + "loss": 7.579, + "step": 1269100 + }, + { + "epoch": 5.170441872757511, + "grad_norm": 11.34080696105957, + "learning_rate": 0.0005305265525503086, + "loss": 7.5401, + "step": 1269200 + }, + { + "epoch": 5.1708492507808925, + "grad_norm": 14.808900833129883, + "learning_rate": 0.000530223312533352, + "loss": 7.5109, + "step": 1269300 + }, + { + "epoch": 5.171256628804274, + "grad_norm": 19.919721603393555, + "learning_rate": 0.0005299201505898793, + "loss": 7.5375, + "step": 1269400 + }, + { + "epoch": 5.171664006827656, + "grad_norm": 13.870037078857422, + "learning_rate": 0.0005296170667318773, + "loss": 7.5282, + "step": 1269500 + }, + { + "epoch": 5.172071384851037, + "grad_norm": 8.070316314697266, + "learning_rate": 0.0005293140609713271, + "loss": 7.535, + "step": 1269600 + }, + { + "epoch": 5.172478762874419, + "grad_norm": 9.52051830291748, + "learning_rate": 0.0005290111333202091, + "loss": 7.561, + "step": 1269700 + }, + { + "epoch": 5.1728861408978, + "grad_norm": 19.949111938476562, + "learning_rate": 0.0005287082837904989, + "loss": 7.5589, + "step": 1269800 + }, + { + "epoch": 5.173293518921182, + "grad_norm": 6.0597944259643555, + "learning_rate": 0.0005284055123941703, + "loss": 7.5136, + "step": 1269900 + }, + { + "epoch": 5.173700896944563, + "grad_norm": 3.6505286693573, + "learning_rate": 0.0005281028191431927, + "loss": 7.5395, + "step": 1270000 + }, + { + "epoch": 5.173700896944563, + "eval_MaskedAccuracy": 0.5123596121515993, + "eval_loss": 1.5900706052780151, + "eval_runtime": 162.7414, + "eval_samples_per_second": 390.042, + "eval_steps_per_second": 1.524, + "step": 1270000 + }, + { + "epoch": 5.174108274967945, + "grad_norm": 10.080352783203125, + "learning_rate": 0.0005278002040495345, + "loss": 7.5478, + "step": 1270100 + }, + { + "epoch": 5.174515652991326, + "grad_norm": 6.672272205352783, + "learning_rate": 0.0005274976671251593, + "loss": 7.5676, + "step": 1270200 + }, + { + "epoch": 5.174923031014707, + "grad_norm": 10.528050422668457, + "learning_rate": 0.0005271952083820284, + "loss": 7.5677, + "step": 1270300 + }, + { + "epoch": 5.1753304090380885, + "grad_norm": 19.363574981689453, + "learning_rate": 0.0005268928278320986, + "loss": 7.5439, + "step": 1270400 + }, + { + "epoch": 5.17573778706147, + "grad_norm": 3.931839942932129, + "learning_rate": 0.0005265905254873242, + "loss": 7.5489, + "step": 1270500 + }, + { + "epoch": 5.176145165084852, + "grad_norm": 11.309305191040039, + "learning_rate": 0.0005262883013596573, + "loss": 7.5522, + "step": 1270600 + }, + { + "epoch": 5.176552543108233, + "grad_norm": 13.059709548950195, + "learning_rate": 0.0005259861554610477, + "loss": 7.5518, + "step": 1270700 + }, + { + "epoch": 5.176959921131615, + "grad_norm": 7.099419116973877, + "learning_rate": 0.0005256840878034389, + "loss": 7.5431, + "step": 1270800 + }, + { + "epoch": 5.177367299154996, + "grad_norm": 2.741917848587036, + "learning_rate": 0.0005253820983987747, + "loss": 7.5658, + "step": 1270900 + }, + { + "epoch": 5.177774677178378, + "grad_norm": 4.795065879821777, + "learning_rate": 0.0005250801872589931, + "loss": 7.5494, + "step": 1271000 + }, + { + "epoch": 5.177774677178378, + "eval_MaskedAccuracy": 0.5126933325976969, + "eval_loss": 1.5832537412643433, + "eval_runtime": 172.5992, + "eval_samples_per_second": 367.765, + "eval_steps_per_second": 1.437, + "step": 1271000 + }, + { + "epoch": 5.178182055201759, + "grad_norm": 7.33160400390625, + "learning_rate": 0.000524778354396032, + "loss": 7.5672, + "step": 1271100 + }, + { + "epoch": 5.178589433225141, + "grad_norm": 18.861373901367188, + "learning_rate": 0.0005244765998218236, + "loss": 7.5469, + "step": 1271200 + }, + { + "epoch": 5.178996811248522, + "grad_norm": 4.511181831359863, + "learning_rate": 0.0005241749235482972, + "loss": 7.5396, + "step": 1271300 + }, + { + "epoch": 5.179404189271904, + "grad_norm": 6.524885654449463, + "learning_rate": 0.0005238733255873815, + "loss": 7.5576, + "step": 1271400 + }, + { + "epoch": 5.179811567295285, + "grad_norm": 3.818176746368408, + "learning_rate": 0.0005235718059509989, + "loss": 7.5659, + "step": 1271500 + }, + { + "epoch": 5.180218945318666, + "grad_norm": 13.060426712036133, + "learning_rate": 0.00052327036465107, + "loss": 7.5366, + "step": 1271600 + }, + { + "epoch": 5.1806263233420475, + "grad_norm": 22.0394229888916, + "learning_rate": 0.0005229690016995128, + "loss": 7.5515, + "step": 1271700 + }, + { + "epoch": 5.181033701365429, + "grad_norm": 19.83888053894043, + "learning_rate": 0.0005226677171082413, + "loss": 7.5662, + "step": 1271800 + }, + { + "epoch": 5.181441079388811, + "grad_norm": 9.551691055297852, + "learning_rate": 0.0005223665108891677, + "loss": 7.5682, + "step": 1271900 + }, + { + "epoch": 5.181848457412192, + "grad_norm": 4.3937859535217285, + "learning_rate": 0.0005220653830541982, + "loss": 7.5552, + "step": 1272000 + }, + { + "epoch": 5.181848457412192, + "eval_MaskedAccuracy": 0.5124723849085815, + "eval_loss": 1.5935578346252441, + "eval_runtime": 174.9777, + "eval_samples_per_second": 362.766, + "eval_steps_per_second": 1.417, + "step": 1272000 + }, + { + "epoch": 5.182255835435574, + "grad_norm": 22.850011825561523, + "learning_rate": 0.0005217643336152411, + "loss": 7.5562, + "step": 1272100 + }, + { + "epoch": 5.182663213458955, + "grad_norm": 18.282371520996094, + "learning_rate": 0.0005214633625841967, + "loss": 7.5065, + "step": 1272200 + }, + { + "epoch": 5.183070591482337, + "grad_norm": 7.804836273193359, + "learning_rate": 0.0005211624699729641, + "loss": 7.544, + "step": 1272300 + }, + { + "epoch": 5.183477969505718, + "grad_norm": 4.658621311187744, + "learning_rate": 0.0005208616557934395, + "loss": 7.5487, + "step": 1272400 + }, + { + "epoch": 5.1838853475291, + "grad_norm": 5.6526947021484375, + "learning_rate": 0.0005205609200575151, + "loss": 7.5616, + "step": 1272500 + }, + { + "epoch": 5.184292725552481, + "grad_norm": 20.303049087524414, + "learning_rate": 0.0005202602627770817, + "loss": 7.5462, + "step": 1272600 + }, + { + "epoch": 5.184700103575863, + "grad_norm": 21.300710678100586, + "learning_rate": 0.0005199596839640245, + "loss": 7.5363, + "step": 1272700 + }, + { + "epoch": 5.1851074815992435, + "grad_norm": 8.422891616821289, + "learning_rate": 0.0005196591836302279, + "loss": 7.5784, + "step": 1272800 + }, + { + "epoch": 5.185514859622625, + "grad_norm": 15.43104076385498, + "learning_rate": 0.0005193587617875722, + "loss": 7.5381, + "step": 1272900 + }, + { + "epoch": 5.185922237646007, + "grad_norm": 9.114885330200195, + "learning_rate": 0.0005190584184479343, + "loss": 7.5534, + "step": 1273000 + }, + { + "epoch": 5.185922237646007, + "eval_MaskedAccuracy": 0.5126939014839413, + "eval_loss": 1.5904895067214966, + "eval_runtime": 173.6511, + "eval_samples_per_second": 365.538, + "eval_steps_per_second": 1.428, + "step": 1273000 + }, + { + "epoch": 5.186329615669388, + "grad_norm": 9.559250831604004, + "learning_rate": 0.0005187581536231892, + "loss": 7.5217, + "step": 1273100 + }, + { + "epoch": 5.18673699369277, + "grad_norm": 4.708173751831055, + "learning_rate": 0.0005184579673252074, + "loss": 7.5205, + "step": 1273200 + }, + { + "epoch": 5.187144371716151, + "grad_norm": 21.92316436767578, + "learning_rate": 0.0005181578595658565, + "loss": 7.5412, + "step": 1273300 + }, + { + "epoch": 5.187551749739533, + "grad_norm": 11.953930854797363, + "learning_rate": 0.000517857830357002, + "loss": 7.5612, + "step": 1273400 + }, + { + "epoch": 5.187959127762914, + "grad_norm": 17.054977416992188, + "learning_rate": 0.0005175578797105055, + "loss": 7.5511, + "step": 1273500 + }, + { + "epoch": 5.188366505786296, + "grad_norm": 9.827713012695312, + "learning_rate": 0.0005172580076382258, + "loss": 7.5188, + "step": 1273600 + }, + { + "epoch": 5.188773883809677, + "grad_norm": 20.34474754333496, + "learning_rate": 0.0005169582141520181, + "loss": 7.5589, + "step": 1273700 + }, + { + "epoch": 5.189181261833059, + "grad_norm": 7.054084777832031, + "learning_rate": 0.0005166584992637351, + "loss": 7.5555, + "step": 1273800 + }, + { + "epoch": 5.18958863985644, + "grad_norm": 7.799508094787598, + "learning_rate": 0.0005163588629852259, + "loss": 7.558, + "step": 1273900 + }, + { + "epoch": 5.189996017879821, + "grad_norm": 8.375802993774414, + "learning_rate": 0.0005160593053283368, + "loss": 7.52, + "step": 1274000 + }, + { + "epoch": 5.189996017879821, + "eval_MaskedAccuracy": 0.512114161487815, + "eval_loss": 1.5887385606765747, + "eval_runtime": 165.4391, + "eval_samples_per_second": 383.682, + "eval_steps_per_second": 1.499, + "step": 1274000 + }, + { + "epoch": 5.1904033959032025, + "grad_norm": 10.135929107666016, + "learning_rate": 0.0005157598263049106, + "loss": 7.543, + "step": 1274100 + }, + { + "epoch": 5.190810773926584, + "grad_norm": 12.61279582977295, + "learning_rate": 0.0005154604259267879, + "loss": 7.5402, + "step": 1274200 + }, + { + "epoch": 5.191218151949966, + "grad_norm": 8.79918384552002, + "learning_rate": 0.0005151611042058058, + "loss": 7.5575, + "step": 1274300 + }, + { + "epoch": 5.191625529973347, + "grad_norm": 16.091278076171875, + "learning_rate": 0.0005148618611537979, + "loss": 7.5296, + "step": 1274400 + }, + { + "epoch": 5.192032907996729, + "grad_norm": 16.06725311279297, + "learning_rate": 0.0005145626967825941, + "loss": 7.537, + "step": 1274500 + }, + { + "epoch": 5.19244028602011, + "grad_norm": 18.132741928100586, + "learning_rate": 0.0005142636111040232, + "loss": 7.5405, + "step": 1274600 + }, + { + "epoch": 5.192847664043492, + "grad_norm": 4.988110542297363, + "learning_rate": 0.0005139646041299082, + "loss": 7.533, + "step": 1274700 + }, + { + "epoch": 5.193255042066873, + "grad_norm": 5.989030838012695, + "learning_rate": 0.0005136656758720708, + "loss": 7.563, + "step": 1274800 + }, + { + "epoch": 5.193662420090255, + "grad_norm": 25.162853240966797, + "learning_rate": 0.0005133668263423302, + "loss": 7.5419, + "step": 1274900 + }, + { + "epoch": 5.194069798113636, + "grad_norm": 18.128734588623047, + "learning_rate": 0.0005130680555525008, + "loss": 7.534, + "step": 1275000 + }, + { + "epoch": 5.194069798113636, + "eval_MaskedAccuracy": 0.5121862551616991, + "eval_loss": 1.5897197723388672, + "eval_runtime": 160.1237, + "eval_samples_per_second": 396.419, + "eval_steps_per_second": 1.549, + "step": 1275000 + }, + { + "epoch": 5.194477176137018, + "grad_norm": 8.414715766906738, + "learning_rate": 0.0005127693635143956, + "loss": 7.5486, + "step": 1275100 + }, + { + "epoch": 5.194884554160399, + "grad_norm": 3.6837668418884277, + "learning_rate": 0.0005124707502398218, + "loss": 7.557, + "step": 1275200 + }, + { + "epoch": 5.19529193218378, + "grad_norm": 6.873579025268555, + "learning_rate": 0.0005121722157405866, + "loss": 7.5579, + "step": 1275300 + }, + { + "epoch": 5.195699310207162, + "grad_norm": 6.990813255310059, + "learning_rate": 0.0005118737600284916, + "loss": 7.5723, + "step": 1275400 + }, + { + "epoch": 5.196106688230543, + "grad_norm": 4.58314847946167, + "learning_rate": 0.0005115753831153366, + "loss": 7.5239, + "step": 1275500 + }, + { + "epoch": 5.196514066253925, + "grad_norm": 15.989585876464844, + "learning_rate": 0.0005112770850129188, + "loss": 7.5425, + "step": 1275600 + }, + { + "epoch": 5.196921444277306, + "grad_norm": 20.096433639526367, + "learning_rate": 0.000510978865733031, + "loss": 7.5467, + "step": 1275700 + }, + { + "epoch": 5.197328822300688, + "grad_norm": 10.62798023223877, + "learning_rate": 0.0005106807252874631, + "loss": 7.5466, + "step": 1275800 + }, + { + "epoch": 5.197736200324069, + "grad_norm": 21.502344131469727, + "learning_rate": 0.0005103826636880029, + "loss": 7.503, + "step": 1275900 + }, + { + "epoch": 5.198143578347451, + "grad_norm": 3.51676082611084, + "learning_rate": 0.000510084680946433, + "loss": 7.5731, + "step": 1276000 + }, + { + "epoch": 5.198143578347451, + "eval_MaskedAccuracy": 0.5123767147094139, + "eval_loss": 1.5917176008224487, + "eval_runtime": 171.0444, + "eval_samples_per_second": 371.108, + "eval_steps_per_second": 1.45, + "step": 1276000 + }, + { + "epoch": 5.198550956370832, + "grad_norm": 12.190217018127441, + "learning_rate": 0.0005097867770745358, + "loss": 7.5173, + "step": 1276100 + }, + { + "epoch": 5.198958334394214, + "grad_norm": 24.854997634887695, + "learning_rate": 0.0005094889520840889, + "loss": 7.5543, + "step": 1276200 + }, + { + "epoch": 5.199365712417595, + "grad_norm": 21.390878677368164, + "learning_rate": 0.0005091912059868668, + "loss": 7.5561, + "step": 1276300 + }, + { + "epoch": 5.199773090440977, + "grad_norm": 16.05355453491211, + "learning_rate": 0.0005088935387946397, + "loss": 7.5494, + "step": 1276400 + }, + { + "epoch": 5.200180468464358, + "grad_norm": 4.544560432434082, + "learning_rate": 0.0005085959505191776, + "loss": 7.5553, + "step": 1276500 + }, + { + "epoch": 5.200587846487739, + "grad_norm": 19.7420654296875, + "learning_rate": 0.0005082984411722452, + "loss": 7.5475, + "step": 1276600 + }, + { + "epoch": 5.200995224511121, + "grad_norm": 12.10345458984375, + "learning_rate": 0.0005080010107656047, + "loss": 7.5647, + "step": 1276700 + }, + { + "epoch": 5.201402602534502, + "grad_norm": 5.871239185333252, + "learning_rate": 0.0005077036593110144, + "loss": 7.5453, + "step": 1276800 + }, + { + "epoch": 5.201809980557884, + "grad_norm": 18.729658126831055, + "learning_rate": 0.0005074063868202314, + "loss": 7.537, + "step": 1276900 + }, + { + "epoch": 5.202217358581265, + "grad_norm": 6.017864227294922, + "learning_rate": 0.0005071091933050057, + "loss": 7.5214, + "step": 1277000 + }, + { + "epoch": 5.202217358581265, + "eval_MaskedAccuracy": 0.5128177817636642, + "eval_loss": 1.5880012512207031, + "eval_runtime": 173.3499, + "eval_samples_per_second": 366.173, + "eval_steps_per_second": 1.431, + "step": 1277000 + }, + { + "epoch": 5.202624736604647, + "grad_norm": 6.193304061889648, + "learning_rate": 0.0005068120787770904, + "loss": 7.562, + "step": 1277100 + }, + { + "epoch": 5.203032114628028, + "grad_norm": 18.89733123779297, + "learning_rate": 0.0005065150432482303, + "loss": 7.532, + "step": 1277200 + }, + { + "epoch": 5.20343949265141, + "grad_norm": 6.110525608062744, + "learning_rate": 0.0005062180867301693, + "loss": 7.5612, + "step": 1277300 + }, + { + "epoch": 5.203846870674791, + "grad_norm": 16.18763542175293, + "learning_rate": 0.000505921209234647, + "loss": 7.5379, + "step": 1277400 + }, + { + "epoch": 5.204254248698173, + "grad_norm": 18.676969528198242, + "learning_rate": 0.0005056244107734024, + "loss": 7.5401, + "step": 1277500 + }, + { + "epoch": 5.204661626721554, + "grad_norm": 14.559529304504395, + "learning_rate": 0.000505327691358168, + "loss": 7.5684, + "step": 1277600 + }, + { + "epoch": 5.205069004744936, + "grad_norm": 4.3110175132751465, + "learning_rate": 0.0005050310510006749, + "loss": 7.5665, + "step": 1277700 + }, + { + "epoch": 5.205476382768317, + "grad_norm": 4.289351463317871, + "learning_rate": 0.0005047344897126508, + "loss": 7.5634, + "step": 1277800 + }, + { + "epoch": 5.205883760791698, + "grad_norm": 4.672882556915283, + "learning_rate": 0.0005044380075058202, + "loss": 7.5554, + "step": 1277900 + }, + { + "epoch": 5.20629113881508, + "grad_norm": 15.918068885803223, + "learning_rate": 0.0005041416043919044, + "loss": 7.5788, + "step": 1278000 + }, + { + "epoch": 5.20629113881508, + "eval_MaskedAccuracy": 0.5121467039824147, + "eval_loss": 1.5961333513259888, + "eval_runtime": 164.6736, + "eval_samples_per_second": 385.465, + "eval_steps_per_second": 1.506, + "step": 1278000 + }, + { + "epoch": 5.206698516838461, + "grad_norm": 19.095203399658203, + "learning_rate": 0.0005038452803826239, + "loss": 7.5523, + "step": 1278100 + }, + { + "epoch": 5.207105894861843, + "grad_norm": 21.950645446777344, + "learning_rate": 0.0005035490354896918, + "loss": 7.5401, + "step": 1278200 + }, + { + "epoch": 5.207513272885224, + "grad_norm": 5.846652507781982, + "learning_rate": 0.0005032528697248206, + "loss": 7.5438, + "step": 1278300 + }, + { + "epoch": 5.207920650908606, + "grad_norm": 3.010402202606201, + "learning_rate": 0.0005029567830997202, + "loss": 7.5521, + "step": 1278400 + }, + { + "epoch": 5.208328028931987, + "grad_norm": 8.32205581665039, + "learning_rate": 0.0005026607756260952, + "loss": 7.5344, + "step": 1278500 + }, + { + "epoch": 5.208735406955369, + "grad_norm": 5.044190883636475, + "learning_rate": 0.000502364847315649, + "loss": 7.5528, + "step": 1278600 + }, + { + "epoch": 5.20914278497875, + "grad_norm": 9.99151611328125, + "learning_rate": 0.0005020689981800818, + "loss": 7.5759, + "step": 1278700 + }, + { + "epoch": 5.209550163002132, + "grad_norm": 9.568770408630371, + "learning_rate": 0.0005017732282310888, + "loss": 7.5386, + "step": 1278800 + }, + { + "epoch": 5.2099575410255134, + "grad_norm": 27.248090744018555, + "learning_rate": 0.0005014775374803644, + "loss": 7.5319, + "step": 1278900 + }, + { + "epoch": 5.210364919048894, + "grad_norm": 5.413463115692139, + "learning_rate": 0.0005011819259395981, + "loss": 7.537, + "step": 1279000 + }, + { + "epoch": 5.210364919048894, + "eval_MaskedAccuracy": 0.5122444609892658, + "eval_loss": 1.587311029434204, + "eval_runtime": 163.1407, + "eval_samples_per_second": 389.088, + "eval_steps_per_second": 1.52, + "step": 1279000 + }, + { + "epoch": 5.210772297072276, + "grad_norm": 11.685070991516113, + "learning_rate": 0.0005008863936204777, + "loss": 7.5437, + "step": 1279100 + }, + { + "epoch": 5.211179675095657, + "grad_norm": 7.2980265617370605, + "learning_rate": 0.0005005909405346861, + "loss": 7.5981, + "step": 1279200 + }, + { + "epoch": 5.211587053119039, + "grad_norm": 18.209922790527344, + "learning_rate": 0.0005002955666939064, + "loss": 7.545, + "step": 1279300 + }, + { + "epoch": 5.21199443114242, + "grad_norm": 12.128777503967285, + "learning_rate": 0.0005000002721098142, + "loss": 7.5637, + "step": 1279400 + }, + { + "epoch": 5.212401809165802, + "grad_norm": 4.433072566986084, + "learning_rate": 0.0004997050567940851, + "loss": 7.5295, + "step": 1279500 + }, + { + "epoch": 5.212809187189183, + "grad_norm": 10.994619369506836, + "learning_rate": 0.0004994099207583901, + "loss": 7.5315, + "step": 1279600 + }, + { + "epoch": 5.213216565212565, + "grad_norm": 13.443550109863281, + "learning_rate": 0.0004991148640143975, + "loss": 7.5605, + "step": 1279700 + }, + { + "epoch": 5.213623943235946, + "grad_norm": 16.893964767456055, + "learning_rate": 0.0004988198865737724, + "loss": 7.5842, + "step": 1279800 + }, + { + "epoch": 5.214031321259328, + "grad_norm": 9.369170188903809, + "learning_rate": 0.0004985249884481761, + "loss": 7.5525, + "step": 1279900 + }, + { + "epoch": 5.214438699282709, + "grad_norm": 9.552986145019531, + "learning_rate": 0.0004982301696492694, + "loss": 7.5436, + "step": 1280000 + }, + { + "epoch": 5.214438699282709, + "eval_MaskedAccuracy": 0.5130588358646386, + "eval_loss": 1.5795327425003052, + "eval_runtime": 169.1986, + "eval_samples_per_second": 375.157, + "eval_steps_per_second": 1.466, + "step": 1280000 + }, + { + "epoch": 5.214846077306091, + "grad_norm": 3.6762850284576416, + "learning_rate": 0.0004979354301887042, + "loss": 7.5163, + "step": 1280100 + }, + { + "epoch": 5.2152534553294725, + "grad_norm": 8.116147994995117, + "learning_rate": 0.0004976407700781358, + "loss": 7.5847, + "step": 1280200 + }, + { + "epoch": 5.215660833352853, + "grad_norm": 5.632050514221191, + "learning_rate": 0.0004973461893292147, + "loss": 7.4982, + "step": 1280300 + }, + { + "epoch": 5.216068211376235, + "grad_norm": 5.7624053955078125, + "learning_rate": 0.0004970516879535864, + "loss": 7.5721, + "step": 1280400 + }, + { + "epoch": 5.216475589399616, + "grad_norm": 6.228050231933594, + "learning_rate": 0.0004967572659628934, + "loss": 7.5744, + "step": 1280500 + }, + { + "epoch": 5.216882967422998, + "grad_norm": 3.5868136882781982, + "learning_rate": 0.0004964629233687771, + "loss": 7.5626, + "step": 1280600 + }, + { + "epoch": 5.217290345446379, + "grad_norm": 11.111696243286133, + "learning_rate": 0.000496168660182874, + "loss": 7.5391, + "step": 1280700 + }, + { + "epoch": 5.217697723469761, + "grad_norm": 4.965041160583496, + "learning_rate": 0.0004958744764168174, + "loss": 7.5492, + "step": 1280800 + }, + { + "epoch": 5.218105101493142, + "grad_norm": 12.747413635253906, + "learning_rate": 0.000495580372082238, + "loss": 7.5441, + "step": 1280900 + }, + { + "epoch": 5.218512479516524, + "grad_norm": 5.378708362579346, + "learning_rate": 0.0004952863471907636, + "loss": 7.5121, + "step": 1281000 + }, + { + "epoch": 5.218512479516524, + "eval_MaskedAccuracy": 0.5126325956417793, + "eval_loss": 1.593763828277588, + "eval_runtime": 176.2232, + "eval_samples_per_second": 360.202, + "eval_steps_per_second": 1.407, + "step": 1281000 + }, + { + "epoch": 5.218919857539905, + "grad_norm": 11.631937026977539, + "learning_rate": 0.0004949924017540182, + "loss": 7.5425, + "step": 1281100 + }, + { + "epoch": 5.219327235563287, + "grad_norm": 5.905684947967529, + "learning_rate": 0.0004946985357836244, + "loss": 7.5377, + "step": 1281200 + }, + { + "epoch": 5.2197346135866685, + "grad_norm": 5.273624420166016, + "learning_rate": 0.0004944047492911988, + "loss": 7.5476, + "step": 1281300 + }, + { + "epoch": 5.22014199161005, + "grad_norm": 9.432669639587402, + "learning_rate": 0.0004941110422883577, + "loss": 7.5253, + "step": 1281400 + }, + { + "epoch": 5.2205493696334315, + "grad_norm": 7.141916275024414, + "learning_rate": 0.0004938174147867119, + "loss": 7.5348, + "step": 1281500 + }, + { + "epoch": 5.220956747656812, + "grad_norm": 12.515483856201172, + "learning_rate": 0.0004935238667978699, + "loss": 7.535, + "step": 1281600 + }, + { + "epoch": 5.221364125680194, + "grad_norm": 4.881894588470459, + "learning_rate": 0.0004932303983334376, + "loss": 7.5415, + "step": 1281700 + }, + { + "epoch": 5.221771503703575, + "grad_norm": 3.7656407356262207, + "learning_rate": 0.0004929370094050173, + "loss": 7.5609, + "step": 1281800 + }, + { + "epoch": 5.222178881726957, + "grad_norm": 13.714067459106445, + "learning_rate": 0.0004926437000242073, + "loss": 7.5394, + "step": 1281900 + }, + { + "epoch": 5.222586259750338, + "grad_norm": 15.318659782409668, + "learning_rate": 0.0004923504702026056, + "loss": 7.5323, + "step": 1282000 + }, + { + "epoch": 5.222586259750338, + "eval_MaskedAccuracy": 0.51202640226235, + "eval_loss": 1.5868650674819946, + "eval_runtime": 169.942, + "eval_samples_per_second": 373.516, + "eval_steps_per_second": 1.459, + "step": 1282000 + }, + { + "epoch": 5.22299363777372, + "grad_norm": 4.545812129974365, + "learning_rate": 0.0004920573199518039, + "loss": 7.5595, + "step": 1282100 + }, + { + "epoch": 5.223401015797101, + "grad_norm": 18.41389274597168, + "learning_rate": 0.0004917642492833929, + "loss": 7.5457, + "step": 1282200 + }, + { + "epoch": 5.223808393820483, + "grad_norm": 5.827291011810303, + "learning_rate": 0.0004914712582089581, + "loss": 7.5457, + "step": 1282300 + }, + { + "epoch": 5.224215771843864, + "grad_norm": 13.710026741027832, + "learning_rate": 0.0004911783467400837, + "loss": 7.5297, + "step": 1282400 + }, + { + "epoch": 5.224623149867246, + "grad_norm": 7.869463920593262, + "learning_rate": 0.0004908855148883501, + "loss": 7.5423, + "step": 1282500 + }, + { + "epoch": 5.2250305278906275, + "grad_norm": 4.054448127746582, + "learning_rate": 0.000490592762665335, + "loss": 7.553, + "step": 1282600 + }, + { + "epoch": 5.225437905914009, + "grad_norm": 7.019258975982666, + "learning_rate": 0.0004903000900826111, + "loss": 7.5315, + "step": 1282700 + }, + { + "epoch": 5.22584528393739, + "grad_norm": 8.367408752441406, + "learning_rate": 0.0004900074971517503, + "loss": 7.5601, + "step": 1282800 + }, + { + "epoch": 5.226252661960771, + "grad_norm": 8.909741401672363, + "learning_rate": 0.00048971498388432, + "loss": 7.5328, + "step": 1282900 + }, + { + "epoch": 5.226660039984153, + "grad_norm": 10.273576736450195, + "learning_rate": 0.000489422550291885, + "loss": 7.5083, + "step": 1283000 + }, + { + "epoch": 5.226660039984153, + "eval_MaskedAccuracy": 0.5120344032540787, + "eval_loss": 1.5840531587600708, + "eval_runtime": 162.2577, + "eval_samples_per_second": 391.205, + "eval_steps_per_second": 1.528, + "step": 1283000 + }, + { + "epoch": 5.227067418007534, + "grad_norm": 10.882198333740234, + "learning_rate": 0.0004891301963860064, + "loss": 7.5474, + "step": 1283100 + }, + { + "epoch": 5.227474796030916, + "grad_norm": 10.444225311279297, + "learning_rate": 0.0004888379221782434, + "loss": 7.5588, + "step": 1283200 + }, + { + "epoch": 5.227882174054297, + "grad_norm": 20.341291427612305, + "learning_rate": 0.0004885457276801499, + "loss": 7.5335, + "step": 1283300 + }, + { + "epoch": 5.228289552077679, + "grad_norm": 12.155478477478027, + "learning_rate": 0.000488253612903279, + "loss": 7.5322, + "step": 1283400 + }, + { + "epoch": 5.22869693010106, + "grad_norm": 6.3683390617370605, + "learning_rate": 0.000487961577859181, + "loss": 7.5329, + "step": 1283500 + }, + { + "epoch": 5.229104308124442, + "grad_norm": 9.687567710876465, + "learning_rate": 0.00048766962255940034, + "loss": 7.5184, + "step": 1283600 + }, + { + "epoch": 5.2295116861478235, + "grad_norm": 15.411434173583984, + "learning_rate": 0.0004873777470154787, + "loss": 7.5815, + "step": 1283700 + }, + { + "epoch": 5.229919064171205, + "grad_norm": 2.9939393997192383, + "learning_rate": 0.00048708595123895566, + "loss": 7.5516, + "step": 1283800 + }, + { + "epoch": 5.2303264421945865, + "grad_norm": 8.011006355285645, + "learning_rate": 0.0004867942352413677, + "loss": 7.5448, + "step": 1283900 + }, + { + "epoch": 5.230733820217967, + "grad_norm": 12.934003829956055, + "learning_rate": 0.000486502599034248, + "loss": 7.5224, + "step": 1284000 + }, + { + "epoch": 5.230733820217967, + "eval_MaskedAccuracy": 0.5120633930333218, + "eval_loss": 1.5963746309280396, + "eval_runtime": 165.3475, + "eval_samples_per_second": 383.894, + "eval_steps_per_second": 1.5, + "step": 1284000 + }, + { + "epoch": 5.231141198241349, + "grad_norm": 8.110870361328125, + "learning_rate": 0.0004862110426291262, + "loss": 7.5334, + "step": 1284100 + }, + { + "epoch": 5.23154857626473, + "grad_norm": 28.927682876586914, + "learning_rate": 0.00048591956603752894, + "loss": 7.5333, + "step": 1284200 + }, + { + "epoch": 5.231955954288112, + "grad_norm": 18.001243591308594, + "learning_rate": 0.0004856281692709795, + "loss": 7.5285, + "step": 1284300 + }, + { + "epoch": 5.232363332311493, + "grad_norm": 11.77944564819336, + "learning_rate": 0.0004853368523409993, + "loss": 7.5394, + "step": 1284400 + }, + { + "epoch": 5.232770710334875, + "grad_norm": 11.754374504089355, + "learning_rate": 0.0004850456152591052, + "loss": 7.5542, + "step": 1284500 + }, + { + "epoch": 5.233178088358256, + "grad_norm": 4.447332382202148, + "learning_rate": 0.0004847544580368111, + "loss": 7.5223, + "step": 1284600 + }, + { + "epoch": 5.233585466381638, + "grad_norm": 12.266335487365723, + "learning_rate": 0.0004844633806856289, + "loss": 7.4934, + "step": 1284700 + }, + { + "epoch": 5.233992844405019, + "grad_norm": 4.861476421356201, + "learning_rate": 0.00048417238321706577, + "loss": 7.5464, + "step": 1284800 + }, + { + "epoch": 5.234400222428401, + "grad_norm": 3.598330497741699, + "learning_rate": 0.00048388146564262567, + "loss": 7.5578, + "step": 1284900 + }, + { + "epoch": 5.2348076004517825, + "grad_norm": 22.25257682800293, + "learning_rate": 0.00048359062797381157, + "loss": 7.52, + "step": 1285000 + }, + { + "epoch": 5.2348076004517825, + "eval_MaskedAccuracy": 0.5125263938568052, + "eval_loss": 1.5771960020065308, + "eval_runtime": 164.8982, + "eval_samples_per_second": 384.94, + "eval_steps_per_second": 1.504, + "step": 1285000 + }, + { + "epoch": 5.235214978475164, + "grad_norm": 14.142352104187012, + "learning_rate": 0.0004832998702221213, + "loss": 7.584, + "step": 1285100 + }, + { + "epoch": 5.235622356498546, + "grad_norm": 4.911906719207764, + "learning_rate": 0.00048300919239904966, + "loss": 7.5526, + "step": 1285200 + }, + { + "epoch": 5.236029734521926, + "grad_norm": 5.236055374145508, + "learning_rate": 0.00048271859451608933, + "loss": 7.5327, + "step": 1285300 + }, + { + "epoch": 5.236437112545308, + "grad_norm": 6.5616984367370605, + "learning_rate": 0.00048242807658472887, + "loss": 7.5313, + "step": 1285400 + }, + { + "epoch": 5.236844490568689, + "grad_norm": 14.643767356872559, + "learning_rate": 0.0004821376386164545, + "loss": 7.576, + "step": 1285500 + }, + { + "epoch": 5.237251868592071, + "grad_norm": 6.73957633972168, + "learning_rate": 0.0004818472806227475, + "loss": 7.543, + "step": 1285600 + }, + { + "epoch": 5.237659246615452, + "grad_norm": 10.866485595703125, + "learning_rate": 0.00048155700261508835, + "loss": 7.5103, + "step": 1285700 + }, + { + "epoch": 5.238066624638834, + "grad_norm": 4.343795299530029, + "learning_rate": 0.00048126680460495314, + "loss": 7.5332, + "step": 1285800 + }, + { + "epoch": 5.238474002662215, + "grad_norm": 12.848427772521973, + "learning_rate": 0.0004809766866038141, + "loss": 7.5352, + "step": 1285900 + }, + { + "epoch": 5.238881380685597, + "grad_norm": 6.552394866943359, + "learning_rate": 0.0004806866486231429, + "loss": 7.5298, + "step": 1286000 + }, + { + "epoch": 5.238881380685597, + "eval_MaskedAccuracy": 0.5122762140455446, + "eval_loss": 1.602086067199707, + "eval_runtime": 183.139, + "eval_samples_per_second": 346.6, + "eval_steps_per_second": 1.354, + "step": 1286000 + }, + { + "epoch": 5.2392887587089785, + "grad_norm": 3.267075538635254, + "learning_rate": 0.0004803966906744046, + "loss": 7.521, + "step": 1286100 + }, + { + "epoch": 5.23969613673236, + "grad_norm": 5.963367462158203, + "learning_rate": 0.00048010681276906397, + "loss": 7.5325, + "step": 1286200 + }, + { + "epoch": 5.2401035147557415, + "grad_norm": 7.868770599365234, + "learning_rate": 0.0004798170149185809, + "loss": 7.5412, + "step": 1286300 + }, + { + "epoch": 5.240510892779123, + "grad_norm": 6.230511665344238, + "learning_rate": 0.00047952729713441244, + "loss": 7.5466, + "step": 1286400 + }, + { + "epoch": 5.240918270802505, + "grad_norm": 21.375364303588867, + "learning_rate": 0.0004792376594280131, + "loss": 7.5623, + "step": 1286500 + }, + { + "epoch": 5.241325648825885, + "grad_norm": 5.332259654998779, + "learning_rate": 0.00047894810181083335, + "loss": 7.53, + "step": 1286600 + }, + { + "epoch": 5.241733026849267, + "grad_norm": 8.759510040283203, + "learning_rate": 0.0004786586242943215, + "loss": 7.5094, + "step": 1286700 + }, + { + "epoch": 5.242140404872648, + "grad_norm": 14.29955768585205, + "learning_rate": 0.0004783692268899217, + "loss": 7.5341, + "step": 1286800 + }, + { + "epoch": 5.24254778289603, + "grad_norm": 3.381981134414673, + "learning_rate": 0.000478079909609075, + "loss": 7.5583, + "step": 1286900 + }, + { + "epoch": 5.242955160919411, + "grad_norm": 15.60663890838623, + "learning_rate": 0.0004777906724632211, + "loss": 7.5218, + "step": 1287000 + }, + { + "epoch": 5.242955160919411, + "eval_MaskedAccuracy": 0.5127360467544204, + "eval_loss": 1.5800647735595703, + "eval_runtime": 164.4253, + "eval_samples_per_second": 386.048, + "eval_steps_per_second": 1.508, + "step": 1287000 + }, + { + "epoch": 5.243362538942793, + "grad_norm": 11.387564659118652, + "learning_rate": 0.0004775015154637939, + "loss": 7.5437, + "step": 1287100 + }, + { + "epoch": 5.2437699169661744, + "grad_norm": 6.099433898925781, + "learning_rate": 0.00047721243862222483, + "loss": 7.5736, + "step": 1287200 + }, + { + "epoch": 5.244177294989556, + "grad_norm": 5.325205326080322, + "learning_rate": 0.00047692344194994433, + "loss": 7.5478, + "step": 1287300 + }, + { + "epoch": 5.2445846730129375, + "grad_norm": 8.52285385131836, + "learning_rate": 0.0004766345254583771, + "loss": 7.5513, + "step": 1287400 + }, + { + "epoch": 5.244992051036319, + "grad_norm": 4.5920820236206055, + "learning_rate": 0.0004763456891589454, + "loss": 7.5396, + "step": 1287500 + }, + { + "epoch": 5.245399429059701, + "grad_norm": 5.701610565185547, + "learning_rate": 0.00047605693306306874, + "loss": 7.5753, + "step": 1287600 + }, + { + "epoch": 5.245806807083082, + "grad_norm": 9.378232955932617, + "learning_rate": 0.00047576825718216267, + "loss": 7.5534, + "step": 1287700 + }, + { + "epoch": 5.246214185106463, + "grad_norm": 3.1569764614105225, + "learning_rate": 0.0004754796615276406, + "loss": 7.5526, + "step": 1287800 + }, + { + "epoch": 5.246621563129844, + "grad_norm": 7.628342151641846, + "learning_rate": 0.0004751911461109123, + "loss": 7.565, + "step": 1287900 + }, + { + "epoch": 5.247028941153226, + "grad_norm": 9.264254570007324, + "learning_rate": 0.0004749027109433841, + "loss": 7.5454, + "step": 1288000 + }, + { + "epoch": 5.247028941153226, + "eval_MaskedAccuracy": 0.5126010382443479, + "eval_loss": 1.5994163751602173, + "eval_runtime": 179.8793, + "eval_samples_per_second": 352.881, + "eval_steps_per_second": 1.379, + "step": 1288000 + }, + { + "epoch": 5.247436319176607, + "grad_norm": 11.552118301391602, + "learning_rate": 0.0004746143560364597, + "loss": 7.5598, + "step": 1288100 + }, + { + "epoch": 5.247843697199989, + "grad_norm": 16.229381561279297, + "learning_rate": 0.0004743260814015388, + "loss": 7.5822, + "step": 1288200 + }, + { + "epoch": 5.24825107522337, + "grad_norm": 3.563021659851074, + "learning_rate": 0.0004740378870500186, + "loss": 7.5352, + "step": 1288300 + }, + { + "epoch": 5.248658453246752, + "grad_norm": 18.701969146728516, + "learning_rate": 0.0004737497729932932, + "loss": 7.4985, + "step": 1288400 + }, + { + "epoch": 5.2490658312701335, + "grad_norm": 12.15512752532959, + "learning_rate": 0.0004734617392427531, + "loss": 7.5343, + "step": 1288500 + }, + { + "epoch": 5.249473209293515, + "grad_norm": 10.886027336120605, + "learning_rate": 0.00047317378580978645, + "loss": 7.5496, + "step": 1288600 + }, + { + "epoch": 5.2498805873168966, + "grad_norm": 6.915746212005615, + "learning_rate": 0.00047288591270577655, + "loss": 7.5567, + "step": 1288700 + }, + { + "epoch": 5.250287965340278, + "grad_norm": 3.643834352493286, + "learning_rate": 0.00047259811994210477, + "loss": 7.5528, + "step": 1288800 + }, + { + "epoch": 5.25069534336366, + "grad_norm": 5.9770827293396, + "learning_rate": 0.00047231040753014893, + "loss": 7.5479, + "step": 1288900 + }, + { + "epoch": 5.25110272138704, + "grad_norm": 19.901004791259766, + "learning_rate": 0.00047202277548128504, + "loss": 7.5477, + "step": 1289000 + }, + { + "epoch": 5.25110272138704, + "eval_MaskedAccuracy": 0.512350450181298, + "eval_loss": 1.5840940475463867, + "eval_runtime": 170.3322, + "eval_samples_per_second": 372.66, + "eval_steps_per_second": 1.456, + "step": 1289000 + }, + { + "epoch": 5.251510099410422, + "grad_norm": 11.31248664855957, + "learning_rate": 0.000471735223806884, + "loss": 7.5632, + "step": 1289100 + }, + { + "epoch": 5.251917477433803, + "grad_norm": 10.792511940002441, + "learning_rate": 0.0004714477525183138, + "loss": 7.5247, + "step": 1289200 + }, + { + "epoch": 5.252324855457185, + "grad_norm": 6.50295877456665, + "learning_rate": 0.0004711603616269403, + "loss": 7.5642, + "step": 1289300 + }, + { + "epoch": 5.252732233480566, + "grad_norm": 5.041585445404053, + "learning_rate": 0.0004708730511441261, + "loss": 7.5536, + "step": 1289400 + }, + { + "epoch": 5.253139611503948, + "grad_norm": 3.0778822898864746, + "learning_rate": 0.00047058582108122903, + "loss": 7.5834, + "step": 1289500 + }, + { + "epoch": 5.2535469895273295, + "grad_norm": 7.7733659744262695, + "learning_rate": 0.00047029867144960475, + "loss": 7.5586, + "step": 1289600 + }, + { + "epoch": 5.253954367550711, + "grad_norm": 7.688708305358887, + "learning_rate": 0.00047001160226060705, + "loss": 7.5327, + "step": 1289700 + }, + { + "epoch": 5.2543617455740925, + "grad_norm": 16.57267189025879, + "learning_rate": 0.00046972461352558394, + "loss": 7.5596, + "step": 1289800 + }, + { + "epoch": 5.254769123597474, + "grad_norm": 10.274303436279297, + "learning_rate": 0.00046943770525588273, + "loss": 7.5482, + "step": 1289900 + }, + { + "epoch": 5.255176501620856, + "grad_norm": 8.531637191772461, + "learning_rate": 0.0004691508774628459, + "loss": 7.549, + "step": 1290000 + }, + { + "epoch": 5.255176501620856, + "eval_MaskedAccuracy": 0.5127083132934412, + "eval_loss": 1.5892359018325806, + "eval_runtime": 175.776, + "eval_samples_per_second": 361.119, + "eval_steps_per_second": 1.411, + "step": 1290000 + }, + { + "epoch": 5.255583879644237, + "grad_norm": 12.287857055664062, + "learning_rate": 0.0004688641301578133, + "loss": 7.5599, + "step": 1290100 + }, + { + "epoch": 5.255991257667619, + "grad_norm": 20.044239044189453, + "learning_rate": 0.00046857746335212205, + "loss": 7.5597, + "step": 1290200 + }, + { + "epoch": 5.256398635690999, + "grad_norm": 9.0589599609375, + "learning_rate": 0.0004682908770571044, + "loss": 7.5507, + "step": 1290300 + }, + { + "epoch": 5.256806013714381, + "grad_norm": 4.308457851409912, + "learning_rate": 0.00046800437128409196, + "loss": 7.537, + "step": 1290400 + }, + { + "epoch": 5.257213391737762, + "grad_norm": 10.643133163452148, + "learning_rate": 0.0004677179460444113, + "loss": 7.5506, + "step": 1290500 + }, + { + "epoch": 5.257620769761144, + "grad_norm": 4.991848945617676, + "learning_rate": 0.0004674316013493865, + "loss": 7.5256, + "step": 1290600 + }, + { + "epoch": 5.258028147784525, + "grad_norm": 14.143113136291504, + "learning_rate": 0.00046714533721033824, + "loss": 7.5438, + "step": 1290700 + }, + { + "epoch": 5.258435525807907, + "grad_norm": 5.053746700286865, + "learning_rate": 0.00046685915363858305, + "loss": 7.5349, + "step": 1290800 + }, + { + "epoch": 5.2588429038312885, + "grad_norm": 6.166320323944092, + "learning_rate": 0.0004665730506454373, + "loss": 7.5626, + "step": 1290900 + }, + { + "epoch": 5.25925028185467, + "grad_norm": 5.733420372009277, + "learning_rate": 0.00046628702824221116, + "loss": 7.5535, + "step": 1291000 + }, + { + "epoch": 5.25925028185467, + "eval_MaskedAccuracy": 0.5129774351274916, + "eval_loss": 1.5838022232055664, + "eval_runtime": 190.6573, + "eval_samples_per_second": 332.933, + "eval_steps_per_second": 1.301, + "step": 1291000 + }, + { + "epoch": 5.259657659878052, + "grad_norm": 18.24139976501465, + "learning_rate": 0.00046600108644021253, + "loss": 7.5392, + "step": 1291100 + }, + { + "epoch": 5.260065037901433, + "grad_norm": 7.882357597351074, + "learning_rate": 0.0004657152252507463, + "loss": 7.5146, + "step": 1291200 + }, + { + "epoch": 5.260472415924815, + "grad_norm": 4.560593605041504, + "learning_rate": 0.00046542944468511344, + "loss": 7.5257, + "step": 1291300 + }, + { + "epoch": 5.260879793948196, + "grad_norm": 4.275140285491943, + "learning_rate": 0.0004651437447546133, + "loss": 7.5555, + "step": 1291400 + }, + { + "epoch": 5.261287171971578, + "grad_norm": 14.538978576660156, + "learning_rate": 0.00046485812547054103, + "loss": 7.5422, + "step": 1291500 + }, + { + "epoch": 5.261694549994958, + "grad_norm": 16.678388595581055, + "learning_rate": 0.0004645725868441886, + "loss": 7.5793, + "step": 1291600 + }, + { + "epoch": 5.26210192801834, + "grad_norm": 9.275223731994629, + "learning_rate": 0.00046428712888684436, + "loss": 7.5627, + "step": 1291700 + }, + { + "epoch": 5.262509306041721, + "grad_norm": 5.30936861038208, + "learning_rate": 0.00046400175160979455, + "loss": 7.5381, + "step": 1291800 + }, + { + "epoch": 5.262916684065103, + "grad_norm": 6.1422600746154785, + "learning_rate": 0.0004637164550243218, + "loss": 7.5604, + "step": 1291900 + }, + { + "epoch": 5.2633240620884845, + "grad_norm": 6.921450614929199, + "learning_rate": 0.00046343123914170523, + "loss": 7.5401, + "step": 1292000 + }, + { + "epoch": 5.2633240620884845, + "eval_MaskedAccuracy": 0.5128384343371457, + "eval_loss": 1.581850528717041, + "eval_runtime": 189.9637, + "eval_samples_per_second": 334.148, + "eval_steps_per_second": 1.306, + "step": 1292000 + }, + { + "epoch": 5.263731440111866, + "grad_norm": 3.6345386505126953, + "learning_rate": 0.0004631461039732202, + "loss": 7.5429, + "step": 1292100 + }, + { + "epoch": 5.2641388181352475, + "grad_norm": 5.500032424926758, + "learning_rate": 0.00046286104953014123, + "loss": 7.5973, + "step": 1292200 + }, + { + "epoch": 5.264546196158629, + "grad_norm": 7.4409499168396, + "learning_rate": 0.0004625760758237363, + "loss": 7.5617, + "step": 1292300 + }, + { + "epoch": 5.264953574182011, + "grad_norm": 3.640127182006836, + "learning_rate": 0.00046229118286527266, + "loss": 7.546, + "step": 1292400 + }, + { + "epoch": 5.265360952205392, + "grad_norm": 3.7148308753967285, + "learning_rate": 0.00046200637066601366, + "loss": 7.5905, + "step": 1292500 + }, + { + "epoch": 5.265768330228774, + "grad_norm": 3.0423319339752197, + "learning_rate": 0.0004617216392372203, + "loss": 7.5471, + "step": 1292600 + }, + { + "epoch": 5.266175708252154, + "grad_norm": 21.06744956970215, + "learning_rate": 0.00046143698859014846, + "loss": 7.5274, + "step": 1292700 + }, + { + "epoch": 5.266583086275536, + "grad_norm": 4.4424848556518555, + "learning_rate": 0.00046115241873605244, + "loss": 7.5555, + "step": 1292800 + }, + { + "epoch": 5.266990464298917, + "grad_norm": 3.3005311489105225, + "learning_rate": 0.0004608679296861823, + "loss": 7.5234, + "step": 1292900 + }, + { + "epoch": 5.267397842322299, + "grad_norm": 9.025586128234863, + "learning_rate": 0.0004605835214517859, + "loss": 7.5342, + "step": 1293000 + }, + { + "epoch": 5.267397842322299, + "eval_MaskedAccuracy": 0.5121078317535273, + "eval_loss": 1.5936518907546997, + "eval_runtime": 182.2964, + "eval_samples_per_second": 348.202, + "eval_steps_per_second": 1.36, + "step": 1293000 + }, + { + "epoch": 5.26780522034568, + "grad_norm": 7.081292629241943, + "learning_rate": 0.0004602991940441072, + "loss": 7.5365, + "step": 1293100 + }, + { + "epoch": 5.268212598369062, + "grad_norm": 3.9597582817077637, + "learning_rate": 0.00046001494747438705, + "loss": 7.5489, + "step": 1293200 + }, + { + "epoch": 5.2686199763924435, + "grad_norm": 18.178199768066406, + "learning_rate": 0.0004597307817538632, + "loss": 7.5318, + "step": 1293300 + }, + { + "epoch": 5.269027354415825, + "grad_norm": 5.263289928436279, + "learning_rate": 0.0004594466968937704, + "loss": 7.5342, + "step": 1293400 + }, + { + "epoch": 5.269434732439207, + "grad_norm": 7.937551498413086, + "learning_rate": 0.00045916269290534025, + "loss": 7.5707, + "step": 1293500 + }, + { + "epoch": 5.269842110462588, + "grad_norm": 8.918220520019531, + "learning_rate": 0.0004588787697998007, + "loss": 7.5492, + "step": 1293600 + }, + { + "epoch": 5.27024948848597, + "grad_norm": 14.999495506286621, + "learning_rate": 0.00045859492758837676, + "loss": 7.5527, + "step": 1293700 + }, + { + "epoch": 5.270656866509351, + "grad_norm": 4.848311901092529, + "learning_rate": 0.0004583111662822896, + "loss": 7.5433, + "step": 1293800 + }, + { + "epoch": 5.271064244532733, + "grad_norm": 6.287750720977783, + "learning_rate": 0.000458027485892759, + "loss": 7.559, + "step": 1293900 + }, + { + "epoch": 5.271471622556113, + "grad_norm": 4.3590989112854, + "learning_rate": 0.000457743886431, + "loss": 7.5378, + "step": 1294000 + }, + { + "epoch": 5.271471622556113, + "eval_MaskedAccuracy": 0.5122752705209045, + "eval_loss": 1.5980956554412842, + "eval_runtime": 164.4552, + "eval_samples_per_second": 385.977, + "eval_steps_per_second": 1.508, + "step": 1294000 + }, + { + "epoch": 5.271879000579495, + "grad_norm": 6.657304286956787, + "learning_rate": 0.00045746036790822486, + "loss": 7.5515, + "step": 1294100 + }, + { + "epoch": 5.272286378602876, + "grad_norm": 4.288999080657959, + "learning_rate": 0.00045717693033564256, + "loss": 7.5395, + "step": 1294200 + }, + { + "epoch": 5.272693756626258, + "grad_norm": 4.8614888191223145, + "learning_rate": 0.0004568935737244582, + "loss": 7.5536, + "step": 1294300 + }, + { + "epoch": 5.2731011346496395, + "grad_norm": 9.658740043640137, + "learning_rate": 0.0004566102980858755, + "loss": 7.5664, + "step": 1294400 + }, + { + "epoch": 5.273508512673021, + "grad_norm": 17.376310348510742, + "learning_rate": 0.00045632710343109303, + "loss": 7.5423, + "step": 1294500 + }, + { + "epoch": 5.2739158906964025, + "grad_norm": 17.489919662475586, + "learning_rate": 0.00045604398977130707, + "loss": 7.5552, + "step": 1294600 + }, + { + "epoch": 5.274323268719784, + "grad_norm": 7.263301372528076, + "learning_rate": 0.0004557609571177107, + "loss": 7.5632, + "step": 1294700 + }, + { + "epoch": 5.274730646743166, + "grad_norm": 18.212284088134766, + "learning_rate": 0.00045547800548149307, + "loss": 7.5522, + "step": 1294800 + }, + { + "epoch": 5.275138024766547, + "grad_norm": 13.712687492370605, + "learning_rate": 0.00045519513487384244, + "loss": 7.5387, + "step": 1294900 + }, + { + "epoch": 5.275545402789929, + "grad_norm": 12.00277328491211, + "learning_rate": 0.00045491234530594147, + "loss": 7.5094, + "step": 1295000 + }, + { + "epoch": 5.275545402789929, + "eval_MaskedAccuracy": 0.5126189693204896, + "eval_loss": 1.583979606628418, + "eval_runtime": 177.2105, + "eval_samples_per_second": 358.195, + "eval_steps_per_second": 1.399, + "step": 1295000 + }, + { + "epoch": 5.27595278081331, + "grad_norm": 5.409238815307617, + "learning_rate": 0.00045462963678897, + "loss": 7.538, + "step": 1295100 + }, + { + "epoch": 5.276360158836692, + "grad_norm": 4.3116960525512695, + "learning_rate": 0.00045434700933410496, + "loss": 7.5659, + "step": 1295200 + }, + { + "epoch": 5.276767536860072, + "grad_norm": 4.7283806800842285, + "learning_rate": 0.00045406446295252035, + "loss": 7.4981, + "step": 1295300 + }, + { + "epoch": 5.277174914883454, + "grad_norm": 12.936450004577637, + "learning_rate": 0.0004537819976553864, + "loss": 7.5487, + "step": 1295400 + }, + { + "epoch": 5.2775822929068354, + "grad_norm": 3.9384841918945312, + "learning_rate": 0.00045349961345387077, + "loss": 7.4983, + "step": 1295500 + }, + { + "epoch": 5.277989670930217, + "grad_norm": 18.16901397705078, + "learning_rate": 0.0004532173103591378, + "loss": 7.5316, + "step": 1295600 + }, + { + "epoch": 5.2783970489535985, + "grad_norm": 13.225839614868164, + "learning_rate": 0.0004529350883823488, + "loss": 7.5077, + "step": 1295700 + }, + { + "epoch": 5.27880442697698, + "grad_norm": 12.580103874206543, + "learning_rate": 0.0004526529475346595, + "loss": 7.519, + "step": 1295800 + }, + { + "epoch": 5.279211805000362, + "grad_norm": 9.558238983154297, + "learning_rate": 0.00045237088782722657, + "loss": 7.5222, + "step": 1295900 + }, + { + "epoch": 5.279619183023743, + "grad_norm": 9.06231689453125, + "learning_rate": 0.00045208890927120104, + "loss": 7.5416, + "step": 1296000 + }, + { + "epoch": 5.279619183023743, + "eval_MaskedAccuracy": 0.5123287434314, + "eval_loss": 1.5975998640060425, + "eval_runtime": 176.2005, + "eval_samples_per_second": 360.249, + "eval_steps_per_second": 1.407, + "step": 1296000 + }, + { + "epoch": 5.280026561047125, + "grad_norm": 7.299903869628906, + "learning_rate": 0.00045180701187773077, + "loss": 7.5211, + "step": 1296100 + }, + { + "epoch": 5.280433939070506, + "grad_norm": 9.481529235839844, + "learning_rate": 0.0004515251956579602, + "loss": 7.5459, + "step": 1296200 + }, + { + "epoch": 5.280841317093888, + "grad_norm": 20.374595642089844, + "learning_rate": 0.00045124346062303195, + "loss": 7.5396, + "step": 1296300 + }, + { + "epoch": 5.281248695117269, + "grad_norm": 14.360052108764648, + "learning_rate": 0.0004509618067840833, + "loss": 7.5444, + "step": 1296400 + }, + { + "epoch": 5.281656073140651, + "grad_norm": 7.320785045623779, + "learning_rate": 0.0004506802341522505, + "loss": 7.5352, + "step": 1296500 + }, + { + "epoch": 5.282063451164031, + "grad_norm": 25.894439697265625, + "learning_rate": 0.0004503987427386654, + "loss": 7.5397, + "step": 1296600 + }, + { + "epoch": 5.282470829187413, + "grad_norm": 6.371466159820557, + "learning_rate": 0.00045011733255445683, + "loss": 7.5465, + "step": 1296700 + }, + { + "epoch": 5.2828782072107945, + "grad_norm": 16.651531219482422, + "learning_rate": 0.00044983600361075016, + "loss": 7.5721, + "step": 1296800 + }, + { + "epoch": 5.283285585234176, + "grad_norm": 6.61417818069458, + "learning_rate": 0.00044955475591866757, + "loss": 7.5564, + "step": 1296900 + }, + { + "epoch": 5.2836929632575576, + "grad_norm": 14.974919319152832, + "learning_rate": 0.00044927358948932945, + "loss": 7.5771, + "step": 1297000 + }, + { + "epoch": 5.2836929632575576, + "eval_MaskedAccuracy": 0.5125122033148928, + "eval_loss": 1.5861791372299194, + "eval_runtime": 158.5479, + "eval_samples_per_second": 400.359, + "eval_steps_per_second": 1.564, + "step": 1297000 + }, + { + "epoch": 5.284100341280939, + "grad_norm": 11.632669448852539, + "learning_rate": 0.0004489925043338509, + "loss": 7.5322, + "step": 1297100 + }, + { + "epoch": 5.284507719304321, + "grad_norm": 3.8050639629364014, + "learning_rate": 0.0004487115004633452, + "loss": 7.525, + "step": 1297200 + }, + { + "epoch": 5.284915097327702, + "grad_norm": 14.257918357849121, + "learning_rate": 0.00044843057788892165, + "loss": 7.5417, + "step": 1297300 + }, + { + "epoch": 5.285322475351084, + "grad_norm": 7.380938529968262, + "learning_rate": 0.0004481497366216868, + "loss": 7.5114, + "step": 1297400 + }, + { + "epoch": 5.285729853374465, + "grad_norm": 3.516423225402832, + "learning_rate": 0.0004478689766727435, + "loss": 7.5464, + "step": 1297500 + }, + { + "epoch": 5.286137231397847, + "grad_norm": 8.229839324951172, + "learning_rate": 0.0004475882980531919, + "loss": 7.5587, + "step": 1297600 + }, + { + "epoch": 5.286544609421227, + "grad_norm": 7.977880477905273, + "learning_rate": 0.00044730770077412875, + "loss": 7.5146, + "step": 1297700 + }, + { + "epoch": 5.286951987444609, + "grad_norm": 9.316288948059082, + "learning_rate": 0.0004470271848466471, + "loss": 7.4973, + "step": 1297800 + }, + { + "epoch": 5.2873593654679905, + "grad_norm": 7.418075084686279, + "learning_rate": 0.00044674675028183816, + "loss": 7.5378, + "step": 1297900 + }, + { + "epoch": 5.287766743491372, + "grad_norm": 13.113317489624023, + "learning_rate": 0.0004464663970907882, + "loss": 7.5471, + "step": 1298000 + }, + { + "epoch": 5.287766743491372, + "eval_MaskedAccuracy": 0.5122624171890003, + "eval_loss": 1.593355417251587, + "eval_runtime": 163.0497, + "eval_samples_per_second": 389.305, + "eval_steps_per_second": 1.521, + "step": 1298000 + }, + { + "epoch": 5.2881741215147535, + "grad_norm": 4.2268781661987305, + "learning_rate": 0.00044618612528458197, + "loss": 7.5563, + "step": 1298100 + }, + { + "epoch": 5.288581499538135, + "grad_norm": 3.9747097492218018, + "learning_rate": 0.00044590593487429876, + "loss": 7.5446, + "step": 1298200 + }, + { + "epoch": 5.288988877561517, + "grad_norm": 4.4901862144470215, + "learning_rate": 0.0004456258258710172, + "loss": 7.5498, + "step": 1298300 + }, + { + "epoch": 5.289396255584898, + "grad_norm": 15.346317291259766, + "learning_rate": 0.0004453457982858112, + "loss": 7.517, + "step": 1298400 + }, + { + "epoch": 5.28980363360828, + "grad_norm": 7.432754993438721, + "learning_rate": 0.0004450658521297512, + "loss": 7.5307, + "step": 1298500 + }, + { + "epoch": 5.290211011631661, + "grad_norm": 15.85821533203125, + "learning_rate": 0.0004447859874139055, + "loss": 7.5595, + "step": 1298600 + }, + { + "epoch": 5.290618389655043, + "grad_norm": 3.3669979572296143, + "learning_rate": 0.00044450620414933833, + "loss": 7.5576, + "step": 1298700 + }, + { + "epoch": 5.291025767678424, + "grad_norm": 8.374113082885742, + "learning_rate": 0.0004442265023471109, + "loss": 7.5313, + "step": 1298800 + }, + { + "epoch": 5.291433145701806, + "grad_norm": 13.105899810791016, + "learning_rate": 0.0004439468820182816, + "loss": 7.5872, + "step": 1298900 + }, + { + "epoch": 5.291840523725186, + "grad_norm": 5.814199447631836, + "learning_rate": 0.00044366734317390614, + "loss": 7.5416, + "step": 1299000 + }, + { + "epoch": 5.291840523725186, + "eval_MaskedAccuracy": 0.5128568836943566, + "eval_loss": 1.5909806489944458, + "eval_runtime": 157.3225, + "eval_samples_per_second": 403.477, + "eval_steps_per_second": 1.576, + "step": 1299000 + }, + { + "epoch": 5.292247901748568, + "grad_norm": 12.49307632446289, + "learning_rate": 0.00044338788582503507, + "loss": 7.5241, + "step": 1299100 + }, + { + "epoch": 5.2926552797719495, + "grad_norm": 11.495176315307617, + "learning_rate": 0.0004431085099827177, + "loss": 7.5259, + "step": 1299200 + }, + { + "epoch": 5.293062657795331, + "grad_norm": 18.209365844726562, + "learning_rate": 0.0004428292156579981, + "loss": 7.5491, + "step": 1299300 + }, + { + "epoch": 5.293470035818713, + "grad_norm": 15.301490783691406, + "learning_rate": 0.0004425500028619188, + "loss": 7.5406, + "step": 1299400 + }, + { + "epoch": 5.293877413842094, + "grad_norm": 15.960002899169922, + "learning_rate": 0.0004422708716055185, + "loss": 7.5214, + "step": 1299500 + }, + { + "epoch": 5.294284791865476, + "grad_norm": 12.808664321899414, + "learning_rate": 0.0004419918218998331, + "loss": 7.5618, + "step": 1299600 + }, + { + "epoch": 5.294692169888857, + "grad_norm": 17.56773567199707, + "learning_rate": 0.0004417128537558946, + "loss": 7.5269, + "step": 1299700 + }, + { + "epoch": 5.295099547912239, + "grad_norm": 16.31693458557129, + "learning_rate": 0.0004414339671847317, + "loss": 7.5218, + "step": 1299800 + }, + { + "epoch": 5.29550692593562, + "grad_norm": 9.28150749206543, + "learning_rate": 0.0004411551621973715, + "loss": 7.5507, + "step": 1299900 + }, + { + "epoch": 5.295914303959002, + "grad_norm": 4.1325249671936035, + "learning_rate": 0.00044087643880483604, + "loss": 7.5484, + "step": 1300000 + }, + { + "epoch": 5.295914303959002, + "eval_MaskedAccuracy": 0.5128927888927057, + "eval_loss": 1.5847995281219482, + "eval_runtime": 158.4338, + "eval_samples_per_second": 400.647, + "eval_steps_per_second": 1.565, + "step": 1300000 + }, + { + "epoch": 5.296321681982383, + "grad_norm": 8.101085662841797, + "learning_rate": 0.0004405977970181446, + "loss": 7.5637, + "step": 1300100 + }, + { + "epoch": 5.296729060005765, + "grad_norm": 7.415262222290039, + "learning_rate": 0.00044031923684831337, + "loss": 7.5345, + "step": 1300200 + }, + { + "epoch": 5.2971364380291455, + "grad_norm": 9.402379035949707, + "learning_rate": 0.0004400407583063554, + "loss": 7.5537, + "step": 1300300 + }, + { + "epoch": 5.297543816052527, + "grad_norm": 13.689456939697266, + "learning_rate": 0.0004397623614032804, + "loss": 7.559, + "step": 1300400 + }, + { + "epoch": 5.2979511940759085, + "grad_norm": 16.62010955810547, + "learning_rate": 0.00043948404615009484, + "loss": 7.5558, + "step": 1300500 + }, + { + "epoch": 5.29835857209929, + "grad_norm": 22.214908599853516, + "learning_rate": 0.00043920581255780237, + "loss": 7.5524, + "step": 1300600 + }, + { + "epoch": 5.298765950122672, + "grad_norm": 15.187567710876465, + "learning_rate": 0.0004389276606374029, + "loss": 7.5675, + "step": 1300700 + }, + { + "epoch": 5.299173328146053, + "grad_norm": 9.365309715270996, + "learning_rate": 0.0004386495903998932, + "loss": 7.5302, + "step": 1300800 + }, + { + "epoch": 5.299580706169435, + "grad_norm": 13.13938045501709, + "learning_rate": 0.0004383716018562663, + "loss": 7.5481, + "step": 1300900 + }, + { + "epoch": 5.299988084192816, + "grad_norm": 17.353683471679688, + "learning_rate": 0.00043809369501751324, + "loss": 7.5162, + "step": 1301000 + }, + { + "epoch": 5.299988084192816, + "eval_MaskedAccuracy": 0.5124460678659971, + "eval_loss": 1.5921168327331543, + "eval_runtime": 174.3072, + "eval_samples_per_second": 364.162, + "eval_steps_per_second": 1.423, + "step": 1301000 + }, + { + "epoch": 5.300395462216198, + "grad_norm": 5.176187515258789, + "learning_rate": 0.0004378158698946208, + "loss": 7.5158, + "step": 1301100 + }, + { + "epoch": 5.300802840239579, + "grad_norm": 29.1513614654541, + "learning_rate": 0.0004375381264985733, + "loss": 7.5413, + "step": 1301200 + }, + { + "epoch": 5.301210218262961, + "grad_norm": 8.285941123962402, + "learning_rate": 0.00043726046484035075, + "loss": 7.5578, + "step": 1301300 + }, + { + "epoch": 5.301617596286342, + "grad_norm": 20.85906410217285, + "learning_rate": 0.0004369828849309314, + "loss": 7.5582, + "step": 1301400 + }, + { + "epoch": 5.302024974309724, + "grad_norm": 9.398735046386719, + "learning_rate": 0.0004367053867812896, + "loss": 7.5808, + "step": 1301500 + }, + { + "epoch": 5.3024323523331045, + "grad_norm": 3.6359591484069824, + "learning_rate": 0.0004364279704023959, + "loss": 7.5516, + "step": 1301600 + }, + { + "epoch": 5.302839730356486, + "grad_norm": 9.873467445373535, + "learning_rate": 0.0004361506358052178, + "loss": 7.5499, + "step": 1301700 + }, + { + "epoch": 5.303247108379868, + "grad_norm": 10.540246963500977, + "learning_rate": 0.0004358733830007199, + "loss": 7.5431, + "step": 1301800 + }, + { + "epoch": 5.303654486403249, + "grad_norm": 14.299480438232422, + "learning_rate": 0.0004355962119998637, + "loss": 7.5704, + "step": 1301900 + }, + { + "epoch": 5.304061864426631, + "grad_norm": 4.484654426574707, + "learning_rate": 0.0004353191228136075, + "loss": 7.5667, + "step": 1302000 + }, + { + "epoch": 5.304061864426631, + "eval_MaskedAccuracy": 0.5124708999618712, + "eval_loss": 1.5887837409973145, + "eval_runtime": 160.61, + "eval_samples_per_second": 395.218, + "eval_steps_per_second": 1.544, + "step": 1302000 + }, + { + "epoch": 5.304469242450012, + "grad_norm": 17.846599578857422, + "learning_rate": 0.0004350421154529061, + "loss": 7.5863, + "step": 1302100 + }, + { + "epoch": 5.304876620473394, + "grad_norm": 7.631462574005127, + "learning_rate": 0.0004347651899287102, + "loss": 7.5704, + "step": 1302200 + }, + { + "epoch": 5.305283998496775, + "grad_norm": 7.09050989151001, + "learning_rate": 0.00043448834625196874, + "loss": 7.5547, + "step": 1302300 + }, + { + "epoch": 5.305691376520157, + "grad_norm": 13.030946731567383, + "learning_rate": 0.0004342115844336266, + "loss": 7.5227, + "step": 1302400 + }, + { + "epoch": 5.306098754543538, + "grad_norm": 19.24678611755371, + "learning_rate": 0.00043393490448462616, + "loss": 7.5469, + "step": 1302500 + }, + { + "epoch": 5.30650613256692, + "grad_norm": 12.20937728881836, + "learning_rate": 0.0004336583064159059, + "loss": 7.5525, + "step": 1302600 + }, + { + "epoch": 5.3069135105903005, + "grad_norm": 4.572175979614258, + "learning_rate": 0.0004333817902384007, + "loss": 7.5473, + "step": 1302700 + }, + { + "epoch": 5.307320888613682, + "grad_norm": 7.789766311645508, + "learning_rate": 0.0004331053559630435, + "loss": 7.5323, + "step": 1302800 + }, + { + "epoch": 5.3077282666370635, + "grad_norm": 7.178048610687256, + "learning_rate": 0.000432829003600763, + "loss": 7.5074, + "step": 1302900 + }, + { + "epoch": 5.308135644660445, + "grad_norm": 14.05922794342041, + "learning_rate": 0.0004325527331624844, + "loss": 7.5447, + "step": 1303000 + }, + { + "epoch": 5.308135644660445, + "eval_MaskedAccuracy": 0.5133053084697339, + "eval_loss": 1.5874871015548706, + "eval_runtime": 173.7811, + "eval_samples_per_second": 365.264, + "eval_steps_per_second": 1.427, + "step": 1303000 + }, + { + "epoch": 5.308543022683827, + "grad_norm": 11.715259552001953, + "learning_rate": 0.000432276544659131, + "loss": 7.5571, + "step": 1303100 + }, + { + "epoch": 5.308950400707208, + "grad_norm": 5.756912708282471, + "learning_rate": 0.0004320004381016209, + "loss": 7.5376, + "step": 1303200 + }, + { + "epoch": 5.30935777873059, + "grad_norm": 19.225601196289062, + "learning_rate": 0.00043172441350087116, + "loss": 7.5668, + "step": 1303300 + }, + { + "epoch": 5.309765156753971, + "grad_norm": 13.775434494018555, + "learning_rate": 0.00043144847086779356, + "loss": 7.5564, + "step": 1303400 + }, + { + "epoch": 5.310172534777353, + "grad_norm": 11.389219284057617, + "learning_rate": 0.00043117261021329814, + "loss": 7.5481, + "step": 1303500 + }, + { + "epoch": 5.310579912800734, + "grad_norm": 15.047318458557129, + "learning_rate": 0.00043089683154829133, + "loss": 7.5344, + "step": 1303600 + }, + { + "epoch": 5.310987290824116, + "grad_norm": 7.878559589385986, + "learning_rate": 0.0004306211348836748, + "loss": 7.5367, + "step": 1303700 + }, + { + "epoch": 5.311394668847497, + "grad_norm": 15.258581161499023, + "learning_rate": 0.00043034552023034985, + "loss": 7.5238, + "step": 1303800 + }, + { + "epoch": 5.311802046870879, + "grad_norm": 4.573291778564453, + "learning_rate": 0.00043006998759921263, + "loss": 7.538, + "step": 1303900 + }, + { + "epoch": 5.3122094248942595, + "grad_norm": 5.569906234741211, + "learning_rate": 0.00042979453700115654, + "loss": 7.5354, + "step": 1304000 + }, + { + "epoch": 5.3122094248942595, + "eval_MaskedAccuracy": 0.51245261577088, + "eval_loss": 1.5892739295959473, + "eval_runtime": 157.9994, + "eval_samples_per_second": 401.748, + "eval_steps_per_second": 1.57, + "step": 1304000 + }, + { + "epoch": 5.312616802917641, + "grad_norm": 25.85943603515625, + "learning_rate": 0.0004295191684470714, + "loss": 7.5452, + "step": 1304100 + }, + { + "epoch": 5.313024180941023, + "grad_norm": 8.31718635559082, + "learning_rate": 0.000429243881947844, + "loss": 7.5537, + "step": 1304200 + }, + { + "epoch": 5.313431558964404, + "grad_norm": 5.419006824493408, + "learning_rate": 0.0004289686775143576, + "loss": 7.5556, + "step": 1304300 + }, + { + "epoch": 5.313838936987786, + "grad_norm": 5.167078495025635, + "learning_rate": 0.000428693555157493, + "loss": 7.5566, + "step": 1304400 + }, + { + "epoch": 5.314246315011167, + "grad_norm": 6.955180644989014, + "learning_rate": 0.00042841851488812705, + "loss": 7.5608, + "step": 1304500 + }, + { + "epoch": 5.314653693034549, + "grad_norm": 15.398619651794434, + "learning_rate": 0.00042814355671713335, + "loss": 7.5092, + "step": 1304600 + }, + { + "epoch": 5.31506107105793, + "grad_norm": 4.185786247253418, + "learning_rate": 0.00042786868065538197, + "loss": 7.5604, + "step": 1304700 + }, + { + "epoch": 5.315468449081312, + "grad_norm": 10.055868148803711, + "learning_rate": 0.0004275938867137414, + "loss": 7.5345, + "step": 1304800 + }, + { + "epoch": 5.315875827104693, + "grad_norm": 6.670746803283691, + "learning_rate": 0.00042731917490307575, + "loss": 7.5433, + "step": 1304900 + }, + { + "epoch": 5.316283205128075, + "grad_norm": 10.293941497802734, + "learning_rate": 0.0004270445452342453, + "loss": 7.5425, + "step": 1305000 + }, + { + "epoch": 5.316283205128075, + "eval_MaskedAccuracy": 0.5132595393285279, + "eval_loss": 1.5809528827667236, + "eval_runtime": 170.4779, + "eval_samples_per_second": 372.342, + "eval_steps_per_second": 1.455, + "step": 1305000 + }, + { + "epoch": 5.316690583151456, + "grad_norm": 9.087493896484375, + "learning_rate": 0.0004267699977181075, + "loss": 7.5326, + "step": 1305100 + }, + { + "epoch": 5.317097961174838, + "grad_norm": 9.560127258300781, + "learning_rate": 0.0004264955323655171, + "loss": 7.52, + "step": 1305200 + }, + { + "epoch": 5.3175053391982186, + "grad_norm": 2.9714224338531494, + "learning_rate": 0.0004262211491873249, + "loss": 7.5339, + "step": 1305300 + }, + { + "epoch": 5.3179127172216, + "grad_norm": 12.012248039245605, + "learning_rate": 0.00042594684819437825, + "loss": 7.5305, + "step": 1305400 + }, + { + "epoch": 5.318320095244982, + "grad_norm": 22.136690139770508, + "learning_rate": 0.0004256726293975226, + "loss": 7.5278, + "step": 1305500 + }, + { + "epoch": 5.318727473268363, + "grad_norm": 11.67299747467041, + "learning_rate": 0.00042539849280759904, + "loss": 7.5272, + "step": 1305600 + }, + { + "epoch": 5.319134851291745, + "grad_norm": 18.251789093017578, + "learning_rate": 0.0004251244384354456, + "loss": 7.5241, + "step": 1305700 + }, + { + "epoch": 5.319542229315126, + "grad_norm": 11.959935188293457, + "learning_rate": 0.0004248504662918969, + "loss": 7.5191, + "step": 1305800 + }, + { + "epoch": 5.319949607338508, + "grad_norm": 3.573176145553589, + "learning_rate": 0.0004245765763877847, + "loss": 7.4944, + "step": 1305900 + }, + { + "epoch": 5.320356985361889, + "grad_norm": 12.186860084533691, + "learning_rate": 0.0004243027687339373, + "loss": 7.5227, + "step": 1306000 + }, + { + "epoch": 5.320356985361889, + "eval_MaskedAccuracy": 0.5127432091581579, + "eval_loss": 1.5990056991577148, + "eval_runtime": 175.7337, + "eval_samples_per_second": 361.206, + "eval_steps_per_second": 1.411, + "step": 1306000 + }, + { + "epoch": 5.320764363385271, + "grad_norm": 4.960899829864502, + "learning_rate": 0.0004240290433411798, + "loss": 7.5576, + "step": 1306100 + }, + { + "epoch": 5.321171741408652, + "grad_norm": 3.94295334815979, + "learning_rate": 0.00042375540022033403, + "loss": 7.5552, + "step": 1306200 + }, + { + "epoch": 5.321579119432034, + "grad_norm": 10.346899032592773, + "learning_rate": 0.00042348183938221843, + "loss": 7.5124, + "step": 1306300 + }, + { + "epoch": 5.321986497455415, + "grad_norm": 28.08512306213379, + "learning_rate": 0.0004232083608376485, + "loss": 7.5388, + "step": 1306400 + }, + { + "epoch": 5.322393875478797, + "grad_norm": 5.102529048919678, + "learning_rate": 0.00042293496459743673, + "loss": 7.5298, + "step": 1306500 + }, + { + "epoch": 5.322801253502178, + "grad_norm": 6.559290885925293, + "learning_rate": 0.0004226616506723909, + "loss": 7.5538, + "step": 1306600 + }, + { + "epoch": 5.323208631525559, + "grad_norm": 21.693456649780273, + "learning_rate": 0.0004223884190733167, + "loss": 7.5271, + "step": 1306700 + }, + { + "epoch": 5.323616009548941, + "grad_norm": 4.1509809494018555, + "learning_rate": 0.00042211526981101715, + "loss": 7.5492, + "step": 1306800 + }, + { + "epoch": 5.324023387572322, + "grad_norm": 10.603466987609863, + "learning_rate": 0.00042184220289629115, + "loss": 7.5463, + "step": 1306900 + }, + { + "epoch": 5.324430765595704, + "grad_norm": 5.296978950500488, + "learning_rate": 0.0004215692183399348, + "loss": 7.5322, + "step": 1307000 + }, + { + "epoch": 5.324430765595704, + "eval_MaskedAccuracy": 0.5130697215877362, + "eval_loss": 1.5880545377731323, + "eval_runtime": 171.6127, + "eval_samples_per_second": 369.879, + "eval_steps_per_second": 1.445, + "step": 1307000 + }, + { + "epoch": 5.324838143619085, + "grad_norm": 6.5792436599731445, + "learning_rate": 0.00042129631615274004, + "loss": 7.5571, + "step": 1307100 + }, + { + "epoch": 5.325245521642467, + "grad_norm": 5.165150165557861, + "learning_rate": 0.0004210234963454961, + "loss": 7.5089, + "step": 1307200 + }, + { + "epoch": 5.325652899665848, + "grad_norm": 16.115964889526367, + "learning_rate": 0.00042075075892898976, + "loss": 7.5474, + "step": 1307300 + }, + { + "epoch": 5.32606027768923, + "grad_norm": 9.52346420288086, + "learning_rate": 0.00042047810391400284, + "loss": 7.5395, + "step": 1307400 + }, + { + "epoch": 5.326467655712611, + "grad_norm": 18.871601104736328, + "learning_rate": 0.0004202055313113151, + "loss": 7.5501, + "step": 1307500 + }, + { + "epoch": 5.326875033735993, + "grad_norm": 19.175764083862305, + "learning_rate": 0.00041993304113170307, + "loss": 7.5676, + "step": 1307600 + }, + { + "epoch": 5.327282411759374, + "grad_norm": 26.013240814208984, + "learning_rate": 0.0004196606333859392, + "loss": 7.5426, + "step": 1307700 + }, + { + "epoch": 5.327689789782755, + "grad_norm": 8.625907897949219, + "learning_rate": 0.00041938830808479463, + "loss": 7.5382, + "step": 1307800 + }, + { + "epoch": 5.328097167806137, + "grad_norm": 5.5241618156433105, + "learning_rate": 0.00041911606523903437, + "loss": 7.5174, + "step": 1307900 + }, + { + "epoch": 5.328504545829518, + "grad_norm": 7.408745288848877, + "learning_rate": 0.0004188439048594226, + "loss": 7.5024, + "step": 1308000 + }, + { + "epoch": 5.328504545829518, + "eval_MaskedAccuracy": 0.5130457488440296, + "eval_loss": 1.5893456935882568, + "eval_runtime": 167.7813, + "eval_samples_per_second": 378.326, + "eval_steps_per_second": 1.478, + "step": 1308000 + }, + { + "epoch": 5.3289119238529, + "grad_norm": 20.802061080932617, + "learning_rate": 0.0004185718269567188, + "loss": 7.5287, + "step": 1308100 + }, + { + "epoch": 5.329319301876281, + "grad_norm": 13.224457740783691, + "learning_rate": 0.0004182998315416791, + "loss": 7.5293, + "step": 1308200 + }, + { + "epoch": 5.329726679899663, + "grad_norm": 13.797801971435547, + "learning_rate": 0.00041802791862505803, + "loss": 7.5389, + "step": 1308300 + }, + { + "epoch": 5.330134057923044, + "grad_norm": 10.780403137207031, + "learning_rate": 0.0004177560882176052, + "loss": 7.5695, + "step": 1308400 + }, + { + "epoch": 5.330541435946426, + "grad_norm": 24.98788070678711, + "learning_rate": 0.0004174843403300676, + "loss": 7.5097, + "step": 1308500 + }, + { + "epoch": 5.330948813969807, + "grad_norm": 17.747987747192383, + "learning_rate": 0.0004172126749731888, + "loss": 7.5208, + "step": 1308600 + }, + { + "epoch": 5.331356191993189, + "grad_norm": 16.77276039123535, + "learning_rate": 0.0004169410921577086, + "loss": 7.5197, + "step": 1308700 + }, + { + "epoch": 5.33176357001657, + "grad_norm": 16.676837921142578, + "learning_rate": 0.00041666959189436543, + "loss": 7.5426, + "step": 1308800 + }, + { + "epoch": 5.332170948039952, + "grad_norm": 19.803956985473633, + "learning_rate": 0.0004163981741938923, + "loss": 7.5245, + "step": 1308900 + }, + { + "epoch": 5.332578326063333, + "grad_norm": 17.46125030517578, + "learning_rate": 0.0004161268390670199, + "loss": 7.5737, + "step": 1309000 + }, + { + "epoch": 5.332578326063333, + "eval_MaskedAccuracy": 0.5129704649628153, + "eval_loss": 1.5894827842712402, + "eval_runtime": 161.9856, + "eval_samples_per_second": 391.862, + "eval_steps_per_second": 1.531, + "step": 1309000 + }, + { + "epoch": 5.332985704086714, + "grad_norm": 7.310243129730225, + "learning_rate": 0.00041585558652447593, + "loss": 7.5035, + "step": 1309100 + }, + { + "epoch": 5.333393082110096, + "grad_norm": 5.5625319480896, + "learning_rate": 0.0004155844165769841, + "loss": 7.5365, + "step": 1309200 + }, + { + "epoch": 5.333800460133477, + "grad_norm": 18.815752029418945, + "learning_rate": 0.00041531332923526497, + "loss": 7.509, + "step": 1309300 + }, + { + "epoch": 5.334207838156859, + "grad_norm": 13.04920482635498, + "learning_rate": 0.0004150423245100366, + "loss": 7.5357, + "step": 1309400 + }, + { + "epoch": 5.33461521618024, + "grad_norm": 5.087550640106201, + "learning_rate": 0.00041477140241201276, + "loss": 7.5255, + "step": 1309500 + }, + { + "epoch": 5.335022594203622, + "grad_norm": 5.4975056648254395, + "learning_rate": 0.0004145005629519049, + "loss": 7.5456, + "step": 1309600 + }, + { + "epoch": 5.335429972227003, + "grad_norm": 10.997625350952148, + "learning_rate": 0.00041422980614042065, + "loss": 7.5531, + "step": 1309700 + }, + { + "epoch": 5.335837350250385, + "grad_norm": 12.89289665222168, + "learning_rate": 0.0004139591319882645, + "loss": 7.5631, + "step": 1309800 + }, + { + "epoch": 5.336244728273766, + "grad_norm": 10.376225471496582, + "learning_rate": 0.0004136885405061371, + "loss": 7.5583, + "step": 1309900 + }, + { + "epoch": 5.336652106297148, + "grad_norm": 4.5756516456604, + "learning_rate": 0.0004134180317047375, + "loss": 7.5299, + "step": 1310000 + }, + { + "epoch": 5.336652106297148, + "eval_MaskedAccuracy": 0.5123939334447258, + "eval_loss": 1.5862807035446167, + "eval_runtime": 174.3439, + "eval_samples_per_second": 364.085, + "eval_steps_per_second": 1.422, + "step": 1310000 + }, + { + "epoch": 5.3370594843205295, + "grad_norm": 6.5817952156066895, + "learning_rate": 0.0004131476055947595, + "loss": 7.5295, + "step": 1310100 + }, + { + "epoch": 5.337466862343911, + "grad_norm": 9.626608848571777, + "learning_rate": 0.00041287726218689375, + "loss": 7.5487, + "step": 1310200 + }, + { + "epoch": 5.337874240367292, + "grad_norm": 13.104557991027832, + "learning_rate": 0.0004126070014918302, + "loss": 7.5388, + "step": 1310300 + }, + { + "epoch": 5.338281618390673, + "grad_norm": 10.869489669799805, + "learning_rate": 0.0004123368235202523, + "loss": 7.5224, + "step": 1310400 + }, + { + "epoch": 5.338688996414055, + "grad_norm": 8.736258506774902, + "learning_rate": 0.0004120667282828421, + "loss": 7.5527, + "step": 1310500 + }, + { + "epoch": 5.339096374437436, + "grad_norm": 17.685625076293945, + "learning_rate": 0.00041179671579027826, + "loss": 7.5197, + "step": 1310600 + }, + { + "epoch": 5.339503752460818, + "grad_norm": 23.2597713470459, + "learning_rate": 0.00041152678605323536, + "loss": 7.5461, + "step": 1310700 + }, + { + "epoch": 5.339911130484199, + "grad_norm": 5.2023162841796875, + "learning_rate": 0.0004112569390823852, + "loss": 7.5447, + "step": 1310800 + }, + { + "epoch": 5.340318508507581, + "grad_norm": 10.159354209899902, + "learning_rate": 0.0004109871748883967, + "loss": 7.5404, + "step": 1310900 + }, + { + "epoch": 5.340725886530962, + "grad_norm": 13.514359474182129, + "learning_rate": 0.00041071749348193447, + "loss": 7.5503, + "step": 1311000 + }, + { + "epoch": 5.340725886530962, + "eval_MaskedAccuracy": 0.5126926209186771, + "eval_loss": 1.5796821117401123, + "eval_runtime": 183.4886, + "eval_samples_per_second": 345.94, + "eval_steps_per_second": 1.352, + "step": 1311000 + }, + { + "epoch": 5.341133264554344, + "grad_norm": 8.904132843017578, + "learning_rate": 0.00041044789487366105, + "loss": 7.5385, + "step": 1311100 + }, + { + "epoch": 5.341540642577725, + "grad_norm": 20.313217163085938, + "learning_rate": 0.0004101783790742347, + "loss": 7.5527, + "step": 1311200 + }, + { + "epoch": 5.341948020601107, + "grad_norm": 17.716442108154297, + "learning_rate": 0.0004099089460943104, + "loss": 7.5334, + "step": 1311300 + }, + { + "epoch": 5.3423553986244885, + "grad_norm": 15.50544261932373, + "learning_rate": 0.0004096395959445412, + "loss": 7.5475, + "step": 1311400 + }, + { + "epoch": 5.34276277664787, + "grad_norm": 3.44032621383667, + "learning_rate": 0.00040937032863557567, + "loss": 7.542, + "step": 1311500 + }, + { + "epoch": 5.343170154671251, + "grad_norm": 4.819403171539307, + "learning_rate": 0.00040910114417805815, + "loss": 7.5638, + "step": 1311600 + }, + { + "epoch": 5.343577532694632, + "grad_norm": 12.281195640563965, + "learning_rate": 0.0004088320425826329, + "loss": 7.529, + "step": 1311700 + }, + { + "epoch": 5.343984910718014, + "grad_norm": 12.83141803741455, + "learning_rate": 0.00040856302385993814, + "loss": 7.5529, + "step": 1311800 + }, + { + "epoch": 5.344392288741395, + "grad_norm": 3.5825467109680176, + "learning_rate": 0.0004082940880206096, + "loss": 7.52, + "step": 1311900 + }, + { + "epoch": 5.344799666764777, + "grad_norm": 18.3764705657959, + "learning_rate": 0.00040802523507527984, + "loss": 7.5457, + "step": 1312000 + }, + { + "epoch": 5.344799666764777, + "eval_MaskedAccuracy": 0.5125034261440891, + "eval_loss": 1.5950565338134766, + "eval_runtime": 198.6051, + "eval_samples_per_second": 319.609, + "eval_steps_per_second": 1.249, + "step": 1312000 + }, + { + "epoch": 5.345207044788158, + "grad_norm": 8.003653526306152, + "learning_rate": 0.00040775646503457764, + "loss": 7.5668, + "step": 1312100 + }, + { + "epoch": 5.34561442281154, + "grad_norm": 13.133673667907715, + "learning_rate": 0.0004074877779091297, + "loss": 7.5334, + "step": 1312200 + }, + { + "epoch": 5.346021800834921, + "grad_norm": 6.1893486976623535, + "learning_rate": 0.00040721917370955814, + "loss": 7.5845, + "step": 1312300 + }, + { + "epoch": 5.346429178858303, + "grad_norm": 9.516192436218262, + "learning_rate": 0.00040695065244648204, + "loss": 7.5335, + "step": 1312400 + }, + { + "epoch": 5.3468365568816845, + "grad_norm": 21.823017120361328, + "learning_rate": 0.00040668221413051774, + "loss": 7.4975, + "step": 1312500 + }, + { + "epoch": 5.347243934905066, + "grad_norm": 12.48024845123291, + "learning_rate": 0.0004064138587722779, + "loss": 7.5477, + "step": 1312600 + }, + { + "epoch": 5.347651312928447, + "grad_norm": 14.158882141113281, + "learning_rate": 0.00040614558638237246, + "loss": 7.5063, + "step": 1312700 + }, + { + "epoch": 5.348058690951828, + "grad_norm": 7.386480808258057, + "learning_rate": 0.00040587739697140753, + "loss": 7.5617, + "step": 1312800 + }, + { + "epoch": 5.34846606897521, + "grad_norm": 8.702553749084473, + "learning_rate": 0.00040560929054998617, + "loss": 7.5234, + "step": 1312900 + }, + { + "epoch": 5.348873446998591, + "grad_norm": 13.563511848449707, + "learning_rate": 0.00040534126712870813, + "loss": 7.5658, + "step": 1313000 + }, + { + "epoch": 5.348873446998591, + "eval_MaskedAccuracy": 0.5124022373641511, + "eval_loss": 1.5936604738235474, + "eval_runtime": 183.3132, + "eval_samples_per_second": 346.271, + "eval_steps_per_second": 1.353, + "step": 1313000 + }, + { + "epoch": 5.349280825021973, + "grad_norm": 9.220017433166504, + "learning_rate": 0.00040507332671816873, + "loss": 7.559, + "step": 1313100 + }, + { + "epoch": 5.349688203045354, + "grad_norm": 10.695631980895996, + "learning_rate": 0.000404805469328962, + "loss": 7.5328, + "step": 1313200 + }, + { + "epoch": 5.350095581068736, + "grad_norm": 10.029516220092773, + "learning_rate": 0.00040453769497167803, + "loss": 7.5139, + "step": 1313300 + }, + { + "epoch": 5.350502959092117, + "grad_norm": 12.86770248413086, + "learning_rate": 0.0004042700036569035, + "loss": 7.5435, + "step": 1313400 + }, + { + "epoch": 5.350910337115499, + "grad_norm": 5.891438961029053, + "learning_rate": 0.00040400239539522084, + "loss": 7.5367, + "step": 1313500 + }, + { + "epoch": 5.35131771513888, + "grad_norm": 8.02744197845459, + "learning_rate": 0.0004037348701972099, + "loss": 7.5255, + "step": 1313600 + }, + { + "epoch": 5.351725093162262, + "grad_norm": 19.7856388092041, + "learning_rate": 0.00040346742807344845, + "loss": 7.5718, + "step": 1313700 + }, + { + "epoch": 5.3521324711856435, + "grad_norm": 9.17331600189209, + "learning_rate": 0.00040320006903450976, + "loss": 7.5278, + "step": 1313800 + }, + { + "epoch": 5.352539849209025, + "grad_norm": 5.395483016967773, + "learning_rate": 0.0004029327930909628, + "loss": 7.552, + "step": 1313900 + }, + { + "epoch": 5.352947227232406, + "grad_norm": 7.13218879699707, + "learning_rate": 0.0004026656002533755, + "loss": 7.5595, + "step": 1314000 + }, + { + "epoch": 5.352947227232406, + "eval_MaskedAccuracy": 0.5126087552568914, + "eval_loss": 1.5851060152053833, + "eval_runtime": 182.1563, + "eval_samples_per_second": 348.47, + "eval_steps_per_second": 1.361, + "step": 1314000 + }, + { + "epoch": 5.353354605255787, + "grad_norm": 13.182037353515625, + "learning_rate": 0.0004023984905323112, + "loss": 7.5605, + "step": 1314100 + }, + { + "epoch": 5.353761983279169, + "grad_norm": 12.281851768493652, + "learning_rate": 0.00040213146393833, + "loss": 7.5284, + "step": 1314200 + }, + { + "epoch": 5.35416936130255, + "grad_norm": 5.3217949867248535, + "learning_rate": 0.000401864520481989, + "loss": 7.5249, + "step": 1314300 + }, + { + "epoch": 5.354576739325932, + "grad_norm": 20.306541442871094, + "learning_rate": 0.0004015976601738416, + "loss": 7.5326, + "step": 1314400 + }, + { + "epoch": 5.354984117349313, + "grad_norm": 6.144830226898193, + "learning_rate": 0.0004013308830244386, + "loss": 7.5371, + "step": 1314500 + }, + { + "epoch": 5.355391495372695, + "grad_norm": 5.244483470916748, + "learning_rate": 0.00040106418904432725, + "loss": 7.5152, + "step": 1314600 + }, + { + "epoch": 5.355798873396076, + "grad_norm": 5.141518592834473, + "learning_rate": 0.00040079757824405083, + "loss": 7.5004, + "step": 1314700 + }, + { + "epoch": 5.356206251419458, + "grad_norm": 8.617745399475098, + "learning_rate": 0.00040053105063415055, + "loss": 7.5898, + "step": 1314800 + }, + { + "epoch": 5.3566136294428395, + "grad_norm": 6.266997337341309, + "learning_rate": 0.00040026460622516355, + "loss": 7.4993, + "step": 1314900 + }, + { + "epoch": 5.357021007466221, + "grad_norm": 22.494998931884766, + "learning_rate": 0.0003999982450276234, + "loss": 7.5457, + "step": 1315000 + }, + { + "epoch": 5.357021007466221, + "eval_MaskedAccuracy": 0.5129410097786962, + "eval_loss": 1.588871717453003, + "eval_runtime": 165.149, + "eval_samples_per_second": 384.356, + "eval_steps_per_second": 1.502, + "step": 1315000 + }, + { + "epoch": 5.3574283854896025, + "grad_norm": 23.0819091796875, + "learning_rate": 0.0003997319670520608, + "loss": 7.5326, + "step": 1315100 + }, + { + "epoch": 5.357835763512984, + "grad_norm": 22.44310760498047, + "learning_rate": 0.0003994657723090032, + "loss": 7.5444, + "step": 1315200 + }, + { + "epoch": 5.358243141536365, + "grad_norm": 8.22282886505127, + "learning_rate": 0.00039919966080897485, + "loss": 7.5606, + "step": 1315300 + }, + { + "epoch": 5.358650519559746, + "grad_norm": 20.609516143798828, + "learning_rate": 0.0003989336325624969, + "loss": 7.5357, + "step": 1315400 + }, + { + "epoch": 5.359057897583128, + "grad_norm": 6.432299613952637, + "learning_rate": 0.0003986676875800865, + "loss": 7.5586, + "step": 1315500 + }, + { + "epoch": 5.359465275606509, + "grad_norm": 6.51712703704834, + "learning_rate": 0.00039840182587225713, + "loss": 7.5492, + "step": 1315600 + }, + { + "epoch": 5.359872653629891, + "grad_norm": 8.8421630859375, + "learning_rate": 0.00039813604744952066, + "loss": 7.5369, + "step": 1315700 + }, + { + "epoch": 5.360280031653272, + "grad_norm": 4.876821994781494, + "learning_rate": 0.0003978703523223846, + "loss": 7.5407, + "step": 1315800 + }, + { + "epoch": 5.360687409676654, + "grad_norm": 9.427638053894043, + "learning_rate": 0.000397604740501353, + "loss": 7.5552, + "step": 1315900 + }, + { + "epoch": 5.3610947877000354, + "grad_norm": 12.400432586669922, + "learning_rate": 0.00039733921199692725, + "loss": 7.5291, + "step": 1316000 + }, + { + "epoch": 5.3610947877000354, + "eval_MaskedAccuracy": 0.5122422487512329, + "eval_loss": 1.597111701965332, + "eval_runtime": 194.252, + "eval_samples_per_second": 326.771, + "eval_steps_per_second": 1.277, + "step": 1316000 + }, + { + "epoch": 5.361502165723417, + "grad_norm": 18.319015502929688, + "learning_rate": 0.00039707376681960555, + "loss": 7.5306, + "step": 1316100 + }, + { + "epoch": 5.3619095437467985, + "grad_norm": 6.772778034210205, + "learning_rate": 0.0003968084049798815, + "loss": 7.5176, + "step": 1316200 + }, + { + "epoch": 5.36231692177018, + "grad_norm": 10.569714546203613, + "learning_rate": 0.00039654312648824665, + "loss": 7.52, + "step": 1316300 + }, + { + "epoch": 5.362724299793562, + "grad_norm": 12.517595291137695, + "learning_rate": 0.00039627793135518866, + "loss": 7.5064, + "step": 1316400 + }, + { + "epoch": 5.363131677816943, + "grad_norm": 8.718416213989258, + "learning_rate": 0.0003960128195911921, + "loss": 7.5677, + "step": 1316500 + }, + { + "epoch": 5.363539055840324, + "grad_norm": 18.29831314086914, + "learning_rate": 0.0003957477912067381, + "loss": 7.5475, + "step": 1316600 + }, + { + "epoch": 5.363946433863705, + "grad_norm": 19.1815242767334, + "learning_rate": 0.00039548284621230564, + "loss": 7.5555, + "step": 1316700 + }, + { + "epoch": 5.364353811887087, + "grad_norm": 8.209659576416016, + "learning_rate": 0.0003952179846183688, + "loss": 7.5113, + "step": 1316800 + }, + { + "epoch": 5.364761189910468, + "grad_norm": 2.8940536975860596, + "learning_rate": 0.00039495320643539913, + "loss": 7.5219, + "step": 1316900 + }, + { + "epoch": 5.36516856793385, + "grad_norm": 6.335155487060547, + "learning_rate": 0.0003946885116738643, + "loss": 7.5488, + "step": 1317000 + }, + { + "epoch": 5.36516856793385, + "eval_MaskedAccuracy": 0.5125834465282173, + "eval_loss": 1.5992813110351562, + "eval_runtime": 170.2243, + "eval_samples_per_second": 372.896, + "eval_steps_per_second": 1.457, + "step": 1317000 + }, + { + "epoch": 5.365575945957231, + "grad_norm": 3.8846523761749268, + "learning_rate": 0.0003944239003442292, + "loss": 7.5064, + "step": 1317100 + }, + { + "epoch": 5.365983323980613, + "grad_norm": 5.232937812805176, + "learning_rate": 0.0003941593724569558, + "loss": 7.5324, + "step": 1317200 + }, + { + "epoch": 5.3663907020039945, + "grad_norm": 12.589492797851562, + "learning_rate": 0.0003938949280225023, + "loss": 7.5335, + "step": 1317300 + }, + { + "epoch": 5.366798080027376, + "grad_norm": 28.246875762939453, + "learning_rate": 0.00039363056705132285, + "loss": 7.5515, + "step": 1317400 + }, + { + "epoch": 5.3672054580507575, + "grad_norm": 10.634908676147461, + "learning_rate": 0.00039336628955386925, + "loss": 7.5299, + "step": 1317500 + }, + { + "epoch": 5.367612836074139, + "grad_norm": 13.630500793457031, + "learning_rate": 0.00039310209554058943, + "loss": 7.5211, + "step": 1317600 + }, + { + "epoch": 5.36802021409752, + "grad_norm": 5.530129909515381, + "learning_rate": 0.0003928379850219299, + "loss": 7.5065, + "step": 1317700 + }, + { + "epoch": 5.368427592120901, + "grad_norm": 6.207881450653076, + "learning_rate": 0.0003925739580083312, + "loss": 7.5259, + "step": 1317800 + }, + { + "epoch": 5.368834970144283, + "grad_norm": 13.467637062072754, + "learning_rate": 0.0003923100145102327, + "loss": 7.5184, + "step": 1317900 + }, + { + "epoch": 5.369242348167664, + "grad_norm": 3.3274197578430176, + "learning_rate": 0.00039204615453806795, + "loss": 7.5238, + "step": 1318000 + }, + { + "epoch": 5.369242348167664, + "eval_MaskedAccuracy": 0.5131843151826292, + "eval_loss": 1.5855181217193604, + "eval_runtime": 175.5552, + "eval_samples_per_second": 361.573, + "eval_steps_per_second": 1.413, + "step": 1318000 + }, + { + "epoch": 5.369649726191046, + "grad_norm": 11.497682571411133, + "learning_rate": 0.00039178237810227033, + "loss": 7.5357, + "step": 1318100 + }, + { + "epoch": 5.370057104214427, + "grad_norm": 5.816246032714844, + "learning_rate": 0.0003915186852132668, + "loss": 7.5541, + "step": 1318200 + }, + { + "epoch": 5.370464482237809, + "grad_norm": 4.2336297035217285, + "learning_rate": 0.00039125507588148357, + "loss": 7.542, + "step": 1318300 + }, + { + "epoch": 5.3708718602611905, + "grad_norm": 8.212498664855957, + "learning_rate": 0.00039099155011734267, + "loss": 7.5107, + "step": 1318400 + }, + { + "epoch": 5.371279238284572, + "grad_norm": 3.440297842025757, + "learning_rate": 0.00039072810793126187, + "loss": 7.5382, + "step": 1318500 + }, + { + "epoch": 5.3716866163079535, + "grad_norm": 7.391533851623535, + "learning_rate": 0.00039046474933365717, + "loss": 7.5435, + "step": 1318600 + }, + { + "epoch": 5.372093994331335, + "grad_norm": 4.354636192321777, + "learning_rate": 0.00039020147433494, + "loss": 7.5329, + "step": 1318700 + }, + { + "epoch": 5.372501372354717, + "grad_norm": 29.759435653686523, + "learning_rate": 0.0003899382829455192, + "loss": 7.5259, + "step": 1318800 + }, + { + "epoch": 5.372908750378098, + "grad_norm": 5.569962024688721, + "learning_rate": 0.00038967517517580043, + "loss": 7.5521, + "step": 1318900 + }, + { + "epoch": 5.373316128401479, + "grad_norm": 2.95422101020813, + "learning_rate": 0.00038941215103618535, + "loss": 7.5121, + "step": 1319000 + }, + { + "epoch": 5.373316128401479, + "eval_MaskedAccuracy": 0.5129268518777247, + "eval_loss": 1.5870742797851562, + "eval_runtime": 202.1288, + "eval_samples_per_second": 314.037, + "eval_steps_per_second": 1.227, + "step": 1319000 + }, + { + "epoch": 5.37372350642486, + "grad_norm": 3.4544146060943604, + "learning_rate": 0.00038914921053707344, + "loss": 7.5696, + "step": 1319100 + }, + { + "epoch": 5.374130884448242, + "grad_norm": 4.413163185119629, + "learning_rate": 0.00038888635368885925, + "loss": 7.546, + "step": 1319200 + }, + { + "epoch": 5.374538262471623, + "grad_norm": 6.508971691131592, + "learning_rate": 0.0003886235805019351, + "loss": 7.5441, + "step": 1319300 + }, + { + "epoch": 5.374945640495005, + "grad_norm": 13.9869384765625, + "learning_rate": 0.00038836089098669065, + "loss": 7.519, + "step": 1319400 + }, + { + "epoch": 5.375353018518386, + "grad_norm": 4.619554042816162, + "learning_rate": 0.0003880982851535106, + "loss": 7.5285, + "step": 1319500 + }, + { + "epoch": 5.375760396541768, + "grad_norm": 17.703474044799805, + "learning_rate": 0.0003878357630127769, + "loss": 7.5462, + "step": 1319600 + }, + { + "epoch": 5.3761677745651495, + "grad_norm": 22.187644958496094, + "learning_rate": 0.00038757332457486893, + "loss": 7.5366, + "step": 1319700 + }, + { + "epoch": 5.376575152588531, + "grad_norm": 10.332889556884766, + "learning_rate": 0.00038731096985016207, + "loss": 7.5217, + "step": 1319800 + }, + { + "epoch": 5.376982530611913, + "grad_norm": 22.152936935424805, + "learning_rate": 0.0003870486988490289, + "loss": 7.5483, + "step": 1319900 + }, + { + "epoch": 5.377389908635294, + "grad_norm": 7.143672943115234, + "learning_rate": 0.0003867865115818383, + "loss": 7.5636, + "step": 1320000 + }, + { + "epoch": 5.377389908635294, + "eval_MaskedAccuracy": 0.5130267379412599, + "eval_loss": 1.5818291902542114, + "eval_runtime": 173.6924, + "eval_samples_per_second": 365.451, + "eval_steps_per_second": 1.428, + "step": 1320000 + }, + { + "epoch": 5.377797286658676, + "grad_norm": 17.466697692871094, + "learning_rate": 0.00038652440805895567, + "loss": 7.5344, + "step": 1320100 + }, + { + "epoch": 5.378204664682057, + "grad_norm": 3.5994436740875244, + "learning_rate": 0.0003862623882907438, + "loss": 7.5153, + "step": 1320200 + }, + { + "epoch": 5.378612042705438, + "grad_norm": 5.591103553771973, + "learning_rate": 0.0003860004522875611, + "loss": 7.5348, + "step": 1320300 + }, + { + "epoch": 5.379019420728819, + "grad_norm": 3.714869737625122, + "learning_rate": 0.0003857386000597642, + "loss": 7.5269, + "step": 1320400 + }, + { + "epoch": 5.379426798752201, + "grad_norm": 20.146677017211914, + "learning_rate": 0.0003854768316177052, + "loss": 7.5617, + "step": 1320500 + }, + { + "epoch": 5.379834176775582, + "grad_norm": 14.925349235534668, + "learning_rate": 0.00038521514697173286, + "loss": 7.5187, + "step": 1320600 + }, + { + "epoch": 5.380241554798964, + "grad_norm": 5.3701581954956055, + "learning_rate": 0.00038495354613219296, + "loss": 7.5165, + "step": 1320700 + }, + { + "epoch": 5.3806489328223455, + "grad_norm": 8.008158683776855, + "learning_rate": 0.0003846920291094284, + "loss": 7.5342, + "step": 1320800 + }, + { + "epoch": 5.381056310845727, + "grad_norm": 12.682861328125, + "learning_rate": 0.00038443059591377807, + "loss": 7.5345, + "step": 1320900 + }, + { + "epoch": 5.3814636888691085, + "grad_norm": 6.351812839508057, + "learning_rate": 0.00038416924655557767, + "loss": 7.5084, + "step": 1321000 + }, + { + "epoch": 5.3814636888691085, + "eval_MaskedAccuracy": 0.5129036387186814, + "eval_loss": 1.5889383554458618, + "eval_runtime": 165.3506, + "eval_samples_per_second": 383.887, + "eval_steps_per_second": 1.5, + "step": 1321000 + }, + { + "epoch": 5.38187106689249, + "grad_norm": 14.423798561096191, + "learning_rate": 0.0003839079810451603, + "loss": 7.5513, + "step": 1321100 + }, + { + "epoch": 5.382278444915872, + "grad_norm": 14.717927932739258, + "learning_rate": 0.00038364679939285483, + "loss": 7.5411, + "step": 1321200 + }, + { + "epoch": 5.382685822939253, + "grad_norm": 14.870012283325195, + "learning_rate": 0.0003833857016089872, + "loss": 7.5271, + "step": 1321300 + }, + { + "epoch": 5.383093200962635, + "grad_norm": 19.216617584228516, + "learning_rate": 0.0003831246877038793, + "loss": 7.568, + "step": 1321400 + }, + { + "epoch": 5.383500578986016, + "grad_norm": 4.725645065307617, + "learning_rate": 0.0003828637576878519, + "loss": 7.5502, + "step": 1321500 + }, + { + "epoch": 5.383907957009397, + "grad_norm": 3.9537508487701416, + "learning_rate": 0.00038260291157121945, + "loss": 7.5641, + "step": 1321600 + }, + { + "epoch": 5.384315335032778, + "grad_norm": 22.767629623413086, + "learning_rate": 0.0003823421493642955, + "loss": 7.5407, + "step": 1321700 + }, + { + "epoch": 5.38472271305616, + "grad_norm": 8.632490158081055, + "learning_rate": 0.00038208147107738927, + "loss": 7.5435, + "step": 1321800 + }, + { + "epoch": 5.385130091079541, + "grad_norm": 4.072729587554932, + "learning_rate": 0.00038182087672080706, + "loss": 7.5354, + "step": 1321900 + }, + { + "epoch": 5.385537469102923, + "grad_norm": 8.678680419921875, + "learning_rate": 0.0003815603663048508, + "loss": 7.5438, + "step": 1322000 + }, + { + "epoch": 5.385537469102923, + "eval_MaskedAccuracy": 0.512812164448944, + "eval_loss": 1.593866229057312, + "eval_runtime": 169.3801, + "eval_samples_per_second": 374.755, + "eval_steps_per_second": 1.464, + "step": 1322000 + }, + { + "epoch": 5.3859448471263045, + "grad_norm": 22.05284309387207, + "learning_rate": 0.0003812999398398203, + "loss": 7.52, + "step": 1322100 + }, + { + "epoch": 5.386352225149686, + "grad_norm": 3.8710389137268066, + "learning_rate": 0.00038103959733601155, + "loss": 7.5524, + "step": 1322200 + }, + { + "epoch": 5.386759603173068, + "grad_norm": 19.8353214263916, + "learning_rate": 0.0003807793388037175, + "loss": 7.5244, + "step": 1322300 + }, + { + "epoch": 5.387166981196449, + "grad_norm": 10.54450511932373, + "learning_rate": 0.0003805191642532271, + "loss": 7.5627, + "step": 1322400 + }, + { + "epoch": 5.387574359219831, + "grad_norm": 4.97101354598999, + "learning_rate": 0.0003802590736948262, + "loss": 7.5435, + "step": 1322500 + }, + { + "epoch": 5.387981737243212, + "grad_norm": 13.744295120239258, + "learning_rate": 0.00037999906713879865, + "loss": 7.5629, + "step": 1322600 + }, + { + "epoch": 5.388389115266593, + "grad_norm": 18.679704666137695, + "learning_rate": 0.000379739144595423, + "loss": 7.5163, + "step": 1322700 + }, + { + "epoch": 5.388796493289974, + "grad_norm": 13.629724502563477, + "learning_rate": 0.00037947930607497586, + "loss": 7.5317, + "step": 1322800 + }, + { + "epoch": 5.389203871313356, + "grad_norm": 27.627920150756836, + "learning_rate": 0.0003792195515877298, + "loss": 7.5291, + "step": 1322900 + }, + { + "epoch": 5.389611249336737, + "grad_norm": 13.59952449798584, + "learning_rate": 0.00037895988114395397, + "loss": 7.5524, + "step": 1323000 + }, + { + "epoch": 5.389611249336737, + "eval_MaskedAccuracy": 0.5128496255588495, + "eval_loss": 1.5980591773986816, + "eval_runtime": 169.592, + "eval_samples_per_second": 374.286, + "eval_steps_per_second": 1.462, + "step": 1323000 + }, + { + "epoch": 5.390018627360119, + "grad_norm": 9.709085464477539, + "learning_rate": 0.0003787002947539152, + "loss": 7.5235, + "step": 1323100 + }, + { + "epoch": 5.3904260053835005, + "grad_norm": 14.340139389038086, + "learning_rate": 0.0003784407924278759, + "loss": 7.5406, + "step": 1323200 + }, + { + "epoch": 5.390833383406882, + "grad_norm": 7.7949957847595215, + "learning_rate": 0.000378181374176096, + "loss": 7.5504, + "step": 1323300 + }, + { + "epoch": 5.3912407614302635, + "grad_norm": 17.28707504272461, + "learning_rate": 0.00037792204000883093, + "loss": 7.5238, + "step": 1323400 + }, + { + "epoch": 5.391648139453645, + "grad_norm": 7.211428642272949, + "learning_rate": 0.0003776627899363344, + "loss": 7.5173, + "step": 1323500 + }, + { + "epoch": 5.392055517477027, + "grad_norm": 13.542436599731445, + "learning_rate": 0.0003774036239688558, + "loss": 7.5393, + "step": 1323600 + }, + { + "epoch": 5.392462895500408, + "grad_norm": 25.06650161743164, + "learning_rate": 0.000377144542116641, + "loss": 7.537, + "step": 1323700 + }, + { + "epoch": 5.39287027352379, + "grad_norm": 17.492645263671875, + "learning_rate": 0.00037688554438993285, + "loss": 7.5252, + "step": 1323800 + }, + { + "epoch": 5.393277651547171, + "grad_norm": 8.449779510498047, + "learning_rate": 0.0003766266307989712, + "loss": 7.5459, + "step": 1323900 + }, + { + "epoch": 5.393685029570552, + "grad_norm": 8.3551664352417, + "learning_rate": 0.00037636780135399223, + "loss": 7.5279, + "step": 1324000 + }, + { + "epoch": 5.393685029570552, + "eval_MaskedAccuracy": 0.5134369896521969, + "eval_loss": 1.5781093835830688, + "eval_runtime": 172.788, + "eval_samples_per_second": 367.363, + "eval_steps_per_second": 1.435, + "step": 1324000 + }, + { + "epoch": 5.394092407593933, + "grad_norm": 13.627609252929688, + "learning_rate": 0.00037610905606522865, + "loss": 7.5107, + "step": 1324100 + }, + { + "epoch": 5.394499785617315, + "grad_norm": 6.587964057922363, + "learning_rate": 0.0003758503949429101, + "loss": 7.5158, + "step": 1324200 + }, + { + "epoch": 5.3949071636406964, + "grad_norm": 27.416866302490234, + "learning_rate": 0.0003755918179972632, + "loss": 7.536, + "step": 1324300 + }, + { + "epoch": 5.395314541664078, + "grad_norm": 8.633523941040039, + "learning_rate": 0.00037533332523851043, + "loss": 7.5323, + "step": 1324400 + }, + { + "epoch": 5.3957219196874595, + "grad_norm": 16.36502456665039, + "learning_rate": 0.0003750749166768711, + "loss": 7.5265, + "step": 1324500 + }, + { + "epoch": 5.396129297710841, + "grad_norm": 14.699762344360352, + "learning_rate": 0.0003748165923225625, + "loss": 7.5216, + "step": 1324600 + }, + { + "epoch": 5.396536675734223, + "grad_norm": 17.163429260253906, + "learning_rate": 0.00037455835218579657, + "loss": 7.5513, + "step": 1324700 + }, + { + "epoch": 5.396944053757604, + "grad_norm": 10.71406364440918, + "learning_rate": 0.0003743001962767837, + "loss": 7.548, + "step": 1324800 + }, + { + "epoch": 5.397351431780986, + "grad_norm": 11.657567024230957, + "learning_rate": 0.00037404212460572986, + "loss": 7.5181, + "step": 1324900 + }, + { + "epoch": 5.397758809804367, + "grad_norm": 15.483115196228027, + "learning_rate": 0.0003737841371828378, + "loss": 7.5, + "step": 1325000 + }, + { + "epoch": 5.397758809804367, + "eval_MaskedAccuracy": 0.5131526883538816, + "eval_loss": 1.5845218896865845, + "eval_runtime": 169.8272, + "eval_samples_per_second": 373.768, + "eval_steps_per_second": 1.46, + "step": 1325000 + }, + { + "epoch": 5.398166187827749, + "grad_norm": 4.039661884307861, + "learning_rate": 0.0003735262340183077, + "loss": 7.515, + "step": 1325100 + }, + { + "epoch": 5.39857356585113, + "grad_norm": 5.575165271759033, + "learning_rate": 0.00037326841512233555, + "loss": 7.5221, + "step": 1325200 + }, + { + "epoch": 5.398980943874511, + "grad_norm": 7.704547882080078, + "learning_rate": 0.00037301068050511396, + "loss": 7.5164, + "step": 1325300 + }, + { + "epoch": 5.399388321897892, + "grad_norm": 7.4974188804626465, + "learning_rate": 0.0003727530301768332, + "loss": 7.5379, + "step": 1325400 + }, + { + "epoch": 5.399795699921274, + "grad_norm": 9.015436172485352, + "learning_rate": 0.00037249546414767855, + "loss": 7.5171, + "step": 1325500 + }, + { + "epoch": 5.4002030779446555, + "grad_norm": 8.185972213745117, + "learning_rate": 0.0003722379824278342, + "loss": 7.545, + "step": 1325600 + }, + { + "epoch": 5.400610455968037, + "grad_norm": 13.704870223999023, + "learning_rate": 0.00037198058502747904, + "loss": 7.5315, + "step": 1325700 + }, + { + "epoch": 5.4010178339914185, + "grad_norm": 7.266266345977783, + "learning_rate": 0.0003717232719567897, + "loss": 7.549, + "step": 1325800 + }, + { + "epoch": 5.4014252120148, + "grad_norm": 4.926082611083984, + "learning_rate": 0.00037146604322593887, + "loss": 7.5379, + "step": 1325900 + }, + { + "epoch": 5.401832590038182, + "grad_norm": 21.833553314208984, + "learning_rate": 0.0003712088988450965, + "loss": 7.5369, + "step": 1326000 + }, + { + "epoch": 5.401832590038182, + "eval_MaskedAccuracy": 0.5133101537241762, + "eval_loss": 1.5868409872055054, + "eval_runtime": 178.0688, + "eval_samples_per_second": 356.469, + "eval_steps_per_second": 1.393, + "step": 1326000 + }, + { + "epoch": 5.402239968061563, + "grad_norm": 6.728618621826172, + "learning_rate": 0.00037095183882442844, + "loss": 7.5212, + "step": 1326100 + }, + { + "epoch": 5.402647346084945, + "grad_norm": 7.52299165725708, + "learning_rate": 0.00037069486317409755, + "loss": 7.5444, + "step": 1326200 + }, + { + "epoch": 5.403054724108326, + "grad_norm": 20.929426193237305, + "learning_rate": 0.000370437971904264, + "loss": 7.5385, + "step": 1326300 + }, + { + "epoch": 5.403462102131708, + "grad_norm": 18.907255172729492, + "learning_rate": 0.000370181165025084, + "loss": 7.5352, + "step": 1326400 + }, + { + "epoch": 5.403869480155089, + "grad_norm": 7.164705753326416, + "learning_rate": 0.0003699244425467094, + "loss": 7.5369, + "step": 1326500 + }, + { + "epoch": 5.40427685817847, + "grad_norm": 8.887213706970215, + "learning_rate": 0.0003696678044792916, + "loss": 7.5353, + "step": 1326600 + }, + { + "epoch": 5.4046842362018515, + "grad_norm": 25.259241104125977, + "learning_rate": 0.0003694112508329763, + "loss": 7.5614, + "step": 1326700 + }, + { + "epoch": 5.405091614225233, + "grad_norm": 9.49890422821045, + "learning_rate": 0.0003691547816179058, + "loss": 7.5566, + "step": 1326800 + }, + { + "epoch": 5.4054989922486145, + "grad_norm": 11.74758529663086, + "learning_rate": 0.0003688983968442207, + "loss": 7.5704, + "step": 1326900 + }, + { + "epoch": 5.405906370271996, + "grad_norm": 23.76457405090332, + "learning_rate": 0.0003686420965220563, + "loss": 7.5434, + "step": 1327000 + }, + { + "epoch": 5.405906370271996, + "eval_MaskedAccuracy": 0.5125708183474138, + "eval_loss": 1.5922502279281616, + "eval_runtime": 177.1038, + "eval_samples_per_second": 358.411, + "eval_steps_per_second": 1.4, + "step": 1327000 + }, + { + "epoch": 5.406313748295378, + "grad_norm": 8.575128555297852, + "learning_rate": 0.0003683858806615459, + "loss": 7.5408, + "step": 1327100 + }, + { + "epoch": 5.406721126318759, + "grad_norm": 14.53542423248291, + "learning_rate": 0.000368129749272819, + "loss": 7.5233, + "step": 1327200 + }, + { + "epoch": 5.407128504342141, + "grad_norm": 17.9052734375, + "learning_rate": 0.00036787370236600185, + "loss": 7.5332, + "step": 1327300 + }, + { + "epoch": 5.407535882365522, + "grad_norm": 4.918173789978027, + "learning_rate": 0.0003676177399512174, + "loss": 7.5047, + "step": 1327400 + }, + { + "epoch": 5.407943260388904, + "grad_norm": 7.67805290222168, + "learning_rate": 0.0003673618620385856, + "loss": 7.5447, + "step": 1327500 + }, + { + "epoch": 5.408350638412285, + "grad_norm": 7.62180757522583, + "learning_rate": 0.00036710606863822197, + "loss": 7.5551, + "step": 1327600 + }, + { + "epoch": 5.408758016435666, + "grad_norm": 10.066807746887207, + "learning_rate": 0.00036685035976024026, + "loss": 7.4909, + "step": 1327700 + }, + { + "epoch": 5.409165394459047, + "grad_norm": 14.012504577636719, + "learning_rate": 0.00036659473541474926, + "loss": 7.5455, + "step": 1327800 + }, + { + "epoch": 5.409572772482429, + "grad_norm": 18.850902557373047, + "learning_rate": 0.00036633919561185514, + "loss": 7.536, + "step": 1327900 + }, + { + "epoch": 5.4099801505058105, + "grad_norm": 7.739994049072266, + "learning_rate": 0.00036608374036166115, + "loss": 7.5027, + "step": 1328000 + }, + { + "epoch": 5.4099801505058105, + "eval_MaskedAccuracy": 0.5132354166429296, + "eval_loss": 1.581290602684021, + "eval_runtime": 175.2553, + "eval_samples_per_second": 362.192, + "eval_steps_per_second": 1.415, + "step": 1328000 + }, + { + "epoch": 5.410387528529192, + "grad_norm": 11.632523536682129, + "learning_rate": 0.00036582836967426603, + "loss": 7.5485, + "step": 1328100 + }, + { + "epoch": 5.410794906552574, + "grad_norm": 8.318305015563965, + "learning_rate": 0.000365573083559767, + "loss": 7.5681, + "step": 1328200 + }, + { + "epoch": 5.411202284575955, + "grad_norm": 8.158488273620605, + "learning_rate": 0.0003653178820282565, + "loss": 7.5407, + "step": 1328300 + }, + { + "epoch": 5.411609662599337, + "grad_norm": 3.6198344230651855, + "learning_rate": 0.00036506276508982367, + "loss": 7.5366, + "step": 1328400 + }, + { + "epoch": 5.412017040622718, + "grad_norm": 6.183091640472412, + "learning_rate": 0.00036480773275455493, + "loss": 7.5503, + "step": 1328500 + }, + { + "epoch": 5.4124244186461, + "grad_norm": 8.025540351867676, + "learning_rate": 0.0003645527850325331, + "loss": 7.5346, + "step": 1328600 + }, + { + "epoch": 5.412831796669481, + "grad_norm": 8.579042434692383, + "learning_rate": 0.00036429792193383735, + "loss": 7.5301, + "step": 1328700 + }, + { + "epoch": 5.413239174692863, + "grad_norm": 17.611928939819336, + "learning_rate": 0.0003640431434685441, + "loss": 7.5073, + "step": 1328800 + }, + { + "epoch": 5.413646552716244, + "grad_norm": 14.009963989257812, + "learning_rate": 0.0003637884496467265, + "loss": 7.52, + "step": 1328900 + }, + { + "epoch": 5.414053930739625, + "grad_norm": 2.990170955657959, + "learning_rate": 0.00036353384047845347, + "loss": 7.5237, + "step": 1329000 + }, + { + "epoch": 5.414053930739625, + "eval_MaskedAccuracy": 0.5129120832677998, + "eval_loss": 1.5914942026138306, + "eval_runtime": 172.7778, + "eval_samples_per_second": 367.385, + "eval_steps_per_second": 1.435, + "step": 1329000 + }, + { + "epoch": 5.4144613087630065, + "grad_norm": 13.124401092529297, + "learning_rate": 0.00036327931597379096, + "loss": 7.5323, + "step": 1329100 + }, + { + "epoch": 5.414868686786388, + "grad_norm": 7.4388251304626465, + "learning_rate": 0.0003630248761428018, + "loss": 7.5581, + "step": 1329200 + }, + { + "epoch": 5.4152760648097695, + "grad_norm": 21.128225326538086, + "learning_rate": 0.0003627705209955451, + "loss": 7.5452, + "step": 1329300 + }, + { + "epoch": 5.415683442833151, + "grad_norm": 6.522050857543945, + "learning_rate": 0.0003625162505420775, + "loss": 7.5497, + "step": 1329400 + }, + { + "epoch": 5.416090820856533, + "grad_norm": 5.448185443878174, + "learning_rate": 0.0003622620647924505, + "loss": 7.534, + "step": 1329500 + }, + { + "epoch": 5.416498198879914, + "grad_norm": 4.216091632843018, + "learning_rate": 0.00036200796375671475, + "loss": 7.5572, + "step": 1329600 + }, + { + "epoch": 5.416905576903296, + "grad_norm": 3.4206042289733887, + "learning_rate": 0.0003617539474449165, + "loss": 7.5092, + "step": 1329700 + }, + { + "epoch": 5.417312954926677, + "grad_norm": 5.928034782409668, + "learning_rate": 0.0003615000158670968, + "loss": 7.5278, + "step": 1329800 + }, + { + "epoch": 5.417720332950059, + "grad_norm": 4.560810565948486, + "learning_rate": 0.0003612461690332955, + "loss": 7.5227, + "step": 1329900 + }, + { + "epoch": 5.41812771097344, + "grad_norm": 8.247055053710938, + "learning_rate": 0.0003609924069535495, + "loss": 7.5479, + "step": 1330000 + }, + { + "epoch": 5.41812771097344, + "eval_MaskedAccuracy": 0.5127908292357723, + "eval_loss": 1.5952818393707275, + "eval_runtime": 176.3326, + "eval_samples_per_second": 359.979, + "eval_steps_per_second": 1.406, + "step": 1330000 + }, + { + "epoch": 5.418535088996822, + "grad_norm": 8.054097175598145, + "learning_rate": 0.00036073872963789057, + "loss": 7.5763, + "step": 1330100 + }, + { + "epoch": 5.418942467020203, + "grad_norm": 4.887831211090088, + "learning_rate": 0.00036048513709634737, + "loss": 7.5659, + "step": 1330200 + }, + { + "epoch": 5.419349845043584, + "grad_norm": 9.02286148071289, + "learning_rate": 0.00036023162933894714, + "loss": 7.5615, + "step": 1330300 + }, + { + "epoch": 5.4197572230669655, + "grad_norm": 9.907876968383789, + "learning_rate": 0.0003599782063757112, + "loss": 7.5057, + "step": 1330400 + }, + { + "epoch": 5.420164601090347, + "grad_norm": 15.303842544555664, + "learning_rate": 0.0003597248682166587, + "loss": 7.5373, + "step": 1330500 + }, + { + "epoch": 5.420571979113729, + "grad_norm": 7.663967609405518, + "learning_rate": 0.0003594716148718062, + "loss": 7.5297, + "step": 1330600 + }, + { + "epoch": 5.42097935713711, + "grad_norm": 19.221603393554688, + "learning_rate": 0.0003592184463511656, + "loss": 7.5419, + "step": 1330700 + }, + { + "epoch": 5.421386735160492, + "grad_norm": 14.464127540588379, + "learning_rate": 0.0003589653626647467, + "loss": 7.5402, + "step": 1330800 + }, + { + "epoch": 5.421794113183873, + "grad_norm": 15.968949317932129, + "learning_rate": 0.0003587123638225548, + "loss": 7.5249, + "step": 1330900 + }, + { + "epoch": 5.422201491207255, + "grad_norm": 12.885955810546875, + "learning_rate": 0.0003584594498345925, + "loss": 7.5548, + "step": 1331000 + }, + { + "epoch": 5.422201491207255, + "eval_MaskedAccuracy": 0.5131306422582581, + "eval_loss": 1.5863968133926392, + "eval_runtime": 179.8542, + "eval_samples_per_second": 352.93, + "eval_steps_per_second": 1.379, + "step": 1331000 + }, + { + "epoch": 5.422608869230636, + "grad_norm": 3.5550923347473145, + "learning_rate": 0.00035820662071085855, + "loss": 7.5337, + "step": 1331100 + }, + { + "epoch": 5.423016247254018, + "grad_norm": 27.15935707092285, + "learning_rate": 0.00035795387646134826, + "loss": 7.5472, + "step": 1331200 + }, + { + "epoch": 5.423423625277399, + "grad_norm": 6.2725090980529785, + "learning_rate": 0.0003577012170960547, + "loss": 7.5348, + "step": 1331300 + }, + { + "epoch": 5.423831003300781, + "grad_norm": 19.44696807861328, + "learning_rate": 0.0003574486426249667, + "loss": 7.5198, + "step": 1331400 + }, + { + "epoch": 5.424238381324162, + "grad_norm": 18.16598129272461, + "learning_rate": 0.0003571961530580688, + "loss": 7.5305, + "step": 1331500 + }, + { + "epoch": 5.424645759347543, + "grad_norm": 18.06147575378418, + "learning_rate": 0.000356943748405345, + "loss": 7.5041, + "step": 1331600 + }, + { + "epoch": 5.4250531373709245, + "grad_norm": 5.330445289611816, + "learning_rate": 0.0003566914286767737, + "loss": 7.5474, + "step": 1331700 + }, + { + "epoch": 5.425460515394306, + "grad_norm": 17.969959259033203, + "learning_rate": 0.00035643919388232985, + "loss": 7.5306, + "step": 1331800 + }, + { + "epoch": 5.425867893417688, + "grad_norm": 11.571163177490234, + "learning_rate": 0.00035618704403198666, + "loss": 7.5164, + "step": 1331900 + }, + { + "epoch": 5.426275271441069, + "grad_norm": 18.80744171142578, + "learning_rate": 0.0003559349791357114, + "loss": 7.5245, + "step": 1332000 + }, + { + "epoch": 5.426275271441069, + "eval_MaskedAccuracy": 0.5136524945248668, + "eval_loss": 1.5814142227172852, + "eval_runtime": 174.3108, + "eval_samples_per_second": 364.154, + "eval_steps_per_second": 1.423, + "step": 1332000 + }, + { + "epoch": 5.426682649464451, + "grad_norm": 8.176673889160156, + "learning_rate": 0.00035568299920347095, + "loss": 7.5307, + "step": 1332100 + }, + { + "epoch": 5.427090027487832, + "grad_norm": 3.8952736854553223, + "learning_rate": 0.00035543110424522727, + "loss": 7.5446, + "step": 1332200 + }, + { + "epoch": 5.427497405511214, + "grad_norm": 5.171550273895264, + "learning_rate": 0.00035517929427093846, + "loss": 7.5641, + "step": 1332300 + }, + { + "epoch": 5.427904783534595, + "grad_norm": 5.871838092803955, + "learning_rate": 0.0003549275692905596, + "loss": 7.5226, + "step": 1332400 + }, + { + "epoch": 5.428312161557977, + "grad_norm": 16.73641014099121, + "learning_rate": 0.0003546759293140439, + "loss": 7.5418, + "step": 1332500 + }, + { + "epoch": 5.428719539581358, + "grad_norm": 3.0405664443969727, + "learning_rate": 0.0003544243743513398, + "loss": 7.5329, + "step": 1332600 + }, + { + "epoch": 5.429126917604739, + "grad_norm": 14.436090469360352, + "learning_rate": 0.00035417290441239186, + "loss": 7.533, + "step": 1332700 + }, + { + "epoch": 5.4295342956281205, + "grad_norm": 10.883188247680664, + "learning_rate": 0.00035392151950714226, + "loss": 7.5392, + "step": 1332800 + }, + { + "epoch": 5.429941673651502, + "grad_norm": 12.811883926391602, + "learning_rate": 0.00035367021964552973, + "loss": 7.4935, + "step": 1332900 + }, + { + "epoch": 5.430349051674884, + "grad_norm": 4.134091854095459, + "learning_rate": 0.00035341900483748973, + "loss": 7.5361, + "step": 1333000 + }, + { + "epoch": 5.430349051674884, + "eval_MaskedAccuracy": 0.5130643101503585, + "eval_loss": 1.586102843284607, + "eval_runtime": 175.0351, + "eval_samples_per_second": 362.647, + "eval_steps_per_second": 1.417, + "step": 1333000 + }, + { + "epoch": 5.430756429698265, + "grad_norm": 22.417848587036133, + "learning_rate": 0.00035316787509295403, + "loss": 7.527, + "step": 1333100 + }, + { + "epoch": 5.431163807721647, + "grad_norm": 6.5187578201293945, + "learning_rate": 0.00035291683042185036, + "loss": 7.5296, + "step": 1333200 + }, + { + "epoch": 5.431571185745028, + "grad_norm": 19.404090881347656, + "learning_rate": 0.0003526658708341051, + "loss": 7.521, + "step": 1333300 + }, + { + "epoch": 5.43197856376841, + "grad_norm": 5.884498596191406, + "learning_rate": 0.00035241499633963883, + "loss": 7.5207, + "step": 1333400 + }, + { + "epoch": 5.432385941791791, + "grad_norm": 14.977967262268066, + "learning_rate": 0.00035216420694837085, + "loss": 7.5263, + "step": 1333500 + }, + { + "epoch": 5.432793319815173, + "grad_norm": 6.032514572143555, + "learning_rate": 0.0003519135026702161, + "loss": 7.5368, + "step": 1333600 + }, + { + "epoch": 5.433200697838554, + "grad_norm": 17.459970474243164, + "learning_rate": 0.00035166288351508627, + "loss": 7.5152, + "step": 1333700 + }, + { + "epoch": 5.433608075861936, + "grad_norm": 5.257842540740967, + "learning_rate": 0.00035141234949288955, + "loss": 7.5326, + "step": 1333800 + }, + { + "epoch": 5.434015453885317, + "grad_norm": 9.040240287780762, + "learning_rate": 0.00035116190061353086, + "loss": 7.5341, + "step": 1333900 + }, + { + "epoch": 5.434422831908698, + "grad_norm": 5.258974075317383, + "learning_rate": 0.0003509115368869117, + "loss": 7.5446, + "step": 1334000 + }, + { + "epoch": 5.434422831908698, + "eval_MaskedAccuracy": 0.5131344451019411, + "eval_loss": 1.5840940475463867, + "eval_runtime": 155.4309, + "eval_samples_per_second": 408.387, + "eval_steps_per_second": 1.596, + "step": 1334000 + }, + { + "epoch": 5.4348302099320795, + "grad_norm": 10.722369194030762, + "learning_rate": 0.00035066125832293026, + "loss": 7.5392, + "step": 1334100 + }, + { + "epoch": 5.435237587955461, + "grad_norm": 11.396703720092773, + "learning_rate": 0.00035041106493148187, + "loss": 7.5322, + "step": 1334200 + }, + { + "epoch": 5.435644965978843, + "grad_norm": 6.298478603363037, + "learning_rate": 0.0003501609567224579, + "loss": 7.5113, + "step": 1334300 + }, + { + "epoch": 5.436052344002224, + "grad_norm": 10.318392753601074, + "learning_rate": 0.00034991093370574536, + "loss": 7.5359, + "step": 1334400 + }, + { + "epoch": 5.436459722025606, + "grad_norm": 28.445934295654297, + "learning_rate": 0.00034966099589123006, + "loss": 7.545, + "step": 1334500 + }, + { + "epoch": 5.436867100048987, + "grad_norm": 6.695615768432617, + "learning_rate": 0.0003494111432887935, + "loss": 7.5127, + "step": 1334600 + }, + { + "epoch": 5.437274478072369, + "grad_norm": 22.194202423095703, + "learning_rate": 0.00034916137590831295, + "loss": 7.5225, + "step": 1334700 + }, + { + "epoch": 5.43768185609575, + "grad_norm": 15.1741361618042, + "learning_rate": 0.0003489116937596636, + "loss": 7.5247, + "step": 1334800 + }, + { + "epoch": 5.438089234119132, + "grad_norm": 2.9347405433654785, + "learning_rate": 0.0003486620968527162, + "loss": 7.5406, + "step": 1334900 + }, + { + "epoch": 5.438496612142513, + "grad_norm": 21.040611267089844, + "learning_rate": 0.00034841258519733957, + "loss": 7.5374, + "step": 1335000 + }, + { + "epoch": 5.438496612142513, + "eval_MaskedAccuracy": 0.5131448649528033, + "eval_loss": 1.5918980836868286, + "eval_runtime": 164.1497, + "eval_samples_per_second": 386.696, + "eval_steps_per_second": 1.511, + "step": 1335000 + }, + { + "epoch": 5.438903990165895, + "grad_norm": 3.5905072689056396, + "learning_rate": 0.00034816315880339764, + "loss": 7.5396, + "step": 1335100 + }, + { + "epoch": 5.439311368189276, + "grad_norm": 5.706892967224121, + "learning_rate": 0.00034791381768075155, + "loss": 7.538, + "step": 1335200 + }, + { + "epoch": 5.439718746212657, + "grad_norm": 10.50268268585205, + "learning_rate": 0.00034766456183925884, + "loss": 7.5207, + "step": 1335300 + }, + { + "epoch": 5.440126124236039, + "grad_norm": 18.290935516357422, + "learning_rate": 0.0003474153912887737, + "loss": 7.5295, + "step": 1335400 + }, + { + "epoch": 5.44053350225942, + "grad_norm": 12.509167671203613, + "learning_rate": 0.00034716630603914833, + "loss": 7.5343, + "step": 1335500 + }, + { + "epoch": 5.440940880282802, + "grad_norm": 7.119658946990967, + "learning_rate": 0.00034691730610023053, + "loss": 7.4918, + "step": 1335600 + }, + { + "epoch": 5.441348258306183, + "grad_norm": 2.9584312438964844, + "learning_rate": 0.0003466683914818631, + "loss": 7.5193, + "step": 1335700 + }, + { + "epoch": 5.441755636329565, + "grad_norm": 26.875782012939453, + "learning_rate": 0.00034641956219388773, + "loss": 7.5529, + "step": 1335800 + }, + { + "epoch": 5.442163014352946, + "grad_norm": 15.270933151245117, + "learning_rate": 0.0003461708182461422, + "loss": 7.5507, + "step": 1335900 + }, + { + "epoch": 5.442570392376328, + "grad_norm": 5.244901657104492, + "learning_rate": 0.00034592215964846017, + "loss": 7.5237, + "step": 1336000 + }, + { + "epoch": 5.442570392376328, + "eval_MaskedAccuracy": 0.5129465714972707, + "eval_loss": 1.5902272462844849, + "eval_runtime": 158.804, + "eval_samples_per_second": 399.713, + "eval_steps_per_second": 1.562, + "step": 1336000 + }, + { + "epoch": 5.442977770399709, + "grad_norm": 4.501123905181885, + "learning_rate": 0.0003456735864106727, + "loss": 7.5205, + "step": 1336100 + }, + { + "epoch": 5.443385148423091, + "grad_norm": 13.229872703552246, + "learning_rate": 0.0003454250985426075, + "loss": 7.5779, + "step": 1336200 + }, + { + "epoch": 5.443792526446472, + "grad_norm": 18.62929916381836, + "learning_rate": 0.0003451766960540884, + "loss": 7.515, + "step": 1336300 + }, + { + "epoch": 5.444199904469854, + "grad_norm": 9.902000427246094, + "learning_rate": 0.0003449283789549361, + "loss": 7.5232, + "step": 1336400 + }, + { + "epoch": 5.4446072824932354, + "grad_norm": 10.179045677185059, + "learning_rate": 0.0003446801472549677, + "loss": 7.5605, + "step": 1336500 + }, + { + "epoch": 5.445014660516616, + "grad_norm": 13.637742042541504, + "learning_rate": 0.0003444320009639968, + "loss": 7.5518, + "step": 1336600 + }, + { + "epoch": 5.445422038539998, + "grad_norm": 14.29509449005127, + "learning_rate": 0.000344183940091835, + "loss": 7.5242, + "step": 1336700 + }, + { + "epoch": 5.445829416563379, + "grad_norm": 10.59730339050293, + "learning_rate": 0.0003439359646482886, + "loss": 7.5534, + "step": 1336800 + }, + { + "epoch": 5.446236794586761, + "grad_norm": 9.562889099121094, + "learning_rate": 0.00034368807464316164, + "loss": 7.5266, + "step": 1336900 + }, + { + "epoch": 5.446644172610142, + "grad_norm": 19.545717239379883, + "learning_rate": 0.0003434402700862544, + "loss": 7.5524, + "step": 1337000 + }, + { + "epoch": 5.446644172610142, + "eval_MaskedAccuracy": 0.5134409661704566, + "eval_loss": 1.581125020980835, + "eval_runtime": 195.5915, + "eval_samples_per_second": 324.533, + "eval_steps_per_second": 1.268, + "step": 1337000 + }, + { + "epoch": 5.447051550633524, + "grad_norm": 12.941215515136719, + "learning_rate": 0.00034319255098736514, + "loss": 7.5372, + "step": 1337100 + }, + { + "epoch": 5.447458928656905, + "grad_norm": 14.04662036895752, + "learning_rate": 0.0003429449173562859, + "loss": 7.518, + "step": 1337200 + }, + { + "epoch": 5.447866306680287, + "grad_norm": 17.593795776367188, + "learning_rate": 0.00034269736920280764, + "loss": 7.541, + "step": 1337300 + }, + { + "epoch": 5.448273684703668, + "grad_norm": 5.950951099395752, + "learning_rate": 0.00034244990653671715, + "loss": 7.5448, + "step": 1337400 + }, + { + "epoch": 5.44868106272705, + "grad_norm": 6.166128635406494, + "learning_rate": 0.000342202529367798, + "loss": 7.5412, + "step": 1337500 + }, + { + "epoch": 5.449088440750431, + "grad_norm": 5.826472282409668, + "learning_rate": 0.0003419552377058305, + "loss": 7.5065, + "step": 1337600 + }, + { + "epoch": 5.449495818773812, + "grad_norm": 17.794090270996094, + "learning_rate": 0.0003417080315605913, + "loss": 7.535, + "step": 1337700 + }, + { + "epoch": 5.449903196797194, + "grad_norm": 10.942166328430176, + "learning_rate": 0.00034146091094185393, + "loss": 7.5487, + "step": 1337800 + }, + { + "epoch": 5.450310574820575, + "grad_norm": 11.414239883422852, + "learning_rate": 0.00034121387585938766, + "loss": 7.5501, + "step": 1337900 + }, + { + "epoch": 5.450717952843957, + "grad_norm": 3.393474817276001, + "learning_rate": 0.00034096692632296007, + "loss": 7.5384, + "step": 1338000 + }, + { + "epoch": 5.450717952843957, + "eval_MaskedAccuracy": 0.5130633760258678, + "eval_loss": 1.596960425376892, + "eval_runtime": 163.0323, + "eval_samples_per_second": 389.346, + "eval_steps_per_second": 1.521, + "step": 1338000 + }, + { + "epoch": 5.451125330867338, + "grad_norm": 23.581764221191406, + "learning_rate": 0.00034072006234233375, + "loss": 7.572, + "step": 1338100 + }, + { + "epoch": 5.45153270889072, + "grad_norm": 2.6095852851867676, + "learning_rate": 0.00034047328392726857, + "loss": 7.5234, + "step": 1338200 + }, + { + "epoch": 5.451940086914101, + "grad_norm": 10.084429740905762, + "learning_rate": 0.00034022659108752106, + "loss": 7.5147, + "step": 1338300 + }, + { + "epoch": 5.452347464937483, + "grad_norm": 3.5106518268585205, + "learning_rate": 0.00033997998383284404, + "loss": 7.5262, + "step": 1338400 + }, + { + "epoch": 5.452754842960864, + "grad_norm": 13.30595588684082, + "learning_rate": 0.0003397334621729879, + "loss": 7.5234, + "step": 1338500 + }, + { + "epoch": 5.453162220984246, + "grad_norm": 15.96445083618164, + "learning_rate": 0.0003394870261176992, + "loss": 7.5267, + "step": 1338600 + }, + { + "epoch": 5.453569599007627, + "grad_norm": 10.994308471679688, + "learning_rate": 0.00033924067567671996, + "loss": 7.5092, + "step": 1338700 + }, + { + "epoch": 5.453976977031009, + "grad_norm": 17.248014450073242, + "learning_rate": 0.0003389944108597901, + "loss": 7.5469, + "step": 1338800 + }, + { + "epoch": 5.4543843550543905, + "grad_norm": 2.9977152347564697, + "learning_rate": 0.0003387482316766457, + "loss": 7.5468, + "step": 1338900 + }, + { + "epoch": 5.454791733077771, + "grad_norm": 3.5361149311065674, + "learning_rate": 0.0003385021381370198, + "loss": 7.5357, + "step": 1339000 + }, + { + "epoch": 5.454791733077771, + "eval_MaskedAccuracy": 0.5134931230086092, + "eval_loss": 1.5927139520645142, + "eval_runtime": 176.011, + "eval_samples_per_second": 360.637, + "eval_steps_per_second": 1.409, + "step": 1339000 + }, + { + "epoch": 5.455199111101153, + "grad_norm": 5.578920841217041, + "learning_rate": 0.00033825613025064176, + "loss": 7.5595, + "step": 1339100 + }, + { + "epoch": 5.455606489124534, + "grad_norm": 7.370936870574951, + "learning_rate": 0.00033801020802723777, + "loss": 7.5487, + "step": 1339200 + }, + { + "epoch": 5.456013867147916, + "grad_norm": 14.652971267700195, + "learning_rate": 0.00033776437147653, + "loss": 7.5554, + "step": 1339300 + }, + { + "epoch": 5.456421245171297, + "grad_norm": 6.171076774597168, + "learning_rate": 0.00033751862060823725, + "loss": 7.5181, + "step": 1339400 + }, + { + "epoch": 5.456828623194679, + "grad_norm": 3.990514039993286, + "learning_rate": 0.000337272955432076, + "loss": 7.5047, + "step": 1339500 + }, + { + "epoch": 5.45723600121806, + "grad_norm": 6.304797172546387, + "learning_rate": 0.0003370273759577589, + "loss": 7.5187, + "step": 1339600 + }, + { + "epoch": 5.457643379241442, + "grad_norm": 11.207032203674316, + "learning_rate": 0.00033678188219499503, + "loss": 7.4948, + "step": 1339700 + }, + { + "epoch": 5.458050757264823, + "grad_norm": 8.533391952514648, + "learning_rate": 0.0003365364741534893, + "loss": 7.5275, + "step": 1339800 + }, + { + "epoch": 5.458458135288205, + "grad_norm": 4.139130115509033, + "learning_rate": 0.0003362911518429445, + "loss": 7.532, + "step": 1339900 + }, + { + "epoch": 5.458865513311586, + "grad_norm": 6.50508451461792, + "learning_rate": 0.00033604591527305953, + "loss": 7.4986, + "step": 1340000 + }, + { + "epoch": 5.458865513311586, + "eval_MaskedAccuracy": 0.512827248144668, + "eval_loss": 1.5947595834732056, + "eval_runtime": 181.6673, + "eval_samples_per_second": 349.408, + "eval_steps_per_second": 1.365, + "step": 1340000 + }, + { + "epoch": 5.459272891334968, + "grad_norm": 10.115270614624023, + "learning_rate": 0.0003358007644535302, + "loss": 7.5262, + "step": 1340100 + }, + { + "epoch": 5.4596802693583495, + "grad_norm": 21.376296997070312, + "learning_rate": 0.00033555569939404845, + "loss": 7.5373, + "step": 1340200 + }, + { + "epoch": 5.46008764738173, + "grad_norm": 19.60698127746582, + "learning_rate": 0.00033531072010430237, + "loss": 7.5546, + "step": 1340300 + }, + { + "epoch": 5.460495025405112, + "grad_norm": 15.67002010345459, + "learning_rate": 0.00033506582659397757, + "loss": 7.5188, + "step": 1340400 + }, + { + "epoch": 5.460902403428493, + "grad_norm": 4.400627136230469, + "learning_rate": 0.00033482101887275694, + "loss": 7.5232, + "step": 1340500 + }, + { + "epoch": 5.461309781451875, + "grad_norm": 6.973476886749268, + "learning_rate": 0.0003345762969503184, + "loss": 7.5498, + "step": 1340600 + }, + { + "epoch": 5.461717159475256, + "grad_norm": 12.007425308227539, + "learning_rate": 0.0003343316608363366, + "loss": 7.5003, + "step": 1340700 + }, + { + "epoch": 5.462124537498638, + "grad_norm": 12.796014785766602, + "learning_rate": 0.00033408711054048443, + "loss": 7.5597, + "step": 1340800 + }, + { + "epoch": 5.462531915522019, + "grad_norm": 16.323322296142578, + "learning_rate": 0.00033384264607242987, + "loss": 7.5058, + "step": 1340900 + }, + { + "epoch": 5.462939293545401, + "grad_norm": 19.669464111328125, + "learning_rate": 0.00033359826744183696, + "loss": 7.5373, + "step": 1341000 + }, + { + "epoch": 5.462939293545401, + "eval_MaskedAccuracy": 0.5132753471187822, + "eval_loss": 1.5841223001480103, + "eval_runtime": 189.2486, + "eval_samples_per_second": 335.411, + "eval_steps_per_second": 1.31, + "step": 1341000 + }, + { + "epoch": 5.463346671568782, + "grad_norm": 11.022880554199219, + "learning_rate": 0.00033335397465836833, + "loss": 7.5201, + "step": 1341100 + }, + { + "epoch": 5.463754049592164, + "grad_norm": 14.854443550109863, + "learning_rate": 0.00033310976773168183, + "loss": 7.5231, + "step": 1341200 + }, + { + "epoch": 5.4641614276155455, + "grad_norm": 8.195154190063477, + "learning_rate": 0.00033286564667143234, + "loss": 7.5346, + "step": 1341300 + }, + { + "epoch": 5.464568805638927, + "grad_norm": 5.181168556213379, + "learning_rate": 0.0003326216114872715, + "loss": 7.5321, + "step": 1341400 + }, + { + "epoch": 5.4649761836623085, + "grad_norm": 13.370020866394043, + "learning_rate": 0.0003323776621888467, + "loss": 7.5162, + "step": 1341500 + }, + { + "epoch": 5.465383561685689, + "grad_norm": 31.435993194580078, + "learning_rate": 0.00033213379878580315, + "loss": 7.5352, + "step": 1341600 + }, + { + "epoch": 5.465790939709071, + "grad_norm": 3.4957010746002197, + "learning_rate": 0.00033189002128778154, + "loss": 7.4939, + "step": 1341700 + }, + { + "epoch": 5.466198317732452, + "grad_norm": 9.091877937316895, + "learning_rate": 0.0003316463297044193, + "loss": 7.5208, + "step": 1341800 + }, + { + "epoch": 5.466605695755834, + "grad_norm": 8.437361717224121, + "learning_rate": 0.0003314027240453526, + "loss": 7.5717, + "step": 1341900 + }, + { + "epoch": 5.467013073779215, + "grad_norm": 10.271653175354004, + "learning_rate": 0.00033115920432021123, + "loss": 7.5145, + "step": 1342000 + }, + { + "epoch": 5.467013073779215, + "eval_MaskedAccuracy": 0.5129229004959125, + "eval_loss": 1.5858826637268066, + "eval_runtime": 151.4268, + "eval_samples_per_second": 419.186, + "eval_steps_per_second": 1.638, + "step": 1342000 + }, + { + "epoch": 5.467420451802597, + "grad_norm": 6.712301731109619, + "learning_rate": 0.0003309157705386229, + "loss": 7.5501, + "step": 1342100 + }, + { + "epoch": 5.467827829825978, + "grad_norm": 5.213397026062012, + "learning_rate": 0.00033067242271021206, + "loss": 7.4886, + "step": 1342200 + }, + { + "epoch": 5.46823520784936, + "grad_norm": 23.394397735595703, + "learning_rate": 0.0003304291608445994, + "loss": 7.5303, + "step": 1342300 + }, + { + "epoch": 5.468642585872741, + "grad_norm": 13.85262393951416, + "learning_rate": 0.00033018598495140216, + "loss": 7.5308, + "step": 1342400 + }, + { + "epoch": 5.469049963896123, + "grad_norm": 4.306583404541016, + "learning_rate": 0.00032994289504023453, + "loss": 7.5401, + "step": 1342500 + }, + { + "epoch": 5.4694573419195045, + "grad_norm": 21.96116065979004, + "learning_rate": 0.00032969989112070774, + "loss": 7.5259, + "step": 1342600 + }, + { + "epoch": 5.469864719942885, + "grad_norm": 11.738424301147461, + "learning_rate": 0.0003294569732024284, + "loss": 7.5423, + "step": 1342700 + }, + { + "epoch": 5.470272097966267, + "grad_norm": 16.211389541625977, + "learning_rate": 0.0003292141412950006, + "loss": 7.5346, + "step": 1342800 + }, + { + "epoch": 5.470679475989648, + "grad_norm": 13.16845417022705, + "learning_rate": 0.00032897139540802474, + "loss": 7.5355, + "step": 1342900 + }, + { + "epoch": 5.47108685401303, + "grad_norm": 4.776648998260498, + "learning_rate": 0.0003287287355510974, + "loss": 7.517, + "step": 1343000 + }, + { + "epoch": 5.47108685401303, + "eval_MaskedAccuracy": 0.5125800603311347, + "eval_loss": 1.594915509223938, + "eval_runtime": 157.1588, + "eval_samples_per_second": 403.897, + "eval_steps_per_second": 1.578, + "step": 1343000 + }, + { + "epoch": 5.471494232036411, + "grad_norm": 4.625259876251221, + "learning_rate": 0.000328486161733813, + "loss": 7.5365, + "step": 1343100 + }, + { + "epoch": 5.471901610059793, + "grad_norm": 5.41280460357666, + "learning_rate": 0.0003282436739657615, + "loss": 7.5352, + "step": 1343200 + }, + { + "epoch": 5.472308988083174, + "grad_norm": 12.514579772949219, + "learning_rate": 0.0003280012722565299, + "loss": 7.531, + "step": 1343300 + }, + { + "epoch": 5.472716366106556, + "grad_norm": 13.552356719970703, + "learning_rate": 0.0003277589566157006, + "loss": 7.5604, + "step": 1343400 + }, + { + "epoch": 5.473123744129937, + "grad_norm": 13.721244812011719, + "learning_rate": 0.0003275167270528554, + "loss": 7.5526, + "step": 1343500 + }, + { + "epoch": 5.473531122153319, + "grad_norm": 11.623786926269531, + "learning_rate": 0.00032727458357756985, + "loss": 7.5298, + "step": 1343600 + }, + { + "epoch": 5.4739385001767005, + "grad_norm": 16.389251708984375, + "learning_rate": 0.00032703252619941737, + "loss": 7.5368, + "step": 1343700 + }, + { + "epoch": 5.474345878200082, + "grad_norm": 8.868416786193848, + "learning_rate": 0.00032679055492796744, + "loss": 7.5088, + "step": 1343800 + }, + { + "epoch": 5.4747532562234635, + "grad_norm": 3.096803903579712, + "learning_rate": 0.0003265486697727869, + "loss": 7.5462, + "step": 1343900 + }, + { + "epoch": 5.475160634246844, + "grad_norm": 21.188514709472656, + "learning_rate": 0.0003263068707434388, + "loss": 7.5692, + "step": 1344000 + }, + { + "epoch": 5.475160634246844, + "eval_MaskedAccuracy": 0.5128599134834798, + "eval_loss": 1.588036298751831, + "eval_runtime": 159.4299, + "eval_samples_per_second": 398.144, + "eval_steps_per_second": 1.556, + "step": 1344000 + }, + { + "epoch": 5.475568012270226, + "grad_norm": 3.874368906021118, + "learning_rate": 0.0003260651578494824, + "loss": 7.5354, + "step": 1344100 + }, + { + "epoch": 5.475975390293607, + "grad_norm": 19.807144165039062, + "learning_rate": 0.00032582353110047424, + "loss": 7.5309, + "step": 1344200 + }, + { + "epoch": 5.476382768316989, + "grad_norm": 6.514845848083496, + "learning_rate": 0.00032558199050596574, + "loss": 7.5299, + "step": 1344300 + }, + { + "epoch": 5.47679014634037, + "grad_norm": 12.320228576660156, + "learning_rate": 0.0003253405360755088, + "loss": 7.5375, + "step": 1344400 + }, + { + "epoch": 5.477197524363752, + "grad_norm": 4.334640026092529, + "learning_rate": 0.0003250991678186477, + "loss": 7.496, + "step": 1344500 + }, + { + "epoch": 5.477604902387133, + "grad_norm": 18.309972763061523, + "learning_rate": 0.0003248578857449255, + "loss": 7.5116, + "step": 1344600 + }, + { + "epoch": 5.478012280410515, + "grad_norm": 3.038743019104004, + "learning_rate": 0.0003246166898638813, + "loss": 7.5474, + "step": 1344700 + }, + { + "epoch": 5.4784196584338964, + "grad_norm": 5.224091529846191, + "learning_rate": 0.000324375580185051, + "loss": 7.5399, + "step": 1344800 + }, + { + "epoch": 5.478827036457278, + "grad_norm": 11.377028465270996, + "learning_rate": 0.00032413455671796655, + "loss": 7.524, + "step": 1344900 + }, + { + "epoch": 5.4792344144806595, + "grad_norm": 4.814363956451416, + "learning_rate": 0.0003238936194721577, + "loss": 7.5648, + "step": 1345000 + }, + { + "epoch": 5.4792344144806595, + "eval_MaskedAccuracy": 0.512634868952215, + "eval_loss": 1.5925509929656982, + "eval_runtime": 157.3071, + "eval_samples_per_second": 403.516, + "eval_steps_per_second": 1.577, + "step": 1345000 + }, + { + "epoch": 5.479641792504041, + "grad_norm": 9.213382720947266, + "learning_rate": 0.00032365276845714927, + "loss": 7.5141, + "step": 1345100 + }, + { + "epoch": 5.480049170527423, + "grad_norm": 11.836997032165527, + "learning_rate": 0.0003234120036824633, + "loss": 7.522, + "step": 1345200 + }, + { + "epoch": 5.480456548550803, + "grad_norm": 10.100482940673828, + "learning_rate": 0.0003231713251576189, + "loss": 7.5438, + "step": 1345300 + }, + { + "epoch": 5.480863926574185, + "grad_norm": 6.054364204406738, + "learning_rate": 0.00032293073289213145, + "loss": 7.5527, + "step": 1345400 + }, + { + "epoch": 5.481271304597566, + "grad_norm": 3.237656593322754, + "learning_rate": 0.0003226902268955121, + "loss": 7.4976, + "step": 1345500 + }, + { + "epoch": 5.481678682620948, + "grad_norm": 4.5124101638793945, + "learning_rate": 0.0003224498071772703, + "loss": 7.538, + "step": 1345600 + }, + { + "epoch": 5.482086060644329, + "grad_norm": 12.343074798583984, + "learning_rate": 0.00032220947374691025, + "loss": 7.4971, + "step": 1345700 + }, + { + "epoch": 5.482493438667711, + "grad_norm": 4.376140594482422, + "learning_rate": 0.0003219692266139342, + "loss": 7.4965, + "step": 1345800 + }, + { + "epoch": 5.482900816691092, + "grad_norm": 4.290546894073486, + "learning_rate": 0.0003217290657878396, + "loss": 7.5243, + "step": 1345900 + }, + { + "epoch": 5.483308194714474, + "grad_norm": 12.027759552001953, + "learning_rate": 0.0003214889912781226, + "loss": 7.5142, + "step": 1346000 + }, + { + "epoch": 5.483308194714474, + "eval_MaskedAccuracy": 0.5125970006508773, + "eval_loss": 1.5909134149551392, + "eval_runtime": 171.889, + "eval_samples_per_second": 369.285, + "eval_steps_per_second": 1.443, + "step": 1346000 + }, + { + "epoch": 5.4837155727378555, + "grad_norm": 3.6816511154174805, + "learning_rate": 0.0003212490030942748, + "loss": 7.5326, + "step": 1346100 + }, + { + "epoch": 5.484122950761237, + "grad_norm": 5.182373523712158, + "learning_rate": 0.00032100910124578284, + "loss": 7.5531, + "step": 1346200 + }, + { + "epoch": 5.4845303287846185, + "grad_norm": 12.852604866027832, + "learning_rate": 0.00032076928574213204, + "loss": 7.5406, + "step": 1346300 + }, + { + "epoch": 5.484937706808, + "grad_norm": 15.884415626525879, + "learning_rate": 0.000320529556592803, + "loss": 7.5342, + "step": 1346400 + }, + { + "epoch": 5.485345084831382, + "grad_norm": 18.99367904663086, + "learning_rate": 0.00032028991380727423, + "loss": 7.5479, + "step": 1346500 + }, + { + "epoch": 5.485752462854762, + "grad_norm": 13.396120071411133, + "learning_rate": 0.00032005035739501994, + "loss": 7.5298, + "step": 1346600 + }, + { + "epoch": 5.486159840878144, + "grad_norm": 15.541460037231445, + "learning_rate": 0.00031981088736551095, + "loss": 7.5432, + "step": 1346700 + }, + { + "epoch": 5.486567218901525, + "grad_norm": 5.564704895019531, + "learning_rate": 0.0003195715037282145, + "loss": 7.5431, + "step": 1346800 + }, + { + "epoch": 5.486974596924907, + "grad_norm": 15.751948356628418, + "learning_rate": 0.0003193322064925953, + "loss": 7.516, + "step": 1346900 + }, + { + "epoch": 5.487381974948288, + "grad_norm": 16.45070457458496, + "learning_rate": 0.00031909299566811364, + "loss": 7.5259, + "step": 1347000 + }, + { + "epoch": 5.487381974948288, + "eval_MaskedAccuracy": 0.5127089098006233, + "eval_loss": 1.5921605825424194, + "eval_runtime": 161.6336, + "eval_samples_per_second": 392.715, + "eval_steps_per_second": 1.534, + "step": 1347000 + }, + { + "epoch": 5.48778935297167, + "grad_norm": 16.948579788208008, + "learning_rate": 0.000318853871264227, + "loss": 7.4926, + "step": 1347100 + }, + { + "epoch": 5.4881967309950515, + "grad_norm": 4.223629951477051, + "learning_rate": 0.0003186148332903895, + "loss": 7.5477, + "step": 1347200 + }, + { + "epoch": 5.488604109018433, + "grad_norm": 4.958531856536865, + "learning_rate": 0.00031837588175605047, + "loss": 7.5472, + "step": 1347300 + }, + { + "epoch": 5.4890114870418145, + "grad_norm": 3.535315990447998, + "learning_rate": 0.0003181370166706577, + "loss": 7.535, + "step": 1347400 + }, + { + "epoch": 5.489418865065196, + "grad_norm": 14.249555587768555, + "learning_rate": 0.00031789823804365553, + "loss": 7.5191, + "step": 1347500 + }, + { + "epoch": 5.489826243088578, + "grad_norm": 3.990600347518921, + "learning_rate": 0.0003176595458844832, + "loss": 7.5293, + "step": 1347600 + }, + { + "epoch": 5.490233621111958, + "grad_norm": 9.97349739074707, + "learning_rate": 0.0003174209402025778, + "loss": 7.5425, + "step": 1347700 + }, + { + "epoch": 5.49064099913534, + "grad_norm": 5.024134635925293, + "learning_rate": 0.00031718242100737274, + "loss": 7.5202, + "step": 1347800 + }, + { + "epoch": 5.491048377158721, + "grad_norm": 10.962465286254883, + "learning_rate": 0.00031694398830829786, + "loss": 7.5292, + "step": 1347900 + }, + { + "epoch": 5.491455755182103, + "grad_norm": 9.160889625549316, + "learning_rate": 0.0003167056421147788, + "loss": 7.5487, + "step": 1348000 + }, + { + "epoch": 5.491455755182103, + "eval_MaskedAccuracy": 0.5124560960872646, + "eval_loss": 1.5988479852676392, + "eval_runtime": 159.5393, + "eval_samples_per_second": 397.871, + "eval_steps_per_second": 1.554, + "step": 1348000 + }, + { + "epoch": 5.491863133205484, + "grad_norm": 7.978877067565918, + "learning_rate": 0.00031646738243624046, + "loss": 7.5229, + "step": 1348100 + }, + { + "epoch": 5.492270511228866, + "grad_norm": 14.206701278686523, + "learning_rate": 0.0003162292092821014, + "loss": 7.5523, + "step": 1348200 + }, + { + "epoch": 5.492677889252247, + "grad_norm": 9.733171463012695, + "learning_rate": 0.0003159911226617778, + "loss": 7.5467, + "step": 1348300 + }, + { + "epoch": 5.493085267275629, + "grad_norm": 14.264520645141602, + "learning_rate": 0.00031575312258468296, + "loss": 7.528, + "step": 1348400 + }, + { + "epoch": 5.4934926452990105, + "grad_norm": 4.115435600280762, + "learning_rate": 0.0003155152090602261, + "loss": 7.5435, + "step": 1348500 + }, + { + "epoch": 5.493900023322392, + "grad_norm": 9.625788688659668, + "learning_rate": 0.0003152773820978131, + "loss": 7.5394, + "step": 1348600 + }, + { + "epoch": 5.494307401345774, + "grad_norm": 8.65286636352539, + "learning_rate": 0.0003150396417068466, + "loss": 7.5411, + "step": 1348700 + }, + { + "epoch": 5.494714779369155, + "grad_norm": 5.142483711242676, + "learning_rate": 0.0003148019878967261, + "loss": 7.5598, + "step": 1348800 + }, + { + "epoch": 5.495122157392537, + "grad_norm": 10.273721694946289, + "learning_rate": 0.0003145644206768466, + "loss": 7.5265, + "step": 1348900 + }, + { + "epoch": 5.495529535415917, + "grad_norm": 9.339920997619629, + "learning_rate": 0.0003143269400566013, + "loss": 7.5208, + "step": 1349000 + }, + { + "epoch": 5.495529535415917, + "eval_MaskedAccuracy": 0.5130556723890132, + "eval_loss": 1.5973339080810547, + "eval_runtime": 164.6236, + "eval_samples_per_second": 385.583, + "eval_steps_per_second": 1.506, + "step": 1349000 + }, + { + "epoch": 5.495936913439299, + "grad_norm": 4.450901985168457, + "learning_rate": 0.00031408954604537854, + "loss": 7.5344, + "step": 1349100 + }, + { + "epoch": 5.49634429146268, + "grad_norm": 10.155784606933594, + "learning_rate": 0.0003138522386525638, + "loss": 7.5323, + "step": 1349200 + }, + { + "epoch": 5.496751669486062, + "grad_norm": 15.838692665100098, + "learning_rate": 0.0003136150178875386, + "loss": 7.5505, + "step": 1349300 + }, + { + "epoch": 5.497159047509443, + "grad_norm": 5.904865264892578, + "learning_rate": 0.0003133778837596825, + "loss": 7.5431, + "step": 1349400 + }, + { + "epoch": 5.497566425532825, + "grad_norm": 9.229758262634277, + "learning_rate": 0.00031314083627837034, + "loss": 7.5248, + "step": 1349500 + }, + { + "epoch": 5.4979738035562065, + "grad_norm": 17.848678588867188, + "learning_rate": 0.00031290387545297336, + "loss": 7.5719, + "step": 1349600 + }, + { + "epoch": 5.498381181579588, + "grad_norm": 16.38506507873535, + "learning_rate": 0.00031266700129286055, + "loss": 7.5813, + "step": 1349700 + }, + { + "epoch": 5.4987885596029695, + "grad_norm": 9.744184494018555, + "learning_rate": 0.00031243021380739707, + "loss": 7.5311, + "step": 1349800 + }, + { + "epoch": 5.499195937626351, + "grad_norm": 6.805414199829102, + "learning_rate": 0.0003121935130059439, + "loss": 7.5201, + "step": 1349900 + }, + { + "epoch": 5.499603315649733, + "grad_norm": 15.914021492004395, + "learning_rate": 0.000311956898897859, + "loss": 7.5276, + "step": 1350000 + }, + { + "epoch": 5.499603315649733, + "eval_MaskedAccuracy": 0.5128690428060072, + "eval_loss": 1.5992703437805176, + "eval_runtime": 159.9813, + "eval_samples_per_second": 396.771, + "eval_steps_per_second": 1.55, + "step": 1350000 + }, + { + "epoch": 5.500010693673114, + "grad_norm": 14.615961074829102, + "learning_rate": 0.000311720371492497, + "loss": 7.5482, + "step": 1350100 + }, + { + "epoch": 5.500418071696496, + "grad_norm": 15.448691368103027, + "learning_rate": 0.0003114839307992093, + "loss": 7.5288, + "step": 1350200 + }, + { + "epoch": 5.500825449719876, + "grad_norm": 2.8489630222320557, + "learning_rate": 0.00031124757682734366, + "loss": 7.5294, + "step": 1350300 + }, + { + "epoch": 5.501232827743258, + "grad_norm": 4.686595439910889, + "learning_rate": 0.0003110113095862444, + "loss": 7.5541, + "step": 1350400 + }, + { + "epoch": 5.501640205766639, + "grad_norm": 4.597413063049316, + "learning_rate": 0.00031077512908525226, + "loss": 7.5222, + "step": 1350500 + }, + { + "epoch": 5.502047583790021, + "grad_norm": 9.691584587097168, + "learning_rate": 0.0003105390353337044, + "loss": 7.5245, + "step": 1350600 + }, + { + "epoch": 5.502454961813402, + "grad_norm": 11.693702697753906, + "learning_rate": 0.0003103030283409355, + "loss": 7.5349, + "step": 1350700 + }, + { + "epoch": 5.502862339836784, + "grad_norm": 5.796513557434082, + "learning_rate": 0.0003100671081162762, + "loss": 7.5517, + "step": 1350800 + }, + { + "epoch": 5.5032697178601655, + "grad_norm": 9.841246604919434, + "learning_rate": 0.00030983127466905285, + "loss": 7.5717, + "step": 1350900 + }, + { + "epoch": 5.503677095883547, + "grad_norm": 21.618650436401367, + "learning_rate": 0.0003095955280085905, + "loss": 7.5008, + "step": 1351000 + }, + { + "epoch": 5.503677095883547, + "eval_MaskedAccuracy": 0.5133096525036167, + "eval_loss": 1.5842745304107666, + "eval_runtime": 154.528, + "eval_samples_per_second": 410.773, + "eval_steps_per_second": 1.605, + "step": 1351000 + }, + { + "epoch": 5.504084473906929, + "grad_norm": 13.956299781799316, + "learning_rate": 0.0003093598681442084, + "loss": 7.5189, + "step": 1351100 + }, + { + "epoch": 5.50449185193031, + "grad_norm": 6.590954303741455, + "learning_rate": 0.00030912429508522376, + "loss": 7.5171, + "step": 1351200 + }, + { + "epoch": 5.504899229953692, + "grad_norm": 4.142152309417725, + "learning_rate": 0.0003088888088409498, + "loss": 7.5613, + "step": 1351300 + }, + { + "epoch": 5.505306607977072, + "grad_norm": 3.0190186500549316, + "learning_rate": 0.00030865340942069715, + "loss": 7.5455, + "step": 1351400 + }, + { + "epoch": 5.505713986000455, + "grad_norm": 9.084418296813965, + "learning_rate": 0.00030841809683377215, + "loss": 7.508, + "step": 1351500 + }, + { + "epoch": 5.506121364023835, + "grad_norm": 10.555480003356934, + "learning_rate": 0.00030818287108947773, + "loss": 7.5278, + "step": 1351600 + }, + { + "epoch": 5.506528742047217, + "grad_norm": 3.6321165561676025, + "learning_rate": 0.0003079477321971144, + "loss": 7.5506, + "step": 1351700 + }, + { + "epoch": 5.506936120070598, + "grad_norm": 8.326172828674316, + "learning_rate": 0.0003077126801659775, + "loss": 7.5182, + "step": 1351800 + }, + { + "epoch": 5.50734349809398, + "grad_norm": 6.167407989501953, + "learning_rate": 0.00030747771500536027, + "loss": 7.4815, + "step": 1351900 + }, + { + "epoch": 5.5077508761173615, + "grad_norm": 6.568759441375732, + "learning_rate": 0.00030724283672455205, + "loss": 7.5345, + "step": 1352000 + }, + { + "epoch": 5.5077508761173615, + "eval_MaskedAccuracy": 0.513475285674313, + "eval_loss": 1.5810248851776123, + "eval_runtime": 153.0557, + "eval_samples_per_second": 414.725, + "eval_steps_per_second": 1.62, + "step": 1352000 + }, + { + "epoch": 5.508158254140743, + "grad_norm": 4.821125030517578, + "learning_rate": 0.00030700804533283937, + "loss": 7.5277, + "step": 1352100 + }, + { + "epoch": 5.5085656321641245, + "grad_norm": 8.832671165466309, + "learning_rate": 0.0003067733408395036, + "loss": 7.5149, + "step": 1352200 + }, + { + "epoch": 5.508973010187506, + "grad_norm": 16.641756057739258, + "learning_rate": 0.0003065387232538247, + "loss": 7.552, + "step": 1352300 + }, + { + "epoch": 5.509380388210888, + "grad_norm": 5.799544334411621, + "learning_rate": 0.0003063041925850786, + "loss": 7.4953, + "step": 1352400 + }, + { + "epoch": 5.509787766234269, + "grad_norm": 16.126590728759766, + "learning_rate": 0.0003060697488425371, + "loss": 7.5339, + "step": 1352500 + }, + { + "epoch": 5.510195144257651, + "grad_norm": 4.758677005767822, + "learning_rate": 0.00030583539203546926, + "loss": 7.5393, + "step": 1352600 + }, + { + "epoch": 5.510602522281031, + "grad_norm": 9.810537338256836, + "learning_rate": 0.0003056011221731403, + "loss": 7.5374, + "step": 1352700 + }, + { + "epoch": 5.511009900304413, + "grad_norm": 5.438594341278076, + "learning_rate": 0.0003053669392648126, + "loss": 7.5487, + "step": 1352800 + }, + { + "epoch": 5.511417278327794, + "grad_norm": 3.1264357566833496, + "learning_rate": 0.0003051328433197442, + "loss": 7.5228, + "step": 1352900 + }, + { + "epoch": 5.511824656351176, + "grad_norm": 5.903149127960205, + "learning_rate": 0.0003048988343471898, + "loss": 7.5106, + "step": 1353000 + }, + { + "epoch": 5.511824656351176, + "eval_MaskedAccuracy": 0.5130156749728397, + "eval_loss": 1.593740463256836, + "eval_runtime": 150.775, + "eval_samples_per_second": 420.998, + "eval_steps_per_second": 1.645, + "step": 1353000 + }, + { + "epoch": 5.512232034374557, + "grad_norm": 4.0846757888793945, + "learning_rate": 0.00030466491235640186, + "loss": 7.5441, + "step": 1353100 + }, + { + "epoch": 5.512639412397939, + "grad_norm": 4.778141975402832, + "learning_rate": 0.00030443107735662797, + "loss": 7.5308, + "step": 1353200 + }, + { + "epoch": 5.5130467904213205, + "grad_norm": 5.338253974914551, + "learning_rate": 0.0003041973293571125, + "loss": 7.5573, + "step": 1353300 + }, + { + "epoch": 5.513454168444702, + "grad_norm": 4.985092639923096, + "learning_rate": 0.0003039636683670981, + "loss": 7.5476, + "step": 1353400 + }, + { + "epoch": 5.513861546468084, + "grad_norm": 13.974929809570312, + "learning_rate": 0.0003037300943958216, + "loss": 7.5322, + "step": 1353500 + }, + { + "epoch": 5.514268924491465, + "grad_norm": 22.895973205566406, + "learning_rate": 0.00030349660745251764, + "loss": 7.5185, + "step": 1353600 + }, + { + "epoch": 5.514676302514847, + "grad_norm": 2.834916353225708, + "learning_rate": 0.0003032632075464173, + "loss": 7.5345, + "step": 1353700 + }, + { + "epoch": 5.515083680538228, + "grad_norm": 8.161876678466797, + "learning_rate": 0.00030302989468674804, + "loss": 7.521, + "step": 1353800 + }, + { + "epoch": 5.51549105856161, + "grad_norm": 19.494930267333984, + "learning_rate": 0.000302796668882734, + "loss": 7.5327, + "step": 1353900 + }, + { + "epoch": 5.51589843658499, + "grad_norm": 19.22008514404297, + "learning_rate": 0.0003025635301435959, + "loss": 7.5419, + "step": 1354000 + }, + { + "epoch": 5.51589843658499, + "eval_MaskedAccuracy": 0.5136081749109903, + "eval_loss": 1.5779005289077759, + "eval_runtime": 159.4313, + "eval_samples_per_second": 398.14, + "eval_steps_per_second": 1.556, + "step": 1354000 + }, + { + "epoch": 5.516305814608372, + "grad_norm": 17.956283569335938, + "learning_rate": 0.00030233047847855085, + "loss": 7.5323, + "step": 1354100 + }, + { + "epoch": 5.516713192631753, + "grad_norm": 9.527266502380371, + "learning_rate": 0.0003020975138968126, + "loss": 7.5419, + "step": 1354200 + }, + { + "epoch": 5.517120570655135, + "grad_norm": 5.125702381134033, + "learning_rate": 0.00030186463640759177, + "loss": 7.5371, + "step": 1354300 + }, + { + "epoch": 5.5175279486785165, + "grad_norm": 11.493252754211426, + "learning_rate": 0.0003016318460200952, + "loss": 7.5557, + "step": 1354400 + }, + { + "epoch": 5.517935326701898, + "grad_norm": 12.407708168029785, + "learning_rate": 0.0003013991427435262, + "loss": 7.4964, + "step": 1354500 + }, + { + "epoch": 5.5183427047252795, + "grad_norm": 5.3618974685668945, + "learning_rate": 0.00030116652658708455, + "loss": 7.5349, + "step": 1354600 + }, + { + "epoch": 5.518750082748661, + "grad_norm": 10.639982223510742, + "learning_rate": 0.0003009339975599669, + "loss": 7.5238, + "step": 1354700 + }, + { + "epoch": 5.519157460772043, + "grad_norm": 6.850088596343994, + "learning_rate": 0.00030070155567136646, + "loss": 7.5272, + "step": 1354800 + }, + { + "epoch": 5.519564838795424, + "grad_norm": 5.3476128578186035, + "learning_rate": 0.00030046920093047304, + "loss": 7.5304, + "step": 1354900 + }, + { + "epoch": 5.519972216818806, + "grad_norm": 38.98796844482422, + "learning_rate": 0.0003002369333464727, + "loss": 7.5189, + "step": 1355000 + }, + { + "epoch": 5.519972216818806, + "eval_MaskedAccuracy": 0.5131691850320736, + "eval_loss": 1.588597297668457, + "eval_runtime": 179.1814, + "eval_samples_per_second": 354.255, + "eval_steps_per_second": 1.384, + "step": 1355000 + }, + { + "epoch": 5.520379594842187, + "grad_norm": 14.150798797607422, + "learning_rate": 0.000300004752928548, + "loss": 7.5311, + "step": 1355100 + }, + { + "epoch": 5.520786972865569, + "grad_norm": 17.762676239013672, + "learning_rate": 0.00029977265968587907, + "loss": 7.5403, + "step": 1355200 + }, + { + "epoch": 5.521194350888949, + "grad_norm": 11.598910331726074, + "learning_rate": 0.00029954065362764154, + "loss": 7.5492, + "step": 1355300 + }, + { + "epoch": 5.521601728912331, + "grad_norm": 13.182674407958984, + "learning_rate": 0.00029930873476300725, + "loss": 7.5275, + "step": 1355400 + }, + { + "epoch": 5.5220091069357125, + "grad_norm": 13.883099555969238, + "learning_rate": 0.00029907690310114514, + "loss": 7.5435, + "step": 1355500 + }, + { + "epoch": 5.522416484959094, + "grad_norm": 8.441153526306152, + "learning_rate": 0.0002988451586512213, + "loss": 7.5111, + "step": 1355600 + }, + { + "epoch": 5.5228238629824755, + "grad_norm": 11.882668495178223, + "learning_rate": 0.000298613501422398, + "loss": 7.5076, + "step": 1355700 + }, + { + "epoch": 5.523231241005857, + "grad_norm": 25.98122215270996, + "learning_rate": 0.00029838193142383375, + "loss": 7.5322, + "step": 1355800 + }, + { + "epoch": 5.523638619029239, + "grad_norm": 11.909917831420898, + "learning_rate": 0.0002981504486646836, + "loss": 7.5161, + "step": 1355900 + }, + { + "epoch": 5.52404599705262, + "grad_norm": 13.028166770935059, + "learning_rate": 0.00029791905315409826, + "loss": 7.5113, + "step": 1356000 + }, + { + "epoch": 5.52404599705262, + "eval_MaskedAccuracy": 0.5132002872127008, + "eval_loss": 1.586454153060913, + "eval_runtime": 156.1492, + "eval_samples_per_second": 406.509, + "eval_steps_per_second": 1.588, + "step": 1356000 + }, + { + "epoch": 5.524453375076002, + "grad_norm": 5.624743938446045, + "learning_rate": 0.0002976877449012282, + "loss": 7.5335, + "step": 1356100 + }, + { + "epoch": 5.524860753099383, + "grad_norm": 7.9186530113220215, + "learning_rate": 0.00029745652391521653, + "loss": 7.5268, + "step": 1356200 + }, + { + "epoch": 5.525268131122765, + "grad_norm": 4.483052730560303, + "learning_rate": 0.0002972253902052055, + "loss": 7.5209, + "step": 1356300 + }, + { + "epoch": 5.525675509146145, + "grad_norm": 4.488603591918945, + "learning_rate": 0.0002969943437803331, + "loss": 7.5226, + "step": 1356400 + }, + { + "epoch": 5.526082887169528, + "grad_norm": 4.400208473205566, + "learning_rate": 0.0002967633846497328, + "loss": 7.5213, + "step": 1356500 + }, + { + "epoch": 5.526490265192908, + "grad_norm": 3.501458168029785, + "learning_rate": 0.00029653251282253694, + "loss": 7.5381, + "step": 1356600 + }, + { + "epoch": 5.52689764321629, + "grad_norm": 3.340562582015991, + "learning_rate": 0.0002963017283078713, + "loss": 7.5285, + "step": 1356700 + }, + { + "epoch": 5.5273050212396715, + "grad_norm": 5.059596538543701, + "learning_rate": 0.00029607103111486266, + "loss": 7.5223, + "step": 1356800 + }, + { + "epoch": 5.527712399263053, + "grad_norm": 3.181023120880127, + "learning_rate": 0.00029584042125262974, + "loss": 7.5276, + "step": 1356900 + }, + { + "epoch": 5.5281197772864346, + "grad_norm": 15.770975112915039, + "learning_rate": 0.00029560989873029077, + "loss": 7.5245, + "step": 1357000 + }, + { + "epoch": 5.5281197772864346, + "eval_MaskedAccuracy": 0.51299556931503, + "eval_loss": 1.5948057174682617, + "eval_runtime": 153.1605, + "eval_samples_per_second": 414.441, + "eval_steps_per_second": 1.619, + "step": 1357000 + }, + { + "epoch": 5.528527155309816, + "grad_norm": 5.648419380187988, + "learning_rate": 0.00029537946355695895, + "loss": 7.5298, + "step": 1357100 + }, + { + "epoch": 5.528934533333198, + "grad_norm": 27.536808013916016, + "learning_rate": 0.0002951491157417453, + "loss": 7.5066, + "step": 1357200 + }, + { + "epoch": 5.529341911356579, + "grad_norm": 4.438647270202637, + "learning_rate": 0.0002949188552937557, + "loss": 7.5134, + "step": 1357300 + }, + { + "epoch": 5.529749289379961, + "grad_norm": 10.924798965454102, + "learning_rate": 0.00029468868222209423, + "loss": 7.477, + "step": 1357400 + }, + { + "epoch": 5.530156667403342, + "grad_norm": 3.520021438598633, + "learning_rate": 0.0002944585965358611, + "loss": 7.5538, + "step": 1357500 + }, + { + "epoch": 5.530564045426724, + "grad_norm": 11.51445484161377, + "learning_rate": 0.0002942285982441522, + "loss": 7.533, + "step": 1357600 + }, + { + "epoch": 5.530971423450104, + "grad_norm": 5.935987949371338, + "learning_rate": 0.00029399868735606075, + "loss": 7.5299, + "step": 1357700 + }, + { + "epoch": 5.531378801473486, + "grad_norm": 9.059341430664062, + "learning_rate": 0.00029376886388067644, + "loss": 7.5327, + "step": 1357800 + }, + { + "epoch": 5.5317861794968675, + "grad_norm": 17.713075637817383, + "learning_rate": 0.0002935391278270857, + "loss": 7.5326, + "step": 1357900 + }, + { + "epoch": 5.532193557520249, + "grad_norm": 4.2288947105407715, + "learning_rate": 0.00029330947920437074, + "loss": 7.533, + "step": 1358000 + }, + { + "epoch": 5.532193557520249, + "eval_MaskedAccuracy": 0.5129954861303083, + "eval_loss": 1.5838136672973633, + "eval_runtime": 156.0729, + "eval_samples_per_second": 406.707, + "eval_steps_per_second": 1.589, + "step": 1358000 + }, + { + "epoch": 5.5326009355436305, + "grad_norm": 12.103240966796875, + "learning_rate": 0.0002930799180216107, + "loss": 7.5148, + "step": 1358100 + }, + { + "epoch": 5.533008313567012, + "grad_norm": 3.613372325897217, + "learning_rate": 0.00029285044428788186, + "loss": 7.5113, + "step": 1358200 + }, + { + "epoch": 5.533415691590394, + "grad_norm": 11.579863548278809, + "learning_rate": 0.00029262105801225653, + "loss": 7.5425, + "step": 1358300 + }, + { + "epoch": 5.533823069613775, + "grad_norm": 5.354294300079346, + "learning_rate": 0.00029239175920380353, + "loss": 7.5302, + "step": 1358400 + }, + { + "epoch": 5.534230447637157, + "grad_norm": 4.83122444152832, + "learning_rate": 0.00029216254787158783, + "loss": 7.5372, + "step": 1358500 + }, + { + "epoch": 5.534637825660538, + "grad_norm": 19.561399459838867, + "learning_rate": 0.0002919334240246718, + "loss": 7.5163, + "step": 1358600 + }, + { + "epoch": 5.53504520368392, + "grad_norm": 22.466936111450195, + "learning_rate": 0.00029170438767211327, + "loss": 7.54, + "step": 1358700 + }, + { + "epoch": 5.535452581707301, + "grad_norm": 5.240617752075195, + "learning_rate": 0.00029147543882296865, + "loss": 7.5319, + "step": 1358800 + }, + { + "epoch": 5.535859959730683, + "grad_norm": 5.821611404418945, + "learning_rate": 0.0002912465774862884, + "loss": 7.5175, + "step": 1358900 + }, + { + "epoch": 5.536267337754063, + "grad_norm": 26.9658260345459, + "learning_rate": 0.00029101780367112094, + "loss": 7.5253, + "step": 1359000 + }, + { + "epoch": 5.536267337754063, + "eval_MaskedAccuracy": 0.5132573493254089, + "eval_loss": 1.5926933288574219, + "eval_runtime": 163.301, + "eval_samples_per_second": 388.706, + "eval_steps_per_second": 1.519, + "step": 1359000 + }, + { + "epoch": 5.536674715777445, + "grad_norm": 4.119058609008789, + "learning_rate": 0.00029078911738651106, + "loss": 7.5531, + "step": 1359100 + }, + { + "epoch": 5.5370820938008265, + "grad_norm": 3.332658529281616, + "learning_rate": 0.0002905605186414999, + "loss": 7.5236, + "step": 1359200 + }, + { + "epoch": 5.537489471824208, + "grad_norm": 8.222732543945312, + "learning_rate": 0.00029033200744512473, + "loss": 7.5292, + "step": 1359300 + }, + { + "epoch": 5.53789684984759, + "grad_norm": 11.296479225158691, + "learning_rate": 0.0002901035838064204, + "loss": 7.5207, + "step": 1359400 + }, + { + "epoch": 5.538304227870971, + "grad_norm": 12.594161033630371, + "learning_rate": 0.00028987524773441744, + "loss": 7.5036, + "step": 1359500 + }, + { + "epoch": 5.538711605894353, + "grad_norm": 8.876920700073242, + "learning_rate": 0.00028964699923814336, + "loss": 7.5404, + "step": 1359600 + }, + { + "epoch": 5.539118983917734, + "grad_norm": 4.410617828369141, + "learning_rate": 0.00028941883832662177, + "loss": 7.525, + "step": 1359700 + }, + { + "epoch": 5.539526361941116, + "grad_norm": 6.2033538818359375, + "learning_rate": 0.00028919076500887394, + "loss": 7.5347, + "step": 1359800 + }, + { + "epoch": 5.539933739964497, + "grad_norm": 9.682522773742676, + "learning_rate": 0.0002889627792939154, + "loss": 7.5422, + "step": 1359900 + }, + { + "epoch": 5.540341117987879, + "grad_norm": 18.145429611206055, + "learning_rate": 0.0002887348811907606, + "loss": 7.5459, + "step": 1360000 + }, + { + "epoch": 5.540341117987879, + "eval_MaskedAccuracy": 0.5131748225935678, + "eval_loss": 1.59002685546875, + "eval_runtime": 153.5122, + "eval_samples_per_second": 413.492, + "eval_steps_per_second": 1.616, + "step": 1360000 + }, + { + "epoch": 5.54074849601126, + "grad_norm": 18.38419532775879, + "learning_rate": 0.0002885070707084194, + "loss": 7.5203, + "step": 1360100 + }, + { + "epoch": 5.541155874034642, + "grad_norm": 14.297173500061035, + "learning_rate": 0.00028827934785589807, + "loss": 7.5336, + "step": 1360200 + }, + { + "epoch": 5.5415632520580225, + "grad_norm": 6.6692633628845215, + "learning_rate": 0.0002880517126422007, + "loss": 7.5225, + "step": 1360300 + }, + { + "epoch": 5.541970630081404, + "grad_norm": 11.438497543334961, + "learning_rate": 0.0002878241650763257, + "loss": 7.5433, + "step": 1360400 + }, + { + "epoch": 5.5423780081047855, + "grad_norm": 3.0007681846618652, + "learning_rate": 0.0002875967051672699, + "loss": 7.5249, + "step": 1360500 + }, + { + "epoch": 5.542785386128167, + "grad_norm": 4.182516098022461, + "learning_rate": 0.00028736933292402515, + "loss": 7.5555, + "step": 1360600 + }, + { + "epoch": 5.543192764151549, + "grad_norm": 7.271773338317871, + "learning_rate": 0.00028714204835558077, + "loss": 7.4932, + "step": 1360700 + }, + { + "epoch": 5.54360014217493, + "grad_norm": 9.270909309387207, + "learning_rate": 0.00028691485147092435, + "loss": 7.4946, + "step": 1360800 + }, + { + "epoch": 5.544007520198312, + "grad_norm": 5.700077533721924, + "learning_rate": 0.00028668774227903716, + "loss": 7.5015, + "step": 1360900 + }, + { + "epoch": 5.544414898221693, + "grad_norm": 30.443313598632812, + "learning_rate": 0.0002864607207888979, + "loss": 7.5364, + "step": 1361000 + }, + { + "epoch": 5.544414898221693, + "eval_MaskedAccuracy": 0.5132442484916322, + "eval_loss": 1.5818790197372437, + "eval_runtime": 150.6778, + "eval_samples_per_second": 421.27, + "eval_steps_per_second": 1.646, + "step": 1361000 + }, + { + "epoch": 5.544822276245075, + "grad_norm": 10.397770881652832, + "learning_rate": 0.00028623378700948156, + "loss": 7.4893, + "step": 1361100 + }, + { + "epoch": 5.545229654268456, + "grad_norm": 7.334549427032471, + "learning_rate": 0.00028600694094976076, + "loss": 7.4905, + "step": 1361200 + }, + { + "epoch": 5.545637032291838, + "grad_norm": 8.588781356811523, + "learning_rate": 0.0002857801826187033, + "loss": 7.4941, + "step": 1361300 + }, + { + "epoch": 5.546044410315218, + "grad_norm": 16.37862777709961, + "learning_rate": 0.0002855535120252745, + "loss": 7.5173, + "step": 1361400 + }, + { + "epoch": 5.546451788338601, + "grad_norm": 5.2927165031433105, + "learning_rate": 0.0002853269291784358, + "loss": 7.4858, + "step": 1361500 + }, + { + "epoch": 5.5468591663619815, + "grad_norm": 3.8727829456329346, + "learning_rate": 0.00028510043408714567, + "loss": 7.519, + "step": 1361600 + }, + { + "epoch": 5.547266544385363, + "grad_norm": 10.189196586608887, + "learning_rate": 0.0002848740267603579, + "loss": 7.5252, + "step": 1361700 + }, + { + "epoch": 5.547673922408745, + "grad_norm": 7.4850029945373535, + "learning_rate": 0.00028464770720702397, + "loss": 7.5186, + "step": 1361800 + }, + { + "epoch": 5.548081300432126, + "grad_norm": 5.111363887786865, + "learning_rate": 0.0002844214754360912, + "loss": 7.5465, + "step": 1361900 + }, + { + "epoch": 5.548488678455508, + "grad_norm": 8.511821746826172, + "learning_rate": 0.00028419533145650455, + "loss": 7.5218, + "step": 1362000 + }, + { + "epoch": 5.548488678455508, + "eval_MaskedAccuracy": 0.5126793560835217, + "eval_loss": 1.594093680381775, + "eval_runtime": 163.6107, + "eval_samples_per_second": 387.97, + "eval_steps_per_second": 1.516, + "step": 1362000 + }, + { + "epoch": 5.548896056478889, + "grad_norm": 10.877079963684082, + "learning_rate": 0.0002839692752772038, + "loss": 7.545, + "step": 1362100 + }, + { + "epoch": 5.549303434502271, + "grad_norm": 3.9173827171325684, + "learning_rate": 0.00028374330690712654, + "loss": 7.5129, + "step": 1362200 + }, + { + "epoch": 5.549710812525652, + "grad_norm": 5.803036212921143, + "learning_rate": 0.0002835174263552065, + "loss": 7.5282, + "step": 1362300 + }, + { + "epoch": 5.550118190549034, + "grad_norm": 4.023270606994629, + "learning_rate": 0.0002832916336303739, + "loss": 7.5377, + "step": 1362400 + }, + { + "epoch": 5.550525568572415, + "grad_norm": 7.85737943649292, + "learning_rate": 0.00028306592874155565, + "loss": 7.524, + "step": 1362500 + }, + { + "epoch": 5.550932946595797, + "grad_norm": 5.284960746765137, + "learning_rate": 0.000282840311697675, + "loss": 7.5251, + "step": 1362600 + }, + { + "epoch": 5.5513403246191775, + "grad_norm": 7.164961338043213, + "learning_rate": 0.0002826147825076513, + "loss": 7.5379, + "step": 1362700 + }, + { + "epoch": 5.551747702642559, + "grad_norm": 2.8433749675750732, + "learning_rate": 0.0002823893411804015, + "loss": 7.5438, + "step": 1362800 + }, + { + "epoch": 5.5521550806659405, + "grad_norm": 4.6245341300964355, + "learning_rate": 0.0002821639877248384, + "loss": 7.5236, + "step": 1362900 + }, + { + "epoch": 5.552562458689322, + "grad_norm": 6.554775238037109, + "learning_rate": 0.0002819387221498716, + "loss": 7.5347, + "step": 1363000 + }, + { + "epoch": 5.552562458689322, + "eval_MaskedAccuracy": 0.5128569088679895, + "eval_loss": 1.594756007194519, + "eval_runtime": 186.1972, + "eval_samples_per_second": 340.907, + "eval_steps_per_second": 1.332, + "step": 1363000 + }, + { + "epoch": 5.552969836712704, + "grad_norm": 4.0478668212890625, + "learning_rate": 0.00028171354446440683, + "loss": 7.5135, + "step": 1363100 + }, + { + "epoch": 5.553377214736085, + "grad_norm": 29.875524520874023, + "learning_rate": 0.0002814884546773468, + "loss": 7.5379, + "step": 1363200 + }, + { + "epoch": 5.553784592759467, + "grad_norm": 6.030183792114258, + "learning_rate": 0.00028126345279759024, + "loss": 7.5234, + "step": 1363300 + }, + { + "epoch": 5.554191970782848, + "grad_norm": 8.036584854125977, + "learning_rate": 0.00028103853883403273, + "loss": 7.5175, + "step": 1363400 + }, + { + "epoch": 5.55459934880623, + "grad_norm": 20.940343856811523, + "learning_rate": 0.000280813712795566, + "loss": 7.5369, + "step": 1363500 + }, + { + "epoch": 5.555006726829611, + "grad_norm": 3.3316187858581543, + "learning_rate": 0.00028058897469107955, + "loss": 7.5088, + "step": 1363600 + }, + { + "epoch": 5.555414104852993, + "grad_norm": 18.483346939086914, + "learning_rate": 0.00028036432452945736, + "loss": 7.5237, + "step": 1363700 + }, + { + "epoch": 5.555821482876374, + "grad_norm": 7.354155540466309, + "learning_rate": 0.00028013976231958153, + "loss": 7.5373, + "step": 1363800 + }, + { + "epoch": 5.556228860899756, + "grad_norm": 28.18296241760254, + "learning_rate": 0.0002799152880703307, + "loss": 7.5265, + "step": 1363900 + }, + { + "epoch": 5.5566362389231365, + "grad_norm": 16.456531524658203, + "learning_rate": 0.00027969090179057903, + "loss": 7.5101, + "step": 1364000 + }, + { + "epoch": 5.5566362389231365, + "eval_MaskedAccuracy": 0.5128969486337993, + "eval_loss": 1.5997278690338135, + "eval_runtime": 174.6059, + "eval_samples_per_second": 363.539, + "eval_steps_per_second": 1.42, + "step": 1364000 + }, + { + "epoch": 5.557043616946518, + "grad_norm": 3.2516138553619385, + "learning_rate": 0.00027946660348919793, + "loss": 7.5338, + "step": 1364100 + }, + { + "epoch": 5.5574509949699, + "grad_norm": 14.053657531738281, + "learning_rate": 0.0002792423931750547, + "loss": 7.5362, + "step": 1364200 + }, + { + "epoch": 5.557858372993281, + "grad_norm": 15.374994277954102, + "learning_rate": 0.0002790182708570141, + "loss": 7.5311, + "step": 1364300 + }, + { + "epoch": 5.558265751016663, + "grad_norm": 9.491127967834473, + "learning_rate": 0.0002787942365439364, + "loss": 7.4937, + "step": 1364400 + }, + { + "epoch": 5.558673129040044, + "grad_norm": 16.13951873779297, + "learning_rate": 0.0002785702902446793, + "loss": 7.5448, + "step": 1364500 + }, + { + "epoch": 5.559080507063426, + "grad_norm": 15.348613739013672, + "learning_rate": 0.00027834643196809655, + "loss": 7.5298, + "step": 1364600 + }, + { + "epoch": 5.559487885086807, + "grad_norm": 5.019101619720459, + "learning_rate": 0.00027812266172303737, + "loss": 7.5188, + "step": 1364700 + }, + { + "epoch": 5.559895263110189, + "grad_norm": 11.431594848632812, + "learning_rate": 0.0002778989795183505, + "loss": 7.529, + "step": 1364800 + }, + { + "epoch": 5.56030264113357, + "grad_norm": 12.806009292602539, + "learning_rate": 0.0002776753853628783, + "loss": 7.4886, + "step": 1364900 + }, + { + "epoch": 5.560710019156952, + "grad_norm": 20.750797271728516, + "learning_rate": 0.0002774518792654607, + "loss": 7.5241, + "step": 1365000 + }, + { + "epoch": 5.560710019156952, + "eval_MaskedAccuracy": 0.5137996287967581, + "eval_loss": 1.578283429145813, + "eval_runtime": 172.6123, + "eval_samples_per_second": 367.737, + "eval_steps_per_second": 1.437, + "step": 1365000 + }, + { + "epoch": 5.561117397180333, + "grad_norm": 10.64334774017334, + "learning_rate": 0.0002772284612349343, + "loss": 7.5076, + "step": 1365100 + }, + { + "epoch": 5.561524775203715, + "grad_norm": 9.669614791870117, + "learning_rate": 0.0002770051312801313, + "loss": 7.5107, + "step": 1365200 + }, + { + "epoch": 5.5619321532270956, + "grad_norm": 18.724956512451172, + "learning_rate": 0.00027678188940988193, + "loss": 7.5083, + "step": 1365300 + }, + { + "epoch": 5.562339531250477, + "grad_norm": 4.32308292388916, + "learning_rate": 0.00027655873563301164, + "loss": 7.509, + "step": 1365400 + }, + { + "epoch": 5.562746909273859, + "grad_norm": 23.26671028137207, + "learning_rate": 0.00027633566995834325, + "loss": 7.4798, + "step": 1365500 + }, + { + "epoch": 5.56315428729724, + "grad_norm": 6.140720367431641, + "learning_rate": 0.0002761126923946954, + "loss": 7.5345, + "step": 1365600 + }, + { + "epoch": 5.563561665320622, + "grad_norm": 8.759008407592773, + "learning_rate": 0.00027588980295088403, + "loss": 7.5419, + "step": 1365700 + }, + { + "epoch": 5.563969043344003, + "grad_norm": 4.176877021789551, + "learning_rate": 0.0002756670016357203, + "loss": 7.515, + "step": 1365800 + }, + { + "epoch": 5.564376421367385, + "grad_norm": 3.5711328983306885, + "learning_rate": 0.000275444288458014, + "loss": 7.5399, + "step": 1365900 + }, + { + "epoch": 5.564783799390766, + "grad_norm": 14.740789413452148, + "learning_rate": 0.00027522166342656935, + "loss": 7.5484, + "step": 1366000 + }, + { + "epoch": 5.564783799390766, + "eval_MaskedAccuracy": 0.5134230409177121, + "eval_loss": 1.5808014869689941, + "eval_runtime": 177.2087, + "eval_samples_per_second": 358.199, + "eval_steps_per_second": 1.399, + "step": 1366000 + }, + { + "epoch": 5.565191177414148, + "grad_norm": 19.410436630249023, + "learning_rate": 0.0002749991265501877, + "loss": 7.5297, + "step": 1366100 + }, + { + "epoch": 5.565598555437529, + "grad_norm": 12.808557510375977, + "learning_rate": 0.0002747766778376676, + "loss": 7.5098, + "step": 1366200 + }, + { + "epoch": 5.566005933460911, + "grad_norm": 11.51230239868164, + "learning_rate": 0.00027455431729780314, + "loss": 7.5165, + "step": 1366300 + }, + { + "epoch": 5.5664133114842915, + "grad_norm": 9.870705604553223, + "learning_rate": 0.0002743320449393862, + "loss": 7.5126, + "step": 1366400 + }, + { + "epoch": 5.566820689507674, + "grad_norm": 10.526515007019043, + "learning_rate": 0.00027410986077120434, + "loss": 7.5364, + "step": 1366500 + }, + { + "epoch": 5.567228067531055, + "grad_norm": 4.075404167175293, + "learning_rate": 0.00027388776480204017, + "loss": 7.5229, + "step": 1366600 + }, + { + "epoch": 5.567635445554436, + "grad_norm": 6.086880207061768, + "learning_rate": 0.0002736657570406758, + "loss": 7.5175, + "step": 1366700 + }, + { + "epoch": 5.568042823577818, + "grad_norm": 11.606575965881348, + "learning_rate": 0.0002734438374958877, + "loss": 7.5473, + "step": 1366800 + }, + { + "epoch": 5.568450201601199, + "grad_norm": 6.996771335601807, + "learning_rate": 0.00027322200617645035, + "loss": 7.54, + "step": 1366900 + }, + { + "epoch": 5.568857579624581, + "grad_norm": 8.935172080993652, + "learning_rate": 0.00027300026309113375, + "loss": 7.5155, + "step": 1367000 + }, + { + "epoch": 5.568857579624581, + "eval_MaskedAccuracy": 0.5129643316707965, + "eval_loss": 1.5887912511825562, + "eval_runtime": 168.8896, + "eval_samples_per_second": 375.843, + "eval_steps_per_second": 1.468, + "step": 1367000 + }, + { + "epoch": 5.569264957647962, + "grad_norm": 14.77796459197998, + "learning_rate": 0.0002727786082487044, + "loss": 7.5348, + "step": 1367100 + }, + { + "epoch": 5.569672335671344, + "grad_norm": 5.776971817016602, + "learning_rate": 0.0002725570416579253, + "loss": 7.4868, + "step": 1367200 + }, + { + "epoch": 5.570079713694725, + "grad_norm": 3.8741743564605713, + "learning_rate": 0.0002723355633275565, + "loss": 7.527, + "step": 1367300 + }, + { + "epoch": 5.570487091718107, + "grad_norm": 17.044811248779297, + "learning_rate": 0.00027211417326635396, + "loss": 7.5317, + "step": 1367400 + }, + { + "epoch": 5.570894469741488, + "grad_norm": 3.809858798980713, + "learning_rate": 0.0002718928714830707, + "loss": 7.5072, + "step": 1367500 + }, + { + "epoch": 5.57130184776487, + "grad_norm": 16.063383102416992, + "learning_rate": 0.0002716716579864553, + "loss": 7.5465, + "step": 1367600 + }, + { + "epoch": 5.571709225788251, + "grad_norm": 18.389799118041992, + "learning_rate": 0.00027145053278525413, + "loss": 7.5387, + "step": 1367700 + }, + { + "epoch": 5.572116603811632, + "grad_norm": 2.9917666912078857, + "learning_rate": 0.00027122949588820913, + "loss": 7.5321, + "step": 1367800 + }, + { + "epoch": 5.572523981835014, + "grad_norm": 4.532822608947754, + "learning_rate": 0.000271008547304059, + "loss": 7.5473, + "step": 1367900 + }, + { + "epoch": 5.572931359858395, + "grad_norm": 6.056094169616699, + "learning_rate": 0.00027078768704153915, + "loss": 7.4983, + "step": 1368000 + }, + { + "epoch": 5.572931359858395, + "eval_MaskedAccuracy": 0.5129696851753682, + "eval_loss": 1.5889724493026733, + "eval_runtime": 180.1956, + "eval_samples_per_second": 352.262, + "eval_steps_per_second": 1.376, + "step": 1368000 + }, + { + "epoch": 5.573338737881777, + "grad_norm": 4.154176235198975, + "learning_rate": 0.00027056691510938163, + "loss": 7.547, + "step": 1368100 + }, + { + "epoch": 5.573746115905158, + "grad_norm": 19.629858016967773, + "learning_rate": 0.00027034623151631456, + "loss": 7.5552, + "step": 1368200 + }, + { + "epoch": 5.57415349392854, + "grad_norm": 6.804897308349609, + "learning_rate": 0.00027012563627106304, + "loss": 7.5076, + "step": 1368300 + }, + { + "epoch": 5.574560871951921, + "grad_norm": 3.384577512741089, + "learning_rate": 0.00026990512938234785, + "loss": 7.5544, + "step": 1368400 + }, + { + "epoch": 5.574968249975303, + "grad_norm": 9.741466522216797, + "learning_rate": 0.00026968471085888707, + "loss": 7.5361, + "step": 1368500 + }, + { + "epoch": 5.575375627998684, + "grad_norm": 7.53199577331543, + "learning_rate": 0.00026946438070939464, + "loss": 7.5592, + "step": 1368600 + }, + { + "epoch": 5.575783006022066, + "grad_norm": 5.295625686645508, + "learning_rate": 0.00026924413894258245, + "loss": 7.5428, + "step": 1368700 + }, + { + "epoch": 5.576190384045447, + "grad_norm": 4.960668563842773, + "learning_rate": 0.0002690239855671566, + "loss": 7.5395, + "step": 1368800 + }, + { + "epoch": 5.576597762068829, + "grad_norm": 3.418715000152588, + "learning_rate": 0.0002688039205918214, + "loss": 7.5282, + "step": 1368900 + }, + { + "epoch": 5.57700514009221, + "grad_norm": 7.8770976066589355, + "learning_rate": 0.00026858394402527736, + "loss": 7.5509, + "step": 1369000 + }, + { + "epoch": 5.57700514009221, + "eval_MaskedAccuracy": 0.5129721241031361, + "eval_loss": 1.5839240550994873, + "eval_runtime": 176.2656, + "eval_samples_per_second": 360.116, + "eval_steps_per_second": 1.407, + "step": 1369000 + }, + { + "epoch": 5.577412518115591, + "grad_norm": 3.534111261367798, + "learning_rate": 0.000268364055876221, + "loss": 7.502, + "step": 1369100 + }, + { + "epoch": 5.577819896138973, + "grad_norm": 5.81339168548584, + "learning_rate": 0.0002681442561533461, + "loss": 7.5105, + "step": 1369200 + }, + { + "epoch": 5.578227274162354, + "grad_norm": 7.141969203948975, + "learning_rate": 0.0002679245448653422, + "loss": 7.5268, + "step": 1369300 + }, + { + "epoch": 5.578634652185736, + "grad_norm": 5.7050018310546875, + "learning_rate": 0.0002677049220208963, + "loss": 7.5165, + "step": 1369400 + }, + { + "epoch": 5.579042030209117, + "grad_norm": 3.8554022312164307, + "learning_rate": 0.00026748538762869025, + "loss": 7.5172, + "step": 1369500 + }, + { + "epoch": 5.579449408232499, + "grad_norm": 5.558248996734619, + "learning_rate": 0.00026726594169740374, + "loss": 7.5197, + "step": 1369600 + }, + { + "epoch": 5.57985678625588, + "grad_norm": 4.008612155914307, + "learning_rate": 0.0002670465842357129, + "loss": 7.5379, + "step": 1369700 + }, + { + "epoch": 5.580264164279262, + "grad_norm": 2.9142096042633057, + "learning_rate": 0.00026682731525229023, + "loss": 7.5163, + "step": 1369800 + }, + { + "epoch": 5.580671542302643, + "grad_norm": 12.786030769348145, + "learning_rate": 0.00026660813475580427, + "loss": 7.504, + "step": 1369900 + }, + { + "epoch": 5.581078920326025, + "grad_norm": 23.525224685668945, + "learning_rate": 0.0002663890427549205, + "loss": 7.5243, + "step": 1370000 + }, + { + "epoch": 5.581078920326025, + "eval_MaskedAccuracy": 0.5129621062762936, + "eval_loss": 1.5875533819198608, + "eval_runtime": 175.4683, + "eval_samples_per_second": 361.752, + "eval_steps_per_second": 1.413, + "step": 1370000 + }, + { + "epoch": 5.5814862983494065, + "grad_norm": 4.683876037597656, + "learning_rate": 0.000266170039258301, + "loss": 7.5268, + "step": 1370100 + }, + { + "epoch": 5.581893676372788, + "grad_norm": 7.082616806030273, + "learning_rate": 0.00026595112427460363, + "loss": 7.5114, + "step": 1370200 + }, + { + "epoch": 5.582301054396169, + "grad_norm": 17.532716751098633, + "learning_rate": 0.0002657322978124838, + "loss": 7.5402, + "step": 1370300 + }, + { + "epoch": 5.58270843241955, + "grad_norm": 7.782750129699707, + "learning_rate": 0.00026551355988059283, + "loss": 7.5323, + "step": 1370400 + }, + { + "epoch": 5.583115810442932, + "grad_norm": 13.5018892288208, + "learning_rate": 0.00026529491048757846, + "loss": 7.5541, + "step": 1370500 + }, + { + "epoch": 5.583523188466313, + "grad_norm": 13.190478324890137, + "learning_rate": 0.0002650763496420836, + "loss": 7.5248, + "step": 1370600 + }, + { + "epoch": 5.583930566489695, + "grad_norm": 33.09577560424805, + "learning_rate": 0.00026485787735275145, + "loss": 7.5121, + "step": 1370700 + }, + { + "epoch": 5.584337944513076, + "grad_norm": 12.808554649353027, + "learning_rate": 0.00026463949362821924, + "loss": 7.515, + "step": 1370800 + }, + { + "epoch": 5.584745322536458, + "grad_norm": 15.17452621459961, + "learning_rate": 0.00026442119847712014, + "loss": 7.5279, + "step": 1370900 + }, + { + "epoch": 5.585152700559839, + "grad_norm": 4.880710601806641, + "learning_rate": 0.0002642029919080844, + "loss": 7.5566, + "step": 1371000 + }, + { + "epoch": 5.585152700559839, + "eval_MaskedAccuracy": 0.5137202102691227, + "eval_loss": 1.5842161178588867, + "eval_runtime": 159.5518, + "eval_samples_per_second": 397.839, + "eval_steps_per_second": 1.554, + "step": 1371000 + }, + { + "epoch": 5.585560078583221, + "grad_norm": 4.810088157653809, + "learning_rate": 0.0002639848739297388, + "loss": 7.5377, + "step": 1371100 + }, + { + "epoch": 5.585967456606602, + "grad_norm": 27.958778381347656, + "learning_rate": 0.0002637668445507071, + "loss": 7.539, + "step": 1371200 + }, + { + "epoch": 5.586374834629984, + "grad_norm": 15.798528671264648, + "learning_rate": 0.00026354890377960856, + "loss": 7.5078, + "step": 1371300 + }, + { + "epoch": 5.586782212653365, + "grad_norm": 23.275903701782227, + "learning_rate": 0.0002633310516250602, + "loss": 7.5168, + "step": 1371400 + }, + { + "epoch": 5.587189590676747, + "grad_norm": 15.988101959228516, + "learning_rate": 0.0002631132880956747, + "loss": 7.4983, + "step": 1371500 + }, + { + "epoch": 5.587596968700128, + "grad_norm": 11.174032211303711, + "learning_rate": 0.00026289561320006117, + "loss": 7.5152, + "step": 1371600 + }, + { + "epoch": 5.588004346723509, + "grad_norm": 6.301385402679443, + "learning_rate": 0.00026267802694682636, + "loss": 7.5195, + "step": 1371700 + }, + { + "epoch": 5.588411724746891, + "grad_norm": 16.60630989074707, + "learning_rate": 0.00026246052934457154, + "loss": 7.5452, + "step": 1371800 + }, + { + "epoch": 5.588819102770272, + "grad_norm": 5.048940658569336, + "learning_rate": 0.0002622431204018957, + "loss": 7.5028, + "step": 1371900 + }, + { + "epoch": 5.589226480793654, + "grad_norm": 12.170727729797363, + "learning_rate": 0.00026202580012739456, + "loss": 7.5008, + "step": 1372000 + }, + { + "epoch": 5.589226480793654, + "eval_MaskedAccuracy": 0.513522222905867, + "eval_loss": 1.5896488428115845, + "eval_runtime": 194.5768, + "eval_samples_per_second": 326.226, + "eval_steps_per_second": 1.275, + "step": 1372000 + }, + { + "epoch": 5.589633858817035, + "grad_norm": 6.673583030700684, + "learning_rate": 0.0002618085685296599, + "loss": 7.5521, + "step": 1372100 + }, + { + "epoch": 5.590041236840417, + "grad_norm": 18.239015579223633, + "learning_rate": 0.00026159142561728017, + "loss": 7.5297, + "step": 1372200 + }, + { + "epoch": 5.590448614863798, + "grad_norm": 4.859915733337402, + "learning_rate": 0.0002613743713988397, + "loss": 7.5231, + "step": 1372300 + }, + { + "epoch": 5.59085599288718, + "grad_norm": 12.0966157913208, + "learning_rate": 0.0002611574058829198, + "loss": 7.554, + "step": 1372400 + }, + { + "epoch": 5.5912633709105615, + "grad_norm": 19.52140235900879, + "learning_rate": 0.0002609405290780991, + "loss": 7.5323, + "step": 1372500 + }, + { + "epoch": 5.591670748933943, + "grad_norm": 7.564475059509277, + "learning_rate": 0.00026072374099295057, + "loss": 7.5306, + "step": 1372600 + }, + { + "epoch": 5.592078126957324, + "grad_norm": 10.323042869567871, + "learning_rate": 0.00026050704163604627, + "loss": 7.5441, + "step": 1372700 + }, + { + "epoch": 5.592485504980705, + "grad_norm": 20.985734939575195, + "learning_rate": 0.0002602904310159531, + "loss": 7.5266, + "step": 1372800 + }, + { + "epoch": 5.592892883004087, + "grad_norm": 6.601797580718994, + "learning_rate": 0.00026007390914123434, + "loss": 7.5029, + "step": 1372900 + }, + { + "epoch": 5.593300261027468, + "grad_norm": 3.467031955718994, + "learning_rate": 0.00025985747602045114, + "loss": 7.5091, + "step": 1373000 + }, + { + "epoch": 5.593300261027468, + "eval_MaskedAccuracy": 0.51330253407165, + "eval_loss": 1.5910897254943848, + "eval_runtime": 180.8446, + "eval_samples_per_second": 350.998, + "eval_steps_per_second": 1.371, + "step": 1373000 + }, + { + "epoch": 5.59370763905085, + "grad_norm": 11.178659439086914, + "learning_rate": 0.0002596411316621598, + "loss": 7.53, + "step": 1373100 + }, + { + "epoch": 5.594115017074231, + "grad_norm": 4.633257865905762, + "learning_rate": 0.00025942487607491353, + "loss": 7.5179, + "step": 1373200 + }, + { + "epoch": 5.594522395097613, + "grad_norm": 23.054452896118164, + "learning_rate": 0.0002592087092672618, + "loss": 7.5005, + "step": 1373300 + }, + { + "epoch": 5.594929773120994, + "grad_norm": 9.557489395141602, + "learning_rate": 0.000258992631247751, + "loss": 7.5183, + "step": 1373400 + }, + { + "epoch": 5.595337151144376, + "grad_norm": 16.355045318603516, + "learning_rate": 0.00025877664202492433, + "loss": 7.5329, + "step": 1373500 + }, + { + "epoch": 5.595744529167757, + "grad_norm": 17.38948631286621, + "learning_rate": 0.00025856074160732017, + "loss": 7.4872, + "step": 1373600 + }, + { + "epoch": 5.596151907191139, + "grad_norm": 8.638805389404297, + "learning_rate": 0.00025834493000347487, + "loss": 7.5224, + "step": 1373700 + }, + { + "epoch": 5.5965592852145205, + "grad_norm": 12.201508522033691, + "learning_rate": 0.0002581292072219205, + "loss": 7.5543, + "step": 1373800 + }, + { + "epoch": 5.596966663237902, + "grad_norm": 4.477809906005859, + "learning_rate": 0.0002579135732711859, + "loss": 7.5445, + "step": 1373900 + }, + { + "epoch": 5.597374041261283, + "grad_norm": 3.688843011856079, + "learning_rate": 0.00025769802815979595, + "loss": 7.5382, + "step": 1374000 + }, + { + "epoch": 5.597374041261283, + "eval_MaskedAccuracy": 0.5132077913466938, + "eval_loss": 1.5854042768478394, + "eval_runtime": 157.7601, + "eval_samples_per_second": 402.358, + "eval_steps_per_second": 1.572, + "step": 1374000 + }, + { + "epoch": 5.597781419284664, + "grad_norm": 5.155825138092041, + "learning_rate": 0.00025748257189627243, + "loss": 7.5234, + "step": 1374100 + }, + { + "epoch": 5.598188797308046, + "grad_norm": 17.257415771484375, + "learning_rate": 0.00025726720448913324, + "loss": 7.508, + "step": 1374200 + }, + { + "epoch": 5.598596175331427, + "grad_norm": 6.936742305755615, + "learning_rate": 0.0002570519259468934, + "loss": 7.5381, + "step": 1374300 + }, + { + "epoch": 5.599003553354809, + "grad_norm": 7.681789398193359, + "learning_rate": 0.0002568367362780638, + "loss": 7.5254, + "step": 1374400 + }, + { + "epoch": 5.59941093137819, + "grad_norm": 10.017396926879883, + "learning_rate": 0.00025662163549115146, + "loss": 7.5129, + "step": 1374500 + }, + { + "epoch": 5.599818309401572, + "grad_norm": 3.6523165702819824, + "learning_rate": 0.0002564066235946601, + "loss": 7.5331, + "step": 1374600 + }, + { + "epoch": 5.600225687424953, + "grad_norm": 12.0110502243042, + "learning_rate": 0.0002561917005970929, + "loss": 7.5079, + "step": 1374700 + }, + { + "epoch": 5.600633065448335, + "grad_norm": 2.950870990753174, + "learning_rate": 0.0002559768665069448, + "loss": 7.5354, + "step": 1374800 + }, + { + "epoch": 5.6010404434717165, + "grad_norm": 9.420726776123047, + "learning_rate": 0.0002557621213327099, + "loss": 7.5336, + "step": 1374900 + }, + { + "epoch": 5.601447821495098, + "grad_norm": 3.371947765350342, + "learning_rate": 0.00025554746508287847, + "loss": 7.5341, + "step": 1375000 + }, + { + "epoch": 5.601447821495098, + "eval_MaskedAccuracy": 0.5132219491246219, + "eval_loss": 1.587613821029663, + "eval_runtime": 157.1488, + "eval_samples_per_second": 403.923, + "eval_steps_per_second": 1.578, + "step": 1375000 + }, + { + "epoch": 5.6018551995184795, + "grad_norm": 7.334743022918701, + "learning_rate": 0.00025533289776593645, + "loss": 7.5165, + "step": 1375100 + }, + { + "epoch": 5.602262577541861, + "grad_norm": 12.500086784362793, + "learning_rate": 0.0002551184193903668, + "loss": 7.4944, + "step": 1375200 + }, + { + "epoch": 5.602669955565242, + "grad_norm": 10.016807556152344, + "learning_rate": 0.0002549040299646498, + "loss": 7.5259, + "step": 1375300 + }, + { + "epoch": 5.603077333588623, + "grad_norm": 5.238979339599609, + "learning_rate": 0.00025468972949726045, + "loss": 7.5177, + "step": 1375400 + }, + { + "epoch": 5.603484711612005, + "grad_norm": 2.421661376953125, + "learning_rate": 0.00025447551799667186, + "loss": 7.549, + "step": 1375500 + }, + { + "epoch": 5.603892089635386, + "grad_norm": 10.061813354492188, + "learning_rate": 0.0002542613954713525, + "loss": 7.5242, + "step": 1375600 + }, + { + "epoch": 5.604299467658768, + "grad_norm": 7.082102298736572, + "learning_rate": 0.0002540473619297686, + "loss": 7.5315, + "step": 1375700 + }, + { + "epoch": 5.604706845682149, + "grad_norm": 4.16160774230957, + "learning_rate": 0.0002538334173803819, + "loss": 7.515, + "step": 1375800 + }, + { + "epoch": 5.605114223705531, + "grad_norm": 3.8698904514312744, + "learning_rate": 0.0002536195618316509, + "loss": 7.5255, + "step": 1375900 + }, + { + "epoch": 5.6055216017289125, + "grad_norm": 3.6110451221466064, + "learning_rate": 0.0002534057952920291, + "loss": 7.5211, + "step": 1376000 + }, + { + "epoch": 5.6055216017289125, + "eval_MaskedAccuracy": 0.5130806544962152, + "eval_loss": 1.5900852680206299, + "eval_runtime": 433.231, + "eval_samples_per_second": 146.518, + "eval_steps_per_second": 0.572, + "step": 1376000 + }, + { + "epoch": 5.605928979752294, + "grad_norm": 9.983222007751465, + "learning_rate": 0.0002531921177699691, + "loss": 7.538, + "step": 1376100 + }, + { + "epoch": 5.6063363577756755, + "grad_norm": 2.765918016433716, + "learning_rate": 0.0002529785292739179, + "loss": 7.5385, + "step": 1376200 + }, + { + "epoch": 5.606743735799057, + "grad_norm": 15.373408317565918, + "learning_rate": 0.0002527650298123204, + "loss": 7.5486, + "step": 1376300 + }, + { + "epoch": 5.607151113822438, + "grad_norm": 5.65013313293457, + "learning_rate": 0.00025255161939361686, + "loss": 7.514, + "step": 1376400 + }, + { + "epoch": 5.60755849184582, + "grad_norm": 12.013772964477539, + "learning_rate": 0.00025233829802624394, + "loss": 7.5298, + "step": 1376500 + }, + { + "epoch": 5.607965869869201, + "grad_norm": 4.139561176300049, + "learning_rate": 0.0002521250657186372, + "loss": 7.5365, + "step": 1376600 + }, + { + "epoch": 5.608373247892582, + "grad_norm": 3.9420621395111084, + "learning_rate": 0.00025191192247922594, + "loss": 7.5278, + "step": 1376700 + }, + { + "epoch": 5.608780625915964, + "grad_norm": 4.168811798095703, + "learning_rate": 0.0002516988683164364, + "loss": 7.5323, + "step": 1376800 + }, + { + "epoch": 5.609188003939345, + "grad_norm": 3.7715203762054443, + "learning_rate": 0.00025148590323869214, + "loss": 7.5225, + "step": 1376900 + }, + { + "epoch": 5.609595381962727, + "grad_norm": 8.349024772644043, + "learning_rate": 0.00025127302725441313, + "loss": 7.5179, + "step": 1377000 + }, + { + "epoch": 5.609595381962727, + "eval_MaskedAccuracy": 0.513069277780445, + "eval_loss": 1.5956621170043945, + "eval_runtime": 193.6876, + "eval_samples_per_second": 327.724, + "eval_steps_per_second": 1.28, + "step": 1377000 + }, + { + "epoch": 5.610002759986108, + "grad_norm": 4.570725917816162, + "learning_rate": 0.0002510602403720149, + "loss": 7.5396, + "step": 1377100 + }, + { + "epoch": 5.61041013800949, + "grad_norm": 3.7946763038635254, + "learning_rate": 0.00025084754259991005, + "loss": 7.5329, + "step": 1377200 + }, + { + "epoch": 5.6108175160328715, + "grad_norm": 30.738468170166016, + "learning_rate": 0.000250634933946508, + "loss": 7.5269, + "step": 1377300 + }, + { + "epoch": 5.611224894056253, + "grad_norm": 7.892461776733398, + "learning_rate": 0.0002504224144202143, + "loss": 7.5212, + "step": 1377400 + }, + { + "epoch": 5.6116322720796346, + "grad_norm": 17.25091552734375, + "learning_rate": 0.00025020998402943095, + "loss": 7.5222, + "step": 1377500 + }, + { + "epoch": 5.612039650103016, + "grad_norm": 14.010287284851074, + "learning_rate": 0.0002499976427825564, + "loss": 7.5463, + "step": 1377600 + }, + { + "epoch": 5.612447028126397, + "grad_norm": 15.319863319396973, + "learning_rate": 0.00024978539068798596, + "loss": 7.5458, + "step": 1377700 + }, + { + "epoch": 5.612854406149778, + "grad_norm": 6.391556262969971, + "learning_rate": 0.0002495732277541104, + "loss": 7.5139, + "step": 1377800 + }, + { + "epoch": 5.61326178417316, + "grad_norm": 11.232641220092773, + "learning_rate": 0.00024936115398931764, + "loss": 7.5126, + "step": 1377900 + }, + { + "epoch": 5.613669162196541, + "grad_norm": 13.635534286499023, + "learning_rate": 0.0002491491694019927, + "loss": 7.5191, + "step": 1378000 + }, + { + "epoch": 5.613669162196541, + "eval_MaskedAccuracy": 0.5130278719767443, + "eval_loss": 1.5895746946334839, + "eval_runtime": 189.5981, + "eval_samples_per_second": 334.792, + "eval_steps_per_second": 1.308, + "step": 1378000 + }, + { + "epoch": 5.614076540219923, + "grad_norm": 11.969842910766602, + "learning_rate": 0.00024893727400051603, + "loss": 7.5269, + "step": 1378100 + }, + { + "epoch": 5.614483918243304, + "grad_norm": 14.331548690795898, + "learning_rate": 0.00024872546779326516, + "loss": 7.5179, + "step": 1378200 + }, + { + "epoch": 5.614891296266686, + "grad_norm": 17.87909698486328, + "learning_rate": 0.00024851375078861445, + "loss": 7.5224, + "step": 1378300 + }, + { + "epoch": 5.6152986742900675, + "grad_norm": 7.2406907081604, + "learning_rate": 0.00024830212299493324, + "loss": 7.4959, + "step": 1378400 + }, + { + "epoch": 5.615706052313449, + "grad_norm": 18.738910675048828, + "learning_rate": 0.0002480905844205876, + "loss": 7.5388, + "step": 1378500 + }, + { + "epoch": 5.6161134303368305, + "grad_norm": 7.424437046051025, + "learning_rate": 0.00024787913507394357, + "loss": 7.5473, + "step": 1378600 + }, + { + "epoch": 5.616520808360212, + "grad_norm": 9.2700777053833, + "learning_rate": 0.00024766777496335876, + "loss": 7.5144, + "step": 1378700 + }, + { + "epoch": 5.616928186383594, + "grad_norm": 9.531607627868652, + "learning_rate": 0.0002474565040971902, + "loss": 7.5488, + "step": 1378800 + }, + { + "epoch": 5.617335564406975, + "grad_norm": 3.692396879196167, + "learning_rate": 0.00024724532248379005, + "loss": 7.5224, + "step": 1378900 + }, + { + "epoch": 5.617742942430356, + "grad_norm": 4.1397271156311035, + "learning_rate": 0.00024703423013150796, + "loss": 7.5122, + "step": 1379000 + }, + { + "epoch": 5.617742942430356, + "eval_MaskedAccuracy": 0.5126794755519473, + "eval_loss": 1.5880870819091797, + "eval_runtime": 165.4931, + "eval_samples_per_second": 383.557, + "eval_steps_per_second": 1.499, + "step": 1379000 + }, + { + "epoch": 5.618150320453737, + "grad_norm": 7.830399990081787, + "learning_rate": 0.00024682322704868917, + "loss": 7.5619, + "step": 1379100 + }, + { + "epoch": 5.618557698477119, + "grad_norm": 6.649756908416748, + "learning_rate": 0.00024661231324367536, + "loss": 7.545, + "step": 1379200 + }, + { + "epoch": 5.6189650765005, + "grad_norm": 8.903409957885742, + "learning_rate": 0.00024640148872480557, + "loss": 7.5419, + "step": 1379300 + }, + { + "epoch": 5.619372454523882, + "grad_norm": 19.530412673950195, + "learning_rate": 0.00024619075350041487, + "loss": 7.5328, + "step": 1379400 + }, + { + "epoch": 5.619779832547263, + "grad_norm": 3.8053641319274902, + "learning_rate": 0.0002459801075788348, + "loss": 7.5338, + "step": 1379500 + }, + { + "epoch": 5.620187210570645, + "grad_norm": 2.8859376907348633, + "learning_rate": 0.000245769550968393, + "loss": 7.4906, + "step": 1379600 + }, + { + "epoch": 5.6205945885940265, + "grad_norm": 7.343949317932129, + "learning_rate": 0.00024555908367741413, + "loss": 7.513, + "step": 1379700 + }, + { + "epoch": 5.621001966617408, + "grad_norm": 9.09073257446289, + "learning_rate": 0.000245348705714219, + "loss": 7.517, + "step": 1379800 + }, + { + "epoch": 5.62140934464079, + "grad_norm": 9.085687637329102, + "learning_rate": 0.00024513841708712514, + "loss": 7.5531, + "step": 1379900 + }, + { + "epoch": 5.621816722664171, + "grad_norm": 6.09578800201416, + "learning_rate": 0.00024492821780444613, + "loss": 7.5081, + "step": 1380000 + }, + { + "epoch": 5.621816722664171, + "eval_MaskedAccuracy": 0.5126112671805607, + "eval_loss": 1.5826091766357422, + "eval_runtime": 156.9029, + "eval_samples_per_second": 404.556, + "eval_steps_per_second": 1.581, + "step": 1380000 + }, + { + "epoch": 5.622224100687553, + "grad_norm": 5.202302932739258, + "learning_rate": 0.0002447181078744922, + "loss": 7.5039, + "step": 1380100 + }, + { + "epoch": 5.622631478710934, + "grad_norm": 6.4186482429504395, + "learning_rate": 0.0002445080873055705, + "loss": 7.5086, + "step": 1380200 + }, + { + "epoch": 5.623038856734315, + "grad_norm": 6.249260425567627, + "learning_rate": 0.0002442981561059843, + "loss": 7.5084, + "step": 1380300 + }, + { + "epoch": 5.623446234757696, + "grad_norm": 25.378747940063477, + "learning_rate": 0.00024408831428403244, + "loss": 7.4975, + "step": 1380400 + }, + { + "epoch": 5.623853612781078, + "grad_norm": 6.305699825286865, + "learning_rate": 0.00024387856184801042, + "loss": 7.5652, + "step": 1380500 + }, + { + "epoch": 5.624260990804459, + "grad_norm": 16.556028366088867, + "learning_rate": 0.0002436688988062125, + "loss": 7.5275, + "step": 1380600 + }, + { + "epoch": 5.624668368827841, + "grad_norm": 8.81778335571289, + "learning_rate": 0.00024345932516692744, + "loss": 7.5607, + "step": 1380700 + }, + { + "epoch": 5.6250757468512225, + "grad_norm": 16.383466720581055, + "learning_rate": 0.00024324984093844108, + "loss": 7.5306, + "step": 1380800 + }, + { + "epoch": 5.625483124874604, + "grad_norm": 16.238239288330078, + "learning_rate": 0.00024304044612903487, + "loss": 7.5024, + "step": 1380900 + }, + { + "epoch": 5.6258905028979855, + "grad_norm": 9.679112434387207, + "learning_rate": 0.00024283114074698773, + "loss": 7.5448, + "step": 1381000 + }, + { + "epoch": 5.6258905028979855, + "eval_MaskedAccuracy": 0.5131659569757718, + "eval_loss": 1.5812820196151733, + "eval_runtime": 158.3051, + "eval_samples_per_second": 400.973, + "eval_steps_per_second": 1.567, + "step": 1381000 + }, + { + "epoch": 5.626297880921367, + "grad_norm": 7.569660186767578, + "learning_rate": 0.0002426219248005747, + "loss": 7.5287, + "step": 1381100 + }, + { + "epoch": 5.626705258944749, + "grad_norm": 24.3963680267334, + "learning_rate": 0.00024241279829806685, + "loss": 7.5214, + "step": 1381200 + }, + { + "epoch": 5.62711263696813, + "grad_norm": 14.51697063446045, + "learning_rate": 0.00024220376124773225, + "loss": 7.5592, + "step": 1381300 + }, + { + "epoch": 5.627520014991511, + "grad_norm": 6.675385475158691, + "learning_rate": 0.00024199481365783535, + "loss": 7.5481, + "step": 1381400 + }, + { + "epoch": 5.627927393014893, + "grad_norm": 7.06484842300415, + "learning_rate": 0.00024178595553663704, + "loss": 7.4988, + "step": 1381500 + }, + { + "epoch": 5.628334771038274, + "grad_norm": 8.65516185760498, + "learning_rate": 0.0002415771868923942, + "loss": 7.5275, + "step": 1381600 + }, + { + "epoch": 5.628742149061655, + "grad_norm": 10.394768714904785, + "learning_rate": 0.0002413685077333604, + "loss": 7.5179, + "step": 1381700 + }, + { + "epoch": 5.629149527085037, + "grad_norm": 24.104524612426758, + "learning_rate": 0.00024115991806778638, + "loss": 7.5205, + "step": 1381800 + }, + { + "epoch": 5.629556905108418, + "grad_norm": 19.68671417236328, + "learning_rate": 0.00024095141790391903, + "loss": 7.5044, + "step": 1381900 + }, + { + "epoch": 5.6299642831318, + "grad_norm": 8.048469543457031, + "learning_rate": 0.00024074300725000108, + "loss": 7.514, + "step": 1382000 + }, + { + "epoch": 5.6299642831318, + "eval_MaskedAccuracy": 0.5139233389983908, + "eval_loss": 1.5746879577636719, + "eval_runtime": 157.8807, + "eval_samples_per_second": 402.05, + "eval_steps_per_second": 1.571, + "step": 1382000 + }, + { + "epoch": 5.6303716611551815, + "grad_norm": 3.5602033138275146, + "learning_rate": 0.00024053468611427194, + "loss": 7.5481, + "step": 1382100 + }, + { + "epoch": 5.630779039178563, + "grad_norm": 5.874471664428711, + "learning_rate": 0.00024032645450496788, + "loss": 7.5237, + "step": 1382200 + }, + { + "epoch": 5.631186417201945, + "grad_norm": 17.948442459106445, + "learning_rate": 0.00024011831243032125, + "loss": 7.5058, + "step": 1382300 + }, + { + "epoch": 5.631593795225326, + "grad_norm": 3.3917994499206543, + "learning_rate": 0.0002399102598985611, + "loss": 7.497, + "step": 1382400 + }, + { + "epoch": 5.632001173248708, + "grad_norm": 4.5032830238342285, + "learning_rate": 0.00023970229691791284, + "loss": 7.5504, + "step": 1382500 + }, + { + "epoch": 5.632408551272089, + "grad_norm": 13.444269180297852, + "learning_rate": 0.00023949442349659885, + "loss": 7.5114, + "step": 1382600 + }, + { + "epoch": 5.63281592929547, + "grad_norm": 22.66585922241211, + "learning_rate": 0.00023928663964283714, + "loss": 7.534, + "step": 1382700 + }, + { + "epoch": 5.633223307318851, + "grad_norm": 20.383052825927734, + "learning_rate": 0.00023907894536484236, + "loss": 7.548, + "step": 1382800 + }, + { + "epoch": 5.633630685342233, + "grad_norm": 2.88193416595459, + "learning_rate": 0.00023887134067082606, + "loss": 7.5167, + "step": 1382900 + }, + { + "epoch": 5.634038063365614, + "grad_norm": 13.426657676696777, + "learning_rate": 0.00023866382556899545, + "loss": 7.5217, + "step": 1383000 + }, + { + "epoch": 5.634038063365614, + "eval_MaskedAccuracy": 0.5131976539834424, + "eval_loss": 1.5839974880218506, + "eval_runtime": 445.6287, + "eval_samples_per_second": 142.441, + "eval_steps_per_second": 0.557, + "step": 1383000 + }, + { + "epoch": 5.634445441388996, + "grad_norm": 2.9683921337127686, + "learning_rate": 0.00023845640006755502, + "loss": 7.52, + "step": 1383100 + }, + { + "epoch": 5.6348528194123775, + "grad_norm": 13.596526145935059, + "learning_rate": 0.0002382490641747058, + "loss": 7.5181, + "step": 1383200 + }, + { + "epoch": 5.635260197435759, + "grad_norm": 17.86842155456543, + "learning_rate": 0.00023804181789864416, + "loss": 7.5123, + "step": 1383300 + }, + { + "epoch": 5.6356675754591405, + "grad_norm": 3.347900867462158, + "learning_rate": 0.00023783466124756416, + "loss": 7.5022, + "step": 1383400 + }, + { + "epoch": 5.636074953482522, + "grad_norm": 13.34920883178711, + "learning_rate": 0.00023762759422965576, + "loss": 7.4968, + "step": 1383500 + }, + { + "epoch": 5.636482331505904, + "grad_norm": 3.989408493041992, + "learning_rate": 0.00023742061685310518, + "loss": 7.5358, + "step": 1383600 + }, + { + "epoch": 5.636889709529285, + "grad_norm": 7.5562028884887695, + "learning_rate": 0.0002372137291260952, + "loss": 7.5279, + "step": 1383700 + }, + { + "epoch": 5.637297087552667, + "grad_norm": 26.748794555664062, + "learning_rate": 0.00023700693105680555, + "loss": 7.5495, + "step": 1383800 + }, + { + "epoch": 5.637704465576048, + "grad_norm": 20.54895782470703, + "learning_rate": 0.00023680022265341207, + "loss": 7.5356, + "step": 1383900 + }, + { + "epoch": 5.638111843599429, + "grad_norm": 13.316396713256836, + "learning_rate": 0.00023659360392408734, + "loss": 7.5386, + "step": 1384000 + }, + { + "epoch": 5.638111843599429, + "eval_MaskedAccuracy": 0.5128924797401614, + "eval_loss": 1.5910016298294067, + "eval_runtime": 154.0194, + "eval_samples_per_second": 412.13, + "eval_steps_per_second": 1.61, + "step": 1384000 + }, + { + "epoch": 5.63851922162281, + "grad_norm": 15.915701866149902, + "learning_rate": 0.0002363870748769995, + "loss": 7.5486, + "step": 1384100 + }, + { + "epoch": 5.638926599646192, + "grad_norm": 9.00930404663086, + "learning_rate": 0.0002361806355203137, + "loss": 7.5278, + "step": 1384200 + }, + { + "epoch": 5.6393339776695734, + "grad_norm": 3.5040266513824463, + "learning_rate": 0.00023597428586219238, + "loss": 7.4803, + "step": 1384300 + }, + { + "epoch": 5.639741355692955, + "grad_norm": 15.426980972290039, + "learning_rate": 0.00023576802591079282, + "loss": 7.5592, + "step": 1384400 + }, + { + "epoch": 5.6401487337163365, + "grad_norm": 3.2443149089813232, + "learning_rate": 0.00023556185567426936, + "loss": 7.5452, + "step": 1384500 + }, + { + "epoch": 5.640556111739718, + "grad_norm": 12.899211883544922, + "learning_rate": 0.00023535577516077416, + "loss": 7.5314, + "step": 1384600 + }, + { + "epoch": 5.6409634897631, + "grad_norm": 18.656158447265625, + "learning_rate": 0.00023514978437845372, + "loss": 7.5093, + "step": 1384700 + }, + { + "epoch": 5.641370867786481, + "grad_norm": 7.391315937042236, + "learning_rate": 0.0002349438833354528, + "loss": 7.5285, + "step": 1384800 + }, + { + "epoch": 5.641778245809863, + "grad_norm": 20.103830337524414, + "learning_rate": 0.00023473807203991124, + "loss": 7.5072, + "step": 1384900 + }, + { + "epoch": 5.642185623833244, + "grad_norm": 4.780278205871582, + "learning_rate": 0.0002345323504999661, + "loss": 7.5332, + "step": 1385000 + }, + { + "epoch": 5.642185623833244, + "eval_MaskedAccuracy": 0.5127545823469942, + "eval_loss": 1.5882014036178589, + "eval_runtime": 166.3087, + "eval_samples_per_second": 381.676, + "eval_steps_per_second": 1.491, + "step": 1385000 + }, + { + "epoch": 5.642593001856626, + "grad_norm": 13.922639846801758, + "learning_rate": 0.00023432671872374995, + "loss": 7.5407, + "step": 1385100 + }, + { + "epoch": 5.643000379880007, + "grad_norm": 5.8329010009765625, + "learning_rate": 0.00023412117671939335, + "loss": 7.5332, + "step": 1385200 + }, + { + "epoch": 5.643407757903388, + "grad_norm": 6.772289276123047, + "learning_rate": 0.00023391572449502207, + "loss": 7.5326, + "step": 1385300 + }, + { + "epoch": 5.643815135926769, + "grad_norm": 7.5275115966796875, + "learning_rate": 0.00023371036205875896, + "loss": 7.5267, + "step": 1385400 + }, + { + "epoch": 5.644222513950151, + "grad_norm": 8.430474281311035, + "learning_rate": 0.00023350508941872292, + "loss": 7.527, + "step": 1385500 + }, + { + "epoch": 5.6446298919735325, + "grad_norm": 5.418252944946289, + "learning_rate": 0.00023329990658302953, + "loss": 7.5444, + "step": 1385600 + }, + { + "epoch": 5.645037269996914, + "grad_norm": 9.187230110168457, + "learning_rate": 0.0002330948135597908, + "loss": 7.5594, + "step": 1385700 + }, + { + "epoch": 5.6454446480202956, + "grad_norm": 5.811594009399414, + "learning_rate": 0.00023288981035711476, + "loss": 7.5045, + "step": 1385800 + }, + { + "epoch": 5.645852026043677, + "grad_norm": 9.740527153015137, + "learning_rate": 0.00023268489698310686, + "loss": 7.5262, + "step": 1385900 + }, + { + "epoch": 5.646259404067059, + "grad_norm": 10.950613975524902, + "learning_rate": 0.00023248007344586785, + "loss": 7.5199, + "step": 1386000 + }, + { + "epoch": 5.646259404067059, + "eval_MaskedAccuracy": 0.5131919074783816, + "eval_loss": 1.582820177078247, + "eval_runtime": 154.2318, + "eval_samples_per_second": 411.562, + "eval_steps_per_second": 1.608, + "step": 1386000 + }, + { + "epoch": 5.64666678209044, + "grad_norm": 15.783064842224121, + "learning_rate": 0.00023227533975349614, + "loss": 7.5395, + "step": 1386100 + }, + { + "epoch": 5.647074160113822, + "grad_norm": 15.486825942993164, + "learning_rate": 0.00023207069591408527, + "loss": 7.5392, + "step": 1386200 + }, + { + "epoch": 5.647481538137203, + "grad_norm": 13.45315933227539, + "learning_rate": 0.00023186614193572652, + "loss": 7.527, + "step": 1386300 + }, + { + "epoch": 5.647888916160584, + "grad_norm": 22.64778709411621, + "learning_rate": 0.00023166167782650706, + "loss": 7.5207, + "step": 1386400 + }, + { + "epoch": 5.648296294183966, + "grad_norm": 5.565525531768799, + "learning_rate": 0.00023145730359450939, + "loss": 7.5063, + "step": 1386500 + }, + { + "epoch": 5.648703672207347, + "grad_norm": 11.855425834655762, + "learning_rate": 0.00023125301924781488, + "loss": 7.5222, + "step": 1386600 + }, + { + "epoch": 5.6491110502307285, + "grad_norm": 6.1692304611206055, + "learning_rate": 0.000231048824794499, + "loss": 7.5396, + "step": 1386700 + }, + { + "epoch": 5.64951842825411, + "grad_norm": 14.76517105102539, + "learning_rate": 0.00023084472024263516, + "loss": 7.5008, + "step": 1386800 + }, + { + "epoch": 5.6499258062774915, + "grad_norm": 8.494549751281738, + "learning_rate": 0.00023064070560029277, + "loss": 7.555, + "step": 1386900 + }, + { + "epoch": 5.650333184300873, + "grad_norm": 4.44266939163208, + "learning_rate": 0.00023043678087553737, + "loss": 7.5118, + "step": 1387000 + }, + { + "epoch": 5.650333184300873, + "eval_MaskedAccuracy": 0.5126597984216629, + "eval_loss": 1.5913329124450684, + "eval_runtime": 161.6774, + "eval_samples_per_second": 392.609, + "eval_steps_per_second": 1.534, + "step": 1387000 + }, + { + "epoch": 5.650740562324255, + "grad_norm": 3.5420591831207275, + "learning_rate": 0.0002302329460764315, + "loss": 7.5164, + "step": 1387100 + }, + { + "epoch": 5.651147940347636, + "grad_norm": 3.3939616680145264, + "learning_rate": 0.00023002920121103343, + "loss": 7.4813, + "step": 1387200 + }, + { + "epoch": 5.651555318371018, + "grad_norm": 17.22228240966797, + "learning_rate": 0.0002298255462873982, + "loss": 7.5138, + "step": 1387300 + }, + { + "epoch": 5.651962696394399, + "grad_norm": 7.133488655090332, + "learning_rate": 0.0002296219813135776, + "loss": 7.5223, + "step": 1387400 + }, + { + "epoch": 5.652370074417781, + "grad_norm": 10.650275230407715, + "learning_rate": 0.00022941850629761977, + "loss": 7.5269, + "step": 1387500 + }, + { + "epoch": 5.652777452441162, + "grad_norm": 9.090660095214844, + "learning_rate": 0.00022921512124756893, + "loss": 7.512, + "step": 1387600 + }, + { + "epoch": 5.653184830464543, + "grad_norm": 5.821437835693359, + "learning_rate": 0.0002290118261714661, + "loss": 7.5196, + "step": 1387700 + }, + { + "epoch": 5.653592208487924, + "grad_norm": 5.73036527633667, + "learning_rate": 0.00022880862107734873, + "loss": 7.5401, + "step": 1387800 + }, + { + "epoch": 5.653999586511306, + "grad_norm": 16.467464447021484, + "learning_rate": 0.0002286055059732504, + "loss": 7.518, + "step": 1387900 + }, + { + "epoch": 5.6544069645346875, + "grad_norm": 3.790493965148926, + "learning_rate": 0.00022840248086720166, + "loss": 7.53, + "step": 1388000 + }, + { + "epoch": 5.6544069645346875, + "eval_MaskedAccuracy": 0.5129876134496691, + "eval_loss": 1.5923545360565186, + "eval_runtime": 159.6839, + "eval_samples_per_second": 397.51, + "eval_steps_per_second": 1.553, + "step": 1388000 + }, + { + "epoch": 5.654814342558069, + "grad_norm": 7.764084339141846, + "learning_rate": 0.00022819954576722883, + "loss": 7.5189, + "step": 1388100 + }, + { + "epoch": 5.655221720581451, + "grad_norm": 5.0881476402282715, + "learning_rate": 0.00022799670068135514, + "loss": 7.5257, + "step": 1388200 + }, + { + "epoch": 5.655629098604832, + "grad_norm": 11.76297664642334, + "learning_rate": 0.0002277939456176003, + "loss": 7.5083, + "step": 1388300 + }, + { + "epoch": 5.656036476628214, + "grad_norm": 7.922427654266357, + "learning_rate": 0.00022759128058397906, + "loss": 7.5557, + "step": 1388400 + }, + { + "epoch": 5.656443854651595, + "grad_norm": 10.830290794372559, + "learning_rate": 0.00022738870558850616, + "loss": 7.5255, + "step": 1388500 + }, + { + "epoch": 5.656851232674977, + "grad_norm": 3.2147727012634277, + "learning_rate": 0.00022718622063918882, + "loss": 7.5072, + "step": 1388600 + }, + { + "epoch": 5.657258610698358, + "grad_norm": 14.751627922058105, + "learning_rate": 0.0002269838257440331, + "loss": 7.521, + "step": 1388700 + }, + { + "epoch": 5.65766598872174, + "grad_norm": 3.6608469486236572, + "learning_rate": 0.0002267815209110401, + "loss": 7.5121, + "step": 1388800 + }, + { + "epoch": 5.658073366745121, + "grad_norm": 6.23715353012085, + "learning_rate": 0.0002265793061482088, + "loss": 7.5392, + "step": 1388900 + }, + { + "epoch": 5.658480744768502, + "grad_norm": 12.295629501342773, + "learning_rate": 0.0002263771814635332, + "loss": 7.5265, + "step": 1389000 + }, + { + "epoch": 5.658480744768502, + "eval_MaskedAccuracy": 0.5132895588478967, + "eval_loss": 1.5833216905593872, + "eval_runtime": 162.9295, + "eval_samples_per_second": 389.592, + "eval_steps_per_second": 1.522, + "step": 1389000 + }, + { + "epoch": 5.6588881227918835, + "grad_norm": 20.044307708740234, + "learning_rate": 0.0002261751468650044, + "loss": 7.5064, + "step": 1389100 + }, + { + "epoch": 5.659295500815265, + "grad_norm": 7.284054756164551, + "learning_rate": 0.00022597320236061015, + "loss": 7.5474, + "step": 1389200 + }, + { + "epoch": 5.6597028788386465, + "grad_norm": 9.859037399291992, + "learning_rate": 0.00022577134795833398, + "loss": 7.5277, + "step": 1389300 + }, + { + "epoch": 5.660110256862028, + "grad_norm": 11.014314651489258, + "learning_rate": 0.00022556958366615716, + "loss": 7.5124, + "step": 1389400 + }, + { + "epoch": 5.66051763488541, + "grad_norm": 9.872495651245117, + "learning_rate": 0.00022536790949205572, + "loss": 7.5045, + "step": 1389500 + }, + { + "epoch": 5.660925012908791, + "grad_norm": 13.823678970336914, + "learning_rate": 0.00022516632544400283, + "loss": 7.5307, + "step": 1389600 + }, + { + "epoch": 5.661332390932173, + "grad_norm": 4.973620891571045, + "learning_rate": 0.00022496483152996877, + "loss": 7.5371, + "step": 1389700 + }, + { + "epoch": 5.661739768955554, + "grad_norm": 23.51268196105957, + "learning_rate": 0.00022476342775791958, + "loss": 7.5074, + "step": 1389800 + }, + { + "epoch": 5.662147146978936, + "grad_norm": 8.796059608459473, + "learning_rate": 0.00022456211413581742, + "loss": 7.5343, + "step": 1389900 + }, + { + "epoch": 5.662554525002317, + "grad_norm": 4.088870048522949, + "learning_rate": 0.00022436089067162134, + "loss": 7.5248, + "step": 1390000 + }, + { + "epoch": 5.662554525002317, + "eval_MaskedAccuracy": 0.5131125215970762, + "eval_loss": 1.5887385606765747, + "eval_runtime": 171.0249, + "eval_samples_per_second": 371.151, + "eval_steps_per_second": 1.45, + "step": 1390000 + }, + { + "epoch": 5.662961903025699, + "grad_norm": 4.518383979797363, + "learning_rate": 0.00022415975737328708, + "loss": 7.5212, + "step": 1390100 + }, + { + "epoch": 5.66336928104908, + "grad_norm": 7.672942638397217, + "learning_rate": 0.00022395871424876662, + "loss": 7.5618, + "step": 1390200 + }, + { + "epoch": 5.663776659072461, + "grad_norm": 7.145913124084473, + "learning_rate": 0.00022375776130600808, + "loss": 7.5308, + "step": 1390300 + }, + { + "epoch": 5.6641840370958425, + "grad_norm": 4.374745845794678, + "learning_rate": 0.0002235568985529557, + "loss": 7.5439, + "step": 1390400 + }, + { + "epoch": 5.664591415119224, + "grad_norm": 5.299991607666016, + "learning_rate": 0.0002233561259975516, + "loss": 7.4934, + "step": 1390500 + }, + { + "epoch": 5.664998793142606, + "grad_norm": 4.063174247741699, + "learning_rate": 0.00022315544364773294, + "loss": 7.5518, + "step": 1390600 + }, + { + "epoch": 5.665406171165987, + "grad_norm": 5.611782073974609, + "learning_rate": 0.00022295485151143358, + "loss": 7.5503, + "step": 1390700 + }, + { + "epoch": 5.665813549189369, + "grad_norm": 7.833299160003662, + "learning_rate": 0.00022275434959658427, + "loss": 7.5251, + "step": 1390800 + }, + { + "epoch": 5.66622092721275, + "grad_norm": 3.6235079765319824, + "learning_rate": 0.00022255393791111166, + "loss": 7.5212, + "step": 1390900 + }, + { + "epoch": 5.666628305236132, + "grad_norm": 17.92323875427246, + "learning_rate": 0.0002223536164629392, + "loss": 7.5201, + "step": 1391000 + }, + { + "epoch": 5.666628305236132, + "eval_MaskedAccuracy": 0.513330936714161, + "eval_loss": 1.5948268175125122, + "eval_runtime": 155.461, + "eval_samples_per_second": 408.308, + "eval_steps_per_second": 1.595, + "step": 1391000 + }, + { + "epoch": 5.667035683259513, + "grad_norm": 11.434247016906738, + "learning_rate": 0.00022215338525998705, + "loss": 7.5327, + "step": 1391100 + }, + { + "epoch": 5.667443061282895, + "grad_norm": 17.200960159301758, + "learning_rate": 0.00022195324431017064, + "loss": 7.5173, + "step": 1391200 + }, + { + "epoch": 5.667850439306276, + "grad_norm": 4.2296600341796875, + "learning_rate": 0.0002217531936214034, + "loss": 7.5445, + "step": 1391300 + }, + { + "epoch": 5.668257817329657, + "grad_norm": 19.114154815673828, + "learning_rate": 0.00022155323320159428, + "loss": 7.5339, + "step": 1391400 + }, + { + "epoch": 5.668665195353039, + "grad_norm": 3.479971408843994, + "learning_rate": 0.00022135336305864826, + "loss": 7.4977, + "step": 1391500 + }, + { + "epoch": 5.66907257337642, + "grad_norm": 18.817155838012695, + "learning_rate": 0.00022115358320046774, + "loss": 7.4731, + "step": 1391600 + }, + { + "epoch": 5.6694799513998015, + "grad_norm": 19.09714126586914, + "learning_rate": 0.00022095389363495108, + "loss": 7.5146, + "step": 1391700 + }, + { + "epoch": 5.669887329423183, + "grad_norm": 3.2621140480041504, + "learning_rate": 0.00022075429436999304, + "loss": 7.5305, + "step": 1391800 + }, + { + "epoch": 5.670294707446565, + "grad_norm": 3.465158462524414, + "learning_rate": 0.00022055478541348447, + "loss": 7.5481, + "step": 1391900 + }, + { + "epoch": 5.670702085469946, + "grad_norm": 4.687906265258789, + "learning_rate": 0.00022035536677331334, + "loss": 7.5082, + "step": 1392000 + }, + { + "epoch": 5.670702085469946, + "eval_MaskedAccuracy": 0.5138050215018688, + "eval_loss": 1.5798211097717285, + "eval_runtime": 162.4081, + "eval_samples_per_second": 390.843, + "eval_steps_per_second": 1.527, + "step": 1392000 + }, + { + "epoch": 5.671109463493328, + "grad_norm": 20.191240310668945, + "learning_rate": 0.00022015603845736378, + "loss": 7.5108, + "step": 1392100 + }, + { + "epoch": 5.671516841516709, + "grad_norm": 4.79811954498291, + "learning_rate": 0.00021995680047351605, + "loss": 7.5145, + "step": 1392200 + }, + { + "epoch": 5.671924219540091, + "grad_norm": 4.349195957183838, + "learning_rate": 0.0002197576528296474, + "loss": 7.5234, + "step": 1392300 + }, + { + "epoch": 5.672331597563472, + "grad_norm": 10.458327293395996, + "learning_rate": 0.00021955859553362997, + "loss": 7.5282, + "step": 1392400 + }, + { + "epoch": 5.672738975586854, + "grad_norm": 14.85787582397461, + "learning_rate": 0.0002193596285933358, + "loss": 7.5352, + "step": 1392500 + }, + { + "epoch": 5.673146353610235, + "grad_norm": 16.2830753326416, + "learning_rate": 0.00021916075201662972, + "loss": 7.5384, + "step": 1392600 + }, + { + "epoch": 5.673553731633616, + "grad_norm": 3.118499279022217, + "learning_rate": 0.00021896196581137482, + "loss": 7.5212, + "step": 1392700 + }, + { + "epoch": 5.6739611096569975, + "grad_norm": 11.2678804397583, + "learning_rate": 0.00021876326998542995, + "loss": 7.5187, + "step": 1392800 + }, + { + "epoch": 5.674368487680379, + "grad_norm": 4.308579444885254, + "learning_rate": 0.00021856466454665076, + "loss": 7.5367, + "step": 1392900 + }, + { + "epoch": 5.674775865703761, + "grad_norm": 3.5810940265655518, + "learning_rate": 0.00021836614950288875, + "loss": 7.5346, + "step": 1393000 + }, + { + "epoch": 5.674775865703761, + "eval_MaskedAccuracy": 0.5134186143691946, + "eval_loss": 1.5889108180999756, + "eval_runtime": 157.2879, + "eval_samples_per_second": 403.566, + "eval_steps_per_second": 1.577, + "step": 1393000 + }, + { + "epoch": 5.675183243727142, + "grad_norm": 24.302597045898438, + "learning_rate": 0.00021816772486199303, + "loss": 7.5473, + "step": 1393100 + }, + { + "epoch": 5.675590621750524, + "grad_norm": 30.983192443847656, + "learning_rate": 0.00021796939063180816, + "loss": 7.5455, + "step": 1393200 + }, + { + "epoch": 5.675997999773905, + "grad_norm": 16.238937377929688, + "learning_rate": 0.00021777114682017506, + "loss": 7.5368, + "step": 1393300 + }, + { + "epoch": 5.676405377797287, + "grad_norm": 33.0887565612793, + "learning_rate": 0.00021757299343493143, + "loss": 7.5063, + "step": 1393400 + }, + { + "epoch": 5.676812755820668, + "grad_norm": 16.486955642700195, + "learning_rate": 0.00021737493048391116, + "loss": 7.5209, + "step": 1393500 + }, + { + "epoch": 5.67722013384405, + "grad_norm": 10.487832069396973, + "learning_rate": 0.00021717695797494484, + "loss": 7.5412, + "step": 1393600 + }, + { + "epoch": 5.677627511867431, + "grad_norm": 16.049772262573242, + "learning_rate": 0.0002169790759158599, + "loss": 7.523, + "step": 1393700 + }, + { + "epoch": 5.678034889890813, + "grad_norm": 33.27151870727539, + "learning_rate": 0.00021678128431447906, + "loss": 7.5293, + "step": 1393800 + }, + { + "epoch": 5.678442267914194, + "grad_norm": 16.418254852294922, + "learning_rate": 0.00021658358317862232, + "loss": 7.4919, + "step": 1393900 + }, + { + "epoch": 5.678849645937575, + "grad_norm": 7.597526550292969, + "learning_rate": 0.00021638597251610604, + "loss": 7.517, + "step": 1394000 + }, + { + "epoch": 5.678849645937575, + "eval_MaskedAccuracy": 0.5131666246289983, + "eval_loss": 1.5822957754135132, + "eval_runtime": 175.9943, + "eval_samples_per_second": 360.671, + "eval_steps_per_second": 1.409, + "step": 1394000 + }, + { + "epoch": 5.6792570239609566, + "grad_norm": 15.38145637512207, + "learning_rate": 0.0002161884523347422, + "loss": 7.4891, + "step": 1394100 + }, + { + "epoch": 5.679664401984338, + "grad_norm": 18.405000686645508, + "learning_rate": 0.00021599102264234054, + "loss": 7.5428, + "step": 1394200 + }, + { + "epoch": 5.68007178000772, + "grad_norm": 14.354438781738281, + "learning_rate": 0.0002157936834467065, + "loss": 7.5265, + "step": 1394300 + }, + { + "epoch": 5.680479158031101, + "grad_norm": 2.9753637313842773, + "learning_rate": 0.00021559643475564105, + "loss": 7.4968, + "step": 1394400 + }, + { + "epoch": 5.680886536054483, + "grad_norm": 18.088481903076172, + "learning_rate": 0.00021539927657694357, + "loss": 7.5363, + "step": 1394500 + }, + { + "epoch": 5.681293914077864, + "grad_norm": 12.306862831115723, + "learning_rate": 0.0002152022089184082, + "loss": 7.5192, + "step": 1394600 + }, + { + "epoch": 5.681701292101246, + "grad_norm": 14.621867179870605, + "learning_rate": 0.00021500523178782634, + "loss": 7.4961, + "step": 1394700 + }, + { + "epoch": 5.682108670124627, + "grad_norm": 12.882950782775879, + "learning_rate": 0.00021480834519298526, + "loss": 7.5043, + "step": 1394800 + }, + { + "epoch": 5.682516048148009, + "grad_norm": 3.314741849899292, + "learning_rate": 0.0002146115491416694, + "loss": 7.5298, + "step": 1394900 + }, + { + "epoch": 5.68292342617139, + "grad_norm": 3.0651979446411133, + "learning_rate": 0.00021441484364165885, + "loss": 7.5249, + "step": 1395000 + }, + { + "epoch": 5.68292342617139, + "eval_MaskedAccuracy": 0.5132118971430525, + "eval_loss": 1.588690161705017, + "eval_runtime": 173.4457, + "eval_samples_per_second": 365.97, + "eval_steps_per_second": 1.43, + "step": 1395000 + }, + { + "epoch": 5.683330804194772, + "grad_norm": 4.461178302764893, + "learning_rate": 0.00021421822870073016, + "loss": 7.5057, + "step": 1395100 + }, + { + "epoch": 5.683738182218153, + "grad_norm": 9.283483505249023, + "learning_rate": 0.00021402170432665717, + "loss": 7.5257, + "step": 1395200 + }, + { + "epoch": 5.684145560241534, + "grad_norm": 18.940427780151367, + "learning_rate": 0.00021382527052720888, + "loss": 7.5487, + "step": 1395300 + }, + { + "epoch": 5.684552938264916, + "grad_norm": 4.460453510284424, + "learning_rate": 0.000213628927310152, + "loss": 7.5051, + "step": 1395400 + }, + { + "epoch": 5.684960316288297, + "grad_norm": 9.872069358825684, + "learning_rate": 0.00021343267468324837, + "loss": 7.5086, + "step": 1395500 + }, + { + "epoch": 5.685367694311679, + "grad_norm": 15.643628120422363, + "learning_rate": 0.00021323651265425773, + "loss": 7.5264, + "step": 1395600 + }, + { + "epoch": 5.68577507233506, + "grad_norm": 10.625282287597656, + "learning_rate": 0.00021304044123093455, + "loss": 7.5514, + "step": 1395700 + }, + { + "epoch": 5.686182450358442, + "grad_norm": 22.49959945678711, + "learning_rate": 0.00021284446042103128, + "loss": 7.5237, + "step": 1395800 + }, + { + "epoch": 5.686589828381823, + "grad_norm": 5.622125148773193, + "learning_rate": 0.00021264857023229572, + "loss": 7.5121, + "step": 1395900 + }, + { + "epoch": 5.686997206405205, + "grad_norm": 8.442076683044434, + "learning_rate": 0.00021245277067247258, + "loss": 7.5048, + "step": 1396000 + }, + { + "epoch": 5.686997206405205, + "eval_MaskedAccuracy": 0.5133545845147907, + "eval_loss": 1.5889184474945068, + "eval_runtime": 157.9874, + "eval_samples_per_second": 401.779, + "eval_steps_per_second": 1.57, + "step": 1396000 + }, + { + "epoch": 5.687404584428586, + "grad_norm": 24.410968780517578, + "learning_rate": 0.0002122570617493028, + "loss": 7.5229, + "step": 1396100 + }, + { + "epoch": 5.687811962451968, + "grad_norm": 4.419765472412109, + "learning_rate": 0.00021206144347052364, + "loss": 7.5385, + "step": 1396200 + }, + { + "epoch": 5.688219340475349, + "grad_norm": 18.640249252319336, + "learning_rate": 0.00021186591584386957, + "loss": 7.5353, + "step": 1396300 + }, + { + "epoch": 5.68862671849873, + "grad_norm": 12.499750137329102, + "learning_rate": 0.00021167047887706911, + "loss": 7.5206, + "step": 1396400 + }, + { + "epoch": 5.6890340965221124, + "grad_norm": 14.130266189575195, + "learning_rate": 0.00021147513257785102, + "loss": 7.5055, + "step": 1396500 + }, + { + "epoch": 5.689441474545493, + "grad_norm": 12.074944496154785, + "learning_rate": 0.0002112798769539376, + "loss": 7.5404, + "step": 1396600 + }, + { + "epoch": 5.689848852568875, + "grad_norm": 22.056108474731445, + "learning_rate": 0.0002110847120130487, + "loss": 7.5548, + "step": 1396700 + }, + { + "epoch": 5.690256230592256, + "grad_norm": 13.590144157409668, + "learning_rate": 0.00021088963776289945, + "loss": 7.4977, + "step": 1396800 + }, + { + "epoch": 5.690663608615638, + "grad_norm": 4.762839317321777, + "learning_rate": 0.00021069465421120264, + "loss": 7.5441, + "step": 1396900 + }, + { + "epoch": 5.691070986639019, + "grad_norm": 15.463266372680664, + "learning_rate": 0.00021049976136566724, + "loss": 7.541, + "step": 1397000 + }, + { + "epoch": 5.691070986639019, + "eval_MaskedAccuracy": 0.5128581112931987, + "eval_loss": 1.5882012844085693, + "eval_runtime": 165.056, + "eval_samples_per_second": 384.572, + "eval_steps_per_second": 1.503, + "step": 1397000 + }, + { + "epoch": 5.691478364662401, + "grad_norm": 6.944849014282227, + "learning_rate": 0.0002103049592339978, + "loss": 7.5231, + "step": 1397100 + }, + { + "epoch": 5.691885742685782, + "grad_norm": 5.5081586837768555, + "learning_rate": 0.0002101102478238965, + "loss": 7.5173, + "step": 1397200 + }, + { + "epoch": 5.692293120709164, + "grad_norm": 12.3670654296875, + "learning_rate": 0.00020991562714306082, + "loss": 7.5551, + "step": 1397300 + }, + { + "epoch": 5.692700498732545, + "grad_norm": 10.130189895629883, + "learning_rate": 0.0002097210971991854, + "loss": 7.5331, + "step": 1397400 + }, + { + "epoch": 5.693107876755927, + "grad_norm": 20.905229568481445, + "learning_rate": 0.00020952665799996094, + "loss": 7.5056, + "step": 1397500 + }, + { + "epoch": 5.693515254779308, + "grad_norm": 9.281001091003418, + "learning_rate": 0.00020933230955307489, + "loss": 7.54, + "step": 1397600 + }, + { + "epoch": 5.693922632802689, + "grad_norm": 3.8841283321380615, + "learning_rate": 0.00020913805186621034, + "loss": 7.5051, + "step": 1397700 + }, + { + "epoch": 5.694330010826071, + "grad_norm": 11.44265365600586, + "learning_rate": 0.00020894388494704813, + "loss": 7.5378, + "step": 1397800 + }, + { + "epoch": 5.694737388849452, + "grad_norm": 30.42966079711914, + "learning_rate": 0.00020874980880326453, + "loss": 7.5033, + "step": 1397900 + }, + { + "epoch": 5.695144766872834, + "grad_norm": 3.4537734985351562, + "learning_rate": 0.0002085558234425317, + "loss": 7.5083, + "step": 1398000 + }, + { + "epoch": 5.695144766872834, + "eval_MaskedAccuracy": 0.5137838148019644, + "eval_loss": 1.5854030847549438, + "eval_runtime": 193.3485, + "eval_samples_per_second": 328.298, + "eval_steps_per_second": 1.283, + "step": 1398000 + }, + { + "epoch": 5.695552144896215, + "grad_norm": 8.677281379699707, + "learning_rate": 0.00020836192887251987, + "loss": 7.5314, + "step": 1398100 + }, + { + "epoch": 5.695959522919597, + "grad_norm": 7.363255500793457, + "learning_rate": 0.00020816812510089387, + "loss": 7.4975, + "step": 1398200 + }, + { + "epoch": 5.696366900942978, + "grad_norm": 20.45794677734375, + "learning_rate": 0.000207974412135316, + "loss": 7.5032, + "step": 1398300 + }, + { + "epoch": 5.69677427896636, + "grad_norm": 22.967304229736328, + "learning_rate": 0.00020778078998344554, + "loss": 7.5389, + "step": 1398400 + }, + { + "epoch": 5.697181656989741, + "grad_norm": 15.457267761230469, + "learning_rate": 0.0002075872586529364, + "loss": 7.5257, + "step": 1398500 + }, + { + "epoch": 5.697589035013123, + "grad_norm": 12.36081600189209, + "learning_rate": 0.00020739381815144056, + "loss": 7.5254, + "step": 1398600 + }, + { + "epoch": 5.697996413036504, + "grad_norm": 24.141223907470703, + "learning_rate": 0.0002072004684866058, + "loss": 7.5318, + "step": 1398700 + }, + { + "epoch": 5.698403791059886, + "grad_norm": 16.874622344970703, + "learning_rate": 0.0002070072096660758, + "loss": 7.5378, + "step": 1398800 + }, + { + "epoch": 5.6988111690832675, + "grad_norm": 22.59729766845703, + "learning_rate": 0.00020681404169749123, + "loss": 7.521, + "step": 1398900 + }, + { + "epoch": 5.699218547106648, + "grad_norm": 14.58524227142334, + "learning_rate": 0.00020662096458848965, + "loss": 7.5406, + "step": 1399000 + }, + { + "epoch": 5.699218547106648, + "eval_MaskedAccuracy": 0.5133135149270909, + "eval_loss": 1.5805004835128784, + "eval_runtime": 156.5144, + "eval_samples_per_second": 405.56, + "eval_steps_per_second": 1.585, + "step": 1399000 + }, + { + "epoch": 5.69962592513003, + "grad_norm": 17.078554153442383, + "learning_rate": 0.00020642797834670342, + "loss": 7.52, + "step": 1399100 + }, + { + "epoch": 5.700033303153411, + "grad_norm": 18.19713020324707, + "learning_rate": 0.00020623508297976315, + "loss": 7.5297, + "step": 1399200 + }, + { + "epoch": 5.700440681176793, + "grad_norm": 13.093094825744629, + "learning_rate": 0.0002060422784952944, + "loss": 7.5375, + "step": 1399300 + }, + { + "epoch": 5.700848059200174, + "grad_norm": 21.07042121887207, + "learning_rate": 0.00020584956490091988, + "loss": 7.5094, + "step": 1399400 + }, + { + "epoch": 5.701255437223556, + "grad_norm": 19.204206466674805, + "learning_rate": 0.00020565694220425895, + "loss": 7.5414, + "step": 1399500 + }, + { + "epoch": 5.701662815246937, + "grad_norm": 7.8062424659729, + "learning_rate": 0.00020546441041292704, + "loss": 7.5067, + "step": 1399600 + }, + { + "epoch": 5.702070193270319, + "grad_norm": 3.5404536724090576, + "learning_rate": 0.00020527196953453562, + "loss": 7.5441, + "step": 1399700 + }, + { + "epoch": 5.7024775712937, + "grad_norm": 14.274609565734863, + "learning_rate": 0.0002050796195766931, + "loss": 7.5387, + "step": 1399800 + }, + { + "epoch": 5.702884949317082, + "grad_norm": 16.907634735107422, + "learning_rate": 0.00020488736054700426, + "loss": 7.4932, + "step": 1399900 + }, + { + "epoch": 5.703292327340463, + "grad_norm": 21.000276565551758, + "learning_rate": 0.0002046951924530697, + "loss": 7.5337, + "step": 1400000 + }, + { + "epoch": 5.703292327340463, + "eval_MaskedAccuracy": 0.5128022263675616, + "eval_loss": 1.5850491523742676, + "eval_runtime": 164.8747, + "eval_samples_per_second": 384.995, + "eval_steps_per_second": 1.504, + "step": 1400000 + }, + { + "epoch": 5.703699705363845, + "grad_norm": 2.6224594116210938, + "learning_rate": 0.00020450311530248696, + "loss": 7.5409, + "step": 1400100 + }, + { + "epoch": 5.7041070833872265, + "grad_norm": 13.088146209716797, + "learning_rate": 0.0002043111291028501, + "loss": 7.5092, + "step": 1400200 + }, + { + "epoch": 5.704514461410607, + "grad_norm": 10.771458625793457, + "learning_rate": 0.00020411923386174911, + "loss": 7.517, + "step": 1400300 + }, + { + "epoch": 5.704921839433989, + "grad_norm": 13.163960456848145, + "learning_rate": 0.00020392742958677085, + "loss": 7.5096, + "step": 1400400 + }, + { + "epoch": 5.70532921745737, + "grad_norm": 7.284150123596191, + "learning_rate": 0.00020373571628549817, + "loss": 7.4986, + "step": 1400500 + }, + { + "epoch": 5.705736595480752, + "grad_norm": 17.328292846679688, + "learning_rate": 0.00020354409396551118, + "loss": 7.516, + "step": 1400600 + }, + { + "epoch": 5.706143973504133, + "grad_norm": 5.250231742858887, + "learning_rate": 0.0002033525626343847, + "loss": 7.4923, + "step": 1400700 + }, + { + "epoch": 5.706551351527515, + "grad_norm": 2.9675095081329346, + "learning_rate": 0.00020316112229969195, + "loss": 7.5409, + "step": 1400800 + }, + { + "epoch": 5.706958729550896, + "grad_norm": 6.556509971618652, + "learning_rate": 0.00020296977296900087, + "loss": 7.5035, + "step": 1400900 + }, + { + "epoch": 5.707366107574278, + "grad_norm": 3.9646568298339844, + "learning_rate": 0.0002027785146498769, + "loss": 7.4983, + "step": 1401000 + }, + { + "epoch": 5.707366107574278, + "eval_MaskedAccuracy": 0.5134965311932747, + "eval_loss": 1.5908129215240479, + "eval_runtime": 174.5783, + "eval_samples_per_second": 363.596, + "eval_steps_per_second": 1.421, + "step": 1401000 + }, + { + "epoch": 5.707773485597659, + "grad_norm": 24.632753372192383, + "learning_rate": 0.00020258734734988145, + "loss": 7.5003, + "step": 1401100 + }, + { + "epoch": 5.708180863621041, + "grad_norm": 33.74378967285156, + "learning_rate": 0.0002023962710765723, + "loss": 7.5498, + "step": 1401200 + }, + { + "epoch": 5.7085882416444225, + "grad_norm": 5.798527717590332, + "learning_rate": 0.0002022052858375038, + "loss": 7.5268, + "step": 1401300 + }, + { + "epoch": 5.708995619667803, + "grad_norm": 27.04881477355957, + "learning_rate": 0.00020201439164022641, + "loss": 7.545, + "step": 1401400 + }, + { + "epoch": 5.7094029976911855, + "grad_norm": 12.537012100219727, + "learning_rate": 0.0002018235884922875, + "loss": 7.528, + "step": 1401500 + }, + { + "epoch": 5.709810375714566, + "grad_norm": 4.775035858154297, + "learning_rate": 0.00020163287640123008, + "loss": 7.5073, + "step": 1401600 + }, + { + "epoch": 5.710217753737948, + "grad_norm": 20.909391403198242, + "learning_rate": 0.00020144225537459468, + "loss": 7.5329, + "step": 1401700 + }, + { + "epoch": 5.710625131761329, + "grad_norm": 5.003806114196777, + "learning_rate": 0.00020125172541991693, + "loss": 7.5185, + "step": 1401800 + }, + { + "epoch": 5.711032509784711, + "grad_norm": 28.107206344604492, + "learning_rate": 0.00020106128654472983, + "loss": 7.5203, + "step": 1401900 + }, + { + "epoch": 5.711439887808092, + "grad_norm": 3.4706532955169678, + "learning_rate": 0.00020087093875656263, + "loss": 7.5077, + "step": 1402000 + }, + { + "epoch": 5.711439887808092, + "eval_MaskedAccuracy": 0.5129085953446926, + "eval_loss": 1.5857908725738525, + "eval_runtime": 169.7335, + "eval_samples_per_second": 373.974, + "eval_steps_per_second": 1.461, + "step": 1402000 + }, + { + "epoch": 5.711847265831474, + "grad_norm": 5.496380805969238, + "learning_rate": 0.00020068068206294015, + "loss": 7.5444, + "step": 1402100 + }, + { + "epoch": 5.712254643854855, + "grad_norm": 4.842715263366699, + "learning_rate": 0.00020049051647138503, + "loss": 7.5026, + "step": 1402200 + }, + { + "epoch": 5.712662021878237, + "grad_norm": 3.697951316833496, + "learning_rate": 0.00020030044198941484, + "loss": 7.5159, + "step": 1402300 + }, + { + "epoch": 5.713069399901618, + "grad_norm": 34.285362243652344, + "learning_rate": 0.00020011045862454473, + "loss": 7.4916, + "step": 1402400 + }, + { + "epoch": 5.713476777925, + "grad_norm": 22.446369171142578, + "learning_rate": 0.00019992056638428533, + "loss": 7.5062, + "step": 1402500 + }, + { + "epoch": 5.7138841559483815, + "grad_norm": 22.62911605834961, + "learning_rate": 0.0001997307652761449, + "loss": 7.4998, + "step": 1402600 + }, + { + "epoch": 5.714291533971762, + "grad_norm": 10.011300086975098, + "learning_rate": 0.00019954105530762653, + "loss": 7.5392, + "step": 1402700 + }, + { + "epoch": 5.714698911995144, + "grad_norm": 4.304636001586914, + "learning_rate": 0.00019935143648623096, + "loss": 7.516, + "step": 1402800 + }, + { + "epoch": 5.715106290018525, + "grad_norm": 5.8564043045043945, + "learning_rate": 0.00019916190881945424, + "loss": 7.5233, + "step": 1402900 + }, + { + "epoch": 5.715513668041907, + "grad_norm": 4.471909046173096, + "learning_rate": 0.00019897247231478999, + "loss": 7.5239, + "step": 1403000 + }, + { + "epoch": 5.715513668041907, + "eval_MaskedAccuracy": 0.5131129358615463, + "eval_loss": 1.5917563438415527, + "eval_runtime": 173.1445, + "eval_samples_per_second": 366.607, + "eval_steps_per_second": 1.432, + "step": 1403000 + }, + { + "epoch": 5.715921046065288, + "grad_norm": 31.28317642211914, + "learning_rate": 0.00019878312697972728, + "loss": 7.4928, + "step": 1403100 + }, + { + "epoch": 5.71632842408867, + "grad_norm": 3.0140678882598877, + "learning_rate": 0.00019859387282175216, + "loss": 7.5469, + "step": 1403200 + }, + { + "epoch": 5.716735802112051, + "grad_norm": 4.018800258636475, + "learning_rate": 0.00019840470984834659, + "loss": 7.5205, + "step": 1403300 + }, + { + "epoch": 5.717143180135433, + "grad_norm": 5.32080078125, + "learning_rate": 0.0001982156380669897, + "loss": 7.5117, + "step": 1403400 + }, + { + "epoch": 5.717550558158814, + "grad_norm": 16.230554580688477, + "learning_rate": 0.000198026657485156, + "loss": 7.5254, + "step": 1403500 + }, + { + "epoch": 5.717957936182196, + "grad_norm": 16.40062141418457, + "learning_rate": 0.0001978377681103172, + "loss": 7.5317, + "step": 1403600 + }, + { + "epoch": 5.7183653142055775, + "grad_norm": 13.31722640991211, + "learning_rate": 0.00019764896994994092, + "loss": 7.4973, + "step": 1403700 + }, + { + "epoch": 5.718772692228959, + "grad_norm": 4.894228458404541, + "learning_rate": 0.00019746026301149165, + "loss": 7.5494, + "step": 1403800 + }, + { + "epoch": 5.7191800702523405, + "grad_norm": 3.2217085361480713, + "learning_rate": 0.00019727164730242935, + "loss": 7.5501, + "step": 1403900 + }, + { + "epoch": 5.719587448275721, + "grad_norm": 5.319243431091309, + "learning_rate": 0.00019708312283021187, + "loss": 7.5266, + "step": 1404000 + }, + { + "epoch": 5.719587448275721, + "eval_MaskedAccuracy": 0.5128442257461067, + "eval_loss": 1.5894906520843506, + "eval_runtime": 160.973, + "eval_samples_per_second": 394.327, + "eval_steps_per_second": 1.541, + "step": 1404000 + }, + { + "epoch": 5.719994826299103, + "grad_norm": 9.104679107666016, + "learning_rate": 0.00019689468960229188, + "loss": 7.5095, + "step": 1404100 + }, + { + "epoch": 5.720402204322484, + "grad_norm": 5.5225138664245605, + "learning_rate": 0.0001967063476261195, + "loss": 7.5199, + "step": 1404200 + }, + { + "epoch": 5.720809582345866, + "grad_norm": 9.274515151977539, + "learning_rate": 0.0001965180969091406, + "loss": 7.5142, + "step": 1404300 + }, + { + "epoch": 5.721216960369247, + "grad_norm": 22.511634826660156, + "learning_rate": 0.00019632993745879792, + "loss": 7.5138, + "step": 1404400 + }, + { + "epoch": 5.721624338392629, + "grad_norm": 15.016294479370117, + "learning_rate": 0.0001961418692825308, + "loss": 7.5143, + "step": 1404500 + }, + { + "epoch": 5.72203171641601, + "grad_norm": 3.427358388900757, + "learning_rate": 0.00019595389238777434, + "loss": 7.5305, + "step": 1404600 + }, + { + "epoch": 5.722439094439392, + "grad_norm": 21.848268508911133, + "learning_rate": 0.00019576600678195975, + "loss": 7.5188, + "step": 1404700 + }, + { + "epoch": 5.7228464724627734, + "grad_norm": 23.56867790222168, + "learning_rate": 0.00019557821247251575, + "loss": 7.5326, + "step": 1404800 + }, + { + "epoch": 5.723253850486155, + "grad_norm": 18.3046932220459, + "learning_rate": 0.0001953905094668669, + "loss": 7.5557, + "step": 1404900 + }, + { + "epoch": 5.7236612285095365, + "grad_norm": 17.465869903564453, + "learning_rate": 0.00019520289777243339, + "loss": 7.539, + "step": 1405000 + }, + { + "epoch": 5.7236612285095365, + "eval_MaskedAccuracy": 0.513291535950472, + "eval_loss": 1.5922725200653076, + "eval_runtime": 157.6129, + "eval_samples_per_second": 402.733, + "eval_steps_per_second": 1.573, + "step": 1405000 + }, + { + "epoch": 5.724068606532918, + "grad_norm": 7.314059734344482, + "learning_rate": 0.00019501537739663327, + "loss": 7.5351, + "step": 1405100 + }, + { + "epoch": 5.7244759845563, + "grad_norm": 18.200645446777344, + "learning_rate": 0.00019482794834688, + "loss": 7.493, + "step": 1405200 + }, + { + "epoch": 5.72488336257968, + "grad_norm": 22.57877540588379, + "learning_rate": 0.00019464061063058336, + "loss": 7.5048, + "step": 1405300 + }, + { + "epoch": 5.725290740603062, + "grad_norm": 3.4958767890930176, + "learning_rate": 0.00019445336425515016, + "loss": 7.5469, + "step": 1405400 + }, + { + "epoch": 5.725698118626443, + "grad_norm": 23.43501091003418, + "learning_rate": 0.00019426620922798318, + "loss": 7.5286, + "step": 1405500 + }, + { + "epoch": 5.726105496649825, + "grad_norm": 4.3498334884643555, + "learning_rate": 0.00019407914555648158, + "loss": 7.499, + "step": 1405600 + }, + { + "epoch": 5.726512874673206, + "grad_norm": 6.705742359161377, + "learning_rate": 0.00019389217324804108, + "loss": 7.5264, + "step": 1405700 + }, + { + "epoch": 5.726920252696588, + "grad_norm": 15.036599159240723, + "learning_rate": 0.00019370529231005365, + "loss": 7.5237, + "step": 1405800 + }, + { + "epoch": 5.727327630719969, + "grad_norm": 10.892059326171875, + "learning_rate": 0.00019351850274990787, + "loss": 7.4968, + "step": 1405900 + }, + { + "epoch": 5.727735008743351, + "grad_norm": 7.121788501739502, + "learning_rate": 0.00019333180457498813, + "loss": 7.5085, + "step": 1406000 + }, + { + "epoch": 5.727735008743351, + "eval_MaskedAccuracy": 0.5135558206290997, + "eval_loss": 1.5852932929992676, + "eval_runtime": 158.0285, + "eval_samples_per_second": 401.675, + "eval_steps_per_second": 1.569, + "step": 1406000 + }, + { + "epoch": 5.7281423867667325, + "grad_norm": 10.672481536865234, + "learning_rate": 0.00019314519779267604, + "loss": 7.5342, + "step": 1406100 + }, + { + "epoch": 5.728549764790114, + "grad_norm": 22.060361862182617, + "learning_rate": 0.00019295868241034892, + "loss": 7.5069, + "step": 1406200 + }, + { + "epoch": 5.7289571428134956, + "grad_norm": 15.186068534851074, + "learning_rate": 0.00019277225843537975, + "loss": 7.505, + "step": 1406300 + }, + { + "epoch": 5.729364520836876, + "grad_norm": 8.485790252685547, + "learning_rate": 0.00019258592587514053, + "loss": 7.5102, + "step": 1406400 + }, + { + "epoch": 5.729771898860259, + "grad_norm": 12.340378761291504, + "learning_rate": 0.00019239968473699762, + "loss": 7.5189, + "step": 1406500 + }, + { + "epoch": 5.730179276883639, + "grad_norm": 16.11414909362793, + "learning_rate": 0.0001922135350283136, + "loss": 7.5215, + "step": 1406600 + }, + { + "epoch": 5.730586654907021, + "grad_norm": 24.906097412109375, + "learning_rate": 0.0001920274767564485, + "loss": 7.5037, + "step": 1406700 + }, + { + "epoch": 5.730994032930402, + "grad_norm": 5.577622413635254, + "learning_rate": 0.00019184150992875788, + "loss": 7.5285, + "step": 1406800 + }, + { + "epoch": 5.731401410953784, + "grad_norm": 15.005581855773926, + "learning_rate": 0.00019165563455259379, + "loss": 7.5308, + "step": 1406900 + }, + { + "epoch": 5.731808788977165, + "grad_norm": 11.045219421386719, + "learning_rate": 0.00019146985063530535, + "loss": 7.5216, + "step": 1407000 + }, + { + "epoch": 5.731808788977165, + "eval_MaskedAccuracy": 0.5137690953558148, + "eval_loss": 1.5858678817749023, + "eval_runtime": 154.7819, + "eval_samples_per_second": 410.1, + "eval_steps_per_second": 1.602, + "step": 1407000 + }, + { + "epoch": 5.732216167000547, + "grad_norm": 3.7264459133148193, + "learning_rate": 0.000191284158184237, + "loss": 7.5264, + "step": 1407100 + }, + { + "epoch": 5.7326235450239285, + "grad_norm": 9.076326370239258, + "learning_rate": 0.00019109855720673066, + "loss": 7.5453, + "step": 1407200 + }, + { + "epoch": 5.73303092304731, + "grad_norm": 9.742874145507812, + "learning_rate": 0.00019091304771012374, + "loss": 7.4892, + "step": 1407300 + }, + { + "epoch": 5.7334383010706915, + "grad_norm": 4.512001991271973, + "learning_rate": 0.000190727629701751, + "loss": 7.4888, + "step": 1407400 + }, + { + "epoch": 5.733845679094073, + "grad_norm": 14.355818748474121, + "learning_rate": 0.00019054230318894222, + "loss": 7.5074, + "step": 1407500 + }, + { + "epoch": 5.734253057117455, + "grad_norm": 12.80526351928711, + "learning_rate": 0.00019035706817902527, + "loss": 7.5143, + "step": 1407600 + }, + { + "epoch": 5.734660435140835, + "grad_norm": 4.367220401763916, + "learning_rate": 0.00019017192467932244, + "loss": 7.5077, + "step": 1407700 + }, + { + "epoch": 5.735067813164217, + "grad_norm": 9.353137016296387, + "learning_rate": 0.00018998687269715448, + "loss": 7.5162, + "step": 1407800 + }, + { + "epoch": 5.735475191187598, + "grad_norm": 20.973501205444336, + "learning_rate": 0.000189801912239837, + "loss": 7.5106, + "step": 1407900 + }, + { + "epoch": 5.73588256921098, + "grad_norm": 13.765401840209961, + "learning_rate": 0.00018961704331468273, + "loss": 7.4998, + "step": 1408000 + }, + { + "epoch": 5.73588256921098, + "eval_MaskedAccuracy": 0.513909088501633, + "eval_loss": 1.575904130935669, + "eval_runtime": 167.9919, + "eval_samples_per_second": 377.851, + "eval_steps_per_second": 1.476, + "step": 1408000 + }, + { + "epoch": 5.736289947234361, + "grad_norm": 8.200135231018066, + "learning_rate": 0.000189432265929, + "loss": 7.5783, + "step": 1408100 + }, + { + "epoch": 5.736697325257743, + "grad_norm": 6.398289680480957, + "learning_rate": 0.00018924758009009434, + "loss": 7.5039, + "step": 1408200 + }, + { + "epoch": 5.737104703281124, + "grad_norm": 16.132312774658203, + "learning_rate": 0.0001890629858052666, + "loss": 7.5471, + "step": 1408300 + }, + { + "epoch": 5.737512081304506, + "grad_norm": 13.6964693069458, + "learning_rate": 0.00018887848308181616, + "loss": 7.5508, + "step": 1408400 + }, + { + "epoch": 5.7379194593278875, + "grad_norm": 14.69521713256836, + "learning_rate": 0.0001886940719270369, + "loss": 7.5162, + "step": 1408500 + }, + { + "epoch": 5.738326837351269, + "grad_norm": 21.59804344177246, + "learning_rate": 0.0001885097523482192, + "loss": 7.5262, + "step": 1408600 + }, + { + "epoch": 5.738734215374651, + "grad_norm": 21.779382705688477, + "learning_rate": 0.00018832552435265053, + "loss": 7.5196, + "step": 1408700 + }, + { + "epoch": 5.739141593398032, + "grad_norm": 19.573837280273438, + "learning_rate": 0.00018814138794761453, + "loss": 7.5391, + "step": 1408800 + }, + { + "epoch": 5.739548971421414, + "grad_norm": 17.48353385925293, + "learning_rate": 0.0001879573431403909, + "loss": 7.5005, + "step": 1408900 + }, + { + "epoch": 5.739956349444794, + "grad_norm": 5.65546178817749, + "learning_rate": 0.0001877733899382556, + "loss": 7.4985, + "step": 1409000 + }, + { + "epoch": 5.739956349444794, + "eval_MaskedAccuracy": 0.5129975344928691, + "eval_loss": 1.5885475873947144, + "eval_runtime": 154.3446, + "eval_samples_per_second": 411.261, + "eval_steps_per_second": 1.607, + "step": 1409000 + }, + { + "epoch": 5.740363727468176, + "grad_norm": 10.080263137817383, + "learning_rate": 0.0001875895283484819, + "loss": 7.5066, + "step": 1409100 + }, + { + "epoch": 5.740771105491557, + "grad_norm": 15.176826477050781, + "learning_rate": 0.00018740575837833862, + "loss": 7.5081, + "step": 1409200 + }, + { + "epoch": 5.741178483514939, + "grad_norm": 14.322775840759277, + "learning_rate": 0.00018722208003509099, + "loss": 7.5071, + "step": 1409300 + }, + { + "epoch": 5.74158586153832, + "grad_norm": 19.6398868560791, + "learning_rate": 0.00018703849332600078, + "loss": 7.5159, + "step": 1409400 + }, + { + "epoch": 5.741993239561702, + "grad_norm": 8.177549362182617, + "learning_rate": 0.00018685499825832612, + "loss": 7.4846, + "step": 1409500 + }, + { + "epoch": 5.7424006175850835, + "grad_norm": 13.5286283493042, + "learning_rate": 0.00018667159483932183, + "loss": 7.5117, + "step": 1409600 + }, + { + "epoch": 5.742807995608465, + "grad_norm": 11.124225616455078, + "learning_rate": 0.0001864882830762387, + "loss": 7.5177, + "step": 1409700 + }, + { + "epoch": 5.7432153736318465, + "grad_norm": 14.455705642700195, + "learning_rate": 0.00018630506297632408, + "loss": 7.5385, + "step": 1409800 + }, + { + "epoch": 5.743622751655228, + "grad_norm": 23.371421813964844, + "learning_rate": 0.00018612193454682123, + "loss": 7.4981, + "step": 1409900 + }, + { + "epoch": 5.74403012967861, + "grad_norm": 16.195911407470703, + "learning_rate": 0.00018593889779497077, + "loss": 7.5055, + "step": 1410000 + }, + { + "epoch": 5.74403012967861, + "eval_MaskedAccuracy": 0.5131065060582505, + "eval_loss": 1.5892151594161987, + "eval_runtime": 151.2469, + "eval_samples_per_second": 419.685, + "eval_steps_per_second": 1.64, + "step": 1410000 + }, + { + "epoch": 5.744437507701991, + "grad_norm": 15.349532127380371, + "learning_rate": 0.00018575595272800902, + "loss": 7.5182, + "step": 1410100 + }, + { + "epoch": 5.744844885725373, + "grad_norm": 12.161064147949219, + "learning_rate": 0.0001855730993531674, + "loss": 7.54, + "step": 1410200 + }, + { + "epoch": 5.745252263748753, + "grad_norm": 13.149774551391602, + "learning_rate": 0.00018539033767767736, + "loss": 7.5185, + "step": 1410300 + }, + { + "epoch": 5.745659641772135, + "grad_norm": 15.56015396118164, + "learning_rate": 0.0001852076677087636, + "loss": 7.5248, + "step": 1410400 + }, + { + "epoch": 5.746067019795516, + "grad_norm": 16.218429565429688, + "learning_rate": 0.0001850250894536475, + "loss": 7.5329, + "step": 1410500 + }, + { + "epoch": 5.746474397818898, + "grad_norm": 13.44758129119873, + "learning_rate": 0.00018484260291954825, + "loss": 7.5114, + "step": 1410600 + }, + { + "epoch": 5.746881775842279, + "grad_norm": 17.162885665893555, + "learning_rate": 0.00018466020811367938, + "loss": 7.5122, + "step": 1410700 + }, + { + "epoch": 5.747289153865661, + "grad_norm": 10.99380874633789, + "learning_rate": 0.0001844779050432526, + "loss": 7.5164, + "step": 1410800 + }, + { + "epoch": 5.7476965318890425, + "grad_norm": 4.253710746765137, + "learning_rate": 0.00018429569371547562, + "loss": 7.5313, + "step": 1410900 + }, + { + "epoch": 5.748103909912424, + "grad_norm": 30.04854965209961, + "learning_rate": 0.00018411357413755162, + "loss": 7.5054, + "step": 1411000 + }, + { + "epoch": 5.748103909912424, + "eval_MaskedAccuracy": 0.5134891987970711, + "eval_loss": 1.581412672996521, + "eval_runtime": 157.7877, + "eval_samples_per_second": 402.287, + "eval_steps_per_second": 1.572, + "step": 1411000 + }, + { + "epoch": 5.748511287935806, + "grad_norm": 6.196340560913086, + "learning_rate": 0.00018393154631668115, + "loss": 7.5082, + "step": 1411100 + }, + { + "epoch": 5.748918665959187, + "grad_norm": 13.771200180053711, + "learning_rate": 0.00018374961026006048, + "loss": 7.5089, + "step": 1411200 + }, + { + "epoch": 5.749326043982569, + "grad_norm": 19.914575576782227, + "learning_rate": 0.00018356776597488263, + "loss": 7.4891, + "step": 1411300 + }, + { + "epoch": 5.749733422005949, + "grad_norm": 23.734725952148438, + "learning_rate": 0.00018338601346833694, + "loss": 7.5522, + "step": 1411400 + }, + { + "epoch": 5.750140800029332, + "grad_norm": 4.969549179077148, + "learning_rate": 0.000183204352747609, + "loss": 7.5143, + "step": 1411500 + }, + { + "epoch": 5.750548178052712, + "grad_norm": 8.582549095153809, + "learning_rate": 0.00018302278381988101, + "loss": 7.5302, + "step": 1411600 + }, + { + "epoch": 5.750955556076094, + "grad_norm": 2.790029287338257, + "learning_rate": 0.000182841306692331, + "loss": 7.5042, + "step": 1411700 + }, + { + "epoch": 5.751362934099475, + "grad_norm": 8.083197593688965, + "learning_rate": 0.0001826599213721339, + "loss": 7.4969, + "step": 1411800 + }, + { + "epoch": 5.751770312122857, + "grad_norm": 4.746738910675049, + "learning_rate": 0.00018247862786646105, + "loss": 7.4961, + "step": 1411900 + }, + { + "epoch": 5.7521776901462385, + "grad_norm": 17.360036849975586, + "learning_rate": 0.0001822974261824794, + "loss": 7.5505, + "step": 1412000 + }, + { + "epoch": 5.7521776901462385, + "eval_MaskedAccuracy": 0.5132593198820746, + "eval_loss": 1.5834758281707764, + "eval_runtime": 173.8437, + "eval_samples_per_second": 365.133, + "eval_steps_per_second": 1.427, + "step": 1412000 + }, + { + "epoch": 5.75258506816962, + "grad_norm": 6.999335765838623, + "learning_rate": 0.00018211631632735368, + "loss": 7.4944, + "step": 1412100 + }, + { + "epoch": 5.7529924461930015, + "grad_norm": 6.317979335784912, + "learning_rate": 0.0001819352983082432, + "loss": 7.5337, + "step": 1412200 + }, + { + "epoch": 5.753399824216383, + "grad_norm": 9.501605033874512, + "learning_rate": 0.00018175437213230518, + "loss": 7.512, + "step": 1412300 + }, + { + "epoch": 5.753807202239765, + "grad_norm": 5.46494722366333, + "learning_rate": 0.00018157353780669228, + "loss": 7.5546, + "step": 1412400 + }, + { + "epoch": 5.754214580263146, + "grad_norm": 3.670668125152588, + "learning_rate": 0.00018139279533855413, + "loss": 7.5186, + "step": 1412500 + }, + { + "epoch": 5.754621958286528, + "grad_norm": 11.392471313476562, + "learning_rate": 0.0001812121447350363, + "loss": 7.5155, + "step": 1412600 + }, + { + "epoch": 5.755029336309908, + "grad_norm": 15.43734359741211, + "learning_rate": 0.00018103158600328107, + "loss": 7.5315, + "step": 1412700 + }, + { + "epoch": 5.75543671433329, + "grad_norm": 16.57461166381836, + "learning_rate": 0.00018085111915042642, + "loss": 7.5286, + "step": 1412800 + }, + { + "epoch": 5.755844092356671, + "grad_norm": 3.595592498779297, + "learning_rate": 0.0001806707441836077, + "loss": 7.5372, + "step": 1412900 + }, + { + "epoch": 5.756251470380053, + "grad_norm": 21.875734329223633, + "learning_rate": 0.00018049046110995607, + "loss": 7.5122, + "step": 1413000 + }, + { + "epoch": 5.756251470380053, + "eval_MaskedAccuracy": 0.5138528158223182, + "eval_loss": 1.5747325420379639, + "eval_runtime": 176.1096, + "eval_samples_per_second": 360.435, + "eval_steps_per_second": 1.408, + "step": 1413000 + }, + { + "epoch": 5.7566588484034344, + "grad_norm": 4.31231164932251, + "learning_rate": 0.00018031026993659837, + "loss": 7.5071, + "step": 1413100 + }, + { + "epoch": 5.757066226426816, + "grad_norm": 3.410404682159424, + "learning_rate": 0.00018013017067065935, + "loss": 7.4807, + "step": 1413200 + }, + { + "epoch": 5.7574736044501975, + "grad_norm": 3.8272275924682617, + "learning_rate": 0.00017995016331925865, + "loss": 7.5132, + "step": 1413300 + }, + { + "epoch": 5.757880982473579, + "grad_norm": 5.379833698272705, + "learning_rate": 0.0001797702478895133, + "loss": 7.5148, + "step": 1413400 + }, + { + "epoch": 5.758288360496961, + "grad_norm": 3.9139113426208496, + "learning_rate": 0.0001795904243885363, + "loss": 7.5202, + "step": 1413500 + }, + { + "epoch": 5.758695738520342, + "grad_norm": 5.3844828605651855, + "learning_rate": 0.00017941069282343726, + "loss": 7.5246, + "step": 1413600 + }, + { + "epoch": 5.759103116543724, + "grad_norm": 19.920928955078125, + "learning_rate": 0.00017923105320132105, + "loss": 7.5016, + "step": 1413700 + }, + { + "epoch": 5.759510494567105, + "grad_norm": 8.671945571899414, + "learning_rate": 0.00017905150552929088, + "loss": 7.4962, + "step": 1413800 + }, + { + "epoch": 5.759917872590487, + "grad_norm": 6.017998695373535, + "learning_rate": 0.0001788720498144446, + "loss": 7.4906, + "step": 1413900 + }, + { + "epoch": 5.760325250613867, + "grad_norm": 8.696832656860352, + "learning_rate": 0.00017869268606387725, + "loss": 7.5044, + "step": 1414000 + }, + { + "epoch": 5.760325250613867, + "eval_MaskedAccuracy": 0.5131223645917778, + "eval_loss": 1.5926177501678467, + "eval_runtime": 162.3583, + "eval_samples_per_second": 390.962, + "eval_steps_per_second": 1.527, + "step": 1414000 + }, + { + "epoch": 5.760732628637249, + "grad_norm": 19.09294891357422, + "learning_rate": 0.0001785134142846798, + "loss": 7.5388, + "step": 1414100 + }, + { + "epoch": 5.76114000666063, + "grad_norm": 8.947702407836914, + "learning_rate": 0.0001783342344839394, + "loss": 7.5146, + "step": 1414200 + }, + { + "epoch": 5.761547384684012, + "grad_norm": 8.759279251098633, + "learning_rate": 0.00017815514666874116, + "loss": 7.5019, + "step": 1414300 + }, + { + "epoch": 5.7619547627073935, + "grad_norm": 36.35895919799805, + "learning_rate": 0.0001779761508461649, + "loss": 7.5131, + "step": 1414400 + }, + { + "epoch": 5.762362140730775, + "grad_norm": 3.747073173522949, + "learning_rate": 0.00017779724702328753, + "loss": 7.5365, + "step": 1414500 + }, + { + "epoch": 5.7627695187541566, + "grad_norm": 9.462847709655762, + "learning_rate": 0.0001776184352071813, + "loss": 7.5144, + "step": 1414600 + }, + { + "epoch": 5.763176896777538, + "grad_norm": 4.828328609466553, + "learning_rate": 0.0001774397154049161, + "loss": 7.5307, + "step": 1414700 + }, + { + "epoch": 5.76358427480092, + "grad_norm": 24.189462661743164, + "learning_rate": 0.00017726108762355767, + "loss": 7.4861, + "step": 1414800 + }, + { + "epoch": 5.763991652824301, + "grad_norm": 8.334132194519043, + "learning_rate": 0.00017708255187016793, + "loss": 7.518, + "step": 1414900 + }, + { + "epoch": 5.764399030847683, + "grad_norm": 24.44731330871582, + "learning_rate": 0.00017690410815180598, + "loss": 7.5354, + "step": 1415000 + }, + { + "epoch": 5.764399030847683, + "eval_MaskedAccuracy": 0.5132570388452132, + "eval_loss": 1.5811527967453003, + "eval_runtime": 170.6061, + "eval_samples_per_second": 372.062, + "eval_steps_per_second": 1.454, + "step": 1415000 + }, + { + "epoch": 5.764806408871064, + "grad_norm": 22.014970779418945, + "learning_rate": 0.00017672575647552594, + "loss": 7.5485, + "step": 1415100 + }, + { + "epoch": 5.765213786894446, + "grad_norm": 3.618098020553589, + "learning_rate": 0.0001765474968483792, + "loss": 7.5262, + "step": 1415200 + }, + { + "epoch": 5.765621164917826, + "grad_norm": 7.878652095794678, + "learning_rate": 0.0001763693292774133, + "loss": 7.5139, + "step": 1415300 + }, + { + "epoch": 5.766028542941208, + "grad_norm": 4.511267185211182, + "learning_rate": 0.00017619125376967205, + "loss": 7.5011, + "step": 1415400 + }, + { + "epoch": 5.7664359209645895, + "grad_norm": 13.622882843017578, + "learning_rate": 0.00017601327033219623, + "loss": 7.5164, + "step": 1415500 + }, + { + "epoch": 5.766843298987971, + "grad_norm": 6.8950090408325195, + "learning_rate": 0.0001758353789720218, + "loss": 7.506, + "step": 1415600 + }, + { + "epoch": 5.7672506770113525, + "grad_norm": 15.81292724609375, + "learning_rate": 0.00017565757969618246, + "loss": 7.5314, + "step": 1415700 + }, + { + "epoch": 5.767658055034734, + "grad_norm": 19.056610107421875, + "learning_rate": 0.00017547987251170674, + "loss": 7.5093, + "step": 1415800 + }, + { + "epoch": 5.768065433058116, + "grad_norm": 4.119966983795166, + "learning_rate": 0.0001753022574256211, + "loss": 7.5192, + "step": 1415900 + }, + { + "epoch": 5.768472811081497, + "grad_norm": 15.643582344055176, + "learning_rate": 0.00017512473444494694, + "loss": 7.5324, + "step": 1416000 + }, + { + "epoch": 5.768472811081497, + "eval_MaskedAccuracy": 0.5141732353414097, + "eval_loss": 1.5801136493682861, + "eval_runtime": 169.0959, + "eval_samples_per_second": 375.385, + "eval_steps_per_second": 1.467, + "step": 1416000 + }, + { + "epoch": 5.768880189104879, + "grad_norm": 13.730713844299316, + "learning_rate": 0.00017494730357670368, + "loss": 7.5006, + "step": 1416100 + }, + { + "epoch": 5.76928756712826, + "grad_norm": 6.921182155609131, + "learning_rate": 0.0001747699648279048, + "loss": 7.491, + "step": 1416200 + }, + { + "epoch": 5.769694945151642, + "grad_norm": 5.342350482940674, + "learning_rate": 0.0001745927182055627, + "loss": 7.5168, + "step": 1416300 + }, + { + "epoch": 5.770102323175022, + "grad_norm": 11.932868957519531, + "learning_rate": 0.00017441556371668417, + "loss": 7.5134, + "step": 1416400 + }, + { + "epoch": 5.770509701198405, + "grad_norm": 11.356863975524902, + "learning_rate": 0.00017423850136827324, + "loss": 7.4981, + "step": 1416500 + }, + { + "epoch": 5.770917079221785, + "grad_norm": 25.6007080078125, + "learning_rate": 0.00017406153116733024, + "loss": 7.5063, + "step": 1416600 + }, + { + "epoch": 5.771324457245167, + "grad_norm": 17.394763946533203, + "learning_rate": 0.00017388465312085177, + "loss": 7.5262, + "step": 1416700 + }, + { + "epoch": 5.7717318352685485, + "grad_norm": 27.250417709350586, + "learning_rate": 0.0001737078672358303, + "loss": 7.4939, + "step": 1416800 + }, + { + "epoch": 5.77213921329193, + "grad_norm": 4.442460536956787, + "learning_rate": 0.00017353117351925565, + "loss": 7.4952, + "step": 1416900 + }, + { + "epoch": 5.772546591315312, + "grad_norm": 12.650249481201172, + "learning_rate": 0.0001733545719781134, + "loss": 7.5142, + "step": 1417000 + }, + { + "epoch": 5.772546591315312, + "eval_MaskedAccuracy": 0.5134382189619834, + "eval_loss": 1.5805528163909912, + "eval_runtime": 173.0323, + "eval_samples_per_second": 366.845, + "eval_steps_per_second": 1.433, + "step": 1417000 + }, + { + "epoch": 5.772953969338693, + "grad_norm": 20.84615135192871, + "learning_rate": 0.00017317806261938522, + "loss": 7.54, + "step": 1417100 + }, + { + "epoch": 5.773361347362075, + "grad_norm": 13.096055030822754, + "learning_rate": 0.0001730016454500498, + "loss": 7.5417, + "step": 1417200 + }, + { + "epoch": 5.773768725385456, + "grad_norm": 19.751548767089844, + "learning_rate": 0.00017282532047708182, + "loss": 7.5181, + "step": 1417300 + }, + { + "epoch": 5.774176103408838, + "grad_norm": 22.788862228393555, + "learning_rate": 0.00017264908770745163, + "loss": 7.522, + "step": 1417400 + }, + { + "epoch": 5.774583481432219, + "grad_norm": 4.602299213409424, + "learning_rate": 0.00017247294714812782, + "loss": 7.5088, + "step": 1417500 + }, + { + "epoch": 5.774990859455601, + "grad_norm": 3.314424991607666, + "learning_rate": 0.0001722968988060734, + "loss": 7.5029, + "step": 1417600 + }, + { + "epoch": 5.775398237478981, + "grad_norm": 4.984609603881836, + "learning_rate": 0.0001721209426882492, + "loss": 7.5298, + "step": 1417700 + }, + { + "epoch": 5.775805615502363, + "grad_norm": 4.923085689544678, + "learning_rate": 0.00017194507880161113, + "loss": 7.5344, + "step": 1417800 + }, + { + "epoch": 5.7762129935257445, + "grad_norm": 9.468344688415527, + "learning_rate": 0.00017176930715311214, + "loss": 7.5023, + "step": 1417900 + }, + { + "epoch": 5.776620371549126, + "grad_norm": 4.969415664672852, + "learning_rate": 0.00017159362774970134, + "loss": 7.5011, + "step": 1418000 + }, + { + "epoch": 5.776620371549126, + "eval_MaskedAccuracy": 0.5141010918163169, + "eval_loss": 1.5732693672180176, + "eval_runtime": 167.0545, + "eval_samples_per_second": 379.972, + "eval_steps_per_second": 1.485, + "step": 1418000 + }, + { + "epoch": 5.7770277495725075, + "grad_norm": 19.92969512939453, + "learning_rate": 0.00017141804059832467, + "loss": 7.5398, + "step": 1418100 + }, + { + "epoch": 5.777435127595889, + "grad_norm": 14.730072975158691, + "learning_rate": 0.00017124254570592313, + "loss": 7.538, + "step": 1418200 + }, + { + "epoch": 5.777842505619271, + "grad_norm": 29.363462448120117, + "learning_rate": 0.00017106714307943587, + "loss": 7.4936, + "step": 1418300 + }, + { + "epoch": 5.778249883642652, + "grad_norm": 13.004620552062988, + "learning_rate": 0.00017089183272579728, + "loss": 7.5343, + "step": 1418400 + }, + { + "epoch": 5.778657261666034, + "grad_norm": 17.11601448059082, + "learning_rate": 0.00017071661465193835, + "loss": 7.5099, + "step": 1418500 + }, + { + "epoch": 5.779064639689415, + "grad_norm": 15.091670036315918, + "learning_rate": 0.00017054148886478597, + "loss": 7.525, + "step": 1418600 + }, + { + "epoch": 5.779472017712797, + "grad_norm": 5.1989521980285645, + "learning_rate": 0.0001703664553712643, + "loss": 7.5075, + "step": 1418700 + }, + { + "epoch": 5.779879395736178, + "grad_norm": 3.2822389602661133, + "learning_rate": 0.00017019151417829297, + "loss": 7.5084, + "step": 1418800 + }, + { + "epoch": 5.78028677375956, + "grad_norm": 15.049131393432617, + "learning_rate": 0.00017001666529278836, + "loss": 7.4896, + "step": 1418900 + }, + { + "epoch": 5.78069415178294, + "grad_norm": 3.6609060764312744, + "learning_rate": 0.0001698419087216636, + "loss": 7.5162, + "step": 1419000 + }, + { + "epoch": 5.78069415178294, + "eval_MaskedAccuracy": 0.5130734803741487, + "eval_loss": 1.594419240951538, + "eval_runtime": 175.0282, + "eval_samples_per_second": 362.661, + "eval_steps_per_second": 1.417, + "step": 1419000 + }, + { + "epoch": 5.781101529806322, + "grad_norm": 15.390701293945312, + "learning_rate": 0.0001696672444718274, + "loss": 7.5092, + "step": 1419100 + }, + { + "epoch": 5.7815089078297035, + "grad_norm": 8.904915809631348, + "learning_rate": 0.0001694926725501855, + "loss": 7.553, + "step": 1419200 + }, + { + "epoch": 5.781916285853085, + "grad_norm": 11.09874439239502, + "learning_rate": 0.000169318192963639, + "loss": 7.5293, + "step": 1419300 + }, + { + "epoch": 5.782323663876467, + "grad_norm": 16.156326293945312, + "learning_rate": 0.00016914380571908682, + "loss": 7.4898, + "step": 1419400 + }, + { + "epoch": 5.782731041899848, + "grad_norm": 34.805091857910156, + "learning_rate": 0.00016896951082342305, + "loss": 7.5133, + "step": 1419500 + }, + { + "epoch": 5.78313841992323, + "grad_norm": 8.421262741088867, + "learning_rate": 0.0001687953082835382, + "loss": 7.5162, + "step": 1419600 + }, + { + "epoch": 5.783545797946611, + "grad_norm": 3.0364320278167725, + "learning_rate": 0.00016862119810631988, + "loss": 7.4995, + "step": 1419700 + }, + { + "epoch": 5.783953175969993, + "grad_norm": 12.082828521728516, + "learning_rate": 0.00016844718029865153, + "loss": 7.5227, + "step": 1419800 + }, + { + "epoch": 5.784360553993374, + "grad_norm": 4.081977844238281, + "learning_rate": 0.0001682732548674128, + "loss": 7.5162, + "step": 1419900 + }, + { + "epoch": 5.784767932016756, + "grad_norm": 9.002562522888184, + "learning_rate": 0.00016809942181947988, + "loss": 7.5442, + "step": 1420000 + }, + { + "epoch": 5.784767932016756, + "eval_MaskedAccuracy": 0.513356205793277, + "eval_loss": 1.5863076448440552, + "eval_runtime": 181.1339, + "eval_samples_per_second": 350.437, + "eval_steps_per_second": 1.369, + "step": 1420000 + }, + { + "epoch": 5.785175310040137, + "grad_norm": 16.186351776123047, + "learning_rate": 0.0001679256811617254, + "loss": 7.5288, + "step": 1420100 + }, + { + "epoch": 5.785582688063519, + "grad_norm": 8.277743339538574, + "learning_rate": 0.0001677520329010184, + "loss": 7.5296, + "step": 1420200 + }, + { + "epoch": 5.7859900660868995, + "grad_norm": 16.341815948486328, + "learning_rate": 0.00016757847704422393, + "loss": 7.5262, + "step": 1420300 + }, + { + "epoch": 5.786397444110281, + "grad_norm": 16.191190719604492, + "learning_rate": 0.00016740501359820348, + "loss": 7.5162, + "step": 1420400 + }, + { + "epoch": 5.7868048221336625, + "grad_norm": 19.828542709350586, + "learning_rate": 0.00016723164256981544, + "loss": 7.5429, + "step": 1420500 + }, + { + "epoch": 5.787212200157044, + "grad_norm": 3.413541316986084, + "learning_rate": 0.0001670583639659137, + "loss": 7.5456, + "step": 1420600 + }, + { + "epoch": 5.787619578180426, + "grad_norm": 15.310487747192383, + "learning_rate": 0.00016688517779334924, + "loss": 7.5234, + "step": 1420700 + }, + { + "epoch": 5.788026956203807, + "grad_norm": 13.728946685791016, + "learning_rate": 0.00016671208405896876, + "loss": 7.5219, + "step": 1420800 + }, + { + "epoch": 5.788434334227189, + "grad_norm": 15.777775764465332, + "learning_rate": 0.00016653908276961587, + "loss": 7.5322, + "step": 1420900 + }, + { + "epoch": 5.78884171225057, + "grad_norm": 17.24802017211914, + "learning_rate": 0.00016636617393213, + "loss": 7.5075, + "step": 1421000 + }, + { + "epoch": 5.78884171225057, + "eval_MaskedAccuracy": 0.5130345098445821, + "eval_loss": 1.5933095216751099, + "eval_runtime": 190.5582, + "eval_samples_per_second": 333.106, + "eval_steps_per_second": 1.301, + "step": 1421000 + }, + { + "epoch": 5.789249090273952, + "grad_norm": 4.251654624938965, + "learning_rate": 0.00016619335755334685, + "loss": 7.5079, + "step": 1421100 + }, + { + "epoch": 5.789656468297333, + "grad_norm": 31.306337356567383, + "learning_rate": 0.00016602063364009952, + "loss": 7.4902, + "step": 1421200 + }, + { + "epoch": 5.790063846320715, + "grad_norm": 22.56920051574707, + "learning_rate": 0.000165848002199216, + "loss": 7.4929, + "step": 1421300 + }, + { + "epoch": 5.7904712243440954, + "grad_norm": 10.100997924804688, + "learning_rate": 0.00016567546323752134, + "loss": 7.5175, + "step": 1421400 + }, + { + "epoch": 5.790878602367478, + "grad_norm": 3.6591389179229736, + "learning_rate": 0.0001655030167618374, + "loss": 7.5263, + "step": 1421500 + }, + { + "epoch": 5.7912859803908585, + "grad_norm": 8.138897895812988, + "learning_rate": 0.00016533066277898173, + "loss": 7.509, + "step": 1421600 + }, + { + "epoch": 5.79169335841424, + "grad_norm": 23.298463821411133, + "learning_rate": 0.00016515840129576847, + "loss": 7.5223, + "step": 1421700 + }, + { + "epoch": 5.792100736437622, + "grad_norm": 17.69959831237793, + "learning_rate": 0.00016498623231900794, + "loss": 7.5256, + "step": 1421800 + }, + { + "epoch": 5.792508114461003, + "grad_norm": 14.322881698608398, + "learning_rate": 0.00016481415585550686, + "loss": 7.5199, + "step": 1421900 + }, + { + "epoch": 5.792915492484385, + "grad_norm": 19.614898681640625, + "learning_rate": 0.00016464217191206835, + "loss": 7.5402, + "step": 1422000 + }, + { + "epoch": 5.792915492484385, + "eval_MaskedAccuracy": 0.5130547197273498, + "eval_loss": 1.587104320526123, + "eval_runtime": 165.2217, + "eval_samples_per_second": 384.187, + "eval_steps_per_second": 1.501, + "step": 1422000 + }, + { + "epoch": 5.793322870507766, + "grad_norm": 22.410751342773438, + "learning_rate": 0.0001644702804954915, + "loss": 7.5272, + "step": 1422100 + }, + { + "epoch": 5.793730248531148, + "grad_norm": 14.932026863098145, + "learning_rate": 0.00016429848161257302, + "loss": 7.5039, + "step": 1422200 + }, + { + "epoch": 5.794137626554529, + "grad_norm": 20.829832077026367, + "learning_rate": 0.00016412677527010444, + "loss": 7.4982, + "step": 1422300 + }, + { + "epoch": 5.794545004577911, + "grad_norm": 24.133546829223633, + "learning_rate": 0.00016395516147487416, + "loss": 7.4878, + "step": 1422400 + }, + { + "epoch": 5.794952382601292, + "grad_norm": 12.178784370422363, + "learning_rate": 0.0001637836402336674, + "loss": 7.525, + "step": 1422500 + }, + { + "epoch": 5.795359760624674, + "grad_norm": 15.152470588684082, + "learning_rate": 0.00016361221155326516, + "loss": 7.5057, + "step": 1422600 + }, + { + "epoch": 5.7957671386480545, + "grad_norm": 9.138758659362793, + "learning_rate": 0.0001634408754404445, + "loss": 7.5014, + "step": 1422700 + }, + { + "epoch": 5.796174516671436, + "grad_norm": 18.050846099853516, + "learning_rate": 0.0001632696319019799, + "loss": 7.5523, + "step": 1422800 + }, + { + "epoch": 5.7965818946948175, + "grad_norm": 19.427255630493164, + "learning_rate": 0.00016309848094464097, + "loss": 7.5159, + "step": 1422900 + }, + { + "epoch": 5.796989272718199, + "grad_norm": 10.630948066711426, + "learning_rate": 0.0001629274225751945, + "loss": 7.5131, + "step": 1423000 + }, + { + "epoch": 5.796989272718199, + "eval_MaskedAccuracy": 0.5133400752451206, + "eval_loss": 1.588206171989441, + "eval_runtime": 157.6728, + "eval_samples_per_second": 402.58, + "eval_steps_per_second": 1.573, + "step": 1423000 + }, + { + "epoch": 5.797396650741581, + "grad_norm": 4.26202917098999, + "learning_rate": 0.00016275645680040323, + "loss": 7.5006, + "step": 1423100 + }, + { + "epoch": 5.797804028764962, + "grad_norm": 8.990422248840332, + "learning_rate": 0.00016258558362702648, + "loss": 7.51, + "step": 1423200 + }, + { + "epoch": 5.798211406788344, + "grad_norm": 13.804823875427246, + "learning_rate": 0.0001624148030618199, + "loss": 7.5149, + "step": 1423300 + }, + { + "epoch": 5.798618784811725, + "grad_norm": 12.746509552001953, + "learning_rate": 0.00016224411511153497, + "loss": 7.5131, + "step": 1423400 + }, + { + "epoch": 5.799026162835107, + "grad_norm": 10.440051078796387, + "learning_rate": 0.00016207351978292017, + "loss": 7.5118, + "step": 1423500 + }, + { + "epoch": 5.799433540858488, + "grad_norm": 24.82433319091797, + "learning_rate": 0.00016190301708272, + "loss": 7.49, + "step": 1423600 + }, + { + "epoch": 5.79984091888187, + "grad_norm": 5.237053871154785, + "learning_rate": 0.00016173260701767515, + "loss": 7.4757, + "step": 1423700 + }, + { + "epoch": 5.800248296905251, + "grad_norm": 14.583934783935547, + "learning_rate": 0.00016156228959452316, + "loss": 7.5304, + "step": 1423800 + }, + { + "epoch": 5.800655674928633, + "grad_norm": 48.17268371582031, + "learning_rate": 0.00016139206481999757, + "loss": 7.5201, + "step": 1423900 + }, + { + "epoch": 5.8010630529520135, + "grad_norm": 3.127892255783081, + "learning_rate": 0.00016122193270082808, + "loss": 7.5152, + "step": 1424000 + }, + { + "epoch": 5.8010630529520135, + "eval_MaskedAccuracy": 0.5130820432215011, + "eval_loss": 1.5851596593856812, + "eval_runtime": 173.9997, + "eval_samples_per_second": 364.805, + "eval_steps_per_second": 1.425, + "step": 1424000 + }, + { + "epoch": 5.801470430975395, + "grad_norm": 14.792893409729004, + "learning_rate": 0.00016105189324374018, + "loss": 7.5313, + "step": 1424100 + }, + { + "epoch": 5.801877808998777, + "grad_norm": 15.091297149658203, + "learning_rate": 0.00016088194645545793, + "loss": 7.5159, + "step": 1424200 + }, + { + "epoch": 5.802285187022158, + "grad_norm": 26.00152587890625, + "learning_rate": 0.00016071209234269965, + "loss": 7.5087, + "step": 1424300 + }, + { + "epoch": 5.80269256504554, + "grad_norm": 14.155200004577637, + "learning_rate": 0.00016054233091218048, + "loss": 7.5039, + "step": 1424400 + }, + { + "epoch": 5.803099943068921, + "grad_norm": 39.95311737060547, + "learning_rate": 0.00016037266217061209, + "loss": 7.5115, + "step": 1424500 + }, + { + "epoch": 5.803507321092303, + "grad_norm": 3.3049662113189697, + "learning_rate": 0.00016020308612470257, + "loss": 7.4999, + "step": 1424600 + }, + { + "epoch": 5.803914699115684, + "grad_norm": 5.990292072296143, + "learning_rate": 0.00016003360278115547, + "loss": 7.5388, + "step": 1424700 + }, + { + "epoch": 5.804322077139066, + "grad_norm": 7.175048351287842, + "learning_rate": 0.000159864212146672, + "loss": 7.5164, + "step": 1424800 + }, + { + "epoch": 5.804729455162447, + "grad_norm": 31.079818725585938, + "learning_rate": 0.00015969491422794874, + "loss": 7.5047, + "step": 1424900 + }, + { + "epoch": 5.805136833185829, + "grad_norm": 12.982433319091797, + "learning_rate": 0.00015952570903167928, + "loss": 7.5094, + "step": 1425000 + }, + { + "epoch": 5.805136833185829, + "eval_MaskedAccuracy": 0.5132631796938145, + "eval_loss": 1.581839919090271, + "eval_runtime": 159.0711, + "eval_samples_per_second": 399.042, + "eval_steps_per_second": 1.559, + "step": 1425000 + }, + { + "epoch": 5.80554421120921, + "grad_norm": 4.535707473754883, + "learning_rate": 0.00015935659656455287, + "loss": 7.5213, + "step": 1425100 + }, + { + "epoch": 5.805951589232592, + "grad_norm": 11.968395233154297, + "learning_rate": 0.0001591875768332556, + "loss": 7.524, + "step": 1425200 + }, + { + "epoch": 5.806358967255973, + "grad_norm": 6.857604503631592, + "learning_rate": 0.00015901864984446974, + "loss": 7.5128, + "step": 1425300 + }, + { + "epoch": 5.806766345279354, + "grad_norm": 3.4068024158477783, + "learning_rate": 0.00015884981560487378, + "loss": 7.5053, + "step": 1425400 + }, + { + "epoch": 5.807173723302736, + "grad_norm": 12.815781593322754, + "learning_rate": 0.00015868107412114293, + "loss": 7.495, + "step": 1425500 + }, + { + "epoch": 5.807581101326117, + "grad_norm": 12.488129615783691, + "learning_rate": 0.00015851242539994783, + "loss": 7.5018, + "step": 1425600 + }, + { + "epoch": 5.807988479349499, + "grad_norm": 10.342538833618164, + "learning_rate": 0.00015834386944795683, + "loss": 7.5324, + "step": 1425700 + }, + { + "epoch": 5.80839585737288, + "grad_norm": 22.352279663085938, + "learning_rate": 0.00015817540627183327, + "loss": 7.4829, + "step": 1425800 + }, + { + "epoch": 5.808803235396262, + "grad_norm": 19.0584774017334, + "learning_rate": 0.00015800703587823768, + "loss": 7.5372, + "step": 1425900 + }, + { + "epoch": 5.809210613419643, + "grad_norm": 13.602452278137207, + "learning_rate": 0.00015783875827382656, + "loss": 7.4858, + "step": 1426000 + }, + { + "epoch": 5.809210613419643, + "eval_MaskedAccuracy": 0.5133862877492475, + "eval_loss": 1.5769734382629395, + "eval_runtime": 173.2155, + "eval_samples_per_second": 366.457, + "eval_steps_per_second": 1.432, + "step": 1426000 + }, + { + "epoch": 5.809617991443025, + "grad_norm": 9.673832893371582, + "learning_rate": 0.00015767057346525206, + "loss": 7.5318, + "step": 1426100 + }, + { + "epoch": 5.810025369466406, + "grad_norm": 6.554992198944092, + "learning_rate": 0.0001575024814591651, + "loss": 7.533, + "step": 1426200 + }, + { + "epoch": 5.810432747489788, + "grad_norm": 3.984445095062256, + "learning_rate": 0.00015733448226221014, + "loss": 7.5076, + "step": 1426300 + }, + { + "epoch": 5.8108401255131685, + "grad_norm": 9.314393997192383, + "learning_rate": 0.00015716657588102918, + "loss": 7.5052, + "step": 1426400 + }, + { + "epoch": 5.811247503536551, + "grad_norm": 11.197092056274414, + "learning_rate": 0.00015699876232226084, + "loss": 7.5127, + "step": 1426500 + }, + { + "epoch": 5.811654881559932, + "grad_norm": 7.632115840911865, + "learning_rate": 0.00015683104159253924, + "loss": 7.5313, + "step": 1426600 + }, + { + "epoch": 5.812062259583313, + "grad_norm": 3.435426950454712, + "learning_rate": 0.00015666341369849574, + "loss": 7.4944, + "step": 1426700 + }, + { + "epoch": 5.812469637606695, + "grad_norm": 4.164614677429199, + "learning_rate": 0.00015649587864675693, + "loss": 7.5022, + "step": 1426800 + }, + { + "epoch": 5.812877015630076, + "grad_norm": 2.644254684448242, + "learning_rate": 0.0001563284364439469, + "loss": 7.5023, + "step": 1426900 + }, + { + "epoch": 5.813284393653458, + "grad_norm": 26.294357299804688, + "learning_rate": 0.00015616108709668518, + "loss": 7.4974, + "step": 1427000 + }, + { + "epoch": 5.813284393653458, + "eval_MaskedAccuracy": 0.5134179198324821, + "eval_loss": 1.5896483659744263, + "eval_runtime": 186.502, + "eval_samples_per_second": 340.35, + "eval_steps_per_second": 1.33, + "step": 1427000 + }, + { + "epoch": 5.813691771676839, + "grad_norm": 3.466761827468872, + "learning_rate": 0.00015599383061158854, + "loss": 7.5437, + "step": 1427100 + }, + { + "epoch": 5.814099149700221, + "grad_norm": 23.73604393005371, + "learning_rate": 0.0001558266669952689, + "loss": 7.5146, + "step": 1427200 + }, + { + "epoch": 5.814506527723602, + "grad_norm": 13.889824867248535, + "learning_rate": 0.00015565959625433542, + "loss": 7.5074, + "step": 1427300 + }, + { + "epoch": 5.814913905746984, + "grad_norm": 4.097889423370361, + "learning_rate": 0.00015549261839539318, + "loss": 7.5213, + "step": 1427400 + }, + { + "epoch": 5.815321283770365, + "grad_norm": 8.272371292114258, + "learning_rate": 0.00015532573342504365, + "loss": 7.5008, + "step": 1427500 + }, + { + "epoch": 5.815728661793747, + "grad_norm": 11.328155517578125, + "learning_rate": 0.00015515894134988513, + "loss": 7.5225, + "step": 1427600 + }, + { + "epoch": 5.816136039817128, + "grad_norm": 10.377781867980957, + "learning_rate": 0.000154992242176511, + "loss": 7.5254, + "step": 1427700 + }, + { + "epoch": 5.816543417840509, + "grad_norm": 24.338708877563477, + "learning_rate": 0.00015482563591151275, + "loss": 7.5151, + "step": 1427800 + }, + { + "epoch": 5.816950795863891, + "grad_norm": 19.643484115600586, + "learning_rate": 0.00015465912256147637, + "loss": 7.5253, + "step": 1427900 + }, + { + "epoch": 5.817358173887272, + "grad_norm": 19.863996505737305, + "learning_rate": 0.00015449270213298592, + "loss": 7.499, + "step": 1428000 + }, + { + "epoch": 5.817358173887272, + "eval_MaskedAccuracy": 0.514176879805612, + "eval_loss": 1.5824474096298218, + "eval_runtime": 153.1606, + "eval_samples_per_second": 414.441, + "eval_steps_per_second": 1.619, + "step": 1428000 + }, + { + "epoch": 5.817765551910654, + "grad_norm": 29.033245086669922, + "learning_rate": 0.00015432637463261924, + "loss": 7.49, + "step": 1428100 + }, + { + "epoch": 5.818172929934035, + "grad_norm": 7.3021240234375, + "learning_rate": 0.00015416014006695397, + "loss": 7.5345, + "step": 1428200 + }, + { + "epoch": 5.818580307957417, + "grad_norm": 4.279619216918945, + "learning_rate": 0.0001539939984425615, + "loss": 7.5325, + "step": 1428300 + }, + { + "epoch": 5.818987685980798, + "grad_norm": 11.395988464355469, + "learning_rate": 0.00015382794976601027, + "loss": 7.4974, + "step": 1428400 + }, + { + "epoch": 5.81939506400418, + "grad_norm": 4.971868515014648, + "learning_rate": 0.00015366199404386524, + "loss": 7.5171, + "step": 1428500 + }, + { + "epoch": 5.819802442027561, + "grad_norm": 11.167874336242676, + "learning_rate": 0.0001534961312826874, + "loss": 7.4983, + "step": 1428600 + }, + { + "epoch": 5.820209820050943, + "grad_norm": 8.332849502563477, + "learning_rate": 0.00015333036148903434, + "loss": 7.4879, + "step": 1428700 + }, + { + "epoch": 5.820617198074324, + "grad_norm": 14.909524917602539, + "learning_rate": 0.00015316468466945965, + "loss": 7.5394, + "step": 1428800 + }, + { + "epoch": 5.821024576097706, + "grad_norm": 12.37127685546875, + "learning_rate": 0.00015299910083051344, + "loss": 7.4944, + "step": 1428900 + }, + { + "epoch": 5.821431954121087, + "grad_norm": 15.7769136428833, + "learning_rate": 0.00015283360997874208, + "loss": 7.5223, + "step": 1429000 + }, + { + "epoch": 5.821431954121087, + "eval_MaskedAccuracy": 0.51342468818523, + "eval_loss": 1.5892704725265503, + "eval_runtime": 191.8393, + "eval_samples_per_second": 330.881, + "eval_steps_per_second": 1.293, + "step": 1429000 + }, + { + "epoch": 5.821839332144468, + "grad_norm": 27.168245315551758, + "learning_rate": 0.00015266821212068886, + "loss": 7.5041, + "step": 1429100 + }, + { + "epoch": 5.82224671016785, + "grad_norm": 2.8287925720214844, + "learning_rate": 0.00015250290726289198, + "loss": 7.5165, + "step": 1429200 + }, + { + "epoch": 5.822654088191231, + "grad_norm": 4.237165451049805, + "learning_rate": 0.00015233769541188734, + "loss": 7.5103, + "step": 1429300 + }, + { + "epoch": 5.823061466214613, + "grad_norm": 15.783507347106934, + "learning_rate": 0.0001521725765742062, + "loss": 7.5219, + "step": 1429400 + }, + { + "epoch": 5.823468844237994, + "grad_norm": 10.036375999450684, + "learning_rate": 0.00015200755075637698, + "loss": 7.5448, + "step": 1429500 + }, + { + "epoch": 5.823876222261376, + "grad_norm": 6.849361896514893, + "learning_rate": 0.0001518426179649242, + "loss": 7.5031, + "step": 1429600 + }, + { + "epoch": 5.824283600284757, + "grad_norm": 9.938248634338379, + "learning_rate": 0.0001516777782063682, + "loss": 7.528, + "step": 1429700 + }, + { + "epoch": 5.824690978308139, + "grad_norm": 12.386895179748535, + "learning_rate": 0.00015151303148722609, + "loss": 7.5247, + "step": 1429800 + }, + { + "epoch": 5.82509835633152, + "grad_norm": 10.73524284362793, + "learning_rate": 0.00015134837781401118, + "loss": 7.5242, + "step": 1429900 + }, + { + "epoch": 5.825505734354902, + "grad_norm": 4.935142993927002, + "learning_rate": 0.00015118381719323332, + "loss": 7.5333, + "step": 1430000 + }, + { + "epoch": 5.825505734354902, + "eval_MaskedAccuracy": 0.5131773851836039, + "eval_loss": 1.5934122800827026, + "eval_runtime": 161.6301, + "eval_samples_per_second": 392.724, + "eval_steps_per_second": 1.534, + "step": 1430000 + }, + { + "epoch": 5.8259131123782835, + "grad_norm": 8.51754093170166, + "learning_rate": 0.00015101934963139766, + "loss": 7.5085, + "step": 1430100 + }, + { + "epoch": 5.826320490401665, + "grad_norm": 18.393491744995117, + "learning_rate": 0.00015085497513500743, + "loss": 7.5272, + "step": 1430200 + }, + { + "epoch": 5.826727868425046, + "grad_norm": 31.64918327331543, + "learning_rate": 0.00015069069371056098, + "loss": 7.5202, + "step": 1430300 + }, + { + "epoch": 5.827135246448427, + "grad_norm": 3.425719976425171, + "learning_rate": 0.00015052650536455327, + "loss": 7.5535, + "step": 1430400 + }, + { + "epoch": 5.827542624471809, + "grad_norm": 18.87350082397461, + "learning_rate": 0.0001503624101034753, + "loss": 7.5311, + "step": 1430500 + }, + { + "epoch": 5.82795000249519, + "grad_norm": 3.842937707901001, + "learning_rate": 0.0001501984079338148, + "loss": 7.5187, + "step": 1430600 + }, + { + "epoch": 5.828357380518572, + "grad_norm": 8.800285339355469, + "learning_rate": 0.00015003449886205513, + "loss": 7.491, + "step": 1430700 + }, + { + "epoch": 5.828764758541953, + "grad_norm": 6.007112979888916, + "learning_rate": 0.00014987068289467755, + "loss": 7.4973, + "step": 1430800 + }, + { + "epoch": 5.829172136565335, + "grad_norm": 32.813323974609375, + "learning_rate": 0.00014970696003815774, + "loss": 7.5156, + "step": 1430900 + }, + { + "epoch": 5.829579514588716, + "grad_norm": 18.104747772216797, + "learning_rate": 0.00014954333029896898, + "loss": 7.5164, + "step": 1431000 + }, + { + "epoch": 5.829579514588716, + "eval_MaskedAccuracy": 0.5130792734574849, + "eval_loss": 1.5899848937988281, + "eval_runtime": 164.4066, + "eval_samples_per_second": 386.092, + "eval_steps_per_second": 1.508, + "step": 1431000 + }, + { + "epoch": 5.829986892612098, + "grad_norm": 9.131357192993164, + "learning_rate": 0.00014937979368357993, + "loss": 7.5244, + "step": 1431100 + }, + { + "epoch": 5.830394270635479, + "grad_norm": 3.4502384662628174, + "learning_rate": 0.00014921635019845647, + "loss": 7.5191, + "step": 1431200 + }, + { + "epoch": 5.830801648658861, + "grad_norm": 3.947558879852295, + "learning_rate": 0.00014905299985006035, + "loss": 7.5305, + "step": 1431300 + }, + { + "epoch": 5.831209026682242, + "grad_norm": 40.45591354370117, + "learning_rate": 0.00014888974264484967, + "loss": 7.5185, + "step": 1431400 + }, + { + "epoch": 5.831616404705624, + "grad_norm": 5.565196990966797, + "learning_rate": 0.00014872657858927866, + "loss": 7.538, + "step": 1431500 + }, + { + "epoch": 5.832023782729005, + "grad_norm": 25.74686050415039, + "learning_rate": 0.00014856350768979834, + "loss": 7.4946, + "step": 1431600 + }, + { + "epoch": 5.832431160752386, + "grad_norm": 6.645580291748047, + "learning_rate": 0.00014840052995285583, + "loss": 7.5205, + "step": 1431700 + }, + { + "epoch": 5.832838538775768, + "grad_norm": 12.331221580505371, + "learning_rate": 0.00014823764538489446, + "loss": 7.516, + "step": 1431800 + }, + { + "epoch": 5.833245916799149, + "grad_norm": 13.032149314880371, + "learning_rate": 0.00014807485399235352, + "loss": 7.5061, + "step": 1431900 + }, + { + "epoch": 5.833653294822531, + "grad_norm": 9.871225357055664, + "learning_rate": 0.00014791215578166864, + "loss": 7.5037, + "step": 1432000 + }, + { + "epoch": 5.833653294822531, + "eval_MaskedAccuracy": 0.5136045432551434, + "eval_loss": 1.5865452289581299, + "eval_runtime": 167.9042, + "eval_samples_per_second": 378.049, + "eval_steps_per_second": 1.477, + "step": 1432000 + }, + { + "epoch": 5.834060672845912, + "grad_norm": 13.493284225463867, + "learning_rate": 0.0001477495507592737, + "loss": 7.4847, + "step": 1432100 + }, + { + "epoch": 5.834468050869294, + "grad_norm": 4.093759536743164, + "learning_rate": 0.00014758703893159636, + "loss": 7.5054, + "step": 1432200 + }, + { + "epoch": 5.834875428892675, + "grad_norm": 14.402068138122559, + "learning_rate": 0.00014742462030506188, + "loss": 7.551, + "step": 1432300 + }, + { + "epoch": 5.835282806916057, + "grad_norm": 4.072689533233643, + "learning_rate": 0.00014726229488609119, + "loss": 7.5064, + "step": 1432400 + }, + { + "epoch": 5.8356901849394385, + "grad_norm": 26.50486946105957, + "learning_rate": 0.00014710006268110211, + "loss": 7.4957, + "step": 1432500 + }, + { + "epoch": 5.83609756296282, + "grad_norm": 10.46023941040039, + "learning_rate": 0.00014693792369650866, + "loss": 7.5267, + "step": 1432600 + }, + { + "epoch": 5.836504940986201, + "grad_norm": 18.413921356201172, + "learning_rate": 0.00014677587793872037, + "loss": 7.5463, + "step": 1432700 + }, + { + "epoch": 5.836912319009582, + "grad_norm": 9.61780071258545, + "learning_rate": 0.00014661392541414457, + "loss": 7.5262, + "step": 1432800 + }, + { + "epoch": 5.837319697032964, + "grad_norm": 19.005826950073242, + "learning_rate": 0.00014645206612918388, + "loss": 7.5296, + "step": 1432900 + }, + { + "epoch": 5.837727075056345, + "grad_norm": 7.550987243652344, + "learning_rate": 0.0001462903000902375, + "loss": 7.5098, + "step": 1433000 + }, + { + "epoch": 5.837727075056345, + "eval_MaskedAccuracy": 0.5138815917066393, + "eval_loss": 1.5830613374710083, + "eval_runtime": 172.1335, + "eval_samples_per_second": 368.76, + "eval_steps_per_second": 1.441, + "step": 1433000 + }, + { + "epoch": 5.838134453079727, + "grad_norm": 4.856964111328125, + "learning_rate": 0.00014612862730370085, + "loss": 7.4893, + "step": 1433100 + }, + { + "epoch": 5.838541831103108, + "grad_norm": 10.035447120666504, + "learning_rate": 0.00014596704777596547, + "loss": 7.5173, + "step": 1433200 + }, + { + "epoch": 5.83894920912649, + "grad_norm": 26.232330322265625, + "learning_rate": 0.00014580556151341995, + "loss": 7.5221, + "step": 1433300 + }, + { + "epoch": 5.839356587149871, + "grad_norm": 7.708934307098389, + "learning_rate": 0.00014564416852244855, + "loss": 7.5182, + "step": 1433400 + }, + { + "epoch": 5.839763965173253, + "grad_norm": 7.310030460357666, + "learning_rate": 0.0001454828688094316, + "loss": 7.5197, + "step": 1433500 + }, + { + "epoch": 5.8401713431966344, + "grad_norm": 17.08176612854004, + "learning_rate": 0.00014532166238074643, + "loss": 7.5286, + "step": 1433600 + }, + { + "epoch": 5.840578721220016, + "grad_norm": 10.942180633544922, + "learning_rate": 0.00014516054924276658, + "loss": 7.4983, + "step": 1433700 + }, + { + "epoch": 5.8409860992433975, + "grad_norm": 3.745558023452759, + "learning_rate": 0.0001449995294018614, + "loss": 7.504, + "step": 1433800 + }, + { + "epoch": 5.841393477266779, + "grad_norm": 3.755819797515869, + "learning_rate": 0.0001448386028643968, + "loss": 7.5042, + "step": 1433900 + }, + { + "epoch": 5.84180085529016, + "grad_norm": 4.809333801269531, + "learning_rate": 0.00014467776963673486, + "loss": 7.4873, + "step": 1434000 + }, + { + "epoch": 5.84180085529016, + "eval_MaskedAccuracy": 0.5132207532249975, + "eval_loss": 1.5858699083328247, + "eval_runtime": 174.3579, + "eval_samples_per_second": 364.056, + "eval_steps_per_second": 1.422, + "step": 1434000 + }, + { + "epoch": 5.842208233313541, + "grad_norm": 8.828631401062012, + "learning_rate": 0.0001445170297252353, + "loss": 7.5113, + "step": 1434100 + }, + { + "epoch": 5.842615611336923, + "grad_norm": 7.251721382141113, + "learning_rate": 0.0001443563831362521, + "loss": 7.526, + "step": 1434200 + }, + { + "epoch": 5.843022989360304, + "grad_norm": 7.140491962432861, + "learning_rate": 0.00014419582987613673, + "loss": 7.5218, + "step": 1434300 + }, + { + "epoch": 5.843430367383686, + "grad_norm": 9.469985961914062, + "learning_rate": 0.00014403536995123657, + "loss": 7.4989, + "step": 1434400 + }, + { + "epoch": 5.843837745407067, + "grad_norm": 20.95997428894043, + "learning_rate": 0.0001438750033678955, + "loss": 7.5093, + "step": 1434500 + }, + { + "epoch": 5.844245123430449, + "grad_norm": 25.432119369506836, + "learning_rate": 0.00014371473013245387, + "loss": 7.4939, + "step": 1434600 + }, + { + "epoch": 5.84465250145383, + "grad_norm": 30.063068389892578, + "learning_rate": 0.00014355455025124794, + "loss": 7.5289, + "step": 1434700 + }, + { + "epoch": 5.845059879477212, + "grad_norm": 4.15604829788208, + "learning_rate": 0.00014339446373061077, + "loss": 7.5243, + "step": 1434800 + }, + { + "epoch": 5.8454672575005935, + "grad_norm": 5.345333099365234, + "learning_rate": 0.00014323447057687104, + "loss": 7.5189, + "step": 1434900 + }, + { + "epoch": 5.845874635523975, + "grad_norm": 7.9726433753967285, + "learning_rate": 0.00014307457079635436, + "loss": 7.5263, + "step": 1435000 + }, + { + "epoch": 5.845874635523975, + "eval_MaskedAccuracy": 0.5138395214722764, + "eval_loss": 1.5867456197738647, + "eval_runtime": 163.9514, + "eval_samples_per_second": 387.163, + "eval_steps_per_second": 1.513, + "step": 1435000 + }, + { + "epoch": 5.8462820135473565, + "grad_norm": 3.4750051498413086, + "learning_rate": 0.00014291476439538204, + "loss": 7.5256, + "step": 1435100 + }, + { + "epoch": 5.846689391570738, + "grad_norm": 5.654435634613037, + "learning_rate": 0.00014275505138027232, + "loss": 7.5117, + "step": 1435200 + }, + { + "epoch": 5.847096769594119, + "grad_norm": 25.007041931152344, + "learning_rate": 0.00014259543175734006, + "loss": 7.4898, + "step": 1435300 + }, + { + "epoch": 5.8475041476175, + "grad_norm": 14.512124061584473, + "learning_rate": 0.00014243590553289517, + "loss": 7.5383, + "step": 1435400 + }, + { + "epoch": 5.847911525640882, + "grad_norm": 4.9716291427612305, + "learning_rate": 0.00014227647271324506, + "loss": 7.5172, + "step": 1435500 + }, + { + "epoch": 5.848318903664263, + "grad_norm": 12.040976524353027, + "learning_rate": 0.0001421171333046923, + "loss": 7.5086, + "step": 1435600 + }, + { + "epoch": 5.848726281687645, + "grad_norm": 26.50587272644043, + "learning_rate": 0.00014195788731353722, + "loss": 7.5304, + "step": 1435700 + }, + { + "epoch": 5.849133659711026, + "grad_norm": 7.013994216918945, + "learning_rate": 0.0001417987347460752, + "loss": 7.5006, + "step": 1435800 + }, + { + "epoch": 5.849541037734408, + "grad_norm": 20.126253128051758, + "learning_rate": 0.00014163967560859838, + "loss": 7.5247, + "step": 1435900 + }, + { + "epoch": 5.8499484157577895, + "grad_norm": 9.635932922363281, + "learning_rate": 0.0001414807099073949, + "loss": 7.5034, + "step": 1436000 + }, + { + "epoch": 5.8499484157577895, + "eval_MaskedAccuracy": 0.5133615099935407, + "eval_loss": 1.5901832580566406, + "eval_runtime": 154.7722, + "eval_samples_per_second": 410.125, + "eval_steps_per_second": 1.602, + "step": 1436000 + }, + { + "epoch": 5.850355793781171, + "grad_norm": 23.026901245117188, + "learning_rate": 0.00014132183764875054, + "loss": 7.5276, + "step": 1436100 + }, + { + "epoch": 5.8507631718045525, + "grad_norm": 9.818020820617676, + "learning_rate": 0.00014116305883894568, + "loss": 7.5121, + "step": 1436200 + }, + { + "epoch": 5.851170549827934, + "grad_norm": 3.112827777862549, + "learning_rate": 0.00014100437348425773, + "loss": 7.5157, + "step": 1436300 + }, + { + "epoch": 5.851577927851315, + "grad_norm": 3.668189525604248, + "learning_rate": 0.0001408457815909602, + "loss": 7.5273, + "step": 1436400 + }, + { + "epoch": 5.851985305874697, + "grad_norm": 5.518843650817871, + "learning_rate": 0.00014068728316532367, + "loss": 7.524, + "step": 1436500 + }, + { + "epoch": 5.852392683898078, + "grad_norm": 13.072844505310059, + "learning_rate": 0.0001405288782136139, + "loss": 7.4977, + "step": 1436600 + }, + { + "epoch": 5.852800061921459, + "grad_norm": 7.497927188873291, + "learning_rate": 0.0001403705667420937, + "loss": 7.5197, + "step": 1436700 + }, + { + "epoch": 5.853207439944841, + "grad_norm": 7.040710926055908, + "learning_rate": 0.00014021234875702211, + "loss": 7.546, + "step": 1436800 + }, + { + "epoch": 5.853614817968222, + "grad_norm": 7.165277481079102, + "learning_rate": 0.00014005422426465413, + "loss": 7.5479, + "step": 1436900 + }, + { + "epoch": 5.854022195991604, + "grad_norm": 16.675113677978516, + "learning_rate": 0.0001398961932712411, + "loss": 7.5055, + "step": 1437000 + }, + { + "epoch": 5.854022195991604, + "eval_MaskedAccuracy": 0.5135469924658986, + "eval_loss": 1.5882340669631958, + "eval_runtime": 158.4854, + "eval_samples_per_second": 400.516, + "eval_steps_per_second": 1.565, + "step": 1437000 + }, + { + "epoch": 5.854429574014985, + "grad_norm": 6.026208400726318, + "learning_rate": 0.000139738255783031, + "loss": 7.5223, + "step": 1437100 + }, + { + "epoch": 5.854836952038367, + "grad_norm": 6.308501243591309, + "learning_rate": 0.00013958041180626798, + "loss": 7.5261, + "step": 1437200 + }, + { + "epoch": 5.8552443300617485, + "grad_norm": 5.170680522918701, + "learning_rate": 0.0001394226613471928, + "loss": 7.4833, + "step": 1437300 + }, + { + "epoch": 5.85565170808513, + "grad_norm": 4.743289947509766, + "learning_rate": 0.00013926500441204147, + "loss": 7.5125, + "step": 1437400 + }, + { + "epoch": 5.856059086108512, + "grad_norm": 5.6337571144104, + "learning_rate": 0.00013910744100704736, + "loss": 7.4908, + "step": 1437500 + }, + { + "epoch": 5.856466464131893, + "grad_norm": 3.7580676078796387, + "learning_rate": 0.00013894997113844025, + "loss": 7.5016, + "step": 1437600 + }, + { + "epoch": 5.856873842155274, + "grad_norm": 4.790675640106201, + "learning_rate": 0.00013879259481244487, + "loss": 7.5079, + "step": 1437700 + }, + { + "epoch": 5.857281220178655, + "grad_norm": 21.260013580322266, + "learning_rate": 0.00013863531203528356, + "loss": 7.5416, + "step": 1437800 + }, + { + "epoch": 5.857688598202037, + "grad_norm": 19.645368576049805, + "learning_rate": 0.00013847812281317457, + "loss": 7.5363, + "step": 1437900 + }, + { + "epoch": 5.858095976225418, + "grad_norm": 25.295434951782227, + "learning_rate": 0.00013832102715233204, + "loss": 7.494, + "step": 1438000 + }, + { + "epoch": 5.858095976225418, + "eval_MaskedAccuracy": 0.5135544458623212, + "eval_loss": 1.5859079360961914, + "eval_runtime": 160.8898, + "eval_samples_per_second": 394.531, + "eval_steps_per_second": 1.541, + "step": 1438000 + }, + { + "epoch": 5.8585033542488, + "grad_norm": 10.009047508239746, + "learning_rate": 0.00013816402505896781, + "loss": 7.5017, + "step": 1438100 + }, + { + "epoch": 5.858910732272181, + "grad_norm": 5.341984272003174, + "learning_rate": 0.00013800711653928768, + "loss": 7.5598, + "step": 1438200 + }, + { + "epoch": 5.859318110295563, + "grad_norm": 14.228255271911621, + "learning_rate": 0.0001378503015994967, + "loss": 7.5264, + "step": 1438300 + }, + { + "epoch": 5.8597254883189445, + "grad_norm": 10.650612831115723, + "learning_rate": 0.0001376935802457934, + "loss": 7.5107, + "step": 1438400 + }, + { + "epoch": 5.860132866342326, + "grad_norm": 20.671329498291016, + "learning_rate": 0.00013753695248437388, + "loss": 7.5118, + "step": 1438500 + }, + { + "epoch": 5.8605402443657075, + "grad_norm": 10.042603492736816, + "learning_rate": 0.00013738041832143092, + "loss": 7.5162, + "step": 1438600 + }, + { + "epoch": 5.860947622389089, + "grad_norm": 4.876357555389404, + "learning_rate": 0.00013722397776315314, + "loss": 7.5134, + "step": 1438700 + }, + { + "epoch": 5.861355000412471, + "grad_norm": 10.248143196105957, + "learning_rate": 0.00013706763081572533, + "loss": 7.5192, + "step": 1438800 + }, + { + "epoch": 5.861762378435852, + "grad_norm": 11.08122444152832, + "learning_rate": 0.0001369113774853283, + "loss": 7.5037, + "step": 1438900 + }, + { + "epoch": 5.862169756459233, + "grad_norm": 17.589916229248047, + "learning_rate": 0.0001367552177781404, + "loss": 7.5149, + "step": 1439000 + }, + { + "epoch": 5.862169756459233, + "eval_MaskedAccuracy": 0.5135288636038816, + "eval_loss": 1.5831726789474487, + "eval_runtime": 165.6449, + "eval_samples_per_second": 383.205, + "eval_steps_per_second": 1.497, + "step": 1439000 + }, + { + "epoch": 5.862577134482614, + "grad_norm": 8.701517105102539, + "learning_rate": 0.00013659915170033456, + "loss": 7.5096, + "step": 1439100 + }, + { + "epoch": 5.862984512505996, + "grad_norm": 4.694896221160889, + "learning_rate": 0.0001364431792580816, + "loss": 7.5167, + "step": 1439200 + }, + { + "epoch": 5.863391890529377, + "grad_norm": 32.01338577270508, + "learning_rate": 0.0001362873004575476, + "loss": 7.5181, + "step": 1439300 + }, + { + "epoch": 5.863799268552759, + "grad_norm": 19.998645782470703, + "learning_rate": 0.00013613151530489546, + "loss": 7.474, + "step": 1439400 + }, + { + "epoch": 5.86420664657614, + "grad_norm": 14.526906967163086, + "learning_rate": 0.00013597582380628413, + "loss": 7.5189, + "step": 1439500 + }, + { + "epoch": 5.864614024599522, + "grad_norm": 3.2489209175109863, + "learning_rate": 0.00013582022596786894, + "loss": 7.5216, + "step": 1439600 + }, + { + "epoch": 5.8650214026229035, + "grad_norm": 7.698704242706299, + "learning_rate": 0.0001356647217958015, + "loss": 7.5021, + "step": 1439700 + }, + { + "epoch": 5.865428780646285, + "grad_norm": 9.126141548156738, + "learning_rate": 0.00013550931129622946, + "loss": 7.5083, + "step": 1439800 + }, + { + "epoch": 5.865836158669667, + "grad_norm": 15.408364295959473, + "learning_rate": 0.00013535399447529715, + "loss": 7.5032, + "step": 1439900 + }, + { + "epoch": 5.866243536693048, + "grad_norm": 20.983110427856445, + "learning_rate": 0.00013519877133914525, + "loss": 7.4762, + "step": 1440000 + }, + { + "epoch": 5.866243536693048, + "eval_MaskedAccuracy": 0.5134474078317008, + "eval_loss": 1.5856764316558838, + "eval_runtime": 165.5224, + "eval_samples_per_second": 383.489, + "eval_steps_per_second": 1.498, + "step": 1440000 + }, + { + "epoch": 5.86665091471643, + "grad_norm": 2.6975395679473877, + "learning_rate": 0.00013504364189391042, + "loss": 7.5193, + "step": 1440100 + }, + { + "epoch": 5.867058292739811, + "grad_norm": 9.552655220031738, + "learning_rate": 0.00013488860614572603, + "loss": 7.4968, + "step": 1440200 + }, + { + "epoch": 5.867465670763192, + "grad_norm": 4.0049662590026855, + "learning_rate": 0.00013473366410072108, + "loss": 7.5209, + "step": 1440300 + }, + { + "epoch": 5.867873048786573, + "grad_norm": 18.50998306274414, + "learning_rate": 0.00013457881576502177, + "loss": 7.509, + "step": 1440400 + }, + { + "epoch": 5.868280426809955, + "grad_norm": 14.251355171203613, + "learning_rate": 0.00013442406114474987, + "loss": 7.4969, + "step": 1440500 + }, + { + "epoch": 5.868687804833336, + "grad_norm": 7.884382724761963, + "learning_rate": 0.0001342694002460231, + "loss": 7.5282, + "step": 1440600 + }, + { + "epoch": 5.869095182856718, + "grad_norm": 21.401798248291016, + "learning_rate": 0.00013411483307495698, + "loss": 7.4923, + "step": 1440700 + }, + { + "epoch": 5.8695025608800995, + "grad_norm": 41.12330627441406, + "learning_rate": 0.00013396035963766155, + "loss": 7.5121, + "step": 1440800 + }, + { + "epoch": 5.869909938903481, + "grad_norm": 16.419288635253906, + "learning_rate": 0.00013380597994024443, + "loss": 7.4969, + "step": 1440900 + }, + { + "epoch": 5.8703173169268625, + "grad_norm": 12.37655258178711, + "learning_rate": 0.00013365169398880881, + "loss": 7.501, + "step": 1441000 + }, + { + "epoch": 5.8703173169268625, + "eval_MaskedAccuracy": 0.5133409005214465, + "eval_loss": 1.5829401016235352, + "eval_runtime": 163.5677, + "eval_samples_per_second": 388.072, + "eval_steps_per_second": 1.516, + "step": 1441000 + }, + { + "epoch": 5.870724694950244, + "grad_norm": 6.052454948425293, + "learning_rate": 0.00013349750178945474, + "loss": 7.5154, + "step": 1441100 + }, + { + "epoch": 5.871132072973626, + "grad_norm": 5.5943379402160645, + "learning_rate": 0.0001333434033482782, + "loss": 7.5461, + "step": 1441200 + }, + { + "epoch": 5.871539450997007, + "grad_norm": 7.4150471687316895, + "learning_rate": 0.00013318939867137166, + "loss": 7.5171, + "step": 1441300 + }, + { + "epoch": 5.871946829020388, + "grad_norm": 4.02662467956543, + "learning_rate": 0.00013303548776482353, + "loss": 7.5081, + "step": 1441400 + }, + { + "epoch": 5.87235420704377, + "grad_norm": 14.457428932189941, + "learning_rate": 0.00013288167063471825, + "loss": 7.4945, + "step": 1441500 + }, + { + "epoch": 5.872761585067151, + "grad_norm": 10.926926612854004, + "learning_rate": 0.0001327279472871378, + "loss": 7.5316, + "step": 1441600 + }, + { + "epoch": 5.873168963090532, + "grad_norm": 14.696325302124023, + "learning_rate": 0.00013257431772815918, + "loss": 7.4902, + "step": 1441700 + }, + { + "epoch": 5.873576341113914, + "grad_norm": 18.471006393432617, + "learning_rate": 0.00013242078196385674, + "loss": 7.4907, + "step": 1441800 + }, + { + "epoch": 5.8739837191372954, + "grad_norm": 6.5945143699646, + "learning_rate": 0.00013226734000029944, + "loss": 7.5085, + "step": 1441900 + }, + { + "epoch": 5.874391097160677, + "grad_norm": 8.984210014343262, + "learning_rate": 0.00013211399184355515, + "loss": 7.5213, + "step": 1442000 + }, + { + "epoch": 5.874391097160677, + "eval_MaskedAccuracy": 0.513202929537671, + "eval_loss": 1.5851502418518066, + "eval_runtime": 165.5599, + "eval_samples_per_second": 383.402, + "eval_steps_per_second": 1.498, + "step": 1442000 + }, + { + "epoch": 5.8747984751840585, + "grad_norm": 9.500515937805176, + "learning_rate": 0.00013196073749968635, + "loss": 7.4766, + "step": 1442100 + }, + { + "epoch": 5.87520585320744, + "grad_norm": 4.075962066650391, + "learning_rate": 0.0001318075769747516, + "loss": 7.5027, + "step": 1442200 + }, + { + "epoch": 5.875613231230822, + "grad_norm": 25.375219345092773, + "learning_rate": 0.00013165451027480574, + "loss": 7.4876, + "step": 1442300 + }, + { + "epoch": 5.876020609254203, + "grad_norm": 40.18368148803711, + "learning_rate": 0.000131501537405901, + "loss": 7.5283, + "step": 1442400 + }, + { + "epoch": 5.876427987277585, + "grad_norm": 7.134500980377197, + "learning_rate": 0.00013134865837408468, + "loss": 7.5371, + "step": 1442500 + }, + { + "epoch": 5.876835365300966, + "grad_norm": 5.869187831878662, + "learning_rate": 0.0001311958731854011, + "loss": 7.5293, + "step": 1442600 + }, + { + "epoch": 5.877242743324347, + "grad_norm": 32.77632522583008, + "learning_rate": 0.00013104318184589084, + "loss": 7.5221, + "step": 1442700 + }, + { + "epoch": 5.877650121347728, + "grad_norm": 14.90440845489502, + "learning_rate": 0.00013089058436159008, + "loss": 7.5192, + "step": 1442800 + }, + { + "epoch": 5.87805749937111, + "grad_norm": 31.729684829711914, + "learning_rate": 0.0001307380807385323, + "loss": 7.5285, + "step": 1442900 + }, + { + "epoch": 5.878464877394491, + "grad_norm": 10.556347846984863, + "learning_rate": 0.0001305856709827465, + "loss": 7.4979, + "step": 1443000 + }, + { + "epoch": 5.878464877394491, + "eval_MaskedAccuracy": 0.5139184009130835, + "eval_loss": 1.5758243799209595, + "eval_runtime": 171.8788, + "eval_samples_per_second": 369.307, + "eval_steps_per_second": 1.443, + "step": 1443000 + }, + { + "epoch": 5.878872255417873, + "grad_norm": 5.144359588623047, + "learning_rate": 0.0001304333551002587, + "loss": 7.5262, + "step": 1443100 + }, + { + "epoch": 5.8792796334412545, + "grad_norm": 2.1949470043182373, + "learning_rate": 0.00013028113309708996, + "loss": 7.5334, + "step": 1443200 + }, + { + "epoch": 5.879687011464636, + "grad_norm": 33.62776184082031, + "learning_rate": 0.00013012900497925934, + "loss": 7.5258, + "step": 1443300 + }, + { + "epoch": 5.8800943894880175, + "grad_norm": 34.01841735839844, + "learning_rate": 0.0001299769707527811, + "loss": 7.5322, + "step": 1443400 + }, + { + "epoch": 5.880501767511399, + "grad_norm": 23.741214752197266, + "learning_rate": 0.00012982503042366543, + "loss": 7.5244, + "step": 1443500 + }, + { + "epoch": 5.880909145534781, + "grad_norm": 12.783524513244629, + "learning_rate": 0.00012967318399791965, + "loss": 7.5436, + "step": 1443600 + }, + { + "epoch": 5.881316523558162, + "grad_norm": 4.492242813110352, + "learning_rate": 0.00012952143148154698, + "loss": 7.5224, + "step": 1443700 + }, + { + "epoch": 5.881723901581544, + "grad_norm": 27.41019058227539, + "learning_rate": 0.0001293697728805472, + "loss": 7.5373, + "step": 1443800 + }, + { + "epoch": 5.882131279604925, + "grad_norm": 4.752145767211914, + "learning_rate": 0.00012921820820091558, + "loss": 7.5271, + "step": 1443900 + }, + { + "epoch": 5.882538657628306, + "grad_norm": 3.3152313232421875, + "learning_rate": 0.00012906673744864512, + "loss": 7.5065, + "step": 1444000 + }, + { + "epoch": 5.882538657628306, + "eval_MaskedAccuracy": 0.5136492446660499, + "eval_loss": 1.5915486812591553, + "eval_runtime": 167.6905, + "eval_samples_per_second": 378.531, + "eval_steps_per_second": 1.479, + "step": 1444000 + }, + { + "epoch": 5.882946035651687, + "grad_norm": 10.958313941955566, + "learning_rate": 0.00012891536062972382, + "loss": 7.5251, + "step": 1444100 + }, + { + "epoch": 5.883353413675069, + "grad_norm": 3.695143699645996, + "learning_rate": 0.00012876407775013672, + "loss": 7.4796, + "step": 1444200 + }, + { + "epoch": 5.8837607916984505, + "grad_norm": 9.321735382080078, + "learning_rate": 0.00012861288881586464, + "loss": 7.5384, + "step": 1444300 + }, + { + "epoch": 5.884168169721832, + "grad_norm": 20.033639907836914, + "learning_rate": 0.0001284617938328845, + "loss": 7.524, + "step": 1444400 + }, + { + "epoch": 5.8845755477452135, + "grad_norm": 25.62944984436035, + "learning_rate": 0.00012831079280717066, + "loss": 7.5156, + "step": 1444500 + }, + { + "epoch": 5.884982925768595, + "grad_norm": 4.379734039306641, + "learning_rate": 0.00012815988574469282, + "loss": 7.536, + "step": 1444600 + }, + { + "epoch": 5.885390303791977, + "grad_norm": 19.151958465576172, + "learning_rate": 0.00012800907265141665, + "loss": 7.502, + "step": 1444700 + }, + { + "epoch": 5.885797681815358, + "grad_norm": 21.18988609313965, + "learning_rate": 0.00012785835353330477, + "loss": 7.5286, + "step": 1444800 + }, + { + "epoch": 5.88620505983874, + "grad_norm": 16.10911750793457, + "learning_rate": 0.0001277077283963162, + "loss": 7.5095, + "step": 1444900 + }, + { + "epoch": 5.886612437862121, + "grad_norm": 24.481395721435547, + "learning_rate": 0.00012755719724640558, + "loss": 7.5323, + "step": 1445000 + }, + { + "epoch": 5.886612437862121, + "eval_MaskedAccuracy": 0.5135959303774668, + "eval_loss": 1.5905863046646118, + "eval_runtime": 177.6913, + "eval_samples_per_second": 357.226, + "eval_steps_per_second": 1.396, + "step": 1445000 + }, + { + "epoch": 5.887019815885503, + "grad_norm": 23.361230850219727, + "learning_rate": 0.00012740676008952476, + "loss": 7.4898, + "step": 1445100 + }, + { + "epoch": 5.887427193908884, + "grad_norm": 3.7567873001098633, + "learning_rate": 0.00012725641693162084, + "loss": 7.5047, + "step": 1445200 + }, + { + "epoch": 5.887834571932265, + "grad_norm": 3.618382215499878, + "learning_rate": 0.0001271061677786378, + "loss": 7.5332, + "step": 1445300 + }, + { + "epoch": 5.888241949955646, + "grad_norm": 4.32653284072876, + "learning_rate": 0.00012695601263651587, + "loss": 7.5139, + "step": 1445400 + }, + { + "epoch": 5.888649327979028, + "grad_norm": 17.843873977661133, + "learning_rate": 0.00012680595151119133, + "loss": 7.5246, + "step": 1445500 + }, + { + "epoch": 5.8890567060024095, + "grad_norm": 27.57331085205078, + "learning_rate": 0.00012665598440859713, + "loss": 7.506, + "step": 1445600 + }, + { + "epoch": 5.889464084025791, + "grad_norm": 9.948957443237305, + "learning_rate": 0.00012650611133466165, + "loss": 7.5069, + "step": 1445700 + }, + { + "epoch": 5.889871462049173, + "grad_norm": 14.954339981079102, + "learning_rate": 0.0001263563322953109, + "loss": 7.5173, + "step": 1445800 + }, + { + "epoch": 5.890278840072554, + "grad_norm": 8.692944526672363, + "learning_rate": 0.00012620664729646582, + "loss": 7.504, + "step": 1445900 + }, + { + "epoch": 5.890686218095936, + "grad_norm": 26.2714786529541, + "learning_rate": 0.00012605705634404497, + "loss": 7.5458, + "step": 1446000 + }, + { + "epoch": 5.890686218095936, + "eval_MaskedAccuracy": 0.5133637309369398, + "eval_loss": 1.5888253450393677, + "eval_runtime": 169.9806, + "eval_samples_per_second": 373.431, + "eval_steps_per_second": 1.459, + "step": 1446000 + }, + { + "epoch": 5.891093596119317, + "grad_norm": 4.7359418869018555, + "learning_rate": 0.00012590755944396224, + "loss": 7.5019, + "step": 1446100 + }, + { + "epoch": 5.891500974142699, + "grad_norm": 18.73127555847168, + "learning_rate": 0.00012575815660212795, + "loss": 7.5389, + "step": 1446200 + }, + { + "epoch": 5.89190835216608, + "grad_norm": 18.924087524414062, + "learning_rate": 0.0001256088478244491, + "loss": 7.4981, + "step": 1446300 + }, + { + "epoch": 5.892315730189461, + "grad_norm": 15.551505088806152, + "learning_rate": 0.00012545963311682816, + "loss": 7.4889, + "step": 1446400 + }, + { + "epoch": 5.892723108212843, + "grad_norm": 10.590399742126465, + "learning_rate": 0.0001253105124851645, + "loss": 7.532, + "step": 1446500 + }, + { + "epoch": 5.893130486236224, + "grad_norm": 13.321043014526367, + "learning_rate": 0.00012516148593535385, + "loss": 7.5364, + "step": 1446600 + }, + { + "epoch": 5.8935378642596055, + "grad_norm": 11.357898712158203, + "learning_rate": 0.00012501255347328795, + "loss": 7.4776, + "step": 1446700 + }, + { + "epoch": 5.893945242282987, + "grad_norm": 3.8171656131744385, + "learning_rate": 0.00012486371510485507, + "loss": 7.5042, + "step": 1446800 + }, + { + "epoch": 5.8943526203063685, + "grad_norm": 10.532248497009277, + "learning_rate": 0.00012471497083593915, + "loss": 7.5628, + "step": 1446900 + }, + { + "epoch": 5.89475999832975, + "grad_norm": 18.82876205444336, + "learning_rate": 0.0001245663206724209, + "loss": 7.5004, + "step": 1447000 + }, + { + "epoch": 5.89475999832975, + "eval_MaskedAccuracy": 0.5139297068543455, + "eval_loss": 1.5773016214370728, + "eval_runtime": 168.4882, + "eval_samples_per_second": 376.739, + "eval_steps_per_second": 1.472, + "step": 1447000 + }, + { + "epoch": 5.895167376353132, + "grad_norm": 6.096856594085693, + "learning_rate": 0.00012441776462017795, + "loss": 7.5222, + "step": 1447100 + }, + { + "epoch": 5.895574754376513, + "grad_norm": 24.732280731201172, + "learning_rate": 0.0001242693026850828, + "loss": 7.4954, + "step": 1447200 + }, + { + "epoch": 5.895982132399895, + "grad_norm": 25.817455291748047, + "learning_rate": 0.00012412093487300518, + "loss": 7.5074, + "step": 1447300 + }, + { + "epoch": 5.896389510423276, + "grad_norm": 19.49112892150879, + "learning_rate": 0.00012397266118981045, + "loss": 7.5271, + "step": 1447400 + }, + { + "epoch": 5.896796888446658, + "grad_norm": 19.085458755493164, + "learning_rate": 0.00012382448164136153, + "loss": 7.5066, + "step": 1447500 + }, + { + "epoch": 5.897204266470039, + "grad_norm": 22.932804107666016, + "learning_rate": 0.00012367639623351624, + "loss": 7.52, + "step": 1447600 + }, + { + "epoch": 5.89761164449342, + "grad_norm": 10.887919425964355, + "learning_rate": 0.00012352840497212907, + "loss": 7.5119, + "step": 1447700 + }, + { + "epoch": 5.898019022516801, + "grad_norm": 3.4162111282348633, + "learning_rate": 0.00012338050786305126, + "loss": 7.5051, + "step": 1447800 + }, + { + "epoch": 5.898426400540183, + "grad_norm": 3.4957773685455322, + "learning_rate": 0.00012323270491212935, + "loss": 7.5177, + "step": 1447900 + }, + { + "epoch": 5.8988337785635645, + "grad_norm": 8.30463981628418, + "learning_rate": 0.0001230849961252072, + "loss": 7.4906, + "step": 1448000 + }, + { + "epoch": 5.8988337785635645, + "eval_MaskedAccuracy": 0.5137562541685923, + "eval_loss": 1.5782514810562134, + "eval_runtime": 161.3628, + "eval_samples_per_second": 393.374, + "eval_steps_per_second": 1.537, + "step": 1448000 + }, + { + "epoch": 5.899241156586946, + "grad_norm": 3.286139726638794, + "learning_rate": 0.0001229373815081249, + "loss": 7.5286, + "step": 1448100 + }, + { + "epoch": 5.899648534610328, + "grad_norm": 18.14317512512207, + "learning_rate": 0.00012278986106671787, + "loss": 7.5051, + "step": 1448200 + }, + { + "epoch": 5.900055912633709, + "grad_norm": 7.864316940307617, + "learning_rate": 0.00012264243480681892, + "loss": 7.5175, + "step": 1448300 + }, + { + "epoch": 5.900463290657091, + "grad_norm": 17.623258590698242, + "learning_rate": 0.00012249510273425597, + "loss": 7.5062, + "step": 1448400 + }, + { + "epoch": 5.900870668680472, + "grad_norm": 14.369786262512207, + "learning_rate": 0.000122347864854854, + "loss": 7.529, + "step": 1448500 + }, + { + "epoch": 5.901278046703854, + "grad_norm": 21.708314895629883, + "learning_rate": 0.00012220072117443488, + "loss": 7.5125, + "step": 1448600 + }, + { + "epoch": 5.901685424727235, + "grad_norm": 10.244361877441406, + "learning_rate": 0.00012205367169881467, + "loss": 7.563, + "step": 1448700 + }, + { + "epoch": 5.902092802750617, + "grad_norm": 16.77159309387207, + "learning_rate": 0.00012190671643380743, + "loss": 7.4899, + "step": 1448800 + }, + { + "epoch": 5.902500180773998, + "grad_norm": 19.80599021911621, + "learning_rate": 0.00012175985538522329, + "loss": 7.5095, + "step": 1448900 + }, + { + "epoch": 5.902907558797379, + "grad_norm": 7.711851119995117, + "learning_rate": 0.00012161308855886834, + "loss": 7.5223, + "step": 1449000 + }, + { + "epoch": 5.902907558797379, + "eval_MaskedAccuracy": 0.5132495013050716, + "eval_loss": 1.5839108228683472, + "eval_runtime": 171.2586, + "eval_samples_per_second": 370.644, + "eval_steps_per_second": 1.448, + "step": 1449000 + }, + { + "epoch": 5.9033149368207605, + "grad_norm": 8.166816711425781, + "learning_rate": 0.0001214664159605456, + "loss": 7.4815, + "step": 1449100 + }, + { + "epoch": 5.903722314844142, + "grad_norm": 14.676194190979004, + "learning_rate": 0.00012131983759605326, + "loss": 7.5067, + "step": 1449200 + }, + { + "epoch": 5.9041296928675235, + "grad_norm": 6.128334999084473, + "learning_rate": 0.00012117335347118658, + "loss": 7.4995, + "step": 1449300 + }, + { + "epoch": 5.904537070890905, + "grad_norm": 3.75400972366333, + "learning_rate": 0.00012102696359173672, + "loss": 7.5163, + "step": 1449400 + }, + { + "epoch": 5.904944448914287, + "grad_norm": 22.000093460083008, + "learning_rate": 0.0001208806679634915, + "loss": 7.5036, + "step": 1449500 + }, + { + "epoch": 5.905351826937668, + "grad_norm": 10.487384796142578, + "learning_rate": 0.00012073446659223455, + "loss": 7.5393, + "step": 1449600 + }, + { + "epoch": 5.90575920496105, + "grad_norm": 13.914013862609863, + "learning_rate": 0.00012058835948374605, + "loss": 7.5098, + "step": 1449700 + }, + { + "epoch": 5.906166582984431, + "grad_norm": 4.168239116668701, + "learning_rate": 0.00012044234664380251, + "loss": 7.5023, + "step": 1449800 + }, + { + "epoch": 5.906573961007813, + "grad_norm": 73.41988372802734, + "learning_rate": 0.00012029642807817591, + "loss": 7.4953, + "step": 1449900 + }, + { + "epoch": 5.906981339031194, + "grad_norm": 4.312287330627441, + "learning_rate": 0.00012015060379263657, + "loss": 7.5129, + "step": 1450000 + }, + { + "epoch": 5.906981339031194, + "eval_MaskedAccuracy": 0.5135990392072747, + "eval_loss": 1.5788543224334717, + "eval_runtime": 164.1998, + "eval_samples_per_second": 386.578, + "eval_steps_per_second": 1.51, + "step": 1450000 + }, + { + "epoch": 5.907388717054576, + "grad_norm": 29.152294158935547, + "learning_rate": 0.00012000487379294871, + "loss": 7.4977, + "step": 1450100 + }, + { + "epoch": 5.907796095077957, + "grad_norm": 16.983320236206055, + "learning_rate": 0.00011985923808487424, + "loss": 7.4934, + "step": 1450200 + }, + { + "epoch": 5.908203473101338, + "grad_norm": 10.714752197265625, + "learning_rate": 0.00011971369667417034, + "loss": 7.5055, + "step": 1450300 + }, + { + "epoch": 5.9086108511247195, + "grad_norm": 3.5559794902801514, + "learning_rate": 0.00011956824956659175, + "loss": 7.4962, + "step": 1450400 + }, + { + "epoch": 5.909018229148101, + "grad_norm": 6.410406112670898, + "learning_rate": 0.00011942289676788828, + "loss": 7.506, + "step": 1450500 + }, + { + "epoch": 5.909425607171483, + "grad_norm": 14.903667449951172, + "learning_rate": 0.00011927763828380662, + "loss": 7.5288, + "step": 1450600 + }, + { + "epoch": 5.909832985194864, + "grad_norm": 17.27276611328125, + "learning_rate": 0.00011913247412008981, + "loss": 7.5025, + "step": 1450700 + }, + { + "epoch": 5.910240363218246, + "grad_norm": 14.003911972045898, + "learning_rate": 0.00011898740428247663, + "loss": 7.5199, + "step": 1450800 + }, + { + "epoch": 5.910647741241627, + "grad_norm": 23.43582534790039, + "learning_rate": 0.00011884242877670301, + "loss": 7.4905, + "step": 1450900 + }, + { + "epoch": 5.911055119265009, + "grad_norm": 11.995508193969727, + "learning_rate": 0.00011869754760849997, + "loss": 7.5087, + "step": 1451000 + }, + { + "epoch": 5.911055119265009, + "eval_MaskedAccuracy": 0.51343927019386, + "eval_loss": 1.5892750024795532, + "eval_runtime": 175.4648, + "eval_samples_per_second": 361.759, + "eval_steps_per_second": 1.413, + "step": 1451000 + }, + { + "epoch": 5.91146249728839, + "grad_norm": 17.342601776123047, + "learning_rate": 0.00011855276078359587, + "loss": 7.4915, + "step": 1451100 + }, + { + "epoch": 5.911869875311772, + "grad_norm": 14.83423137664795, + "learning_rate": 0.0001184080683077145, + "loss": 7.5277, + "step": 1451200 + }, + { + "epoch": 5.912277253335153, + "grad_norm": 15.177057266235352, + "learning_rate": 0.00011826347018657685, + "loss": 7.5006, + "step": 1451300 + }, + { + "epoch": 5.912684631358534, + "grad_norm": 11.612956047058105, + "learning_rate": 0.00011811896642589933, + "loss": 7.4908, + "step": 1451400 + }, + { + "epoch": 5.913092009381916, + "grad_norm": 19.39689064025879, + "learning_rate": 0.00011797455703139484, + "loss": 7.4941, + "step": 1451500 + }, + { + "epoch": 5.913499387405297, + "grad_norm": 3.6763217449188232, + "learning_rate": 0.00011783024200877304, + "loss": 7.5167, + "step": 1451600 + }, + { + "epoch": 5.9139067654286785, + "grad_norm": 12.86085033416748, + "learning_rate": 0.0001176860213637391, + "loss": 7.498, + "step": 1451700 + }, + { + "epoch": 5.91431414345206, + "grad_norm": 4.068489074707031, + "learning_rate": 0.00011754189510199496, + "loss": 7.4984, + "step": 1451800 + }, + { + "epoch": 5.914721521475442, + "grad_norm": 20.809120178222656, + "learning_rate": 0.00011739786322923863, + "loss": 7.4948, + "step": 1451900 + }, + { + "epoch": 5.915128899498823, + "grad_norm": 11.219196319580078, + "learning_rate": 0.00011725392575116451, + "loss": 7.5258, + "step": 1452000 + }, + { + "epoch": 5.915128899498823, + "eval_MaskedAccuracy": 0.5137053143283565, + "eval_loss": 1.5857536792755127, + "eval_runtime": 186.5472, + "eval_samples_per_second": 340.268, + "eval_steps_per_second": 1.329, + "step": 1452000 + }, + { + "epoch": 5.915536277522205, + "grad_norm": 13.42127799987793, + "learning_rate": 0.0001171100826734637, + "loss": 7.5294, + "step": 1452100 + }, + { + "epoch": 5.915943655545586, + "grad_norm": 8.402169227600098, + "learning_rate": 0.00011696633400182227, + "loss": 7.4891, + "step": 1452200 + }, + { + "epoch": 5.916351033568968, + "grad_norm": 17.03421974182129, + "learning_rate": 0.00011682267974192385, + "loss": 7.4852, + "step": 1452300 + }, + { + "epoch": 5.916758411592349, + "grad_norm": 3.7382123470306396, + "learning_rate": 0.000116679119899448, + "loss": 7.5025, + "step": 1452400 + }, + { + "epoch": 5.917165789615731, + "grad_norm": 19.800050735473633, + "learning_rate": 0.00011653565448006997, + "loss": 7.4884, + "step": 1452500 + }, + { + "epoch": 5.917573167639112, + "grad_norm": 3.0500950813293457, + "learning_rate": 0.00011639228348946203, + "loss": 7.5367, + "step": 1452600 + }, + { + "epoch": 5.917980545662493, + "grad_norm": 8.724006652832031, + "learning_rate": 0.00011624900693329203, + "loss": 7.4985, + "step": 1452700 + }, + { + "epoch": 5.9183879236858745, + "grad_norm": 19.91343116760254, + "learning_rate": 0.00011610582481722494, + "loss": 7.5106, + "step": 1452800 + }, + { + "epoch": 5.918795301709256, + "grad_norm": 16.161819458007812, + "learning_rate": 0.00011596273714692142, + "loss": 7.4874, + "step": 1452900 + }, + { + "epoch": 5.919202679732638, + "grad_norm": 4.49916934967041, + "learning_rate": 0.00011581974392803816, + "loss": 7.5098, + "step": 1453000 + }, + { + "epoch": 5.919202679732638, + "eval_MaskedAccuracy": 0.5143894777400708, + "eval_loss": 1.5754600763320923, + "eval_runtime": 158.4013, + "eval_samples_per_second": 400.729, + "eval_steps_per_second": 1.566, + "step": 1453000 + }, + { + "epoch": 5.919610057756019, + "grad_norm": 23.63644027709961, + "learning_rate": 0.00011567684516622845, + "loss": 7.4949, + "step": 1453100 + }, + { + "epoch": 5.920017435779401, + "grad_norm": 23.471284866333008, + "learning_rate": 0.00011553404086714239, + "loss": 7.5216, + "step": 1453200 + }, + { + "epoch": 5.920424813802782, + "grad_norm": 16.059326171875, + "learning_rate": 0.00011539133103642536, + "loss": 7.537, + "step": 1453300 + }, + { + "epoch": 5.920832191826164, + "grad_norm": 5.33090877532959, + "learning_rate": 0.00011524871567971943, + "loss": 7.5013, + "step": 1453400 + }, + { + "epoch": 5.921239569849545, + "grad_norm": 6.212322235107422, + "learning_rate": 0.00011510619480266355, + "loss": 7.5425, + "step": 1453500 + }, + { + "epoch": 5.921646947872927, + "grad_norm": 23.205650329589844, + "learning_rate": 0.00011496376841089155, + "loss": 7.5143, + "step": 1453600 + }, + { + "epoch": 5.922054325896308, + "grad_norm": 20.84681510925293, + "learning_rate": 0.00011482143651003477, + "loss": 7.5228, + "step": 1453700 + }, + { + "epoch": 5.92246170391969, + "grad_norm": 17.809091567993164, + "learning_rate": 0.00011467919910571971, + "loss": 7.5239, + "step": 1453800 + }, + { + "epoch": 5.922869081943071, + "grad_norm": 5.129265308380127, + "learning_rate": 0.00011453705620357022, + "loss": 7.5259, + "step": 1453900 + }, + { + "epoch": 5.923276459966452, + "grad_norm": 13.345703125, + "learning_rate": 0.00011439500780920652, + "loss": 7.5022, + "step": 1454000 + }, + { + "epoch": 5.923276459966452, + "eval_MaskedAccuracy": 0.5135653023197638, + "eval_loss": 1.5850971937179565, + "eval_runtime": 182.5633, + "eval_samples_per_second": 347.693, + "eval_steps_per_second": 1.358, + "step": 1454000 + }, + { + "epoch": 5.923683837989834, + "grad_norm": 9.327455520629883, + "learning_rate": 0.00011425305392824364, + "loss": 7.4987, + "step": 1454100 + }, + { + "epoch": 5.924091216013215, + "grad_norm": 14.787358283996582, + "learning_rate": 0.00011411119456629445, + "loss": 7.5079, + "step": 1454200 + }, + { + "epoch": 5.924498594036597, + "grad_norm": 6.930863857269287, + "learning_rate": 0.00011396942972896683, + "loss": 7.5335, + "step": 1454300 + }, + { + "epoch": 5.924905972059978, + "grad_norm": 11.233614921569824, + "learning_rate": 0.00011382775942186596, + "loss": 7.5241, + "step": 1454400 + }, + { + "epoch": 5.92531335008336, + "grad_norm": 4.68874454498291, + "learning_rate": 0.00011368618365059263, + "loss": 7.5484, + "step": 1454500 + }, + { + "epoch": 5.925720728106741, + "grad_norm": 10.873495101928711, + "learning_rate": 0.00011354470242074426, + "loss": 7.494, + "step": 1454600 + }, + { + "epoch": 5.926128106130123, + "grad_norm": 11.247530937194824, + "learning_rate": 0.00011340331573791407, + "loss": 7.4829, + "step": 1454700 + }, + { + "epoch": 5.926535484153504, + "grad_norm": 39.476524353027344, + "learning_rate": 0.00011326202360769187, + "loss": 7.5259, + "step": 1454800 + }, + { + "epoch": 5.926942862176886, + "grad_norm": 25.049827575683594, + "learning_rate": 0.00011312082603566377, + "loss": 7.5139, + "step": 1454900 + }, + { + "epoch": 5.927350240200267, + "grad_norm": 3.6227469444274902, + "learning_rate": 0.00011297972302741214, + "loss": 7.5229, + "step": 1455000 + }, + { + "epoch": 5.927350240200267, + "eval_MaskedAccuracy": 0.5139925969125861, + "eval_loss": 1.578850507736206, + "eval_runtime": 176.3056, + "eval_samples_per_second": 360.034, + "eval_steps_per_second": 1.407, + "step": 1455000 + }, + { + "epoch": 5.927757618223649, + "grad_norm": 19.786165237426758, + "learning_rate": 0.00011283871458851566, + "loss": 7.5219, + "step": 1455100 + }, + { + "epoch": 5.92816499624703, + "grad_norm": 3.4540088176727295, + "learning_rate": 0.0001126978007245488, + "loss": 7.536, + "step": 1455200 + }, + { + "epoch": 5.928572374270411, + "grad_norm": 26.75469207763672, + "learning_rate": 0.0001125569814410828, + "loss": 7.4783, + "step": 1455300 + }, + { + "epoch": 5.928979752293793, + "grad_norm": 15.556597709655762, + "learning_rate": 0.00011241625674368492, + "loss": 7.4901, + "step": 1455400 + }, + { + "epoch": 5.929387130317174, + "grad_norm": 19.530500411987305, + "learning_rate": 0.000112275626637919, + "loss": 7.5292, + "step": 1455500 + }, + { + "epoch": 5.929794508340556, + "grad_norm": 23.01983642578125, + "learning_rate": 0.00011213509112934449, + "loss": 7.4952, + "step": 1455600 + }, + { + "epoch": 5.930201886363937, + "grad_norm": 28.697982788085938, + "learning_rate": 0.00011199465022351797, + "loss": 7.5308, + "step": 1455700 + }, + { + "epoch": 5.930609264387319, + "grad_norm": 9.541047096252441, + "learning_rate": 0.00011185430392599124, + "loss": 7.5132, + "step": 1455800 + }, + { + "epoch": 5.9310166424107, + "grad_norm": 14.119759559631348, + "learning_rate": 0.00011171405224231382, + "loss": 7.5025, + "step": 1455900 + }, + { + "epoch": 5.931424020434082, + "grad_norm": 3.3149988651275635, + "learning_rate": 0.00011157389517802986, + "loss": 7.5053, + "step": 1456000 + }, + { + "epoch": 5.931424020434082, + "eval_MaskedAccuracy": 0.514069930094781, + "eval_loss": 1.580933690071106, + "eval_runtime": 170.3279, + "eval_samples_per_second": 372.669, + "eval_steps_per_second": 1.456, + "step": 1456000 + }, + { + "epoch": 5.931831398457463, + "grad_norm": 8.514421463012695, + "learning_rate": 0.00011143383273868073, + "loss": 7.518, + "step": 1456100 + }, + { + "epoch": 5.932238776480845, + "grad_norm": 21.946008682250977, + "learning_rate": 0.00011129386492980401, + "loss": 7.5054, + "step": 1456200 + }, + { + "epoch": 5.932646154504226, + "grad_norm": 6.1492228507995605, + "learning_rate": 0.00011115399175693329, + "loss": 7.5188, + "step": 1456300 + }, + { + "epoch": 5.933053532527607, + "grad_norm": 24.58721351623535, + "learning_rate": 0.00011101421322559837, + "loss": 7.5078, + "step": 1456400 + }, + { + "epoch": 5.9334609105509895, + "grad_norm": 11.320701599121094, + "learning_rate": 0.00011087452934132561, + "loss": 7.4834, + "step": 1456500 + }, + { + "epoch": 5.93386828857437, + "grad_norm": 13.344386100769043, + "learning_rate": 0.00011073494010963742, + "loss": 7.5097, + "step": 1456600 + }, + { + "epoch": 5.934275666597752, + "grad_norm": 3.499661922454834, + "learning_rate": 0.00011059544553605261, + "loss": 7.51, + "step": 1456700 + }, + { + "epoch": 5.934683044621133, + "grad_norm": 14.360859870910645, + "learning_rate": 0.0001104560456260858, + "loss": 7.4833, + "step": 1456800 + }, + { + "epoch": 5.935090422644515, + "grad_norm": 14.348841667175293, + "learning_rate": 0.00011031674038524853, + "loss": 7.5203, + "step": 1456900 + }, + { + "epoch": 5.935497800667896, + "grad_norm": 6.4998459815979, + "learning_rate": 0.00011017752981904802, + "loss": 7.5111, + "step": 1457000 + }, + { + "epoch": 5.935497800667896, + "eval_MaskedAccuracy": 0.5138158737279147, + "eval_loss": 1.5889171361923218, + "eval_runtime": 192.7508, + "eval_samples_per_second": 329.316, + "eval_steps_per_second": 1.287, + "step": 1457000 + }, + { + "epoch": 5.935905178691278, + "grad_norm": 9.365138053894043, + "learning_rate": 0.00011003841393298827, + "loss": 7.4937, + "step": 1457100 + }, + { + "epoch": 5.936312556714659, + "grad_norm": 2.9428069591522217, + "learning_rate": 0.00010989939273256926, + "loss": 7.5282, + "step": 1457200 + }, + { + "epoch": 5.936719934738041, + "grad_norm": 19.085132598876953, + "learning_rate": 0.00010976046622328699, + "loss": 7.484, + "step": 1457300 + }, + { + "epoch": 5.937127312761422, + "grad_norm": 15.751285552978516, + "learning_rate": 0.00010962163441063402, + "loss": 7.5371, + "step": 1457400 + }, + { + "epoch": 5.937534690784804, + "grad_norm": 14.46463680267334, + "learning_rate": 0.0001094828973000994, + "loss": 7.5145, + "step": 1457500 + }, + { + "epoch": 5.937942068808185, + "grad_norm": 5.172186374664307, + "learning_rate": 0.00010934425489716794, + "loss": 7.5101, + "step": 1457600 + }, + { + "epoch": 5.938349446831566, + "grad_norm": 38.2766227722168, + "learning_rate": 0.00010920570720732101, + "loss": 7.5323, + "step": 1457700 + }, + { + "epoch": 5.938756824854948, + "grad_norm": 3.047947406768799, + "learning_rate": 0.00010906725423603598, + "loss": 7.5255, + "step": 1457800 + }, + { + "epoch": 5.939164202878329, + "grad_norm": 2.5195424556732178, + "learning_rate": 0.00010892889598878674, + "loss": 7.4827, + "step": 1457900 + }, + { + "epoch": 5.939571580901711, + "grad_norm": 14.40636157989502, + "learning_rate": 0.00010879063247104339, + "loss": 7.5231, + "step": 1458000 + }, + { + "epoch": 5.939571580901711, + "eval_MaskedAccuracy": 0.5136709774046209, + "eval_loss": 1.5871533155441284, + "eval_runtime": 173.1303, + "eval_samples_per_second": 366.637, + "eval_steps_per_second": 1.432, + "step": 1458000 + }, + { + "epoch": 5.939978958925092, + "grad_norm": 3.6198933124542236, + "learning_rate": 0.0001086524636882722, + "loss": 7.5091, + "step": 1458100 + }, + { + "epoch": 5.940386336948474, + "grad_norm": 5.195720195770264, + "learning_rate": 0.00010851438964593575, + "loss": 7.5194, + "step": 1458200 + }, + { + "epoch": 5.940793714971855, + "grad_norm": 5.755271911621094, + "learning_rate": 0.0001083764103494929, + "loss": 7.4978, + "step": 1458300 + }, + { + "epoch": 5.941201092995237, + "grad_norm": 11.181453704833984, + "learning_rate": 0.00010823852580439825, + "loss": 7.5298, + "step": 1458400 + }, + { + "epoch": 5.941608471018618, + "grad_norm": 6.790501117706299, + "learning_rate": 0.00010810073601610384, + "loss": 7.4772, + "step": 1458500 + }, + { + "epoch": 5.942015849042, + "grad_norm": 37.07013702392578, + "learning_rate": 0.00010796304099005648, + "loss": 7.5224, + "step": 1458600 + }, + { + "epoch": 5.942423227065381, + "grad_norm": 15.961553573608398, + "learning_rate": 0.00010782544073170042, + "loss": 7.4948, + "step": 1458700 + }, + { + "epoch": 5.942830605088763, + "grad_norm": 34.687110900878906, + "learning_rate": 0.00010768793524647601, + "loss": 7.5462, + "step": 1458800 + }, + { + "epoch": 5.9432379831121445, + "grad_norm": 25.230497360229492, + "learning_rate": 0.0001075505245398189, + "loss": 7.5262, + "step": 1458900 + }, + { + "epoch": 5.943645361135525, + "grad_norm": 19.544490814208984, + "learning_rate": 0.0001074132086171622, + "loss": 7.4948, + "step": 1459000 + }, + { + "epoch": 5.943645361135525, + "eval_MaskedAccuracy": 0.5138457305683033, + "eval_loss": 1.585815191268921, + "eval_runtime": 179.7689, + "eval_samples_per_second": 353.098, + "eval_steps_per_second": 1.38, + "step": 1459000 + }, + { + "epoch": 5.944052739158907, + "grad_norm": 8.752004623413086, + "learning_rate": 0.00010727598748393458, + "loss": 7.5053, + "step": 1459100 + }, + { + "epoch": 5.944460117182288, + "grad_norm": 33.22947692871094, + "learning_rate": 0.00010713886114556092, + "loss": 7.477, + "step": 1459200 + }, + { + "epoch": 5.94486749520567, + "grad_norm": 24.97207260131836, + "learning_rate": 0.00010700182960746278, + "loss": 7.5359, + "step": 1459300 + }, + { + "epoch": 5.945274873229051, + "grad_norm": 13.16974925994873, + "learning_rate": 0.0001068648928750579, + "loss": 7.5337, + "step": 1459400 + }, + { + "epoch": 5.945682251252433, + "grad_norm": 18.190574645996094, + "learning_rate": 0.00010672805095375971, + "loss": 7.5315, + "step": 1459500 + }, + { + "epoch": 5.946089629275814, + "grad_norm": 19.884090423583984, + "learning_rate": 0.00010659130384897889, + "loss": 7.5197, + "step": 1459600 + }, + { + "epoch": 5.946497007299196, + "grad_norm": 11.915521621704102, + "learning_rate": 0.00010645465156612104, + "loss": 7.5041, + "step": 1459700 + }, + { + "epoch": 5.946904385322577, + "grad_norm": 36.03401565551758, + "learning_rate": 0.0001063180941105886, + "loss": 7.5012, + "step": 1459800 + }, + { + "epoch": 5.947311763345959, + "grad_norm": 6.755247116088867, + "learning_rate": 0.00010618163148778167, + "loss": 7.4913, + "step": 1459900 + }, + { + "epoch": 5.94771914136934, + "grad_norm": 3.6575753688812256, + "learning_rate": 0.00010604526370309458, + "loss": 7.4957, + "step": 1460000 + }, + { + "epoch": 5.94771914136934, + "eval_MaskedAccuracy": 0.5137182524719381, + "eval_loss": 1.5914576053619385, + "eval_runtime": 170.9962, + "eval_samples_per_second": 371.213, + "eval_steps_per_second": 1.45, + "step": 1460000 + }, + { + "epoch": 5.948126519392722, + "grad_norm": 3.3408427238464355, + "learning_rate": 0.00010590899076191884, + "loss": 7.5273, + "step": 1460100 + }, + { + "epoch": 5.9485338974161035, + "grad_norm": 6.367294788360596, + "learning_rate": 0.00010577281266964175, + "loss": 7.5127, + "step": 1460200 + }, + { + "epoch": 5.948941275439484, + "grad_norm": 3.021536350250244, + "learning_rate": 0.00010563672943164713, + "loss": 7.5185, + "step": 1460300 + }, + { + "epoch": 5.949348653462866, + "grad_norm": 17.60310173034668, + "learning_rate": 0.00010550074105331562, + "loss": 7.4955, + "step": 1460400 + }, + { + "epoch": 5.949756031486247, + "grad_norm": 13.842937469482422, + "learning_rate": 0.00010536484754002345, + "loss": 7.513, + "step": 1460500 + }, + { + "epoch": 5.950163409509629, + "grad_norm": 13.60439395904541, + "learning_rate": 0.00010522904889714263, + "loss": 7.519, + "step": 1460600 + }, + { + "epoch": 5.95057078753301, + "grad_norm": 15.470193862915039, + "learning_rate": 0.00010509334513004263, + "loss": 7.4751, + "step": 1460700 + }, + { + "epoch": 5.950978165556392, + "grad_norm": 11.470630645751953, + "learning_rate": 0.00010495773624408801, + "loss": 7.5312, + "step": 1460800 + }, + { + "epoch": 5.951385543579773, + "grad_norm": 24.447980880737305, + "learning_rate": 0.00010482222224464061, + "loss": 7.5286, + "step": 1460900 + }, + { + "epoch": 5.951792921603155, + "grad_norm": 22.97228240966797, + "learning_rate": 0.00010468680313705783, + "loss": 7.4935, + "step": 1461000 + }, + { + "epoch": 5.951792921603155, + "eval_MaskedAccuracy": 0.513794019307914, + "eval_loss": 1.5820680856704712, + "eval_runtime": 171.5518, + "eval_samples_per_second": 370.011, + "eval_steps_per_second": 1.446, + "step": 1461000 + }, + { + "epoch": 5.952200299626536, + "grad_norm": 26.28276252746582, + "learning_rate": 0.00010455147892669349, + "loss": 7.5011, + "step": 1461100 + }, + { + "epoch": 5.952607677649918, + "grad_norm": 3.126626491546631, + "learning_rate": 0.0001044162496188975, + "loss": 7.5204, + "step": 1461200 + }, + { + "epoch": 5.9530150556732995, + "grad_norm": 3.186384439468384, + "learning_rate": 0.00010428111521901671, + "loss": 7.5096, + "step": 1461300 + }, + { + "epoch": 5.95342243369668, + "grad_norm": 5.268251419067383, + "learning_rate": 0.00010414607573239319, + "loss": 7.5026, + "step": 1461400 + }, + { + "epoch": 5.9538298117200625, + "grad_norm": 11.56948184967041, + "learning_rate": 0.00010401113116436601, + "loss": 7.5108, + "step": 1461500 + }, + { + "epoch": 5.954237189743443, + "grad_norm": 28.242420196533203, + "learning_rate": 0.00010387628152026999, + "loss": 7.4937, + "step": 1461600 + }, + { + "epoch": 5.954644567766825, + "grad_norm": 4.825844764709473, + "learning_rate": 0.0001037415268054368, + "loss": 7.5388, + "step": 1461700 + }, + { + "epoch": 5.955051945790206, + "grad_norm": 41.179134368896484, + "learning_rate": 0.00010360686702519378, + "loss": 7.5047, + "step": 1461800 + }, + { + "epoch": 5.955459323813588, + "grad_norm": 11.953980445861816, + "learning_rate": 0.00010347230218486505, + "loss": 7.4873, + "step": 1461900 + }, + { + "epoch": 5.955866701836969, + "grad_norm": 18.22443199157715, + "learning_rate": 0.00010333783228977027, + "loss": 7.5001, + "step": 1462000 + }, + { + "epoch": 5.955866701836969, + "eval_MaskedAccuracy": 0.5136876045121863, + "eval_loss": 1.5788699388504028, + "eval_runtime": 170.8875, + "eval_samples_per_second": 371.449, + "eval_steps_per_second": 1.451, + "step": 1462000 + }, + { + "epoch": 5.956274079860351, + "grad_norm": 3.943265199661255, + "learning_rate": 0.00010320345734522625, + "loss": 7.4993, + "step": 1462100 + }, + { + "epoch": 5.956681457883732, + "grad_norm": 3.824932098388672, + "learning_rate": 0.00010306917735654511, + "loss": 7.4942, + "step": 1462200 + }, + { + "epoch": 5.957088835907114, + "grad_norm": 16.49104118347168, + "learning_rate": 0.00010293499232903566, + "loss": 7.5071, + "step": 1462300 + }, + { + "epoch": 5.957496213930495, + "grad_norm": 4.29388952255249, + "learning_rate": 0.00010280090226800353, + "loss": 7.5231, + "step": 1462400 + }, + { + "epoch": 5.957903591953877, + "grad_norm": 5.01418924331665, + "learning_rate": 0.00010266690717874919, + "loss": 7.4935, + "step": 1462500 + }, + { + "epoch": 5.9583109699772585, + "grad_norm": 14.47789192199707, + "learning_rate": 0.00010253300706657099, + "loss": 7.5057, + "step": 1462600 + }, + { + "epoch": 5.958718348000639, + "grad_norm": 24.14881134033203, + "learning_rate": 0.00010239920193676215, + "loss": 7.5101, + "step": 1462700 + }, + { + "epoch": 5.959125726024021, + "grad_norm": 16.402585983276367, + "learning_rate": 0.00010226549179461274, + "loss": 7.4822, + "step": 1462800 + }, + { + "epoch": 5.959533104047402, + "grad_norm": 22.45849609375, + "learning_rate": 0.00010213187664540925, + "loss": 7.5053, + "step": 1462900 + }, + { + "epoch": 5.959940482070784, + "grad_norm": 33.035240173339844, + "learning_rate": 0.00010199835649443415, + "loss": 7.5059, + "step": 1463000 + }, + { + "epoch": 5.959940482070784, + "eval_MaskedAccuracy": 0.5138496094674514, + "eval_loss": 1.5820598602294922, + "eval_runtime": 173.585, + "eval_samples_per_second": 365.677, + "eval_steps_per_second": 1.429, + "step": 1463000 + }, + { + "epoch": 5.960347860094165, + "grad_norm": 4.521730899810791, + "learning_rate": 0.00010186493134696622, + "loss": 7.5253, + "step": 1463100 + }, + { + "epoch": 5.960755238117547, + "grad_norm": 5.788667678833008, + "learning_rate": 0.00010173160120828051, + "loss": 7.5097, + "step": 1463200 + }, + { + "epoch": 5.961162616140928, + "grad_norm": 15.684574127197266, + "learning_rate": 0.00010159836608364796, + "loss": 7.5296, + "step": 1463300 + }, + { + "epoch": 5.96156999416431, + "grad_norm": 13.343859672546387, + "learning_rate": 0.00010146522597833619, + "loss": 7.5307, + "step": 1463400 + }, + { + "epoch": 5.961977372187691, + "grad_norm": 3.1307003498077393, + "learning_rate": 0.0001013321808976096, + "loss": 7.5201, + "step": 1463500 + }, + { + "epoch": 5.962384750211073, + "grad_norm": 17.754648208618164, + "learning_rate": 0.000101199230846727, + "loss": 7.5361, + "step": 1463600 + }, + { + "epoch": 5.9627921282344545, + "grad_norm": 6.762691974639893, + "learning_rate": 0.00010106637583094499, + "loss": 7.4972, + "step": 1463700 + }, + { + "epoch": 5.963199506257836, + "grad_norm": 6.7893548011779785, + "learning_rate": 0.00010093361585551683, + "loss": 7.5174, + "step": 1463800 + }, + { + "epoch": 5.9636068842812175, + "grad_norm": 13.800431251525879, + "learning_rate": 0.00010080095092569082, + "loss": 7.5069, + "step": 1463900 + }, + { + "epoch": 5.964014262304598, + "grad_norm": 6.065407752990723, + "learning_rate": 0.00010066838104671137, + "loss": 7.4887, + "step": 1464000 + }, + { + "epoch": 5.964014262304598, + "eval_MaskedAccuracy": 0.5136984718314406, + "eval_loss": 1.5877317190170288, + "eval_runtime": 187.7424, + "eval_samples_per_second": 338.102, + "eval_steps_per_second": 1.321, + "step": 1464000 + }, + { + "epoch": 5.96442164032798, + "grad_norm": 15.611747741699219, + "learning_rate": 0.00010053590622382053, + "loss": 7.4778, + "step": 1464100 + }, + { + "epoch": 5.964829018351361, + "grad_norm": 24.867883682250977, + "learning_rate": 0.00010040352646225486, + "loss": 7.5207, + "step": 1464200 + }, + { + "epoch": 5.965236396374743, + "grad_norm": 13.83848762512207, + "learning_rate": 0.0001002712417672487, + "loss": 7.5243, + "step": 1464300 + }, + { + "epoch": 5.965643774398124, + "grad_norm": 16.843660354614258, + "learning_rate": 0.00010013905214403175, + "loss": 7.4785, + "step": 1464400 + }, + { + "epoch": 5.966051152421506, + "grad_norm": 5.088784217834473, + "learning_rate": 0.00010000695759783033, + "loss": 7.5036, + "step": 1464500 + }, + { + "epoch": 5.966458530444887, + "grad_norm": 18.821393966674805, + "learning_rate": 9.987495813386656e-05, + "loss": 7.5131, + "step": 1464600 + }, + { + "epoch": 5.966865908468269, + "grad_norm": 15.2603120803833, + "learning_rate": 9.974305375735924e-05, + "loss": 7.4984, + "step": 1464700 + }, + { + "epoch": 5.9672732864916505, + "grad_norm": 5.545129776000977, + "learning_rate": 9.961124447352308e-05, + "loss": 7.5165, + "step": 1464800 + }, + { + "epoch": 5.967680664515032, + "grad_norm": 2.6632354259490967, + "learning_rate": 9.94795302875693e-05, + "loss": 7.4969, + "step": 1464900 + }, + { + "epoch": 5.9680880425384135, + "grad_norm": 11.412628173828125, + "learning_rate": 9.934791120470565e-05, + "loss": 7.4946, + "step": 1465000 + }, + { + "epoch": 5.9680880425384135, + "eval_MaskedAccuracy": 0.5141648127293885, + "eval_loss": 1.5856399536132812, + "eval_runtime": 168.2749, + "eval_samples_per_second": 377.216, + "eval_steps_per_second": 1.474, + "step": 1465000 + }, + { + "epoch": 5.968495420561795, + "grad_norm": 13.691043853759766, + "learning_rate": 9.921638723013513e-05, + "loss": 7.5057, + "step": 1465100 + }, + { + "epoch": 5.968902798585177, + "grad_norm": 11.590338706970215, + "learning_rate": 9.908495836905769e-05, + "loss": 7.528, + "step": 1465200 + }, + { + "epoch": 5.969310176608557, + "grad_norm": 9.705368041992188, + "learning_rate": 9.895362462666984e-05, + "loss": 7.5023, + "step": 1465300 + }, + { + "epoch": 5.969717554631939, + "grad_norm": 3.938136100769043, + "learning_rate": 9.88223860081638e-05, + "loss": 7.5202, + "step": 1465400 + }, + { + "epoch": 5.97012493265532, + "grad_norm": 17.26057243347168, + "learning_rate": 9.869124251872763e-05, + "loss": 7.5211, + "step": 1465500 + }, + { + "epoch": 5.970532310678702, + "grad_norm": 18.391721725463867, + "learning_rate": 9.856019416354689e-05, + "loss": 7.5244, + "step": 1465600 + }, + { + "epoch": 5.970939688702083, + "grad_norm": 17.442569732666016, + "learning_rate": 9.842924094780166e-05, + "loss": 7.5026, + "step": 1465700 + }, + { + "epoch": 5.971347066725465, + "grad_norm": 33.836708068847656, + "learning_rate": 9.829838287667e-05, + "loss": 7.5119, + "step": 1465800 + }, + { + "epoch": 5.971754444748846, + "grad_norm": 3.290839195251465, + "learning_rate": 9.81676199553254e-05, + "loss": 7.5054, + "step": 1465900 + }, + { + "epoch": 5.972161822772228, + "grad_norm": 22.256479263305664, + "learning_rate": 9.803695218893726e-05, + "loss": 7.4874, + "step": 1466000 + }, + { + "epoch": 5.972161822772228, + "eval_MaskedAccuracy": 0.513176038920254, + "eval_loss": 1.5861608982086182, + "eval_runtime": 197.0123, + "eval_samples_per_second": 322.193, + "eval_steps_per_second": 1.259, + "step": 1466000 + }, + { + "epoch": 5.9725692007956095, + "grad_norm": 5.800668239593506, + "learning_rate": 9.790637958267198e-05, + "loss": 7.5038, + "step": 1466100 + }, + { + "epoch": 5.972976578818991, + "grad_norm": 30.10011100769043, + "learning_rate": 9.777590214169132e-05, + "loss": 7.5013, + "step": 1466200 + }, + { + "epoch": 5.9733839568423726, + "grad_norm": 3.665769338607788, + "learning_rate": 9.764551987115402e-05, + "loss": 7.5282, + "step": 1466300 + }, + { + "epoch": 5.973791334865753, + "grad_norm": 22.723546981811523, + "learning_rate": 9.75152327762147e-05, + "loss": 7.4963, + "step": 1466400 + }, + { + "epoch": 5.974198712889136, + "grad_norm": 24.522565841674805, + "learning_rate": 9.738504086202449e-05, + "loss": 7.5091, + "step": 1466500 + }, + { + "epoch": 5.974606090912516, + "grad_norm": 19.31745147705078, + "learning_rate": 9.725494413373036e-05, + "loss": 7.5246, + "step": 1466600 + }, + { + "epoch": 5.975013468935898, + "grad_norm": 3.7023141384124756, + "learning_rate": 9.712494259647596e-05, + "loss": 7.4691, + "step": 1466700 + }, + { + "epoch": 5.975420846959279, + "grad_norm": 14.211570739746094, + "learning_rate": 9.699503625540065e-05, + "loss": 7.5027, + "step": 1466800 + }, + { + "epoch": 5.975828224982661, + "grad_norm": 13.158486366271973, + "learning_rate": 9.686522511564062e-05, + "loss": 7.5129, + "step": 1466900 + }, + { + "epoch": 5.976235603006042, + "grad_norm": 16.808256149291992, + "learning_rate": 9.673550918232768e-05, + "loss": 7.5369, + "step": 1467000 + }, + { + "epoch": 5.976235603006042, + "eval_MaskedAccuracy": 0.5137634491849503, + "eval_loss": 1.58700430393219, + "eval_runtime": 171.3642, + "eval_samples_per_second": 370.416, + "eval_steps_per_second": 1.447, + "step": 1467000 + }, + { + "epoch": 5.976642981029424, + "grad_norm": 19.31085205078125, + "learning_rate": 9.66058884605905e-05, + "loss": 7.493, + "step": 1467100 + }, + { + "epoch": 5.9770503590528055, + "grad_norm": 9.289825439453125, + "learning_rate": 9.647636295555366e-05, + "loss": 7.5255, + "step": 1467200 + }, + { + "epoch": 5.977457737076187, + "grad_norm": 18.02328109741211, + "learning_rate": 9.634693267233765e-05, + "loss": 7.4943, + "step": 1467300 + }, + { + "epoch": 5.9778651150995685, + "grad_norm": 5.039106369018555, + "learning_rate": 9.621759761605976e-05, + "loss": 7.5191, + "step": 1467400 + }, + { + "epoch": 5.97827249312295, + "grad_norm": 21.01978874206543, + "learning_rate": 9.608835779183349e-05, + "loss": 7.5306, + "step": 1467500 + }, + { + "epoch": 5.978679871146332, + "grad_norm": 6.009078502655029, + "learning_rate": 9.595921320476804e-05, + "loss": 7.487, + "step": 1467600 + }, + { + "epoch": 5.979087249169712, + "grad_norm": 6.737271308898926, + "learning_rate": 9.583016385996886e-05, + "loss": 7.4964, + "step": 1467700 + }, + { + "epoch": 5.979494627193094, + "grad_norm": 35.005550384521484, + "learning_rate": 9.570120976253911e-05, + "loss": 7.5056, + "step": 1467800 + }, + { + "epoch": 5.979902005216475, + "grad_norm": 3.025550127029419, + "learning_rate": 9.557235091757648e-05, + "loss": 7.5109, + "step": 1467900 + }, + { + "epoch": 5.980309383239857, + "grad_norm": 6.011305809020996, + "learning_rate": 9.544358733017512e-05, + "loss": 7.5341, + "step": 1468000 + }, + { + "epoch": 5.980309383239857, + "eval_MaskedAccuracy": 0.5141501315477184, + "eval_loss": 1.5844790935516357, + "eval_runtime": 158.4112, + "eval_samples_per_second": 400.704, + "eval_steps_per_second": 1.566, + "step": 1468000 + }, + { + "epoch": 5.980716761263238, + "grad_norm": 9.449957847595215, + "learning_rate": 9.531491900542593e-05, + "loss": 7.5071, + "step": 1468100 + }, + { + "epoch": 5.98112413928662, + "grad_norm": 14.28373908996582, + "learning_rate": 9.518634594841594e-05, + "loss": 7.5178, + "step": 1468200 + }, + { + "epoch": 5.981531517310001, + "grad_norm": 20.214340209960938, + "learning_rate": 9.505786816422842e-05, + "loss": 7.5069, + "step": 1468300 + }, + { + "epoch": 5.981938895333383, + "grad_norm": 8.812238693237305, + "learning_rate": 9.492948565794244e-05, + "loss": 7.5183, + "step": 1468400 + }, + { + "epoch": 5.9823462733567645, + "grad_norm": 25.215444564819336, + "learning_rate": 9.480119843463404e-05, + "loss": 7.5348, + "step": 1468500 + }, + { + "epoch": 5.982753651380146, + "grad_norm": 26.55904197692871, + "learning_rate": 9.46730064993748e-05, + "loss": 7.5074, + "step": 1468600 + }, + { + "epoch": 5.983161029403528, + "grad_norm": 21.207361221313477, + "learning_rate": 9.454490985723302e-05, + "loss": 7.514, + "step": 1468700 + }, + { + "epoch": 5.983568407426909, + "grad_norm": 30.02730369567871, + "learning_rate": 9.44169085132728e-05, + "loss": 7.5086, + "step": 1468800 + }, + { + "epoch": 5.983975785450291, + "grad_norm": 24.488086700439453, + "learning_rate": 9.428900247255503e-05, + "loss": 7.5401, + "step": 1468900 + }, + { + "epoch": 5.984383163473671, + "grad_norm": 31.024044036865234, + "learning_rate": 9.416119174013635e-05, + "loss": 7.5347, + "step": 1469000 + }, + { + "epoch": 5.984383163473671, + "eval_MaskedAccuracy": 0.5133502734147514, + "eval_loss": 1.5921359062194824, + "eval_runtime": 168.351, + "eval_samples_per_second": 377.046, + "eval_steps_per_second": 1.473, + "step": 1469000 + }, + { + "epoch": 5.984790541497053, + "grad_norm": 5.456093788146973, + "learning_rate": 9.403347632106972e-05, + "loss": 7.5337, + "step": 1469100 + }, + { + "epoch": 5.985197919520434, + "grad_norm": 5.64096736907959, + "learning_rate": 9.390585622040481e-05, + "loss": 7.5264, + "step": 1469200 + }, + { + "epoch": 5.985605297543816, + "grad_norm": 27.000465393066406, + "learning_rate": 9.377833144318656e-05, + "loss": 7.5282, + "step": 1469300 + }, + { + "epoch": 5.986012675567197, + "grad_norm": 2.9788384437561035, + "learning_rate": 9.365090199445708e-05, + "loss": 7.5108, + "step": 1469400 + }, + { + "epoch": 5.986420053590579, + "grad_norm": 6.45781946182251, + "learning_rate": 9.352356787925448e-05, + "loss": 7.5074, + "step": 1469500 + }, + { + "epoch": 5.9868274316139605, + "grad_norm": 5.0068583488464355, + "learning_rate": 9.33963291026124e-05, + "loss": 7.4929, + "step": 1469600 + }, + { + "epoch": 5.987234809637342, + "grad_norm": 24.099658966064453, + "learning_rate": 9.326918566956142e-05, + "loss": 7.5002, + "step": 1469700 + }, + { + "epoch": 5.9876421876607235, + "grad_norm": 8.808025360107422, + "learning_rate": 9.314213758512856e-05, + "loss": 7.5103, + "step": 1469800 + }, + { + "epoch": 5.988049565684105, + "grad_norm": 22.51898193359375, + "learning_rate": 9.301518485433673e-05, + "loss": 7.5148, + "step": 1469900 + }, + { + "epoch": 5.988456943707487, + "grad_norm": 22.935468673706055, + "learning_rate": 9.288832748220463e-05, + "loss": 7.5077, + "step": 1470000 + }, + { + "epoch": 5.988456943707487, + "eval_MaskedAccuracy": 0.5140249406513584, + "eval_loss": 1.5843276977539062, + "eval_runtime": 167.0962, + "eval_samples_per_second": 379.877, + "eval_steps_per_second": 1.484, + "step": 1470000 + }, + { + "epoch": 5.988864321730867, + "grad_norm": 5.14185094833374, + "learning_rate": 9.276156547374807e-05, + "loss": 7.5165, + "step": 1470100 + }, + { + "epoch": 5.98927169975425, + "grad_norm": 19.748701095581055, + "learning_rate": 9.263489883397812e-05, + "loss": 7.5202, + "step": 1470200 + }, + { + "epoch": 5.98967907777763, + "grad_norm": 14.636068344116211, + "learning_rate": 9.250832756790298e-05, + "loss": 7.5375, + "step": 1470300 + }, + { + "epoch": 5.990086455801012, + "grad_norm": 2.6870667934417725, + "learning_rate": 9.238185168052653e-05, + "loss": 7.4993, + "step": 1470400 + }, + { + "epoch": 5.990493833824393, + "grad_norm": 16.517555236816406, + "learning_rate": 9.225547117684914e-05, + "loss": 7.5019, + "step": 1470500 + }, + { + "epoch": 5.990901211847775, + "grad_norm": 9.952735900878906, + "learning_rate": 9.212918606186695e-05, + "loss": 7.5014, + "step": 1470600 + }, + { + "epoch": 5.991308589871156, + "grad_norm": 3.9285147190093994, + "learning_rate": 9.200299634057354e-05, + "loss": 7.5063, + "step": 1470700 + }, + { + "epoch": 5.991715967894538, + "grad_norm": 14.709887504577637, + "learning_rate": 9.1876902017957e-05, + "loss": 7.5175, + "step": 1470800 + }, + { + "epoch": 5.9921233459179195, + "grad_norm": 36.995018005371094, + "learning_rate": 9.175090309900246e-05, + "loss": 7.5189, + "step": 1470900 + }, + { + "epoch": 5.992530723941301, + "grad_norm": 11.792192459106445, + "learning_rate": 9.162499958869204e-05, + "loss": 7.4943, + "step": 1471000 + }, + { + "epoch": 5.992530723941301, + "eval_MaskedAccuracy": 0.5135675904019984, + "eval_loss": 1.588697075843811, + "eval_runtime": 163.4904, + "eval_samples_per_second": 388.255, + "eval_steps_per_second": 1.517, + "step": 1471000 + }, + { + "epoch": 5.992938101964683, + "grad_norm": 16.327465057373047, + "learning_rate": 9.149919149200308e-05, + "loss": 7.5165, + "step": 1471100 + }, + { + "epoch": 5.993345479988064, + "grad_norm": 21.664289474487305, + "learning_rate": 9.137347881390924e-05, + "loss": 7.4989, + "step": 1471200 + }, + { + "epoch": 5.993752858011446, + "grad_norm": 5.597540855407715, + "learning_rate": 9.124786155938075e-05, + "loss": 7.5394, + "step": 1471300 + }, + { + "epoch": 5.994160236034826, + "grad_norm": 3.8632829189300537, + "learning_rate": 9.112233973338396e-05, + "loss": 7.5079, + "step": 1471400 + }, + { + "epoch": 5.994567614058209, + "grad_norm": 5.042380332946777, + "learning_rate": 9.099691334088161e-05, + "loss": 7.5295, + "step": 1471500 + }, + { + "epoch": 5.994974992081589, + "grad_norm": 6.029579162597656, + "learning_rate": 9.087158238683211e-05, + "loss": 7.4962, + "step": 1471600 + }, + { + "epoch": 5.995382370104971, + "grad_norm": 14.387858390808105, + "learning_rate": 9.074634687619045e-05, + "loss": 7.5266, + "step": 1471700 + }, + { + "epoch": 5.995789748128352, + "grad_norm": 4.79557466506958, + "learning_rate": 9.062120681390815e-05, + "loss": 7.5172, + "step": 1471800 + }, + { + "epoch": 5.996197126151734, + "grad_norm": 2.618720054626465, + "learning_rate": 9.049616220493274e-05, + "loss": 7.5104, + "step": 1471900 + }, + { + "epoch": 5.9966045041751155, + "grad_norm": 8.536681175231934, + "learning_rate": 9.037121305420762e-05, + "loss": 7.4938, + "step": 1472000 + }, + { + "epoch": 5.9966045041751155, + "eval_MaskedAccuracy": 0.5139832638053828, + "eval_loss": 1.5836459398269653, + "eval_runtime": 179.3431, + "eval_samples_per_second": 353.936, + "eval_steps_per_second": 1.383, + "step": 1472000 + }, + { + "epoch": 5.997011882198497, + "grad_norm": 3.333171844482422, + "learning_rate": 9.024635936667268e-05, + "loss": 7.4803, + "step": 1472100 + }, + { + "epoch": 5.9974192602218785, + "grad_norm": 17.23170280456543, + "learning_rate": 9.012160114726437e-05, + "loss": 7.4951, + "step": 1472200 + }, + { + "epoch": 5.99782663824526, + "grad_norm": 5.657397270202637, + "learning_rate": 8.999693840091489e-05, + "loss": 7.4899, + "step": 1472300 + }, + { + "epoch": 5.998234016268642, + "grad_norm": 2.1915388107299805, + "learning_rate": 8.987237113255245e-05, + "loss": 7.5321, + "step": 1472400 + }, + { + "epoch": 5.998641394292023, + "grad_norm": 16.36842918395996, + "learning_rate": 8.974789934710257e-05, + "loss": 7.5004, + "step": 1472500 + }, + { + "epoch": 5.999048772315405, + "grad_norm": 12.492622375488281, + "learning_rate": 8.962352304948547e-05, + "loss": 7.5078, + "step": 1472600 + }, + { + "epoch": 5.999456150338785, + "grad_norm": 4.250843524932861, + "learning_rate": 8.949924224461917e-05, + "loss": 7.5268, + "step": 1472700 + }, + { + "epoch": 5.999863528362167, + "grad_norm": 3.642354965209961, + "learning_rate": 8.937505693741677e-05, + "loss": 7.5178, + "step": 1472800 + }, + { + "epoch": 6.000270906385548, + "grad_norm": 12.447785377502441, + "learning_rate": 8.925096713278803e-05, + "loss": 7.5448, + "step": 1472900 + }, + { + "epoch": 6.00067828440893, + "grad_norm": 5.121676445007324, + "learning_rate": 8.912697283563867e-05, + "loss": 7.5269, + "step": 1473000 + }, + { + "epoch": 6.00067828440893, + "eval_MaskedAccuracy": 0.513692675246052, + "eval_loss": 1.5907717943191528, + "eval_runtime": 152.3174, + "eval_samples_per_second": 416.735, + "eval_steps_per_second": 1.628, + "step": 1473000 + }, + { + "epoch": 6.0010856624323115, + "grad_norm": 3.62019681930542, + "learning_rate": 8.900307405087142e-05, + "loss": 7.5368, + "step": 1473100 + }, + { + "epoch": 6.001493040455693, + "grad_norm": 19.750944137573242, + "learning_rate": 8.887927078338407e-05, + "loss": 7.5676, + "step": 1473200 + }, + { + "epoch": 6.0019004184790745, + "grad_norm": 3.51492977142334, + "learning_rate": 8.875556303807164e-05, + "loss": 7.5278, + "step": 1473300 + }, + { + "epoch": 6.002307796502456, + "grad_norm": 3.6750075817108154, + "learning_rate": 8.863195081982469e-05, + "loss": 7.5525, + "step": 1473400 + }, + { + "epoch": 6.002715174525838, + "grad_norm": 4.643901824951172, + "learning_rate": 8.850843413353042e-05, + "loss": 7.5, + "step": 1473500 + }, + { + "epoch": 6.003122552549219, + "grad_norm": 7.770880222320557, + "learning_rate": 8.838501298407213e-05, + "loss": 7.5233, + "step": 1473600 + }, + { + "epoch": 6.003529930572601, + "grad_norm": 3.9565768241882324, + "learning_rate": 8.826168737632878e-05, + "loss": 7.5046, + "step": 1473700 + }, + { + "epoch": 6.003937308595982, + "grad_norm": 47.52876663208008, + "learning_rate": 8.813845731517678e-05, + "loss": 7.4973, + "step": 1473800 + }, + { + "epoch": 6.004344686619364, + "grad_norm": 5.450173854827881, + "learning_rate": 8.80153228054878e-05, + "loss": 7.521, + "step": 1473900 + }, + { + "epoch": 6.004752064642744, + "grad_norm": 2.5958971977233887, + "learning_rate": 8.789228385213007e-05, + "loss": 7.5098, + "step": 1474000 + }, + { + "epoch": 6.004752064642744, + "eval_MaskedAccuracy": 0.5133457696758731, + "eval_loss": 1.5857244729995728, + "eval_runtime": 149.2937, + "eval_samples_per_second": 425.175, + "eval_steps_per_second": 1.661, + "step": 1474000 + }, + { + "epoch": 6.005159442666126, + "grad_norm": 4.340638160705566, + "learning_rate": 8.776934045996828e-05, + "loss": 7.5252, + "step": 1474100 + }, + { + "epoch": 6.005566820689507, + "grad_norm": 6.678283214569092, + "learning_rate": 8.764649263386264e-05, + "loss": 7.4868, + "step": 1474200 + }, + { + "epoch": 6.005974198712889, + "grad_norm": 11.900827407836914, + "learning_rate": 8.752374037866983e-05, + "loss": 7.5235, + "step": 1474300 + }, + { + "epoch": 6.0063815767362705, + "grad_norm": 3.1901357173919678, + "learning_rate": 8.740108369924347e-05, + "loss": 7.5153, + "step": 1474400 + }, + { + "epoch": 6.006788954759652, + "grad_norm": 6.039796352386475, + "learning_rate": 8.727852260043249e-05, + "loss": 7.5251, + "step": 1474500 + }, + { + "epoch": 6.0071963327830336, + "grad_norm": 9.42049789428711, + "learning_rate": 8.715605708708207e-05, + "loss": 7.528, + "step": 1474600 + }, + { + "epoch": 6.007603710806415, + "grad_norm": 3.2921876907348633, + "learning_rate": 8.703368716403451e-05, + "loss": 7.5019, + "step": 1474700 + }, + { + "epoch": 6.008011088829797, + "grad_norm": 4.643259525299072, + "learning_rate": 8.691141283612734e-05, + "loss": 7.5136, + "step": 1474800 + }, + { + "epoch": 6.008418466853178, + "grad_norm": 2.806891441345215, + "learning_rate": 8.678923410819515e-05, + "loss": 7.5136, + "step": 1474900 + }, + { + "epoch": 6.00882584487656, + "grad_norm": 6.464155197143555, + "learning_rate": 8.666715098506755e-05, + "loss": 7.5112, + "step": 1475000 + }, + { + "epoch": 6.00882584487656, + "eval_MaskedAccuracy": 0.5135272810651529, + "eval_loss": 1.587581992149353, + "eval_runtime": 153.6876, + "eval_samples_per_second": 413.02, + "eval_steps_per_second": 1.614, + "step": 1475000 + }, + { + "epoch": 6.009233222899941, + "grad_norm": 3.559119701385498, + "learning_rate": 8.654516347157198e-05, + "loss": 7.5267, + "step": 1475100 + }, + { + "epoch": 6.009640600923322, + "grad_norm": 19.796335220336914, + "learning_rate": 8.642327157253099e-05, + "loss": 7.5369, + "step": 1475200 + }, + { + "epoch": 6.010047978946703, + "grad_norm": 3.9808058738708496, + "learning_rate": 8.630147529276317e-05, + "loss": 7.5266, + "step": 1475300 + }, + { + "epoch": 6.010455356970085, + "grad_norm": 2.821082592010498, + "learning_rate": 8.617977463708416e-05, + "loss": 7.5325, + "step": 1475400 + }, + { + "epoch": 6.0108627349934665, + "grad_norm": 6.1518025398254395, + "learning_rate": 8.605816961030575e-05, + "loss": 7.5449, + "step": 1475500 + }, + { + "epoch": 6.011270113016848, + "grad_norm": 20.01251220703125, + "learning_rate": 8.593666021723478e-05, + "loss": 7.5365, + "step": 1475600 + }, + { + "epoch": 6.0116774910402295, + "grad_norm": 3.697535514831543, + "learning_rate": 8.581524646267592e-05, + "loss": 7.4953, + "step": 1475700 + }, + { + "epoch": 6.012084869063611, + "grad_norm": 4.0473785400390625, + "learning_rate": 8.569392835142931e-05, + "loss": 7.5204, + "step": 1475800 + }, + { + "epoch": 6.012492247086993, + "grad_norm": 4.885238170623779, + "learning_rate": 8.55727058882911e-05, + "loss": 7.5096, + "step": 1475900 + }, + { + "epoch": 6.012899625110374, + "grad_norm": 10.347676277160645, + "learning_rate": 8.545157907805378e-05, + "loss": 7.5226, + "step": 1476000 + }, + { + "epoch": 6.012899625110374, + "eval_MaskedAccuracy": 0.5145368133630774, + "eval_loss": 1.5742132663726807, + "eval_runtime": 166.2148, + "eval_samples_per_second": 381.891, + "eval_steps_per_second": 1.492, + "step": 1476000 + }, + { + "epoch": 6.013307003133756, + "grad_norm": 8.436140060424805, + "learning_rate": 8.533054792550602e-05, + "loss": 7.5097, + "step": 1476100 + }, + { + "epoch": 6.013714381157137, + "grad_norm": 18.134437561035156, + "learning_rate": 8.520961243543283e-05, + "loss": 7.5142, + "step": 1476200 + }, + { + "epoch": 6.014121759180519, + "grad_norm": 8.203875541687012, + "learning_rate": 8.508877261261593e-05, + "loss": 7.538, + "step": 1476300 + }, + { + "epoch": 6.0145291372039, + "grad_norm": 4.324888229370117, + "learning_rate": 8.496802846183241e-05, + "loss": 7.5197, + "step": 1476400 + }, + { + "epoch": 6.014936515227281, + "grad_norm": 7.974706172943115, + "learning_rate": 8.484737998785591e-05, + "loss": 7.5017, + "step": 1476500 + }, + { + "epoch": 6.015343893250662, + "grad_norm": 4.344145774841309, + "learning_rate": 8.472682719545635e-05, + "loss": 7.4817, + "step": 1476600 + }, + { + "epoch": 6.015751271274044, + "grad_norm": 43.55150604248047, + "learning_rate": 8.46063700894001e-05, + "loss": 7.531, + "step": 1476700 + }, + { + "epoch": 6.0161586492974255, + "grad_norm": 24.808927536010742, + "learning_rate": 8.448600867444912e-05, + "loss": 7.5286, + "step": 1476800 + }, + { + "epoch": 6.016566027320807, + "grad_norm": 29.526124954223633, + "learning_rate": 8.436574295536182e-05, + "loss": 7.5118, + "step": 1476900 + }, + { + "epoch": 6.016973405344189, + "grad_norm": 12.216218948364258, + "learning_rate": 8.424557293689314e-05, + "loss": 7.4915, + "step": 1477000 + }, + { + "epoch": 6.016973405344189, + "eval_MaskedAccuracy": 0.5140047141803903, + "eval_loss": 1.5815221071243286, + "eval_runtime": 175.0698, + "eval_samples_per_second": 362.575, + "eval_steps_per_second": 1.417, + "step": 1477000 + }, + { + "epoch": 6.01738078336757, + "grad_norm": 12.379630088806152, + "learning_rate": 8.412549862379428e-05, + "loss": 7.5399, + "step": 1477100 + }, + { + "epoch": 6.017788161390952, + "grad_norm": 7.037223815917969, + "learning_rate": 8.400552002081206e-05, + "loss": 7.4965, + "step": 1477200 + }, + { + "epoch": 6.018195539414333, + "grad_norm": 13.78564739227295, + "learning_rate": 8.388563713269037e-05, + "loss": 7.5165, + "step": 1477300 + }, + { + "epoch": 6.018602917437715, + "grad_norm": 17.575111389160156, + "learning_rate": 8.376584996416819e-05, + "loss": 7.5105, + "step": 1477400 + }, + { + "epoch": 6.019010295461096, + "grad_norm": 42.46109390258789, + "learning_rate": 8.364615851998164e-05, + "loss": 7.5303, + "step": 1477500 + }, + { + "epoch": 6.019417673484478, + "grad_norm": 29.17548942565918, + "learning_rate": 8.352656280486201e-05, + "loss": 7.5214, + "step": 1477600 + }, + { + "epoch": 6.019825051507858, + "grad_norm": 32.769691467285156, + "learning_rate": 8.340706282353902e-05, + "loss": 7.5035, + "step": 1477700 + }, + { + "epoch": 6.02023242953124, + "grad_norm": 10.002124786376953, + "learning_rate": 8.328765858073651e-05, + "loss": 7.5399, + "step": 1477800 + }, + { + "epoch": 6.0206398075546215, + "grad_norm": 31.22835922241211, + "learning_rate": 8.316835008117497e-05, + "loss": 7.5106, + "step": 1477900 + }, + { + "epoch": 6.021047185578003, + "grad_norm": 8.925127029418945, + "learning_rate": 8.30491373295714e-05, + "loss": 7.5622, + "step": 1478000 + }, + { + "epoch": 6.021047185578003, + "eval_MaskedAccuracy": 0.5135413131190498, + "eval_loss": 1.5861700773239136, + "eval_runtime": 151.9379, + "eval_samples_per_second": 417.776, + "eval_steps_per_second": 1.632, + "step": 1478000 + }, + { + "epoch": 6.0214545636013845, + "grad_norm": 19.77899169921875, + "learning_rate": 8.293002033063891e-05, + "loss": 7.5126, + "step": 1478100 + }, + { + "epoch": 6.021861941624766, + "grad_norm": 25.79058837890625, + "learning_rate": 8.281099908908722e-05, + "loss": 7.471, + "step": 1478200 + }, + { + "epoch": 6.022269319648148, + "grad_norm": 16.387996673583984, + "learning_rate": 8.269207360962114e-05, + "loss": 7.5214, + "step": 1478300 + }, + { + "epoch": 6.022676697671529, + "grad_norm": 3.027670383453369, + "learning_rate": 8.257324389694275e-05, + "loss": 7.5149, + "step": 1478400 + }, + { + "epoch": 6.023084075694911, + "grad_norm": 23.706575393676758, + "learning_rate": 8.245450995575031e-05, + "loss": 7.5386, + "step": 1478500 + }, + { + "epoch": 6.023491453718292, + "grad_norm": 21.659406661987305, + "learning_rate": 8.233587179073771e-05, + "loss": 7.5462, + "step": 1478600 + }, + { + "epoch": 6.023898831741674, + "grad_norm": 16.330801010131836, + "learning_rate": 8.221732940659524e-05, + "loss": 7.5242, + "step": 1478700 + }, + { + "epoch": 6.024306209765055, + "grad_norm": 19.215543746948242, + "learning_rate": 8.209888280801004e-05, + "loss": 7.4945, + "step": 1478800 + }, + { + "epoch": 6.024713587788437, + "grad_norm": 21.308149337768555, + "learning_rate": 8.198053199966429e-05, + "loss": 7.5352, + "step": 1478900 + }, + { + "epoch": 6.025120965811817, + "grad_norm": 3.8174691200256348, + "learning_rate": 8.186227698623734e-05, + "loss": 7.5282, + "step": 1479000 + }, + { + "epoch": 6.025120965811817, + "eval_MaskedAccuracy": 0.5139456962781497, + "eval_loss": 1.5708402395248413, + "eval_runtime": 168.6338, + "eval_samples_per_second": 376.413, + "eval_steps_per_second": 1.471, + "step": 1479000 + }, + { + "epoch": 6.025528343835199, + "grad_norm": 11.857566833496094, + "learning_rate": 8.174411777240445e-05, + "loss": 7.5141, + "step": 1479100 + }, + { + "epoch": 6.0259357218585805, + "grad_norm": 20.858306884765625, + "learning_rate": 8.162605436283722e-05, + "loss": 7.5072, + "step": 1479200 + }, + { + "epoch": 6.026343099881962, + "grad_norm": 9.31919002532959, + "learning_rate": 8.150808676220307e-05, + "loss": 7.532, + "step": 1479300 + }, + { + "epoch": 6.026750477905344, + "grad_norm": 16.40489959716797, + "learning_rate": 8.139021497516625e-05, + "loss": 7.4986, + "step": 1479400 + }, + { + "epoch": 6.027157855928725, + "grad_norm": 20.209657669067383, + "learning_rate": 8.127243900638636e-05, + "loss": 7.5194, + "step": 1479500 + }, + { + "epoch": 6.027565233952107, + "grad_norm": 13.192978858947754, + "learning_rate": 8.115475886051974e-05, + "loss": 7.5556, + "step": 1479600 + }, + { + "epoch": 6.027972611975488, + "grad_norm": 3.7593281269073486, + "learning_rate": 8.103717454221929e-05, + "loss": 7.5251, + "step": 1479700 + }, + { + "epoch": 6.02837998999887, + "grad_norm": 8.86939525604248, + "learning_rate": 8.091968605613393e-05, + "loss": 7.5129, + "step": 1479800 + }, + { + "epoch": 6.028787368022251, + "grad_norm": 7.07117223739624, + "learning_rate": 8.080229340690793e-05, + "loss": 7.5095, + "step": 1479900 + }, + { + "epoch": 6.029194746045633, + "grad_norm": 33.47684860229492, + "learning_rate": 8.068499659918283e-05, + "loss": 7.5375, + "step": 1480000 + }, + { + "epoch": 6.029194746045633, + "eval_MaskedAccuracy": 0.5141075699483529, + "eval_loss": 1.578412413597107, + "eval_runtime": 188.9794, + "eval_samples_per_second": 335.889, + "eval_steps_per_second": 1.312, + "step": 1480000 + }, + { + "epoch": 6.029602124069014, + "grad_norm": 22.33747100830078, + "learning_rate": 8.056779563759619e-05, + "loss": 7.5462, + "step": 1480100 + }, + { + "epoch": 6.030009502092395, + "grad_norm": 4.381908416748047, + "learning_rate": 8.045069052678106e-05, + "loss": 7.544, + "step": 1480200 + }, + { + "epoch": 6.0304168801157765, + "grad_norm": 25.629236221313477, + "learning_rate": 8.033368127136756e-05, + "loss": 7.5352, + "step": 1480300 + }, + { + "epoch": 6.030824258139158, + "grad_norm": 20.314191818237305, + "learning_rate": 8.02167678759819e-05, + "loss": 7.5364, + "step": 1480400 + }, + { + "epoch": 6.0312316361625395, + "grad_norm": 15.726969718933105, + "learning_rate": 8.00999503452459e-05, + "loss": 7.5213, + "step": 1480500 + }, + { + "epoch": 6.031639014185921, + "grad_norm": 17.58500862121582, + "learning_rate": 7.998322868377814e-05, + "loss": 7.5023, + "step": 1480600 + }, + { + "epoch": 6.032046392209303, + "grad_norm": 20.897462844848633, + "learning_rate": 7.986660289619344e-05, + "loss": 7.502, + "step": 1480700 + }, + { + "epoch": 6.032453770232684, + "grad_norm": 20.865142822265625, + "learning_rate": 7.975007298710239e-05, + "loss": 7.5286, + "step": 1480800 + }, + { + "epoch": 6.032861148256066, + "grad_norm": 25.1275691986084, + "learning_rate": 7.963363896111185e-05, + "loss": 7.5269, + "step": 1480900 + }, + { + "epoch": 6.033268526279447, + "grad_norm": 28.700515747070312, + "learning_rate": 7.951730082282565e-05, + "loss": 7.4916, + "step": 1481000 + }, + { + "epoch": 6.033268526279447, + "eval_MaskedAccuracy": 0.5133240535375019, + "eval_loss": 1.5870534181594849, + "eval_runtime": 173.3375, + "eval_samples_per_second": 366.199, + "eval_steps_per_second": 1.431, + "step": 1481000 + }, + { + "epoch": 6.033675904302829, + "grad_norm": 30.533897399902344, + "learning_rate": 7.940105857684282e-05, + "loss": 7.5085, + "step": 1481100 + }, + { + "epoch": 6.03408328232621, + "grad_norm": 10.506007194519043, + "learning_rate": 7.928491222775904e-05, + "loss": 7.5175, + "step": 1481200 + }, + { + "epoch": 6.034490660349592, + "grad_norm": 8.078603744506836, + "learning_rate": 7.916886178016647e-05, + "loss": 7.5289, + "step": 1481300 + }, + { + "epoch": 6.034898038372973, + "grad_norm": 9.919642448425293, + "learning_rate": 7.905290723865287e-05, + "loss": 7.531, + "step": 1481400 + }, + { + "epoch": 6.035305416396354, + "grad_norm": 6.306997776031494, + "learning_rate": 7.89370486078031e-05, + "loss": 7.5023, + "step": 1481500 + }, + { + "epoch": 6.0357127944197355, + "grad_norm": 20.438024520874023, + "learning_rate": 7.88212858921965e-05, + "loss": 7.5102, + "step": 1481600 + }, + { + "epoch": 6.036120172443117, + "grad_norm": 14.660018920898438, + "learning_rate": 7.870561909641109e-05, + "loss": 7.5188, + "step": 1481700 + }, + { + "epoch": 6.036527550466499, + "grad_norm": 5.2086615562438965, + "learning_rate": 7.859004822501947e-05, + "loss": 7.5134, + "step": 1481800 + }, + { + "epoch": 6.03693492848988, + "grad_norm": 6.651661396026611, + "learning_rate": 7.847457328259038e-05, + "loss": 7.529, + "step": 1481900 + }, + { + "epoch": 6.037342306513262, + "grad_norm": 21.134342193603516, + "learning_rate": 7.835919427368926e-05, + "loss": 7.5063, + "step": 1482000 + }, + { + "epoch": 6.037342306513262, + "eval_MaskedAccuracy": 0.5138702898446108, + "eval_loss": 1.595697283744812, + "eval_runtime": 157.862, + "eval_samples_per_second": 402.098, + "eval_steps_per_second": 1.571, + "step": 1482000 + }, + { + "epoch": 6.037749684536643, + "grad_norm": 4.608585834503174, + "learning_rate": 7.82439112028781e-05, + "loss": 7.4962, + "step": 1482100 + }, + { + "epoch": 6.038157062560025, + "grad_norm": 9.62073802947998, + "learning_rate": 7.812872407471408e-05, + "loss": 7.486, + "step": 1482200 + }, + { + "epoch": 6.038564440583406, + "grad_norm": 2.1867218017578125, + "learning_rate": 7.801363289375163e-05, + "loss": 7.5352, + "step": 1482300 + }, + { + "epoch": 6.038971818606788, + "grad_norm": 21.22916603088379, + "learning_rate": 7.789863766454078e-05, + "loss": 7.5152, + "step": 1482400 + }, + { + "epoch": 6.039379196630169, + "grad_norm": 3.346815347671509, + "learning_rate": 7.778373839162766e-05, + "loss": 7.5238, + "step": 1482500 + }, + { + "epoch": 6.039786574653551, + "grad_norm": 12.451679229736328, + "learning_rate": 7.766893507955478e-05, + "loss": 7.5203, + "step": 1482600 + }, + { + "epoch": 6.0401939526769315, + "grad_norm": 27.90259552001953, + "learning_rate": 7.755422773286169e-05, + "loss": 7.5063, + "step": 1482700 + }, + { + "epoch": 6.040601330700313, + "grad_norm": 14.531145095825195, + "learning_rate": 7.743961635608251e-05, + "loss": 7.5414, + "step": 1482800 + }, + { + "epoch": 6.0410087087236946, + "grad_norm": 29.668895721435547, + "learning_rate": 7.732510095374884e-05, + "loss": 7.5228, + "step": 1482900 + }, + { + "epoch": 6.041416086747076, + "grad_norm": 32.814239501953125, + "learning_rate": 7.721068153038813e-05, + "loss": 7.4897, + "step": 1483000 + }, + { + "epoch": 6.041416086747076, + "eval_MaskedAccuracy": 0.513304971145958, + "eval_loss": 1.585016131401062, + "eval_runtime": 173.8014, + "eval_samples_per_second": 365.221, + "eval_steps_per_second": 1.427, + "step": 1483000 + }, + { + "epoch": 6.041823464770458, + "grad_norm": 5.910619258880615, + "learning_rate": 7.709635809052385e-05, + "loss": 7.5483, + "step": 1483100 + }, + { + "epoch": 6.042230842793839, + "grad_norm": 18.325368881225586, + "learning_rate": 7.698213063867615e-05, + "loss": 7.5097, + "step": 1483200 + }, + { + "epoch": 6.042638220817221, + "grad_norm": 4.285280704498291, + "learning_rate": 7.686799917936054e-05, + "loss": 7.5598, + "step": 1483300 + }, + { + "epoch": 6.043045598840602, + "grad_norm": 5.864536285400391, + "learning_rate": 7.675396371708956e-05, + "loss": 7.5594, + "step": 1483400 + }, + { + "epoch": 6.043452976863984, + "grad_norm": 3.1510560512542725, + "learning_rate": 7.664002425637147e-05, + "loss": 7.4977, + "step": 1483500 + }, + { + "epoch": 6.043860354887365, + "grad_norm": 13.101836204528809, + "learning_rate": 7.65261808017107e-05, + "loss": 7.5379, + "step": 1483600 + }, + { + "epoch": 6.044267732910747, + "grad_norm": 17.246509552001953, + "learning_rate": 7.641243335760868e-05, + "loss": 7.5224, + "step": 1483700 + }, + { + "epoch": 6.044675110934128, + "grad_norm": 25.920820236206055, + "learning_rate": 7.629878192856211e-05, + "loss": 7.5453, + "step": 1483800 + }, + { + "epoch": 6.04508248895751, + "grad_norm": 16.338682174682617, + "learning_rate": 7.618522651906443e-05, + "loss": 7.5451, + "step": 1483900 + }, + { + "epoch": 6.0454898669808905, + "grad_norm": 17.48977279663086, + "learning_rate": 7.607176713360497e-05, + "loss": 7.5639, + "step": 1484000 + }, + { + "epoch": 6.0454898669808905, + "eval_MaskedAccuracy": 0.5138310179083767, + "eval_loss": 1.586430311203003, + "eval_runtime": 196.0605, + "eval_samples_per_second": 323.757, + "eval_steps_per_second": 1.265, + "step": 1484000 + }, + { + "epoch": 6.045897245004272, + "grad_norm": 10.223199844360352, + "learning_rate": 7.595840377666913e-05, + "loss": 7.516, + "step": 1484100 + }, + { + "epoch": 6.046304623027654, + "grad_norm": 4.165820598602295, + "learning_rate": 7.584513645273925e-05, + "loss": 7.5316, + "step": 1484200 + }, + { + "epoch": 6.046712001051035, + "grad_norm": 18.81446647644043, + "learning_rate": 7.57319651662932e-05, + "loss": 7.5156, + "step": 1484300 + }, + { + "epoch": 6.047119379074417, + "grad_norm": 19.572067260742188, + "learning_rate": 7.561888992180515e-05, + "loss": 7.5306, + "step": 1484400 + }, + { + "epoch": 6.047526757097798, + "grad_norm": 21.238670349121094, + "learning_rate": 7.550591072374554e-05, + "loss": 7.5215, + "step": 1484500 + }, + { + "epoch": 6.04793413512118, + "grad_norm": 15.118731498718262, + "learning_rate": 7.539302757658103e-05, + "loss": 7.5368, + "step": 1484600 + }, + { + "epoch": 6.048341513144561, + "grad_norm": 10.186454772949219, + "learning_rate": 7.528024048477497e-05, + "loss": 7.5218, + "step": 1484700 + }, + { + "epoch": 6.048748891167943, + "grad_norm": 12.231056213378906, + "learning_rate": 7.516754945278571e-05, + "loss": 7.5073, + "step": 1484800 + }, + { + "epoch": 6.049156269191324, + "grad_norm": 17.024585723876953, + "learning_rate": 7.5054954485069e-05, + "loss": 7.5014, + "step": 1484900 + }, + { + "epoch": 6.049563647214706, + "grad_norm": 4.59981107711792, + "learning_rate": 7.494245558607634e-05, + "loss": 7.5547, + "step": 1485000 + }, + { + "epoch": 6.049563647214706, + "eval_MaskedAccuracy": 0.5133585626392425, + "eval_loss": 1.5897096395492554, + "eval_runtime": 160.2187, + "eval_samples_per_second": 396.183, + "eval_steps_per_second": 1.548, + "step": 1485000 + }, + { + "epoch": 6.049971025238087, + "grad_norm": 21.608034133911133, + "learning_rate": 7.483005276025505e-05, + "loss": 7.5247, + "step": 1485100 + }, + { + "epoch": 6.050378403261468, + "grad_norm": 22.101882934570312, + "learning_rate": 7.471774601204954e-05, + "loss": 7.4833, + "step": 1485200 + }, + { + "epoch": 6.05078578128485, + "grad_norm": 14.103845596313477, + "learning_rate": 7.460553534589917e-05, + "loss": 7.5188, + "step": 1485300 + }, + { + "epoch": 6.051193159308231, + "grad_norm": 16.816926956176758, + "learning_rate": 7.449342076624102e-05, + "loss": 7.5451, + "step": 1485400 + }, + { + "epoch": 6.051600537331613, + "grad_norm": 13.526528358459473, + "learning_rate": 7.438140227750652e-05, + "loss": 7.5259, + "step": 1485500 + }, + { + "epoch": 6.052007915354994, + "grad_norm": 18.043670654296875, + "learning_rate": 7.426947988412573e-05, + "loss": 7.5241, + "step": 1485600 + }, + { + "epoch": 6.052415293378376, + "grad_norm": 3.703565835952759, + "learning_rate": 7.415765359052263e-05, + "loss": 7.5698, + "step": 1485700 + }, + { + "epoch": 6.052822671401757, + "grad_norm": 3.1997010707855225, + "learning_rate": 7.404592340111874e-05, + "loss": 7.5117, + "step": 1485800 + }, + { + "epoch": 6.053230049425139, + "grad_norm": 26.271591186523438, + "learning_rate": 7.393428932033097e-05, + "loss": 7.5212, + "step": 1485900 + }, + { + "epoch": 6.05363742744852, + "grad_norm": 11.788737297058105, + "learning_rate": 7.382275135257303e-05, + "loss": 7.5569, + "step": 1486000 + }, + { + "epoch": 6.05363742744852, + "eval_MaskedAccuracy": 0.5138277712720543, + "eval_loss": 1.5856705904006958, + "eval_runtime": 175.3261, + "eval_samples_per_second": 362.045, + "eval_steps_per_second": 1.415, + "step": 1486000 + }, + { + "epoch": 6.054044805471902, + "grad_norm": 16.922420501708984, + "learning_rate": 7.371130950225481e-05, + "loss": 7.5364, + "step": 1486100 + }, + { + "epoch": 6.054452183495283, + "grad_norm": 13.531429290771484, + "learning_rate": 7.359996377378155e-05, + "loss": 7.5092, + "step": 1486200 + }, + { + "epoch": 6.054859561518665, + "grad_norm": 14.451566696166992, + "learning_rate": 7.34887141715562e-05, + "loss": 7.5153, + "step": 1486300 + }, + { + "epoch": 6.055266939542046, + "grad_norm": 21.359676361083984, + "learning_rate": 7.33775606999764e-05, + "loss": 7.546, + "step": 1486400 + }, + { + "epoch": 6.055674317565427, + "grad_norm": 20.718141555786133, + "learning_rate": 7.326650336343699e-05, + "loss": 7.5072, + "step": 1486500 + }, + { + "epoch": 6.056081695588809, + "grad_norm": 13.19158935546875, + "learning_rate": 7.315554216632842e-05, + "loss": 7.5253, + "step": 1486600 + }, + { + "epoch": 6.05648907361219, + "grad_norm": 13.698712348937988, + "learning_rate": 7.304467711303758e-05, + "loss": 7.5042, + "step": 1486700 + }, + { + "epoch": 6.056896451635572, + "grad_norm": 16.147371292114258, + "learning_rate": 7.293390820794776e-05, + "loss": 7.5272, + "step": 1486800 + }, + { + "epoch": 6.057303829658953, + "grad_norm": 17.229768753051758, + "learning_rate": 7.282323545543808e-05, + "loss": 7.5149, + "step": 1486900 + }, + { + "epoch": 6.057711207682335, + "grad_norm": 12.138152122497559, + "learning_rate": 7.271265885988438e-05, + "loss": 7.5274, + "step": 1487000 + }, + { + "epoch": 6.057711207682335, + "eval_MaskedAccuracy": 0.5139290309555314, + "eval_loss": 1.5757960081100464, + "eval_runtime": 158.3188, + "eval_samples_per_second": 400.938, + "eval_steps_per_second": 1.566, + "step": 1487000 + }, + { + "epoch": 6.058118585705716, + "grad_norm": 5.158970832824707, + "learning_rate": 7.260217842565783e-05, + "loss": 7.5205, + "step": 1487100 + }, + { + "epoch": 6.058525963729098, + "grad_norm": 20.5587158203125, + "learning_rate": 7.249179415712652e-05, + "loss": 7.5331, + "step": 1487200 + }, + { + "epoch": 6.058933341752479, + "grad_norm": 16.001752853393555, + "learning_rate": 7.238150605865461e-05, + "loss": 7.5402, + "step": 1487300 + }, + { + "epoch": 6.059340719775861, + "grad_norm": 19.773279190063477, + "learning_rate": 7.227131413460207e-05, + "loss": 7.5562, + "step": 1487400 + }, + { + "epoch": 6.059748097799242, + "grad_norm": 20.93306541442871, + "learning_rate": 7.216121838932572e-05, + "loss": 7.5483, + "step": 1487500 + }, + { + "epoch": 6.060155475822624, + "grad_norm": 9.476699829101562, + "learning_rate": 7.205121882717846e-05, + "loss": 7.5344, + "step": 1487600 + }, + { + "epoch": 6.060562853846005, + "grad_norm": 13.896587371826172, + "learning_rate": 7.194131545250865e-05, + "loss": 7.5394, + "step": 1487700 + }, + { + "epoch": 6.060970231869386, + "grad_norm": 21.827335357666016, + "learning_rate": 7.183150826966167e-05, + "loss": 7.5509, + "step": 1487800 + }, + { + "epoch": 6.061377609892768, + "grad_norm": 10.596880912780762, + "learning_rate": 7.172179728297853e-05, + "loss": 7.5221, + "step": 1487900 + }, + { + "epoch": 6.061784987916149, + "grad_norm": 14.482085227966309, + "learning_rate": 7.161218249679691e-05, + "loss": 7.508, + "step": 1488000 + }, + { + "epoch": 6.061784987916149, + "eval_MaskedAccuracy": 0.5136656771419797, + "eval_loss": 1.5972084999084473, + "eval_runtime": 158.4216, + "eval_samples_per_second": 400.678, + "eval_steps_per_second": 1.565, + "step": 1488000 + }, + { + "epoch": 6.062192365939531, + "grad_norm": 19.168067932128906, + "learning_rate": 7.150266391545025e-05, + "loss": 7.5234, + "step": 1488100 + }, + { + "epoch": 6.062599743962912, + "grad_norm": 9.889928817749023, + "learning_rate": 7.139324154326872e-05, + "loss": 7.5054, + "step": 1488200 + }, + { + "epoch": 6.063007121986294, + "grad_norm": 21.029836654663086, + "learning_rate": 7.128391538457812e-05, + "loss": 7.5124, + "step": 1488300 + }, + { + "epoch": 6.063414500009675, + "grad_norm": 13.195849418640137, + "learning_rate": 7.117468544370075e-05, + "loss": 7.5186, + "step": 1488400 + }, + { + "epoch": 6.063821878033057, + "grad_norm": 13.294780731201172, + "learning_rate": 7.106555172495504e-05, + "loss": 7.5509, + "step": 1488500 + }, + { + "epoch": 6.064229256056438, + "grad_norm": 12.833215713500977, + "learning_rate": 7.095651423265603e-05, + "loss": 7.561, + "step": 1488600 + }, + { + "epoch": 6.06463663407982, + "grad_norm": 10.44172191619873, + "learning_rate": 7.084757297111431e-05, + "loss": 7.5101, + "step": 1488700 + }, + { + "epoch": 6.065044012103201, + "grad_norm": 2.6616299152374268, + "learning_rate": 7.07387279446363e-05, + "loss": 7.5579, + "step": 1488800 + }, + { + "epoch": 6.065451390126583, + "grad_norm": 5.75972318649292, + "learning_rate": 7.06299791575263e-05, + "loss": 7.508, + "step": 1488900 + }, + { + "epoch": 6.065858768149964, + "grad_norm": 12.5253267288208, + "learning_rate": 7.052132661408304e-05, + "loss": 7.5384, + "step": 1489000 + }, + { + "epoch": 6.065858768149964, + "eval_MaskedAccuracy": 0.5136171818785219, + "eval_loss": 1.5813449621200562, + "eval_runtime": 181.8537, + "eval_samples_per_second": 349.05, + "eval_steps_per_second": 1.364, + "step": 1489000 + }, + { + "epoch": 6.066266146173345, + "grad_norm": 21.288951873779297, + "learning_rate": 7.0412770318602e-05, + "loss": 7.5523, + "step": 1489100 + }, + { + "epoch": 6.066673524196727, + "grad_norm": 16.027528762817383, + "learning_rate": 7.03043102753754e-05, + "loss": 7.5512, + "step": 1489200 + }, + { + "epoch": 6.067080902220108, + "grad_norm": 16.88258171081543, + "learning_rate": 7.019594648869112e-05, + "loss": 7.5768, + "step": 1489300 + }, + { + "epoch": 6.06748828024349, + "grad_norm": 2.8600540161132812, + "learning_rate": 7.008767896283354e-05, + "loss": 7.5026, + "step": 1489400 + }, + { + "epoch": 6.067895658266871, + "grad_norm": 18.037826538085938, + "learning_rate": 6.997950770208234e-05, + "loss": 7.5338, + "step": 1489500 + }, + { + "epoch": 6.068303036290253, + "grad_norm": 16.59130096435547, + "learning_rate": 6.987143271071482e-05, + "loss": 7.5303, + "step": 1489600 + }, + { + "epoch": 6.068710414313634, + "grad_norm": 2.7622907161712646, + "learning_rate": 6.97634539930036e-05, + "loss": 7.5182, + "step": 1489700 + }, + { + "epoch": 6.069117792337016, + "grad_norm": 15.692123413085938, + "learning_rate": 6.965557155321751e-05, + "loss": 7.526, + "step": 1489800 + }, + { + "epoch": 6.069525170360397, + "grad_norm": 13.806824684143066, + "learning_rate": 6.954778539562206e-05, + "loss": 7.5205, + "step": 1489900 + }, + { + "epoch": 6.069932548383779, + "grad_norm": 18.76637840270996, + "learning_rate": 6.944009552447811e-05, + "loss": 7.5312, + "step": 1490000 + }, + { + "epoch": 6.069932548383779, + "eval_MaskedAccuracy": 0.5137253195890193, + "eval_loss": 1.580369234085083, + "eval_runtime": 156.3748, + "eval_samples_per_second": 405.922, + "eval_steps_per_second": 1.586, + "step": 1490000 + }, + { + "epoch": 6.0703399264071605, + "grad_norm": 13.451923370361328, + "learning_rate": 6.933250194404374e-05, + "loss": 7.5104, + "step": 1490100 + }, + { + "epoch": 6.070747304430541, + "grad_norm": 5.400201797485352, + "learning_rate": 6.922500465857215e-05, + "loss": 7.5199, + "step": 1490200 + }, + { + "epoch": 6.071154682453923, + "grad_norm": 6.814541339874268, + "learning_rate": 6.911760367231377e-05, + "loss": 7.5038, + "step": 1490300 + }, + { + "epoch": 6.071562060477304, + "grad_norm": 20.72386932373047, + "learning_rate": 6.90102989895144e-05, + "loss": 7.5477, + "step": 1490400 + }, + { + "epoch": 6.071969438500686, + "grad_norm": 17.206645965576172, + "learning_rate": 6.890309061441635e-05, + "loss": 7.5428, + "step": 1490500 + }, + { + "epoch": 6.072376816524067, + "grad_norm": 16.003599166870117, + "learning_rate": 6.879597855125853e-05, + "loss": 7.5401, + "step": 1490600 + }, + { + "epoch": 6.072784194547449, + "grad_norm": 12.124286651611328, + "learning_rate": 6.868896280427548e-05, + "loss": 7.5281, + "step": 1490700 + }, + { + "epoch": 6.07319157257083, + "grad_norm": 12.789175033569336, + "learning_rate": 6.858204337769781e-05, + "loss": 7.535, + "step": 1490800 + }, + { + "epoch": 6.073598950594212, + "grad_norm": 8.151293754577637, + "learning_rate": 6.847522027575316e-05, + "loss": 7.523, + "step": 1490900 + }, + { + "epoch": 6.074006328617593, + "grad_norm": 17.697446823120117, + "learning_rate": 6.836849350266432e-05, + "loss": 7.5407, + "step": 1491000 + }, + { + "epoch": 6.074006328617593, + "eval_MaskedAccuracy": 0.5141354898984748, + "eval_loss": 1.5804506540298462, + "eval_runtime": 153.9659, + "eval_samples_per_second": 412.273, + "eval_steps_per_second": 1.611, + "step": 1491000 + }, + { + "epoch": 6.074413706640975, + "grad_norm": 15.482162475585938, + "learning_rate": 6.826186306265088e-05, + "loss": 7.5333, + "step": 1491100 + }, + { + "epoch": 6.074821084664356, + "grad_norm": 8.309759140014648, + "learning_rate": 6.81553289599287e-05, + "loss": 7.539, + "step": 1491200 + }, + { + "epoch": 6.075228462687738, + "grad_norm": 16.621870040893555, + "learning_rate": 6.804889119870934e-05, + "loss": 7.542, + "step": 1491300 + }, + { + "epoch": 6.0756358407111195, + "grad_norm": 30.539064407348633, + "learning_rate": 6.79425497832011e-05, + "loss": 7.5703, + "step": 1491400 + }, + { + "epoch": 6.0760432187345, + "grad_norm": 20.90303611755371, + "learning_rate": 6.783630471760798e-05, + "loss": 7.5265, + "step": 1491500 + }, + { + "epoch": 6.076450596757882, + "grad_norm": 15.229084014892578, + "learning_rate": 6.7730156006131e-05, + "loss": 7.5384, + "step": 1491600 + }, + { + "epoch": 6.076857974781263, + "grad_norm": 3.7409682273864746, + "learning_rate": 6.762410365296634e-05, + "loss": 7.5134, + "step": 1491700 + }, + { + "epoch": 6.077265352804645, + "grad_norm": 4.02523946762085, + "learning_rate": 6.751814766230639e-05, + "loss": 7.5619, + "step": 1491800 + }, + { + "epoch": 6.077672730828026, + "grad_norm": 6.49707555770874, + "learning_rate": 6.741228803834111e-05, + "loss": 7.5522, + "step": 1491900 + }, + { + "epoch": 6.078080108851408, + "grad_norm": 24.099685668945312, + "learning_rate": 6.730652478525492e-05, + "loss": 7.5671, + "step": 1492000 + }, + { + "epoch": 6.078080108851408, + "eval_MaskedAccuracy": 0.5134329376238207, + "eval_loss": 1.5875269174575806, + "eval_runtime": 230.1251, + "eval_samples_per_second": 275.833, + "eval_steps_per_second": 1.078, + "step": 1492000 + }, + { + "epoch": 6.078487486874789, + "grad_norm": 11.89902400970459, + "learning_rate": 6.720085790722995e-05, + "loss": 7.5227, + "step": 1492100 + }, + { + "epoch": 6.078894864898171, + "grad_norm": 37.37939453125, + "learning_rate": 6.709528740844305e-05, + "loss": 7.5143, + "step": 1492200 + }, + { + "epoch": 6.079302242921552, + "grad_norm": 14.41089916229248, + "learning_rate": 6.698981329306816e-05, + "loss": 7.5072, + "step": 1492300 + }, + { + "epoch": 6.079709620944934, + "grad_norm": 22.131799697875977, + "learning_rate": 6.688443556527573e-05, + "loss": 7.5069, + "step": 1492400 + }, + { + "epoch": 6.0801169989683155, + "grad_norm": 4.21099853515625, + "learning_rate": 6.677915422923131e-05, + "loss": 7.5656, + "step": 1492500 + }, + { + "epoch": 6.080524376991697, + "grad_norm": 30.27763557434082, + "learning_rate": 6.667396928909737e-05, + "loss": 7.5174, + "step": 1492600 + }, + { + "epoch": 6.080931755015078, + "grad_norm": 8.610176086425781, + "learning_rate": 6.65688807490325e-05, + "loss": 7.5211, + "step": 1492700 + }, + { + "epoch": 6.081339133038459, + "grad_norm": 20.53519630432129, + "learning_rate": 6.646388861319148e-05, + "loss": 7.5401, + "step": 1492800 + }, + { + "epoch": 6.081746511061841, + "grad_norm": 8.410408020019531, + "learning_rate": 6.635899288572525e-05, + "loss": 7.5366, + "step": 1492900 + }, + { + "epoch": 6.082153889085222, + "grad_norm": 9.369873046875, + "learning_rate": 6.625419357078061e-05, + "loss": 7.5128, + "step": 1493000 + }, + { + "epoch": 6.082153889085222, + "eval_MaskedAccuracy": 0.5137086489015725, + "eval_loss": 1.576980471611023, + "eval_runtime": 165.4792, + "eval_samples_per_second": 383.589, + "eval_steps_per_second": 1.499, + "step": 1493000 + }, + { + "epoch": 6.082561267108604, + "grad_norm": 12.149567604064941, + "learning_rate": 6.614949067250127e-05, + "loss": 7.5179, + "step": 1493100 + }, + { + "epoch": 6.082968645131985, + "grad_norm": 2.9722039699554443, + "learning_rate": 6.604488419502623e-05, + "loss": 7.5354, + "step": 1493200 + }, + { + "epoch": 6.083376023155367, + "grad_norm": 13.29632568359375, + "learning_rate": 6.594037414249129e-05, + "loss": 7.5287, + "step": 1493300 + }, + { + "epoch": 6.083783401178748, + "grad_norm": 24.41215705871582, + "learning_rate": 6.58359605190286e-05, + "loss": 7.5353, + "step": 1493400 + }, + { + "epoch": 6.08419077920213, + "grad_norm": 5.133878707885742, + "learning_rate": 6.57316433287656e-05, + "loss": 7.5244, + "step": 1493500 + }, + { + "epoch": 6.0845981572255115, + "grad_norm": 14.606828689575195, + "learning_rate": 6.562742257582729e-05, + "loss": 7.5008, + "step": 1493600 + }, + { + "epoch": 6.085005535248893, + "grad_norm": 7.373438358306885, + "learning_rate": 6.552329826433337e-05, + "loss": 7.5131, + "step": 1493700 + }, + { + "epoch": 6.0854129132722745, + "grad_norm": 3.4081225395202637, + "learning_rate": 6.541927039840082e-05, + "loss": 7.4887, + "step": 1493800 + }, + { + "epoch": 6.085820291295656, + "grad_norm": 15.155529022216797, + "learning_rate": 6.531533898214251e-05, + "loss": 7.5363, + "step": 1493900 + }, + { + "epoch": 6.086227669319037, + "grad_norm": 11.844385147094727, + "learning_rate": 6.521150401966698e-05, + "loss": 7.5261, + "step": 1494000 + }, + { + "epoch": 6.086227669319037, + "eval_MaskedAccuracy": 0.513277743180825, + "eval_loss": 1.5814414024353027, + "eval_runtime": 158.5885, + "eval_samples_per_second": 400.256, + "eval_steps_per_second": 1.564, + "step": 1494000 + }, + { + "epoch": 6.086635047342418, + "grad_norm": 22.268585205078125, + "learning_rate": 6.510776551507941e-05, + "loss": 7.5169, + "step": 1494100 + }, + { + "epoch": 6.0870424253658, + "grad_norm": 35.34524154663086, + "learning_rate": 6.50041234724817e-05, + "loss": 7.5192, + "step": 1494200 + }, + { + "epoch": 6.087449803389181, + "grad_norm": 6.962163925170898, + "learning_rate": 6.49005778959707e-05, + "loss": 7.5179, + "step": 1494300 + }, + { + "epoch": 6.087857181412563, + "grad_norm": 3.497898817062378, + "learning_rate": 6.479712878964077e-05, + "loss": 7.5114, + "step": 1494400 + }, + { + "epoch": 6.088264559435944, + "grad_norm": 8.635665893554688, + "learning_rate": 6.469377615758141e-05, + "loss": 7.5209, + "step": 1494500 + }, + { + "epoch": 6.088671937459326, + "grad_norm": 20.112213134765625, + "learning_rate": 6.459052000387842e-05, + "loss": 7.5197, + "step": 1494600 + }, + { + "epoch": 6.089079315482707, + "grad_norm": 4.1904473304748535, + "learning_rate": 6.44873603326146e-05, + "loss": 7.51, + "step": 1494700 + }, + { + "epoch": 6.089486693506089, + "grad_norm": 8.575403213500977, + "learning_rate": 6.438429714786817e-05, + "loss": 7.5089, + "step": 1494800 + }, + { + "epoch": 6.0898940715294705, + "grad_norm": 28.999231338500977, + "learning_rate": 6.42813304537138e-05, + "loss": 7.5137, + "step": 1494900 + }, + { + "epoch": 6.090301449552852, + "grad_norm": 13.976512908935547, + "learning_rate": 6.41784602542223e-05, + "loss": 7.5096, + "step": 1495000 + }, + { + "epoch": 6.090301449552852, + "eval_MaskedAccuracy": 0.5134130855731317, + "eval_loss": 1.5810376405715942, + "eval_runtime": 157.1439, + "eval_samples_per_second": 403.935, + "eval_steps_per_second": 1.578, + "step": 1495000 + }, + { + "epoch": 6.0907088275762336, + "grad_norm": 9.604857444763184, + "learning_rate": 6.40756865534607e-05, + "loss": 7.5231, + "step": 1495100 + }, + { + "epoch": 6.091116205599614, + "grad_norm": 12.27506160736084, + "learning_rate": 6.39730093554921e-05, + "loss": 7.5165, + "step": 1495200 + }, + { + "epoch": 6.091523583622996, + "grad_norm": 4.105710983276367, + "learning_rate": 6.387042866437618e-05, + "loss": 7.5047, + "step": 1495300 + }, + { + "epoch": 6.091930961646377, + "grad_norm": 3.478956699371338, + "learning_rate": 6.37679444841684e-05, + "loss": 7.4837, + "step": 1495400 + }, + { + "epoch": 6.092338339669759, + "grad_norm": 12.801264762878418, + "learning_rate": 6.366555681891957e-05, + "loss": 7.5093, + "step": 1495500 + }, + { + "epoch": 6.09274571769314, + "grad_norm": 3.358525037765503, + "learning_rate": 6.356326567267916e-05, + "loss": 7.5539, + "step": 1495600 + }, + { + "epoch": 6.093153095716522, + "grad_norm": 22.82572364807129, + "learning_rate": 6.346107104949042e-05, + "loss": 7.5156, + "step": 1495700 + }, + { + "epoch": 6.093560473739903, + "grad_norm": 2.934875011444092, + "learning_rate": 6.335897295339384e-05, + "loss": 7.5188, + "step": 1495800 + }, + { + "epoch": 6.093967851763285, + "grad_norm": 3.2453079223632812, + "learning_rate": 6.325697138842585e-05, + "loss": 7.4781, + "step": 1495900 + }, + { + "epoch": 6.0943752297866665, + "grad_norm": 4.031073093414307, + "learning_rate": 6.315506635861938e-05, + "loss": 7.535, + "step": 1496000 + }, + { + "epoch": 6.0943752297866665, + "eval_MaskedAccuracy": 0.5137616484666648, + "eval_loss": 1.5853080749511719, + "eval_runtime": 164.0886, + "eval_samples_per_second": 386.84, + "eval_steps_per_second": 1.511, + "step": 1496000 + }, + { + "epoch": 6.094782607810048, + "grad_norm": 3.092862606048584, + "learning_rate": 6.305325786800281e-05, + "loss": 7.5226, + "step": 1496100 + }, + { + "epoch": 6.0951899858334295, + "grad_norm": 12.842656135559082, + "learning_rate": 6.295154592060126e-05, + "loss": 7.5319, + "step": 1496200 + }, + { + "epoch": 6.095597363856811, + "grad_norm": 6.992317199707031, + "learning_rate": 6.284993052043654e-05, + "loss": 7.5205, + "step": 1496300 + }, + { + "epoch": 6.096004741880193, + "grad_norm": 4.310663223266602, + "learning_rate": 6.274841167152531e-05, + "loss": 7.4866, + "step": 1496400 + }, + { + "epoch": 6.096412119903573, + "grad_norm": 9.675249099731445, + "learning_rate": 6.264698937788139e-05, + "loss": 7.5292, + "step": 1496500 + }, + { + "epoch": 6.096819497926955, + "grad_norm": 4.376608848571777, + "learning_rate": 6.25456636435147e-05, + "loss": 7.505, + "step": 1496600 + }, + { + "epoch": 6.097226875950336, + "grad_norm": 9.331419944763184, + "learning_rate": 6.244443447243108e-05, + "loss": 7.4936, + "step": 1496700 + }, + { + "epoch": 6.097634253973718, + "grad_norm": 4.6320977210998535, + "learning_rate": 6.234330186863277e-05, + "loss": 7.537, + "step": 1496800 + }, + { + "epoch": 6.098041631997099, + "grad_norm": 4.060110092163086, + "learning_rate": 6.224226583611792e-05, + "loss": 7.5192, + "step": 1496900 + }, + { + "epoch": 6.098449010020481, + "grad_norm": 4.308634281158447, + "learning_rate": 6.214132637888117e-05, + "loss": 7.523, + "step": 1497000 + }, + { + "epoch": 6.098449010020481, + "eval_MaskedAccuracy": 0.5131019622926449, + "eval_loss": 1.59047532081604, + "eval_runtime": 164.3752, + "eval_samples_per_second": 386.165, + "eval_steps_per_second": 1.509, + "step": 1497000 + }, + { + "epoch": 6.098856388043862, + "grad_norm": 3.2699756622314453, + "learning_rate": 6.204048350091304e-05, + "loss": 7.4865, + "step": 1497100 + }, + { + "epoch": 6.099263766067244, + "grad_norm": 5.7099833488464355, + "learning_rate": 6.193973720620037e-05, + "loss": 7.4747, + "step": 1497200 + }, + { + "epoch": 6.0996711440906255, + "grad_norm": 3.1226887702941895, + "learning_rate": 6.183908749872627e-05, + "loss": 7.5386, + "step": 1497300 + }, + { + "epoch": 6.100078522114007, + "grad_norm": 9.971474647521973, + "learning_rate": 6.173853438246976e-05, + "loss": 7.5233, + "step": 1497400 + }, + { + "epoch": 6.100485900137389, + "grad_norm": 41.65095520019531, + "learning_rate": 6.163807786140696e-05, + "loss": 7.5278, + "step": 1497500 + }, + { + "epoch": 6.10089327816077, + "grad_norm": 15.143765449523926, + "learning_rate": 6.153771793950887e-05, + "loss": 7.5195, + "step": 1497600 + }, + { + "epoch": 6.101300656184151, + "grad_norm": 44.65937042236328, + "learning_rate": 6.143745462074318e-05, + "loss": 7.5296, + "step": 1497700 + }, + { + "epoch": 6.101708034207532, + "grad_norm": 8.260921478271484, + "learning_rate": 6.133728790907413e-05, + "loss": 7.5214, + "step": 1497800 + }, + { + "epoch": 6.102115412230914, + "grad_norm": 3.633737564086914, + "learning_rate": 6.123721780846173e-05, + "loss": 7.5076, + "step": 1497900 + }, + { + "epoch": 6.102522790254295, + "grad_norm": 6.263742923736572, + "learning_rate": 6.113724432286207e-05, + "loss": 7.4943, + "step": 1498000 + }, + { + "epoch": 6.102522790254295, + "eval_MaskedAccuracy": 0.5135091319582853, + "eval_loss": 1.5897711515426636, + "eval_runtime": 165.0235, + "eval_samples_per_second": 384.648, + "eval_steps_per_second": 1.503, + "step": 1498000 + }, + { + "epoch": 6.102930168277677, + "grad_norm": 38.498966217041016, + "learning_rate": 6.103736745622785e-05, + "loss": 7.5505, + "step": 1498100 + }, + { + "epoch": 6.103337546301058, + "grad_norm": 35.71931076049805, + "learning_rate": 6.093758721250758e-05, + "loss": 7.5099, + "step": 1498200 + }, + { + "epoch": 6.10374492432444, + "grad_norm": 9.26229190826416, + "learning_rate": 6.0837903595646224e-05, + "loss": 7.5418, + "step": 1498300 + }, + { + "epoch": 6.1041523023478215, + "grad_norm": 26.859647750854492, + "learning_rate": 6.07383166095847e-05, + "loss": 7.5456, + "step": 1498400 + }, + { + "epoch": 6.104559680371203, + "grad_norm": 2.4605488777160645, + "learning_rate": 6.063882625826058e-05, + "loss": 7.491, + "step": 1498500 + }, + { + "epoch": 6.1049670583945845, + "grad_norm": 44.589752197265625, + "learning_rate": 6.053943254560676e-05, + "loss": 7.4924, + "step": 1498600 + }, + { + "epoch": 6.105374436417966, + "grad_norm": 3.9444704055786133, + "learning_rate": 6.0440135475553265e-05, + "loss": 7.5357, + "step": 1498700 + }, + { + "epoch": 6.105781814441348, + "grad_norm": 22.431640625, + "learning_rate": 6.034093505202546e-05, + "loss": 7.5317, + "step": 1498800 + }, + { + "epoch": 6.106189192464729, + "grad_norm": 12.822697639465332, + "learning_rate": 6.024183127894574e-05, + "loss": 7.5083, + "step": 1498900 + }, + { + "epoch": 6.10659657048811, + "grad_norm": 19.53934669494629, + "learning_rate": 6.014282416023171e-05, + "loss": 7.5185, + "step": 1499000 + }, + { + "epoch": 6.10659657048811, + "eval_MaskedAccuracy": 0.5143693548931055, + "eval_loss": 1.572809100151062, + "eval_runtime": 162.4281, + "eval_samples_per_second": 390.794, + "eval_steps_per_second": 1.527, + "step": 1499000 + }, + { + "epoch": 6.107003948511491, + "grad_norm": 22.403322219848633, + "learning_rate": 6.0043913699797884e-05, + "loss": 7.5242, + "step": 1499100 + }, + { + "epoch": 6.107411326534873, + "grad_norm": 10.545846939086914, + "learning_rate": 5.994509990155476e-05, + "loss": 7.5222, + "step": 1499200 + }, + { + "epoch": 6.107818704558254, + "grad_norm": 39.69652557373047, + "learning_rate": 5.9846382769408816e-05, + "loss": 7.5152, + "step": 1499300 + }, + { + "epoch": 6.108226082581636, + "grad_norm": 26.553680419921875, + "learning_rate": 5.97477623072627e-05, + "loss": 7.5034, + "step": 1499400 + }, + { + "epoch": 6.108633460605017, + "grad_norm": 15.599024772644043, + "learning_rate": 5.964923851901581e-05, + "loss": 7.5221, + "step": 1499500 + }, + { + "epoch": 6.109040838628399, + "grad_norm": 20.136043548583984, + "learning_rate": 5.955081140856323e-05, + "loss": 7.5389, + "step": 1499600 + }, + { + "epoch": 6.1094482166517805, + "grad_norm": 16.722702026367188, + "learning_rate": 5.9452480979796396e-05, + "loss": 7.523, + "step": 1499700 + }, + { + "epoch": 6.109855594675162, + "grad_norm": 3.7607839107513428, + "learning_rate": 5.9354247236602686e-05, + "loss": 7.51, + "step": 1499800 + }, + { + "epoch": 6.110262972698544, + "grad_norm": 15.021867752075195, + "learning_rate": 5.925611018286561e-05, + "loss": 7.5226, + "step": 1499900 + }, + { + "epoch": 6.110670350721925, + "grad_norm": 3.6220667362213135, + "learning_rate": 5.915806982246545e-05, + "loss": 7.5128, + "step": 1500000 + }, + { + "epoch": 6.110670350721925, + "eval_MaskedAccuracy": 0.5136200421896125, + "eval_loss": 1.5887588262557983, + "eval_runtime": 171.5534, + "eval_samples_per_second": 370.007, + "eval_steps_per_second": 1.446, + "step": 1500000 + }, + { + "epoch": 6.111077728745307, + "grad_norm": 16.785629272460938, + "learning_rate": 5.906012615927822e-05, + "loss": 7.5288, + "step": 1500100 + }, + { + "epoch": 6.111485106768687, + "grad_norm": 29.793237686157227, + "learning_rate": 5.8962279197175594e-05, + "loss": 7.5424, + "step": 1500200 + }, + { + "epoch": 6.111892484792069, + "grad_norm": 12.748746871948242, + "learning_rate": 5.886452894002673e-05, + "loss": 7.533, + "step": 1500300 + }, + { + "epoch": 6.11229986281545, + "grad_norm": 18.582733154296875, + "learning_rate": 5.876687539169579e-05, + "loss": 7.5181, + "step": 1500400 + }, + { + "epoch": 6.112707240838832, + "grad_norm": 4.814263343811035, + "learning_rate": 5.866931855604359e-05, + "loss": 7.5212, + "step": 1500500 + }, + { + "epoch": 6.113114618862213, + "grad_norm": 10.030101776123047, + "learning_rate": 5.8571858436927295e-05, + "loss": 7.5309, + "step": 1500600 + }, + { + "epoch": 6.113521996885595, + "grad_norm": 32.51324462890625, + "learning_rate": 5.847449503819984e-05, + "loss": 7.5445, + "step": 1500700 + }, + { + "epoch": 6.1139293749089765, + "grad_norm": 6.151972770690918, + "learning_rate": 5.83772283637103e-05, + "loss": 7.505, + "step": 1500800 + }, + { + "epoch": 6.114336752932358, + "grad_norm": 9.38499927520752, + "learning_rate": 5.8280058417304595e-05, + "loss": 7.5265, + "step": 1500900 + }, + { + "epoch": 6.1147441309557395, + "grad_norm": 8.609825134277344, + "learning_rate": 5.818298520282393e-05, + "loss": 7.4827, + "step": 1501000 + }, + { + "epoch": 6.1147441309557395, + "eval_MaskedAccuracy": 0.5139675753318365, + "eval_loss": 1.581284523010254, + "eval_runtime": 154.483, + "eval_samples_per_second": 410.893, + "eval_steps_per_second": 1.605, + "step": 1501000 + }, + { + "epoch": 6.115151508979121, + "grad_norm": 15.545623779296875, + "learning_rate": 5.808600872410646e-05, + "loss": 7.5159, + "step": 1501100 + }, + { + "epoch": 6.115558887002503, + "grad_norm": 5.443942546844482, + "learning_rate": 5.798912898498633e-05, + "loss": 7.5376, + "step": 1501200 + }, + { + "epoch": 6.115966265025884, + "grad_norm": 21.568159103393555, + "learning_rate": 5.789234598929311e-05, + "loss": 7.5197, + "step": 1501300 + }, + { + "epoch": 6.116373643049266, + "grad_norm": 16.5999813079834, + "learning_rate": 5.779565974085333e-05, + "loss": 7.5445, + "step": 1501400 + }, + { + "epoch": 6.116781021072646, + "grad_norm": 24.65265464782715, + "learning_rate": 5.76990702434901e-05, + "loss": 7.52, + "step": 1501500 + }, + { + "epoch": 6.117188399096028, + "grad_norm": 10.785026550292969, + "learning_rate": 5.7602577501021594e-05, + "loss": 7.5149, + "step": 1501600 + }, + { + "epoch": 6.117595777119409, + "grad_norm": 24.120969772338867, + "learning_rate": 5.750618151726301e-05, + "loss": 7.5243, + "step": 1501700 + }, + { + "epoch": 6.118003155142791, + "grad_norm": 5.7547993659973145, + "learning_rate": 5.7409882296024915e-05, + "loss": 7.4853, + "step": 1501800 + }, + { + "epoch": 6.1184105331661724, + "grad_norm": 11.60728931427002, + "learning_rate": 5.731367984111482e-05, + "loss": 7.5163, + "step": 1501900 + }, + { + "epoch": 6.118817911189554, + "grad_norm": 14.78231143951416, + "learning_rate": 5.7217574156336114e-05, + "loss": 7.5376, + "step": 1502000 + }, + { + "epoch": 6.118817911189554, + "eval_MaskedAccuracy": 0.5125785951915778, + "eval_loss": 1.5911993980407715, + "eval_runtime": 161.9036, + "eval_samples_per_second": 392.06, + "eval_steps_per_second": 1.532, + "step": 1502000 + }, + { + "epoch": 6.1192252892129355, + "grad_norm": 29.454513549804688, + "learning_rate": 5.712156524548849e-05, + "loss": 7.5158, + "step": 1502100 + }, + { + "epoch": 6.119632667236317, + "grad_norm": 22.591447830200195, + "learning_rate": 5.7025653112367435e-05, + "loss": 7.5205, + "step": 1502200 + }, + { + "epoch": 6.120040045259699, + "grad_norm": 17.294477462768555, + "learning_rate": 5.692983776076528e-05, + "loss": 7.5469, + "step": 1502300 + }, + { + "epoch": 6.12044742328308, + "grad_norm": 4.483959197998047, + "learning_rate": 5.683411919446954e-05, + "loss": 7.5126, + "step": 1502400 + }, + { + "epoch": 6.120854801306462, + "grad_norm": 8.0223388671875, + "learning_rate": 5.673849741726482e-05, + "loss": 7.5208, + "step": 1502500 + }, + { + "epoch": 6.121262179329843, + "grad_norm": 8.338587760925293, + "learning_rate": 5.6642972432931614e-05, + "loss": 7.5289, + "step": 1502600 + }, + { + "epoch": 6.121669557353224, + "grad_norm": 9.082412719726562, + "learning_rate": 5.654754424524656e-05, + "loss": 7.5317, + "step": 1502700 + }, + { + "epoch": 6.122076935376605, + "grad_norm": 5.886031627655029, + "learning_rate": 5.6452212857982136e-05, + "loss": 7.5404, + "step": 1502800 + }, + { + "epoch": 6.122484313399987, + "grad_norm": 4.301381587982178, + "learning_rate": 5.635697827490751e-05, + "loss": 7.5355, + "step": 1502900 + }, + { + "epoch": 6.122891691423368, + "grad_norm": 21.232097625732422, + "learning_rate": 5.626184049978793e-05, + "loss": 7.5442, + "step": 1503000 + }, + { + "epoch": 6.122891691423368, + "eval_MaskedAccuracy": 0.5132958692081369, + "eval_loss": 1.5840513706207275, + "eval_runtime": 167.5835, + "eval_samples_per_second": 378.772, + "eval_steps_per_second": 1.48, + "step": 1503000 + }, + { + "epoch": 6.12329906944675, + "grad_norm": 7.63557767868042, + "learning_rate": 5.616679953638451e-05, + "loss": 7.5196, + "step": 1503100 + }, + { + "epoch": 6.1237064474701315, + "grad_norm": 20.64089012145996, + "learning_rate": 5.6071855388454756e-05, + "loss": 7.5087, + "step": 1503200 + }, + { + "epoch": 6.124113825493513, + "grad_norm": 3.5224153995513916, + "learning_rate": 5.5977008059752304e-05, + "loss": 7.5247, + "step": 1503300 + }, + { + "epoch": 6.1245212035168946, + "grad_norm": 10.130736351013184, + "learning_rate": 5.5882257554026586e-05, + "loss": 7.5134, + "step": 1503400 + }, + { + "epoch": 6.124928581540276, + "grad_norm": 9.070799827575684, + "learning_rate": 5.578760387502442e-05, + "loss": 7.5448, + "step": 1503500 + }, + { + "epoch": 6.125335959563658, + "grad_norm": 15.010527610778809, + "learning_rate": 5.569304702648751e-05, + "loss": 7.5372, + "step": 1503600 + }, + { + "epoch": 6.125743337587039, + "grad_norm": 30.21007537841797, + "learning_rate": 5.5598587012154466e-05, + "loss": 7.5026, + "step": 1503700 + }, + { + "epoch": 6.126150715610421, + "grad_norm": 34.350929260253906, + "learning_rate": 5.5504223835759575e-05, + "loss": 7.5271, + "step": 1503800 + }, + { + "epoch": 6.126558093633802, + "grad_norm": 5.31763219833374, + "learning_rate": 5.540995750103348e-05, + "loss": 7.5206, + "step": 1503900 + }, + { + "epoch": 6.126965471657183, + "grad_norm": 14.548882484436035, + "learning_rate": 5.531578801170296e-05, + "loss": 7.5329, + "step": 1504000 + }, + { + "epoch": 6.126965471657183, + "eval_MaskedAccuracy": 0.5132471680824442, + "eval_loss": 1.5844782590866089, + "eval_runtime": 164.7584, + "eval_samples_per_second": 385.267, + "eval_steps_per_second": 1.505, + "step": 1504000 + }, + { + "epoch": 6.127372849680564, + "grad_norm": 4.667428493499756, + "learning_rate": 5.522171537149146e-05, + "loss": 7.525, + "step": 1504100 + }, + { + "epoch": 6.127780227703946, + "grad_norm": 21.363624572753906, + "learning_rate": 5.512773958411767e-05, + "loss": 7.5196, + "step": 1504200 + }, + { + "epoch": 6.1281876057273275, + "grad_norm": 17.82527732849121, + "learning_rate": 5.5033860653297334e-05, + "loss": 7.5239, + "step": 1504300 + }, + { + "epoch": 6.128594983750709, + "grad_norm": 28.17323112487793, + "learning_rate": 5.4940078582741645e-05, + "loss": 7.5321, + "step": 1504400 + }, + { + "epoch": 6.1290023617740905, + "grad_norm": 23.739816665649414, + "learning_rate": 5.4846393376158575e-05, + "loss": 7.563, + "step": 1504500 + }, + { + "epoch": 6.129409739797472, + "grad_norm": 10.429069519042969, + "learning_rate": 5.475280503725186e-05, + "loss": 7.5084, + "step": 1504600 + }, + { + "epoch": 6.129817117820854, + "grad_norm": 10.665395736694336, + "learning_rate": 5.465931356972176e-05, + "loss": 7.5192, + "step": 1504700 + }, + { + "epoch": 6.130224495844235, + "grad_norm": 18.882339477539062, + "learning_rate": 5.456591897726429e-05, + "loss": 7.5131, + "step": 1504800 + }, + { + "epoch": 6.130631873867617, + "grad_norm": 15.698235511779785, + "learning_rate": 5.447262126357175e-05, + "loss": 7.519, + "step": 1504900 + }, + { + "epoch": 6.131039251890998, + "grad_norm": 9.932863235473633, + "learning_rate": 5.437942043233305e-05, + "loss": 7.5235, + "step": 1505000 + }, + { + "epoch": 6.131039251890998, + "eval_MaskedAccuracy": 0.5135489777019518, + "eval_loss": 1.5909253358840942, + "eval_runtime": 205.4964, + "eval_samples_per_second": 308.891, + "eval_steps_per_second": 1.207, + "step": 1505000 + }, + { + "epoch": 6.13144662991438, + "grad_norm": 11.029427528381348, + "learning_rate": 5.4286316487232534e-05, + "loss": 7.5275, + "step": 1505100 + }, + { + "epoch": 6.13185400793776, + "grad_norm": 15.226016998291016, + "learning_rate": 5.419330943195129e-05, + "loss": 7.5026, + "step": 1505200 + }, + { + "epoch": 6.132261385961142, + "grad_norm": 8.539198875427246, + "learning_rate": 5.410039927016648e-05, + "loss": 7.5387, + "step": 1505300 + }, + { + "epoch": 6.132668763984523, + "grad_norm": 3.748371124267578, + "learning_rate": 5.400758600555089e-05, + "loss": 7.5239, + "step": 1505400 + }, + { + "epoch": 6.133076142007905, + "grad_norm": 19.625595092773438, + "learning_rate": 5.391486964177449e-05, + "loss": 7.5148, + "step": 1505500 + }, + { + "epoch": 6.1334835200312865, + "grad_norm": 18.956878662109375, + "learning_rate": 5.3822250182502516e-05, + "loss": 7.5166, + "step": 1505600 + }, + { + "epoch": 6.133890898054668, + "grad_norm": 16.757740020751953, + "learning_rate": 5.372972763139697e-05, + "loss": 7.5499, + "step": 1505700 + }, + { + "epoch": 6.13429827607805, + "grad_norm": 27.8960018157959, + "learning_rate": 5.3637301992115615e-05, + "loss": 7.484, + "step": 1505800 + }, + { + "epoch": 6.134705654101431, + "grad_norm": 16.285743713378906, + "learning_rate": 5.354497326831234e-05, + "loss": 7.5491, + "step": 1505900 + }, + { + "epoch": 6.135113032124813, + "grad_norm": 3.0651907920837402, + "learning_rate": 5.345274146363771e-05, + "loss": 7.5537, + "step": 1506000 + }, + { + "epoch": 6.135113032124813, + "eval_MaskedAccuracy": 0.5133615278180056, + "eval_loss": 1.5865812301635742, + "eval_runtime": 173.4388, + "eval_samples_per_second": 365.985, + "eval_steps_per_second": 1.43, + "step": 1506000 + }, + { + "epoch": 6.135520410148194, + "grad_norm": 8.346098899841309, + "learning_rate": 5.336060658173798e-05, + "loss": 7.556, + "step": 1506100 + }, + { + "epoch": 6.135927788171576, + "grad_norm": 7.614825248718262, + "learning_rate": 5.3268568626255686e-05, + "loss": 7.5436, + "step": 1506200 + }, + { + "epoch": 6.136335166194957, + "grad_norm": 24.567646026611328, + "learning_rate": 5.317662760082977e-05, + "loss": 7.5348, + "step": 1506300 + }, + { + "epoch": 6.136742544218338, + "grad_norm": 12.6890287399292, + "learning_rate": 5.3084783509094744e-05, + "loss": 7.5319, + "step": 1506400 + }, + { + "epoch": 6.137149922241719, + "grad_norm": 17.08099365234375, + "learning_rate": 5.299303635468242e-05, + "loss": 7.533, + "step": 1506500 + }, + { + "epoch": 6.137557300265101, + "grad_norm": 13.692460060119629, + "learning_rate": 5.290138614121951e-05, + "loss": 7.4986, + "step": 1506600 + }, + { + "epoch": 6.1379646782884825, + "grad_norm": 9.78744125366211, + "learning_rate": 5.280983287232919e-05, + "loss": 7.5201, + "step": 1506700 + }, + { + "epoch": 6.138372056311864, + "grad_norm": 10.857870101928711, + "learning_rate": 5.271837655163163e-05, + "loss": 7.5503, + "step": 1506800 + }, + { + "epoch": 6.1387794343352455, + "grad_norm": 18.279020309448242, + "learning_rate": 5.2627017182742455e-05, + "loss": 7.5017, + "step": 1506900 + }, + { + "epoch": 6.139186812358627, + "grad_norm": 10.454538345336914, + "learning_rate": 5.2535754769273487e-05, + "loss": 7.5279, + "step": 1507000 + }, + { + "epoch": 6.139186812358627, + "eval_MaskedAccuracy": 0.5133254952590061, + "eval_loss": 1.5855728387832642, + "eval_runtime": 174.1098, + "eval_samples_per_second": 364.574, + "eval_steps_per_second": 1.424, + "step": 1507000 + }, + { + "epoch": 6.139594190382009, + "grad_norm": 9.251683235168457, + "learning_rate": 5.244458931483292e-05, + "loss": 7.5185, + "step": 1507100 + }, + { + "epoch": 6.14000156840539, + "grad_norm": 22.32879066467285, + "learning_rate": 5.2353520823024836e-05, + "loss": 7.5305, + "step": 1507200 + }, + { + "epoch": 6.140408946428772, + "grad_norm": 22.638103485107422, + "learning_rate": 5.2262549297449725e-05, + "loss": 7.5294, + "step": 1507300 + }, + { + "epoch": 6.140816324452153, + "grad_norm": 22.381141662597656, + "learning_rate": 5.2171674741704125e-05, + "loss": 7.5399, + "step": 1507400 + }, + { + "epoch": 6.141223702475535, + "grad_norm": 5.647333145141602, + "learning_rate": 5.2080897159381016e-05, + "loss": 7.4983, + "step": 1507500 + }, + { + "epoch": 6.141631080498916, + "grad_norm": 24.05869483947754, + "learning_rate": 5.199021655406925e-05, + "loss": 7.5205, + "step": 1507600 + }, + { + "epoch": 6.142038458522297, + "grad_norm": 22.714885711669922, + "learning_rate": 5.1899632929353695e-05, + "loss": 7.5062, + "step": 1507700 + }, + { + "epoch": 6.142445836545678, + "grad_norm": 15.513699531555176, + "learning_rate": 5.180914628881572e-05, + "loss": 7.5508, + "step": 1507800 + }, + { + "epoch": 6.14285321456906, + "grad_norm": 5.411686897277832, + "learning_rate": 5.17187566360328e-05, + "loss": 7.5067, + "step": 1507900 + }, + { + "epoch": 6.1432605925924415, + "grad_norm": 15.956671714782715, + "learning_rate": 5.162846397457855e-05, + "loss": 7.5378, + "step": 1508000 + }, + { + "epoch": 6.1432605925924415, + "eval_MaskedAccuracy": 0.5133262331475851, + "eval_loss": 1.5871728658676147, + "eval_runtime": 176.6592, + "eval_samples_per_second": 359.313, + "eval_steps_per_second": 1.404, + "step": 1508000 + }, + { + "epoch": 6.143667970615823, + "grad_norm": 10.043844223022461, + "learning_rate": 5.153826830802267e-05, + "loss": 7.5047, + "step": 1508100 + }, + { + "epoch": 6.144075348639205, + "grad_norm": 8.08338737487793, + "learning_rate": 5.1448169639931225e-05, + "loss": 7.5137, + "step": 1508200 + }, + { + "epoch": 6.144482726662586, + "grad_norm": 14.614755630493164, + "learning_rate": 5.1358167973865915e-05, + "loss": 7.5398, + "step": 1508300 + }, + { + "epoch": 6.144890104685968, + "grad_norm": 13.929628372192383, + "learning_rate": 5.1268263313385335e-05, + "loss": 7.5202, + "step": 1508400 + }, + { + "epoch": 6.145297482709349, + "grad_norm": 11.499002456665039, + "learning_rate": 5.117845566204373e-05, + "loss": 7.5273, + "step": 1508500 + }, + { + "epoch": 6.145704860732731, + "grad_norm": 6.545090675354004, + "learning_rate": 5.108874502339197e-05, + "loss": 7.5356, + "step": 1508600 + }, + { + "epoch": 6.146112238756112, + "grad_norm": 19.642810821533203, + "learning_rate": 5.099913140097616e-05, + "loss": 7.5384, + "step": 1508700 + }, + { + "epoch": 6.146519616779494, + "grad_norm": 12.959432601928711, + "learning_rate": 5.0909614798340014e-05, + "loss": 7.5248, + "step": 1508800 + }, + { + "epoch": 6.146926994802875, + "grad_norm": 15.121339797973633, + "learning_rate": 5.082019521902198e-05, + "loss": 7.5185, + "step": 1508900 + }, + { + "epoch": 6.147334372826256, + "grad_norm": 6.627651214599609, + "learning_rate": 5.073087266655732e-05, + "loss": 7.5288, + "step": 1509000 + }, + { + "epoch": 6.147334372826256, + "eval_MaskedAccuracy": 0.5135563572028291, + "eval_loss": 1.5837535858154297, + "eval_runtime": 168.5913, + "eval_samples_per_second": 376.508, + "eval_steps_per_second": 1.471, + "step": 1509000 + }, + { + "epoch": 6.1477417508496375, + "grad_norm": 16.430068969726562, + "learning_rate": 5.064164714447782e-05, + "loss": 7.5318, + "step": 1509100 + }, + { + "epoch": 6.148149128873019, + "grad_norm": 5.408413887023926, + "learning_rate": 5.055251865631089e-05, + "loss": 7.5317, + "step": 1509200 + }, + { + "epoch": 6.1485565068964005, + "grad_norm": 6.283885478973389, + "learning_rate": 5.046348720558015e-05, + "loss": 7.5328, + "step": 1509300 + }, + { + "epoch": 6.148963884919782, + "grad_norm": 13.609271049499512, + "learning_rate": 5.037455279580546e-05, + "loss": 7.5264, + "step": 1509400 + }, + { + "epoch": 6.149371262943164, + "grad_norm": 19.50933265686035, + "learning_rate": 5.02857154305028e-05, + "loss": 7.5452, + "step": 1509500 + }, + { + "epoch": 6.149778640966545, + "grad_norm": 22.08439064025879, + "learning_rate": 5.019697511318486e-05, + "loss": 7.5474, + "step": 1509600 + }, + { + "epoch": 6.150186018989927, + "grad_norm": 18.241321563720703, + "learning_rate": 5.010833184735959e-05, + "loss": 7.495, + "step": 1509700 + }, + { + "epoch": 6.150593397013308, + "grad_norm": 15.491229057312012, + "learning_rate": 5.001978563653154e-05, + "loss": 7.5639, + "step": 1509800 + }, + { + "epoch": 6.15100077503669, + "grad_norm": 3.1370866298675537, + "learning_rate": 4.9931336484201755e-05, + "loss": 7.5406, + "step": 1509900 + }, + { + "epoch": 6.151408153060071, + "grad_norm": 18.79566192626953, + "learning_rate": 4.984298439386675e-05, + "loss": 7.5138, + "step": 1510000 + }, + { + "epoch": 6.151408153060071, + "eval_MaskedAccuracy": 0.5132923806800755, + "eval_loss": 1.5832467079162598, + "eval_runtime": 199.0901, + "eval_samples_per_second": 318.831, + "eval_steps_per_second": 1.246, + "step": 1510000 + }, + { + "epoch": 6.151815531083453, + "grad_norm": 20.028486251831055, + "learning_rate": 4.975472936901961e-05, + "loss": 7.5247, + "step": 1510100 + }, + { + "epoch": 6.1522229091068334, + "grad_norm": 4.8658599853515625, + "learning_rate": 4.966657141314969e-05, + "loss": 7.5394, + "step": 1510200 + }, + { + "epoch": 6.152630287130215, + "grad_norm": 36.72107696533203, + "learning_rate": 4.9578510529742375e-05, + "loss": 7.518, + "step": 1510300 + }, + { + "epoch": 6.1530376651535965, + "grad_norm": 37.59674835205078, + "learning_rate": 4.949054672227884e-05, + "loss": 7.5205, + "step": 1510400 + }, + { + "epoch": 6.153445043176978, + "grad_norm": 3.0896642208099365, + "learning_rate": 4.940267999423704e-05, + "loss": 7.5015, + "step": 1510500 + }, + { + "epoch": 6.15385242120036, + "grad_norm": 5.979048252105713, + "learning_rate": 4.9314910349090614e-05, + "loss": 7.5147, + "step": 1510600 + }, + { + "epoch": 6.154259799223741, + "grad_norm": 38.82964324951172, + "learning_rate": 4.9227237790310136e-05, + "loss": 7.5136, + "step": 1510700 + }, + { + "epoch": 6.154667177247123, + "grad_norm": 7.174513816833496, + "learning_rate": 4.913966232136086e-05, + "loss": 7.5285, + "step": 1510800 + }, + { + "epoch": 6.155074555270504, + "grad_norm": 20.411285400390625, + "learning_rate": 4.905218394570606e-05, + "loss": 7.5316, + "step": 1510900 + }, + { + "epoch": 6.155481933293886, + "grad_norm": 39.270877838134766, + "learning_rate": 4.896480266680332e-05, + "loss": 7.5294, + "step": 1511000 + }, + { + "epoch": 6.155481933293886, + "eval_MaskedAccuracy": 0.5138560436909618, + "eval_loss": 1.5828372240066528, + "eval_runtime": 168.6993, + "eval_samples_per_second": 376.267, + "eval_steps_per_second": 1.47, + "step": 1511000 + }, + { + "epoch": 6.155889311317267, + "grad_norm": 4.460048198699951, + "learning_rate": 4.887751848810792e-05, + "loss": 7.5489, + "step": 1511100 + }, + { + "epoch": 6.156296689340649, + "grad_norm": 3.7612500190734863, + "learning_rate": 4.879033141307078e-05, + "loss": 7.5246, + "step": 1511200 + }, + { + "epoch": 6.15670406736403, + "grad_norm": 5.42179536819458, + "learning_rate": 4.8703241445137996e-05, + "loss": 7.5126, + "step": 1511300 + }, + { + "epoch": 6.157111445387411, + "grad_norm": 18.84461212158203, + "learning_rate": 4.861624858775356e-05, + "loss": 7.5481, + "step": 1511400 + }, + { + "epoch": 6.1575188234107925, + "grad_norm": 10.883890151977539, + "learning_rate": 4.85293528443564e-05, + "loss": 7.4938, + "step": 1511500 + }, + { + "epoch": 6.157926201434174, + "grad_norm": 30.8356990814209, + "learning_rate": 4.8442554218382146e-05, + "loss": 7.5118, + "step": 1511600 + }, + { + "epoch": 6.1583335794575556, + "grad_norm": 9.673890113830566, + "learning_rate": 4.835585271326236e-05, + "loss": 7.514, + "step": 1511700 + }, + { + "epoch": 6.158740957480937, + "grad_norm": 23.722261428833008, + "learning_rate": 4.826924833242485e-05, + "loss": 7.5246, + "step": 1511800 + }, + { + "epoch": 6.159148335504319, + "grad_norm": 16.473833084106445, + "learning_rate": 4.8182741079293106e-05, + "loss": 7.534, + "step": 1511900 + }, + { + "epoch": 6.1595557135277, + "grad_norm": 8.175965309143066, + "learning_rate": 4.809633095728808e-05, + "loss": 7.4927, + "step": 1512000 + }, + { + "epoch": 6.1595557135277, + "eval_MaskedAccuracy": 0.5136075506337975, + "eval_loss": 1.5865976810455322, + "eval_runtime": 163.032, + "eval_samples_per_second": 389.347, + "eval_steps_per_second": 1.521, + "step": 1512000 + }, + { + "epoch": 6.159963091551082, + "grad_norm": 3.847217082977295, + "learning_rate": 4.801001796982517e-05, + "loss": 7.5415, + "step": 1512100 + }, + { + "epoch": 6.160370469574463, + "grad_norm": 3.657407522201538, + "learning_rate": 4.792380212031735e-05, + "loss": 7.5284, + "step": 1512200 + }, + { + "epoch": 6.160777847597845, + "grad_norm": 14.559707641601562, + "learning_rate": 4.7837683412172776e-05, + "loss": 7.507, + "step": 1512300 + }, + { + "epoch": 6.161185225621226, + "grad_norm": 3.464966297149658, + "learning_rate": 4.7751661848796645e-05, + "loss": 7.5167, + "step": 1512400 + }, + { + "epoch": 6.161592603644608, + "grad_norm": 4.260078430175781, + "learning_rate": 4.766573743358945e-05, + "loss": 7.5152, + "step": 1512500 + }, + { + "epoch": 6.161999981667989, + "grad_norm": 4.311563968658447, + "learning_rate": 4.757991016994856e-05, + "loss": 7.4896, + "step": 1512600 + }, + { + "epoch": 6.16240735969137, + "grad_norm": 7.310108661651611, + "learning_rate": 4.749418006126697e-05, + "loss": 7.512, + "step": 1512700 + }, + { + "epoch": 6.1628147377147515, + "grad_norm": 16.643510818481445, + "learning_rate": 4.740854711093398e-05, + "loss": 7.5254, + "step": 1512800 + }, + { + "epoch": 6.163222115738133, + "grad_norm": 15.554253578186035, + "learning_rate": 4.732301132233517e-05, + "loss": 7.5593, + "step": 1512900 + }, + { + "epoch": 6.163629493761515, + "grad_norm": 11.946845054626465, + "learning_rate": 4.7237572698852495e-05, + "loss": 7.5142, + "step": 1513000 + }, + { + "epoch": 6.163629493761515, + "eval_MaskedAccuracy": 0.5135302717421424, + "eval_loss": 1.58794105052948, + "eval_runtime": 152.674, + "eval_samples_per_second": 415.762, + "eval_steps_per_second": 1.624, + "step": 1513000 + }, + { + "epoch": 6.164036871784896, + "grad_norm": 21.967002868652344, + "learning_rate": 4.7152231243863595e-05, + "loss": 7.5124, + "step": 1513100 + }, + { + "epoch": 6.164444249808278, + "grad_norm": 17.184104919433594, + "learning_rate": 4.706698696074239e-05, + "loss": 7.4828, + "step": 1513200 + }, + { + "epoch": 6.164851627831659, + "grad_norm": 6.202548027038574, + "learning_rate": 4.698183985285888e-05, + "loss": 7.5394, + "step": 1513300 + }, + { + "epoch": 6.165259005855041, + "grad_norm": 24.404335021972656, + "learning_rate": 4.6896789923580024e-05, + "loss": 7.5371, + "step": 1513400 + }, + { + "epoch": 6.165666383878422, + "grad_norm": 11.398333549499512, + "learning_rate": 4.681183717626802e-05, + "loss": 7.5247, + "step": 1513500 + }, + { + "epoch": 6.166073761901804, + "grad_norm": 6.04208517074585, + "learning_rate": 4.6726981614281246e-05, + "loss": 7.5326, + "step": 1513600 + }, + { + "epoch": 6.166481139925185, + "grad_norm": 22.410314559936523, + "learning_rate": 4.6642223240974716e-05, + "loss": 7.5411, + "step": 1513700 + }, + { + "epoch": 6.166888517948567, + "grad_norm": 2.2639403343200684, + "learning_rate": 4.6557562059699315e-05, + "loss": 7.5159, + "step": 1513800 + }, + { + "epoch": 6.167295895971948, + "grad_norm": 12.063501358032227, + "learning_rate": 4.647299807380204e-05, + "loss": 7.5088, + "step": 1513900 + }, + { + "epoch": 6.167703273995329, + "grad_norm": 2.2603001594543457, + "learning_rate": 4.6388531286626527e-05, + "loss": 7.4819, + "step": 1514000 + }, + { + "epoch": 6.167703273995329, + "eval_MaskedAccuracy": 0.5137995287013225, + "eval_loss": 1.583763599395752, + "eval_runtime": 161.2835, + "eval_samples_per_second": 393.568, + "eval_steps_per_second": 1.538, + "step": 1514000 + }, + { + "epoch": 6.168110652018711, + "grad_norm": 2.884631633758545, + "learning_rate": 4.63041617015118e-05, + "loss": 7.5158, + "step": 1514100 + }, + { + "epoch": 6.168518030042092, + "grad_norm": 27.567920684814453, + "learning_rate": 4.621988932179374e-05, + "loss": 7.5014, + "step": 1514200 + }, + { + "epoch": 6.168925408065474, + "grad_norm": 10.135107040405273, + "learning_rate": 4.613571415080356e-05, + "loss": 7.5129, + "step": 1514300 + }, + { + "epoch": 6.169332786088855, + "grad_norm": 3.33738374710083, + "learning_rate": 4.6051636191870015e-05, + "loss": 7.5095, + "step": 1514400 + }, + { + "epoch": 6.169740164112237, + "grad_norm": 11.586241722106934, + "learning_rate": 4.5967655448316196e-05, + "loss": 7.5481, + "step": 1514500 + }, + { + "epoch": 6.170147542135618, + "grad_norm": 23.621122360229492, + "learning_rate": 4.5883771923463146e-05, + "loss": 7.517, + "step": 1514600 + }, + { + "epoch": 6.170554920159, + "grad_norm": 4.466002464294434, + "learning_rate": 4.5799985620626517e-05, + "loss": 7.5084, + "step": 1514700 + }, + { + "epoch": 6.170962298182381, + "grad_norm": 3.7232768535614014, + "learning_rate": 4.571629654311955e-05, + "loss": 7.4998, + "step": 1514800 + }, + { + "epoch": 6.171369676205763, + "grad_norm": 6.155814170837402, + "learning_rate": 4.563270469425045e-05, + "loss": 7.5222, + "step": 1514900 + }, + { + "epoch": 6.171777054229144, + "grad_norm": 8.497859954833984, + "learning_rate": 4.5549210077323844e-05, + "loss": 7.4826, + "step": 1515000 + }, + { + "epoch": 6.171777054229144, + "eval_MaskedAccuracy": 0.5135340781686724, + "eval_loss": 1.5922369956970215, + "eval_runtime": 161.6478, + "eval_samples_per_second": 392.681, + "eval_steps_per_second": 1.534, + "step": 1515000 + }, + { + "epoch": 6.172184432252526, + "grad_norm": 4.6390204429626465, + "learning_rate": 4.5465812695641516e-05, + "loss": 7.5042, + "step": 1515100 + }, + { + "epoch": 6.1725918102759065, + "grad_norm": 3.622335910797119, + "learning_rate": 4.538251255249984e-05, + "loss": 7.5045, + "step": 1515200 + }, + { + "epoch": 6.172999188299288, + "grad_norm": 9.310247421264648, + "learning_rate": 4.529930965119211e-05, + "loss": 7.566, + "step": 1515300 + }, + { + "epoch": 6.17340656632267, + "grad_norm": 4.652536869049072, + "learning_rate": 4.521620399500846e-05, + "loss": 7.4942, + "step": 1515400 + }, + { + "epoch": 6.173813944346051, + "grad_norm": 2.640489101409912, + "learning_rate": 4.513319558723418e-05, + "loss": 7.4872, + "step": 1515500 + }, + { + "epoch": 6.174221322369433, + "grad_norm": 3.247558355331421, + "learning_rate": 4.5050284431150864e-05, + "loss": 7.5346, + "step": 1515600 + }, + { + "epoch": 6.174628700392814, + "grad_norm": 2.5709688663482666, + "learning_rate": 4.496747053003655e-05, + "loss": 7.5257, + "step": 1515700 + }, + { + "epoch": 6.175036078416196, + "grad_norm": 6.035948753356934, + "learning_rate": 4.4884753887165394e-05, + "loss": 7.5302, + "step": 1515800 + }, + { + "epoch": 6.175443456439577, + "grad_norm": 22.370046615600586, + "learning_rate": 4.4802134505807394e-05, + "loss": 7.5076, + "step": 1515900 + }, + { + "epoch": 6.175850834462959, + "grad_norm": 2.9051003456115723, + "learning_rate": 4.4719612389229146e-05, + "loss": 7.5285, + "step": 1516000 + }, + { + "epoch": 6.175850834462959, + "eval_MaskedAccuracy": 0.5134246717936268, + "eval_loss": 1.5783573389053345, + "eval_runtime": 165.1756, + "eval_samples_per_second": 384.294, + "eval_steps_per_second": 1.501, + "step": 1516000 + }, + { + "epoch": 6.17625821248634, + "grad_norm": 5.093085289001465, + "learning_rate": 4.463718754069321e-05, + "loss": 7.4848, + "step": 1516100 + }, + { + "epoch": 6.176665590509722, + "grad_norm": 3.852621555328369, + "learning_rate": 4.455485996345793e-05, + "loss": 7.5193, + "step": 1516200 + }, + { + "epoch": 6.177072968533103, + "grad_norm": 2.822230339050293, + "learning_rate": 4.4472629660778647e-05, + "loss": 7.5146, + "step": 1516300 + }, + { + "epoch": 6.177480346556484, + "grad_norm": 4.570256233215332, + "learning_rate": 4.4390496635905943e-05, + "loss": 7.5227, + "step": 1516400 + }, + { + "epoch": 6.177887724579866, + "grad_norm": 13.537642478942871, + "learning_rate": 4.430846089208739e-05, + "loss": 7.5323, + "step": 1516500 + }, + { + "epoch": 6.178295102603247, + "grad_norm": 19.775623321533203, + "learning_rate": 4.4226522432566154e-05, + "loss": 7.4875, + "step": 1516600 + }, + { + "epoch": 6.178702480626629, + "grad_norm": 26.371177673339844, + "learning_rate": 4.414468126058138e-05, + "loss": 7.5135, + "step": 1516700 + }, + { + "epoch": 6.17910985865001, + "grad_norm": 10.75178337097168, + "learning_rate": 4.406293737936908e-05, + "loss": 7.5432, + "step": 1516800 + }, + { + "epoch": 6.179517236673392, + "grad_norm": 9.740500450134277, + "learning_rate": 4.398129079216093e-05, + "loss": 7.5124, + "step": 1516900 + }, + { + "epoch": 6.179924614696773, + "grad_norm": 4.074995994567871, + "learning_rate": 4.38997415021846e-05, + "loss": 7.5364, + "step": 1517000 + }, + { + "epoch": 6.179924614696773, + "eval_MaskedAccuracy": 0.513934802638679, + "eval_loss": 1.575881004333496, + "eval_runtime": 158.1913, + "eval_samples_per_second": 401.261, + "eval_steps_per_second": 1.568, + "step": 1517000 + }, + { + "epoch": 6.180331992720155, + "grad_norm": 8.33947467803955, + "learning_rate": 4.3818289512664364e-05, + "loss": 7.5176, + "step": 1517100 + }, + { + "epoch": 6.180739370743536, + "grad_norm": 3.4337880611419678, + "learning_rate": 4.3736934826820645e-05, + "loss": 7.5319, + "step": 1517200 + }, + { + "epoch": 6.181146748766918, + "grad_norm": 6.742792129516602, + "learning_rate": 4.365567744786916e-05, + "loss": 7.5248, + "step": 1517300 + }, + { + "epoch": 6.181554126790299, + "grad_norm": 6.558507442474365, + "learning_rate": 4.3574517379023636e-05, + "loss": 7.4924, + "step": 1517400 + }, + { + "epoch": 6.181961504813681, + "grad_norm": 6.90985107421875, + "learning_rate": 4.3493454623491785e-05, + "loss": 7.5271, + "step": 1517500 + }, + { + "epoch": 6.182368882837062, + "grad_norm": 8.464466094970703, + "learning_rate": 4.3412489184478695e-05, + "loss": 7.5067, + "step": 1517600 + }, + { + "epoch": 6.182776260860443, + "grad_norm": 3.467275619506836, + "learning_rate": 4.333162106518546e-05, + "loss": 7.5163, + "step": 1517700 + }, + { + "epoch": 6.183183638883825, + "grad_norm": 3.563931703567505, + "learning_rate": 4.32508502688092e-05, + "loss": 7.5211, + "step": 1517800 + }, + { + "epoch": 6.183591016907206, + "grad_norm": 5.884230613708496, + "learning_rate": 4.317017679854316e-05, + "loss": 7.5059, + "step": 1517900 + }, + { + "epoch": 6.183998394930588, + "grad_norm": 16.4660587310791, + "learning_rate": 4.3089600657576995e-05, + "loss": 7.5391, + "step": 1518000 + }, + { + "epoch": 6.183998394930588, + "eval_MaskedAccuracy": 0.5139440589898651, + "eval_loss": 1.5817654132843018, + "eval_runtime": 168.5904, + "eval_samples_per_second": 376.51, + "eval_steps_per_second": 1.471, + "step": 1518000 + }, + { + "epoch": 6.184405772953969, + "grad_norm": 3.888291120529175, + "learning_rate": 4.300912184909597e-05, + "loss": 7.5127, + "step": 1518100 + }, + { + "epoch": 6.184813150977351, + "grad_norm": 3.8665812015533447, + "learning_rate": 4.292874037628201e-05, + "loss": 7.531, + "step": 1518200 + }, + { + "epoch": 6.185220529000732, + "grad_norm": 19.788267135620117, + "learning_rate": 4.284845624231284e-05, + "loss": 7.5273, + "step": 1518300 + }, + { + "epoch": 6.185627907024114, + "grad_norm": 23.828781127929688, + "learning_rate": 4.2768269450363166e-05, + "loss": 7.4969, + "step": 1518400 + }, + { + "epoch": 6.186035285047495, + "grad_norm": 5.883404731750488, + "learning_rate": 4.2688180003602385e-05, + "loss": 7.5205, + "step": 1518500 + }, + { + "epoch": 6.186442663070877, + "grad_norm": 5.5435638427734375, + "learning_rate": 4.260818790519722e-05, + "loss": 7.531, + "step": 1518600 + }, + { + "epoch": 6.186850041094258, + "grad_norm": 15.866876602172852, + "learning_rate": 4.252829315831023e-05, + "loss": 7.5274, + "step": 1518700 + }, + { + "epoch": 6.18725741911764, + "grad_norm": 26.80928611755371, + "learning_rate": 4.244849576610024e-05, + "loss": 7.5126, + "step": 1518800 + }, + { + "epoch": 6.1876647971410215, + "grad_norm": 10.450453758239746, + "learning_rate": 4.236879573172178e-05, + "loss": 7.5225, + "step": 1518900 + }, + { + "epoch": 6.188072175164402, + "grad_norm": 10.120019912719727, + "learning_rate": 4.228919305832569e-05, + "loss": 7.5081, + "step": 1519000 + }, + { + "epoch": 6.188072175164402, + "eval_MaskedAccuracy": 0.5138981129712925, + "eval_loss": 1.5910037755966187, + "eval_runtime": 167.1661, + "eval_samples_per_second": 379.718, + "eval_steps_per_second": 1.484, + "step": 1519000 + }, + { + "epoch": 6.188479553187784, + "grad_norm": 7.581277847290039, + "learning_rate": 4.220968774905958e-05, + "loss": 7.5142, + "step": 1519100 + }, + { + "epoch": 6.188886931211165, + "grad_norm": 16.236650466918945, + "learning_rate": 4.2130279807066024e-05, + "loss": 7.5143, + "step": 1519200 + }, + { + "epoch": 6.189294309234547, + "grad_norm": 2.630388021469116, + "learning_rate": 4.205096923548509e-05, + "loss": 7.5378, + "step": 1519300 + }, + { + "epoch": 6.189701687257928, + "grad_norm": 5.415067195892334, + "learning_rate": 4.197175603745242e-05, + "loss": 7.526, + "step": 1519400 + }, + { + "epoch": 6.19010906528131, + "grad_norm": 3.192352533340454, + "learning_rate": 4.189264021609915e-05, + "loss": 7.4874, + "step": 1519500 + }, + { + "epoch": 6.190516443304691, + "grad_norm": 9.589027404785156, + "learning_rate": 4.181362177455327e-05, + "loss": 7.4991, + "step": 1519600 + }, + { + "epoch": 6.190923821328073, + "grad_norm": 19.218538284301758, + "learning_rate": 4.1734700715939324e-05, + "loss": 7.5384, + "step": 1519700 + }, + { + "epoch": 6.191331199351454, + "grad_norm": 10.557171821594238, + "learning_rate": 4.165587704337692e-05, + "loss": 7.5177, + "step": 1519800 + }, + { + "epoch": 6.191738577374836, + "grad_norm": 12.861437797546387, + "learning_rate": 4.157715075998255e-05, + "loss": 7.5136, + "step": 1519900 + }, + { + "epoch": 6.192145955398217, + "grad_norm": 4.201754093170166, + "learning_rate": 4.1498521868868894e-05, + "loss": 7.5285, + "step": 1520000 + }, + { + "epoch": 6.192145955398217, + "eval_MaskedAccuracy": 0.5133526491845718, + "eval_loss": 1.5832812786102295, + "eval_runtime": 162.1236, + "eval_samples_per_second": 391.529, + "eval_steps_per_second": 1.53, + "step": 1520000 + }, + { + "epoch": 6.192553333421599, + "grad_norm": 11.104284286499023, + "learning_rate": 4.1419990373144196e-05, + "loss": 7.5235, + "step": 1520100 + }, + { + "epoch": 6.19296071144498, + "grad_norm": 3.3993284702301025, + "learning_rate": 4.1341556275913546e-05, + "loss": 7.5234, + "step": 1520200 + }, + { + "epoch": 6.193368089468361, + "grad_norm": 22.561065673828125, + "learning_rate": 4.126321958027776e-05, + "loss": 7.5138, + "step": 1520300 + }, + { + "epoch": 6.193775467491743, + "grad_norm": 4.123826026916504, + "learning_rate": 4.118498028933399e-05, + "loss": 7.5266, + "step": 1520400 + }, + { + "epoch": 6.194182845515124, + "grad_norm": 15.193856239318848, + "learning_rate": 4.1106838406175176e-05, + "loss": 7.5349, + "step": 1520500 + }, + { + "epoch": 6.194590223538506, + "grad_norm": 3.9755845069885254, + "learning_rate": 4.102879393389102e-05, + "loss": 7.5272, + "step": 1520600 + }, + { + "epoch": 6.194997601561887, + "grad_norm": 7.0829291343688965, + "learning_rate": 4.095084687556674e-05, + "loss": 7.5289, + "step": 1520700 + }, + { + "epoch": 6.195404979585269, + "grad_norm": 3.483314275741577, + "learning_rate": 4.0872997234284283e-05, + "loss": 7.5391, + "step": 1520800 + }, + { + "epoch": 6.19581235760865, + "grad_norm": 17.247650146484375, + "learning_rate": 4.0795245013121084e-05, + "loss": 7.5351, + "step": 1520900 + }, + { + "epoch": 6.196219735632032, + "grad_norm": 6.554419994354248, + "learning_rate": 4.071759021515159e-05, + "loss": 7.5394, + "step": 1521000 + }, + { + "epoch": 6.196219735632032, + "eval_MaskedAccuracy": 0.5134685783312823, + "eval_loss": 1.588786005973816, + "eval_runtime": 169.899, + "eval_samples_per_second": 373.61, + "eval_steps_per_second": 1.46, + "step": 1521000 + }, + { + "epoch": 6.196627113655413, + "grad_norm": 5.016470909118652, + "learning_rate": 4.064003284344553e-05, + "loss": 7.5088, + "step": 1521100 + }, + { + "epoch": 6.197034491678795, + "grad_norm": 6.9189043045043945, + "learning_rate": 4.056257290106897e-05, + "loss": 7.5119, + "step": 1521200 + }, + { + "epoch": 6.1974418697021765, + "grad_norm": 3.6506412029266357, + "learning_rate": 4.0485210391085124e-05, + "loss": 7.5276, + "step": 1521300 + }, + { + "epoch": 6.197849247725557, + "grad_norm": 13.37964916229248, + "learning_rate": 4.040794531655236e-05, + "loss": 7.5099, + "step": 1521400 + }, + { + "epoch": 6.198256625748939, + "grad_norm": 16.245330810546875, + "learning_rate": 4.033077768052481e-05, + "loss": 7.5414, + "step": 1521500 + }, + { + "epoch": 6.19866400377232, + "grad_norm": 3.8642942905426025, + "learning_rate": 4.025370748605361e-05, + "loss": 7.524, + "step": 1521600 + }, + { + "epoch": 6.199071381795702, + "grad_norm": 2.934980630874634, + "learning_rate": 4.0176734736185714e-05, + "loss": 7.5104, + "step": 1521700 + }, + { + "epoch": 6.199478759819083, + "grad_norm": 6.9958062171936035, + "learning_rate": 4.009985943396432e-05, + "loss": 7.5184, + "step": 1521800 + }, + { + "epoch": 6.199886137842465, + "grad_norm": 6.103975772857666, + "learning_rate": 4.0023081582428976e-05, + "loss": 7.5436, + "step": 1521900 + }, + { + "epoch": 6.200293515865846, + "grad_norm": 3.3343257904052734, + "learning_rate": 3.994640118461442e-05, + "loss": 7.5268, + "step": 1522000 + }, + { + "epoch": 6.200293515865846, + "eval_MaskedAccuracy": 0.5136080354808839, + "eval_loss": 1.5869801044464111, + "eval_runtime": 171.0967, + "eval_samples_per_second": 370.995, + "eval_steps_per_second": 1.449, + "step": 1522000 + }, + { + "epoch": 6.200700893889228, + "grad_norm": 5.00773811340332, + "learning_rate": 3.986981824355302e-05, + "loss": 7.5252, + "step": 1522100 + }, + { + "epoch": 6.201108271912609, + "grad_norm": 6.265585899353027, + "learning_rate": 3.979333276227185e-05, + "loss": 7.5237, + "step": 1522200 + }, + { + "epoch": 6.201515649935991, + "grad_norm": 3.543001651763916, + "learning_rate": 3.9716944743795216e-05, + "loss": 7.5433, + "step": 1522300 + }, + { + "epoch": 6.2019230279593724, + "grad_norm": 17.54358673095703, + "learning_rate": 3.9640654191142873e-05, + "loss": 7.5434, + "step": 1522400 + }, + { + "epoch": 6.202330405982754, + "grad_norm": 2.7225570678710938, + "learning_rate": 3.9564461107331214e-05, + "loss": 7.5082, + "step": 1522500 + }, + { + "epoch": 6.2027377840061355, + "grad_norm": 4.40688943862915, + "learning_rate": 3.948836549537261e-05, + "loss": 7.5155, + "step": 1522600 + }, + { + "epoch": 6.203145162029516, + "grad_norm": 7.3498101234436035, + "learning_rate": 3.94123673582752e-05, + "loss": 7.4926, + "step": 1522700 + }, + { + "epoch": 6.203552540052898, + "grad_norm": 4.559058666229248, + "learning_rate": 3.9336466699043944e-05, + "loss": 7.5342, + "step": 1522800 + }, + { + "epoch": 6.203959918076279, + "grad_norm": 9.517656326293945, + "learning_rate": 3.9260663520679325e-05, + "loss": 7.5407, + "step": 1522900 + }, + { + "epoch": 6.204367296099661, + "grad_norm": 11.327088356018066, + "learning_rate": 3.9184957826178475e-05, + "loss": 7.5095, + "step": 1523000 + }, + { + "epoch": 6.204367296099661, + "eval_MaskedAccuracy": 0.5134649629722284, + "eval_loss": 1.58461594581604, + "eval_runtime": 175.035, + "eval_samples_per_second": 362.647, + "eval_steps_per_second": 1.417, + "step": 1523000 + }, + { + "epoch": 6.204774674123042, + "grad_norm": 8.06092357635498, + "learning_rate": 3.910934961853442e-05, + "loss": 7.5309, + "step": 1523100 + }, + { + "epoch": 6.205182052146424, + "grad_norm": 6.9076457023620605, + "learning_rate": 3.903383890073621e-05, + "loss": 7.5178, + "step": 1523200 + }, + { + "epoch": 6.205589430169805, + "grad_norm": 15.919775009155273, + "learning_rate": 3.8958425675769104e-05, + "loss": 7.5187, + "step": 1523300 + }, + { + "epoch": 6.205996808193187, + "grad_norm": 5.286274433135986, + "learning_rate": 3.8883109946615025e-05, + "loss": 7.5551, + "step": 1523400 + }, + { + "epoch": 6.206404186216568, + "grad_norm": 23.460248947143555, + "learning_rate": 3.880789171625117e-05, + "loss": 7.514, + "step": 1523500 + }, + { + "epoch": 6.20681156423995, + "grad_norm": 5.665782928466797, + "learning_rate": 3.873277098765169e-05, + "loss": 7.528, + "step": 1523600 + }, + { + "epoch": 6.2072189422633315, + "grad_norm": 9.24010181427002, + "learning_rate": 3.865774776378606e-05, + "loss": 7.5389, + "step": 1523700 + }, + { + "epoch": 6.207626320286713, + "grad_norm": 5.158371448516846, + "learning_rate": 3.85828220476206e-05, + "loss": 7.5244, + "step": 1523800 + }, + { + "epoch": 6.2080336983100946, + "grad_norm": 9.36806583404541, + "learning_rate": 3.8507993842117614e-05, + "loss": 7.5292, + "step": 1523900 + }, + { + "epoch": 6.208441076333475, + "grad_norm": 11.890572547912598, + "learning_rate": 3.843326315023518e-05, + "loss": 7.509, + "step": 1524000 + }, + { + "epoch": 6.208441076333475, + "eval_MaskedAccuracy": 0.5138990814876041, + "eval_loss": 1.584986686706543, + "eval_runtime": 156.982, + "eval_samples_per_second": 404.352, + "eval_steps_per_second": 1.58, + "step": 1524000 + }, + { + "epoch": 6.208848454356857, + "grad_norm": 7.550594806671143, + "learning_rate": 3.835862997492808e-05, + "loss": 7.5279, + "step": 1524100 + }, + { + "epoch": 6.209255832380238, + "grad_norm": 5.89973783493042, + "learning_rate": 3.828409431914691e-05, + "loss": 7.5038, + "step": 1524200 + }, + { + "epoch": 6.20966321040362, + "grad_norm": 10.729607582092285, + "learning_rate": 3.820965618583845e-05, + "loss": 7.5017, + "step": 1524300 + }, + { + "epoch": 6.210070588427001, + "grad_norm": 21.639862060546875, + "learning_rate": 3.8135315577945516e-05, + "loss": 7.5341, + "step": 1524400 + }, + { + "epoch": 6.210477966450383, + "grad_norm": 7.035912990570068, + "learning_rate": 3.806107249840707e-05, + "loss": 7.5192, + "step": 1524500 + }, + { + "epoch": 6.210885344473764, + "grad_norm": 21.44525718688965, + "learning_rate": 3.79869269501588e-05, + "loss": 7.5253, + "step": 1524600 + }, + { + "epoch": 6.211292722497146, + "grad_norm": 19.483362197875977, + "learning_rate": 3.7912878936131835e-05, + "loss": 7.5164, + "step": 1524700 + }, + { + "epoch": 6.2117001005205275, + "grad_norm": 15.843347549438477, + "learning_rate": 3.7838928459253625e-05, + "loss": 7.5308, + "step": 1524800 + }, + { + "epoch": 6.212107478543909, + "grad_norm": 6.50057315826416, + "learning_rate": 3.7765075522447876e-05, + "loss": 7.505, + "step": 1524900 + }, + { + "epoch": 6.2125148565672905, + "grad_norm": 11.148566246032715, + "learning_rate": 3.769132012863448e-05, + "loss": 7.5198, + "step": 1525000 + }, + { + "epoch": 6.2125148565672905, + "eval_MaskedAccuracy": 0.5138115478730861, + "eval_loss": 1.5826061964035034, + "eval_runtime": 158.9275, + "eval_samples_per_second": 399.402, + "eval_steps_per_second": 1.56, + "step": 1525000 + }, + { + "epoch": 6.212922234590672, + "grad_norm": 3.6122076511383057, + "learning_rate": 3.7617662280729385e-05, + "loss": 7.5164, + "step": 1525100 + }, + { + "epoch": 6.213329612614053, + "grad_norm": 6.010428428649902, + "learning_rate": 3.7544101981644156e-05, + "loss": 7.5452, + "step": 1525200 + }, + { + "epoch": 6.213736990637434, + "grad_norm": 3.211630344390869, + "learning_rate": 3.7470639234287856e-05, + "loss": 7.5441, + "step": 1525300 + }, + { + "epoch": 6.214144368660816, + "grad_norm": 27.52581214904785, + "learning_rate": 3.739727404156428e-05, + "loss": 7.5137, + "step": 1525400 + }, + { + "epoch": 6.214551746684197, + "grad_norm": 30.541147232055664, + "learning_rate": 3.7324006406374174e-05, + "loss": 7.5236, + "step": 1525500 + }, + { + "epoch": 6.214959124707579, + "grad_norm": 3.39619779586792, + "learning_rate": 3.725083633161411e-05, + "loss": 7.5184, + "step": 1525600 + }, + { + "epoch": 6.21536650273096, + "grad_norm": 4.325729846954346, + "learning_rate": 3.717776382017712e-05, + "loss": 7.5417, + "step": 1525700 + }, + { + "epoch": 6.215773880754342, + "grad_norm": 24.767698287963867, + "learning_rate": 3.710478887495175e-05, + "loss": 7.5108, + "step": 1525800 + }, + { + "epoch": 6.216181258777723, + "grad_norm": 27.488677978515625, + "learning_rate": 3.7031911498823575e-05, + "loss": 7.544, + "step": 1525900 + }, + { + "epoch": 6.216588636801105, + "grad_norm": 10.902069091796875, + "learning_rate": 3.6959131694673304e-05, + "loss": 7.5298, + "step": 1526000 + }, + { + "epoch": 6.216588636801105, + "eval_MaskedAccuracy": 0.5132878047846802, + "eval_loss": 1.5875064134597778, + "eval_runtime": 166.3401, + "eval_samples_per_second": 381.604, + "eval_steps_per_second": 1.491, + "step": 1526000 + }, + { + "epoch": 6.2169960148244865, + "grad_norm": 19.047046661376953, + "learning_rate": 3.688644946537874e-05, + "loss": 7.506, + "step": 1526100 + }, + { + "epoch": 6.217403392847868, + "grad_norm": 3.6258020401000977, + "learning_rate": 3.6813864813812876e-05, + "loss": 7.5216, + "step": 1526200 + }, + { + "epoch": 6.21781077087125, + "grad_norm": 9.296761512756348, + "learning_rate": 3.6741377742846076e-05, + "loss": 7.5151, + "step": 1526300 + }, + { + "epoch": 6.21821814889463, + "grad_norm": 3.2167930603027344, + "learning_rate": 3.666898825534362e-05, + "loss": 7.512, + "step": 1526400 + }, + { + "epoch": 6.218625526918012, + "grad_norm": 6.6329498291015625, + "learning_rate": 3.659669635416776e-05, + "loss": 7.4934, + "step": 1526500 + }, + { + "epoch": 6.219032904941393, + "grad_norm": 10.958048820495605, + "learning_rate": 3.652450204217629e-05, + "loss": 7.5153, + "step": 1526600 + }, + { + "epoch": 6.219440282964775, + "grad_norm": 4.174325942993164, + "learning_rate": 3.6452405322223724e-05, + "loss": 7.5157, + "step": 1526700 + }, + { + "epoch": 6.219847660988156, + "grad_norm": 3.164076328277588, + "learning_rate": 3.638040619716043e-05, + "loss": 7.5035, + "step": 1526800 + }, + { + "epoch": 6.220255039011538, + "grad_norm": 9.154379844665527, + "learning_rate": 3.6308504669832514e-05, + "loss": 7.5006, + "step": 1526900 + }, + { + "epoch": 6.220662417034919, + "grad_norm": 3.227844476699829, + "learning_rate": 3.623670074308285e-05, + "loss": 7.5327, + "step": 1527000 + }, + { + "epoch": 6.220662417034919, + "eval_MaskedAccuracy": 0.5142326860902728, + "eval_loss": 1.5779016017913818, + "eval_runtime": 430.3278, + "eval_samples_per_second": 147.506, + "eval_steps_per_second": 0.576, + "step": 1527000 + }, + { + "epoch": 6.221069795058301, + "grad_norm": 3.534832000732422, + "learning_rate": 3.616499441975046e-05, + "loss": 7.5095, + "step": 1527100 + }, + { + "epoch": 6.2214771730816825, + "grad_norm": 2.9680981636047363, + "learning_rate": 3.609338570267012e-05, + "loss": 7.4944, + "step": 1527200 + }, + { + "epoch": 6.221884551105064, + "grad_norm": 8.061978340148926, + "learning_rate": 3.602187459467277e-05, + "loss": 7.5392, + "step": 1527300 + }, + { + "epoch": 6.2222919291284455, + "grad_norm": 2.630624532699585, + "learning_rate": 3.595046109858575e-05, + "loss": 7.5047, + "step": 1527400 + }, + { + "epoch": 6.222699307151827, + "grad_norm": 5.866968154907227, + "learning_rate": 3.587914521723235e-05, + "loss": 7.5182, + "step": 1527500 + }, + { + "epoch": 6.223106685175209, + "grad_norm": 7.416944980621338, + "learning_rate": 3.580792695343208e-05, + "loss": 7.4837, + "step": 1527600 + }, + { + "epoch": 6.223514063198589, + "grad_norm": 8.481433868408203, + "learning_rate": 3.5736806310000895e-05, + "loss": 7.5198, + "step": 1527700 + }, + { + "epoch": 6.223921441221971, + "grad_norm": 18.279815673828125, + "learning_rate": 3.56657832897501e-05, + "loss": 7.5131, + "step": 1527800 + }, + { + "epoch": 6.224328819245352, + "grad_norm": 8.598742485046387, + "learning_rate": 3.5594857895487754e-05, + "loss": 7.5433, + "step": 1527900 + }, + { + "epoch": 6.224736197268734, + "grad_norm": 22.971736907958984, + "learning_rate": 3.5524030130018395e-05, + "loss": 7.5203, + "step": 1528000 + }, + { + "epoch": 6.224736197268734, + "eval_MaskedAccuracy": 0.5146985033384197, + "eval_loss": 1.5790197849273682, + "eval_runtime": 159.5687, + "eval_samples_per_second": 397.797, + "eval_steps_per_second": 1.554, + "step": 1528000 + }, + { + "epoch": 6.225143575292115, + "grad_norm": 20.58827781677246, + "learning_rate": 3.545329999614137e-05, + "loss": 7.5162, + "step": 1528100 + }, + { + "epoch": 6.225550953315497, + "grad_norm": 13.78675651550293, + "learning_rate": 3.538266749665359e-05, + "loss": 7.531, + "step": 1528200 + }, + { + "epoch": 6.225958331338878, + "grad_norm": 16.073307037353516, + "learning_rate": 3.531213263434752e-05, + "loss": 7.5065, + "step": 1528300 + }, + { + "epoch": 6.22636570936226, + "grad_norm": 9.83819580078125, + "learning_rate": 3.5241695412011664e-05, + "loss": 7.5093, + "step": 1528400 + }, + { + "epoch": 6.2267730873856415, + "grad_norm": 2.6550517082214355, + "learning_rate": 3.517135583243071e-05, + "loss": 7.5435, + "step": 1528500 + }, + { + "epoch": 6.227180465409023, + "grad_norm": 13.715490341186523, + "learning_rate": 3.5101113898385543e-05, + "loss": 7.5137, + "step": 1528600 + }, + { + "epoch": 6.227587843432405, + "grad_norm": 7.189116477966309, + "learning_rate": 3.503096961265335e-05, + "loss": 7.5168, + "step": 1528700 + }, + { + "epoch": 6.227995221455786, + "grad_norm": 15.785459518432617, + "learning_rate": 3.496092297800741e-05, + "loss": 7.4834, + "step": 1528800 + }, + { + "epoch": 6.228402599479168, + "grad_norm": 5.474700450897217, + "learning_rate": 3.489097399721666e-05, + "loss": 7.5014, + "step": 1528900 + }, + { + "epoch": 6.228809977502548, + "grad_norm": 9.012724876403809, + "learning_rate": 3.482112267304694e-05, + "loss": 7.5346, + "step": 1529000 + }, + { + "epoch": 6.228809977502548, + "eval_MaskedAccuracy": 0.5141386005417878, + "eval_loss": 1.5893563032150269, + "eval_runtime": 166.1825, + "eval_samples_per_second": 381.966, + "eval_steps_per_second": 1.492, + "step": 1529000 + }, + { + "epoch": 6.22921735552593, + "grad_norm": 9.357582092285156, + "learning_rate": 3.475136900825944e-05, + "loss": 7.5225, + "step": 1529100 + }, + { + "epoch": 6.229624733549311, + "grad_norm": 4.897263526916504, + "learning_rate": 3.4681713005612175e-05, + "loss": 7.5036, + "step": 1529200 + }, + { + "epoch": 6.230032111572693, + "grad_norm": 5.631476402282715, + "learning_rate": 3.4612154667859114e-05, + "loss": 7.4929, + "step": 1529300 + }, + { + "epoch": 6.230439489596074, + "grad_norm": 4.668044090270996, + "learning_rate": 3.4542693997750315e-05, + "loss": 7.5267, + "step": 1529400 + }, + { + "epoch": 6.230846867619456, + "grad_norm": 9.86772346496582, + "learning_rate": 3.447333099803139e-05, + "loss": 7.4955, + "step": 1529500 + }, + { + "epoch": 6.2312542456428375, + "grad_norm": 14.227707862854004, + "learning_rate": 3.440406567144532e-05, + "loss": 7.5219, + "step": 1529600 + }, + { + "epoch": 6.231661623666219, + "grad_norm": 6.231542110443115, + "learning_rate": 3.433489802073016e-05, + "loss": 7.5418, + "step": 1529700 + }, + { + "epoch": 6.2320690016896005, + "grad_norm": 5.305866241455078, + "learning_rate": 3.426582804862045e-05, + "loss": 7.5455, + "step": 1529800 + }, + { + "epoch": 6.232476379712982, + "grad_norm": 3.2614071369171143, + "learning_rate": 3.419685575784688e-05, + "loss": 7.4905, + "step": 1529900 + }, + { + "epoch": 6.232883757736364, + "grad_norm": 3.040539264678955, + "learning_rate": 3.4127981151136506e-05, + "loss": 7.505, + "step": 1530000 + }, + { + "epoch": 6.232883757736364, + "eval_MaskedAccuracy": 0.514141880386851, + "eval_loss": 1.5848793983459473, + "eval_runtime": 162.9599, + "eval_samples_per_second": 389.519, + "eval_steps_per_second": 1.522, + "step": 1530000 + }, + { + "epoch": 6.233291135759745, + "grad_norm": 13.58469009399414, + "learning_rate": 3.4059204231212264e-05, + "loss": 7.5222, + "step": 1530100 + }, + { + "epoch": 6.233698513783126, + "grad_norm": 11.106911659240723, + "learning_rate": 3.39905250007929e-05, + "loss": 7.5219, + "step": 1530200 + }, + { + "epoch": 6.234105891806507, + "grad_norm": 18.91869354248047, + "learning_rate": 3.392194346259416e-05, + "loss": 7.5355, + "step": 1530300 + }, + { + "epoch": 6.234513269829889, + "grad_norm": 4.576848983764648, + "learning_rate": 3.385345961932732e-05, + "loss": 7.5324, + "step": 1530400 + }, + { + "epoch": 6.23492064785327, + "grad_norm": 7.7829108238220215, + "learning_rate": 3.378507347369977e-05, + "loss": 7.5082, + "step": 1530500 + }, + { + "epoch": 6.235328025876652, + "grad_norm": 4.414244651794434, + "learning_rate": 3.371678502841532e-05, + "loss": 7.5208, + "step": 1530600 + }, + { + "epoch": 6.2357354039000334, + "grad_norm": 9.419246673583984, + "learning_rate": 3.364859428617363e-05, + "loss": 7.5306, + "step": 1530700 + }, + { + "epoch": 6.236142781923415, + "grad_norm": 9.71709156036377, + "learning_rate": 3.358050124967097e-05, + "loss": 7.5313, + "step": 1530800 + }, + { + "epoch": 6.2365501599467965, + "grad_norm": 6.383625030517578, + "learning_rate": 3.3512505921598716e-05, + "loss": 7.5133, + "step": 1530900 + }, + { + "epoch": 6.236957537970178, + "grad_norm": 4.312445640563965, + "learning_rate": 3.3444608304645974e-05, + "loss": 7.4945, + "step": 1531000 + }, + { + "epoch": 6.236957537970178, + "eval_MaskedAccuracy": 0.51399551571728, + "eval_loss": 1.588167428970337, + "eval_runtime": 160.9326, + "eval_samples_per_second": 394.426, + "eval_steps_per_second": 1.541, + "step": 1531000 + }, + { + "epoch": 6.23736491599356, + "grad_norm": 6.269240379333496, + "learning_rate": 3.3376808401496074e-05, + "loss": 7.5257, + "step": 1531100 + }, + { + "epoch": 6.237772294016941, + "grad_norm": 4.7655863761901855, + "learning_rate": 3.330910621483068e-05, + "loss": 7.5186, + "step": 1531200 + }, + { + "epoch": 6.238179672040323, + "grad_norm": 5.086463451385498, + "learning_rate": 3.324150174732555e-05, + "loss": 7.5303, + "step": 1531300 + }, + { + "epoch": 6.238587050063703, + "grad_norm": 5.014735221862793, + "learning_rate": 3.317399500165379e-05, + "loss": 7.5043, + "step": 1531400 + }, + { + "epoch": 6.238994428087085, + "grad_norm": 4.463014602661133, + "learning_rate": 3.31065859804846e-05, + "loss": 7.5297, + "step": 1531500 + }, + { + "epoch": 6.239401806110466, + "grad_norm": 2.4507670402526855, + "learning_rate": 3.303927468648234e-05, + "loss": 7.5197, + "step": 1531600 + }, + { + "epoch": 6.239809184133848, + "grad_norm": 8.728959083557129, + "learning_rate": 3.2972061122308544e-05, + "loss": 7.5108, + "step": 1531700 + }, + { + "epoch": 6.240216562157229, + "grad_norm": 4.4223504066467285, + "learning_rate": 3.29049452906207e-05, + "loss": 7.5097, + "step": 1531800 + }, + { + "epoch": 6.240623940180611, + "grad_norm": 3.243269681930542, + "learning_rate": 3.283792719407194e-05, + "loss": 7.5469, + "step": 1531900 + }, + { + "epoch": 6.2410313182039925, + "grad_norm": 14.289515495300293, + "learning_rate": 3.2771006835311846e-05, + "loss": 7.5332, + "step": 1532000 + }, + { + "epoch": 6.2410313182039925, + "eval_MaskedAccuracy": 0.5137271034820764, + "eval_loss": 1.579954981803894, + "eval_runtime": 165.4127, + "eval_samples_per_second": 383.743, + "eval_steps_per_second": 1.499, + "step": 1532000 + }, + { + "epoch": 6.241438696227374, + "grad_norm": 8.382390975952148, + "learning_rate": 3.2704184216986484e-05, + "loss": 7.5481, + "step": 1532100 + }, + { + "epoch": 6.2418460742507555, + "grad_norm": 5.989925861358643, + "learning_rate": 3.2637459341737396e-05, + "loss": 7.5342, + "step": 1532200 + }, + { + "epoch": 6.242253452274137, + "grad_norm": 7.9933762550354, + "learning_rate": 3.257083221220268e-05, + "loss": 7.509, + "step": 1532300 + }, + { + "epoch": 6.242660830297519, + "grad_norm": 12.534502983093262, + "learning_rate": 3.250430283101642e-05, + "loss": 7.5119, + "step": 1532400 + }, + { + "epoch": 6.2430682083209, + "grad_norm": 7.783627510070801, + "learning_rate": 3.2437871200809e-05, + "loss": 7.5304, + "step": 1532500 + }, + { + "epoch": 6.243475586344282, + "grad_norm": 10.44333553314209, + "learning_rate": 3.237153732420656e-05, + "loss": 7.52, + "step": 1532600 + }, + { + "epoch": 6.243882964367662, + "grad_norm": 8.387107849121094, + "learning_rate": 3.2305301203831845e-05, + "loss": 7.5243, + "step": 1532700 + }, + { + "epoch": 6.244290342391044, + "grad_norm": 9.744940757751465, + "learning_rate": 3.223916284230359e-05, + "loss": 7.5026, + "step": 1532800 + }, + { + "epoch": 6.244697720414425, + "grad_norm": 17.366146087646484, + "learning_rate": 3.217312224223639e-05, + "loss": 7.5411, + "step": 1532900 + }, + { + "epoch": 6.245105098437807, + "grad_norm": 11.222664833068848, + "learning_rate": 3.210717940624129e-05, + "loss": 7.5166, + "step": 1533000 + }, + { + "epoch": 6.245105098437807, + "eval_MaskedAccuracy": 0.5139523056378643, + "eval_loss": 1.584529995918274, + "eval_runtime": 162.7064, + "eval_samples_per_second": 390.126, + "eval_steps_per_second": 1.524, + "step": 1533000 + }, + { + "epoch": 6.2455124764611885, + "grad_norm": 10.221765518188477, + "learning_rate": 3.2041334336925136e-05, + "loss": 7.5256, + "step": 1533100 + }, + { + "epoch": 6.24591985448457, + "grad_norm": 6.283722877502441, + "learning_rate": 3.1975587036891684e-05, + "loss": 7.5122, + "step": 1533200 + }, + { + "epoch": 6.2463272325079515, + "grad_norm": 3.4898505210876465, + "learning_rate": 3.190993750873981e-05, + "loss": 7.5054, + "step": 1533300 + }, + { + "epoch": 6.246734610531333, + "grad_norm": 5.075881004333496, + "learning_rate": 3.1844385755065e-05, + "loss": 7.5287, + "step": 1533400 + }, + { + "epoch": 6.247141988554715, + "grad_norm": 7.452518939971924, + "learning_rate": 3.1778931778459104e-05, + "loss": 7.5467, + "step": 1533500 + }, + { + "epoch": 6.247549366578096, + "grad_norm": 5.194850444793701, + "learning_rate": 3.1713575581509666e-05, + "loss": 7.5174, + "step": 1533600 + }, + { + "epoch": 6.247956744601478, + "grad_norm": 5.435227394104004, + "learning_rate": 3.164831716680072e-05, + "loss": 7.5003, + "step": 1533700 + }, + { + "epoch": 6.248364122624859, + "grad_norm": 8.2947416305542, + "learning_rate": 3.15831565369123e-05, + "loss": 7.5104, + "step": 1533800 + }, + { + "epoch": 6.248771500648241, + "grad_norm": 11.16115951538086, + "learning_rate": 3.151809369442027e-05, + "loss": 7.5256, + "step": 1533900 + }, + { + "epoch": 6.249178878671621, + "grad_norm": 2.5040526390075684, + "learning_rate": 3.1453128641897385e-05, + "loss": 7.5049, + "step": 1534000 + }, + { + "epoch": 6.249178878671621, + "eval_MaskedAccuracy": 0.5138741488608206, + "eval_loss": 1.5921443700790405, + "eval_runtime": 186.762, + "eval_samples_per_second": 339.876, + "eval_steps_per_second": 1.328, + "step": 1534000 + }, + { + "epoch": 6.249586256695003, + "grad_norm": 7.071235179901123, + "learning_rate": 3.138826138191145e-05, + "loss": 7.4999, + "step": 1534100 + }, + { + "epoch": 6.249993634718384, + "grad_norm": 12.430635452270508, + "learning_rate": 3.1323491917027747e-05, + "loss": 7.5131, + "step": 1534200 + }, + { + "epoch": 6.250401012741766, + "grad_norm": 23.40462303161621, + "learning_rate": 3.125882024980636e-05, + "loss": 7.5258, + "step": 1534300 + }, + { + "epoch": 6.2508083907651475, + "grad_norm": 9.99567985534668, + "learning_rate": 3.119424638280426e-05, + "loss": 7.5132, + "step": 1534400 + }, + { + "epoch": 6.251215768788529, + "grad_norm": 7.518438816070557, + "learning_rate": 3.112977031857426e-05, + "loss": 7.505, + "step": 1534500 + }, + { + "epoch": 6.251623146811911, + "grad_norm": 5.996940612792969, + "learning_rate": 3.106539205966588e-05, + "loss": 7.4946, + "step": 1534600 + }, + { + "epoch": 6.252030524835292, + "grad_norm": 14.199740409851074, + "learning_rate": 3.100111160862424e-05, + "loss": 7.5317, + "step": 1534700 + }, + { + "epoch": 6.252437902858674, + "grad_norm": 9.197056770324707, + "learning_rate": 3.0936928967990237e-05, + "loss": 7.5315, + "step": 1534800 + }, + { + "epoch": 6.252845280882055, + "grad_norm": 16.446992874145508, + "learning_rate": 3.087284414030175e-05, + "loss": 7.5215, + "step": 1534900 + }, + { + "epoch": 6.253252658905437, + "grad_norm": 4.474786281585693, + "learning_rate": 3.08088571280925e-05, + "loss": 7.5534, + "step": 1535000 + }, + { + "epoch": 6.253252658905437, + "eval_MaskedAccuracy": 0.5135466027244717, + "eval_loss": 1.5894109010696411, + "eval_runtime": 151.873, + "eval_samples_per_second": 417.955, + "eval_steps_per_second": 1.633, + "step": 1535000 + }, + { + "epoch": 6.253660036928818, + "grad_norm": 5.775058746337891, + "learning_rate": 3.074496793389151e-05, + "loss": 7.5043, + "step": 1535100 + }, + { + "epoch": 6.254067414952199, + "grad_norm": 12.942586898803711, + "learning_rate": 3.068117656022559e-05, + "loss": 7.5248, + "step": 1535200 + }, + { + "epoch": 6.25447479297558, + "grad_norm": 2.3693482875823975, + "learning_rate": 3.061748300961626e-05, + "loss": 7.5243, + "step": 1535300 + }, + { + "epoch": 6.254882170998962, + "grad_norm": 2.6879940032958984, + "learning_rate": 3.055388728458171e-05, + "loss": 7.5414, + "step": 1535400 + }, + { + "epoch": 6.2552895490223435, + "grad_norm": 3.1551711559295654, + "learning_rate": 3.0490389387636312e-05, + "loss": 7.5223, + "step": 1535500 + }, + { + "epoch": 6.255696927045725, + "grad_norm": 2.420809745788574, + "learning_rate": 3.0426989321290488e-05, + "loss": 7.4876, + "step": 1535600 + }, + { + "epoch": 6.2561043050691065, + "grad_norm": 2.420135974884033, + "learning_rate": 3.0363687088050593e-05, + "loss": 7.5174, + "step": 1535700 + }, + { + "epoch": 6.256511683092488, + "grad_norm": 10.525727272033691, + "learning_rate": 3.0300482690419547e-05, + "loss": 7.5498, + "step": 1535800 + }, + { + "epoch": 6.25691906111587, + "grad_norm": 6.917478084564209, + "learning_rate": 3.023737613089595e-05, + "loss": 7.5377, + "step": 1535900 + }, + { + "epoch": 6.257326439139251, + "grad_norm": 7.227997779846191, + "learning_rate": 3.0174367411974718e-05, + "loss": 7.5425, + "step": 1536000 + }, + { + "epoch": 6.257326439139251, + "eval_MaskedAccuracy": 0.5133481368811607, + "eval_loss": 1.5899070501327515, + "eval_runtime": 167.0447, + "eval_samples_per_second": 379.994, + "eval_steps_per_second": 1.485, + "step": 1536000 + }, + { + "epoch": 6.257733817162633, + "grad_norm": 20.933719635009766, + "learning_rate": 3.01114565361472e-05, + "loss": 7.529, + "step": 1536100 + }, + { + "epoch": 6.258141195186014, + "grad_norm": 7.92340087890625, + "learning_rate": 3.00486435059006e-05, + "loss": 7.5243, + "step": 1536200 + }, + { + "epoch": 6.258548573209396, + "grad_norm": 3.8228554725646973, + "learning_rate": 2.9985928323717917e-05, + "loss": 7.5142, + "step": 1536300 + }, + { + "epoch": 6.258955951232776, + "grad_norm": 7.7100419998168945, + "learning_rate": 2.9923310992078577e-05, + "loss": 7.5011, + "step": 1536400 + }, + { + "epoch": 6.259363329256158, + "grad_norm": 13.193169593811035, + "learning_rate": 2.9860791513458714e-05, + "loss": 7.5286, + "step": 1536500 + }, + { + "epoch": 6.259770707279539, + "grad_norm": 14.40185260772705, + "learning_rate": 2.9798369890329746e-05, + "loss": 7.5324, + "step": 1536600 + }, + { + "epoch": 6.260178085302921, + "grad_norm": 2.7055928707122803, + "learning_rate": 2.9736046125159188e-05, + "loss": 7.514, + "step": 1536700 + }, + { + "epoch": 6.2605854633263025, + "grad_norm": 6.486224174499512, + "learning_rate": 2.967382022041179e-05, + "loss": 7.5186, + "step": 1536800 + }, + { + "epoch": 6.260992841349684, + "grad_norm": 4.19632625579834, + "learning_rate": 2.961169217854702e-05, + "loss": 7.523, + "step": 1536900 + }, + { + "epoch": 6.261400219373066, + "grad_norm": 10.856332778930664, + "learning_rate": 2.9549662002021355e-05, + "loss": 7.5569, + "step": 1537000 + }, + { + "epoch": 6.261400219373066, + "eval_MaskedAccuracy": 0.5135443953707618, + "eval_loss": 1.5890820026397705, + "eval_runtime": 180.6532, + "eval_samples_per_second": 351.369, + "eval_steps_per_second": 1.373, + "step": 1537000 + }, + { + "epoch": 6.261807597396447, + "grad_norm": 3.8834424018859863, + "learning_rate": 2.9487729693287067e-05, + "loss": 7.4853, + "step": 1537100 + }, + { + "epoch": 6.262214975419829, + "grad_norm": 10.020374298095703, + "learning_rate": 2.9425895254793066e-05, + "loss": 7.54, + "step": 1537200 + }, + { + "epoch": 6.26262235344321, + "grad_norm": 6.428145408630371, + "learning_rate": 2.936415868898367e-05, + "loss": 7.529, + "step": 1537300 + }, + { + "epoch": 6.263029731466592, + "grad_norm": 16.600547790527344, + "learning_rate": 2.9302519998299572e-05, + "loss": 7.5321, + "step": 1537400 + }, + { + "epoch": 6.263437109489973, + "grad_norm": 10.559361457824707, + "learning_rate": 2.924097918517798e-05, + "loss": 7.5275, + "step": 1537500 + }, + { + "epoch": 6.263844487513355, + "grad_norm": 19.23090171813965, + "learning_rate": 2.9179536252051844e-05, + "loss": 7.541, + "step": 1537600 + }, + { + "epoch": 6.264251865536735, + "grad_norm": 20.963829040527344, + "learning_rate": 2.9118191201349954e-05, + "loss": 7.5423, + "step": 1537700 + }, + { + "epoch": 6.264659243560117, + "grad_norm": 7.682584762573242, + "learning_rate": 2.9056944035498205e-05, + "loss": 7.5433, + "step": 1537800 + }, + { + "epoch": 6.2650666215834985, + "grad_norm": 6.028975009918213, + "learning_rate": 2.8995794756917608e-05, + "loss": 7.5453, + "step": 1537900 + }, + { + "epoch": 6.26547399960688, + "grad_norm": 32.538124084472656, + "learning_rate": 2.8934743368025824e-05, + "loss": 7.5045, + "step": 1538000 + }, + { + "epoch": 6.26547399960688, + "eval_MaskedAccuracy": 0.5139198570778778, + "eval_loss": 1.5882899761199951, + "eval_runtime": 167.5515, + "eval_samples_per_second": 378.845, + "eval_steps_per_second": 1.48, + "step": 1538000 + }, + { + "epoch": 6.2658813776302615, + "grad_norm": 24.366785049438477, + "learning_rate": 2.8873789871236668e-05, + "loss": 7.5334, + "step": 1538100 + }, + { + "epoch": 6.266288755653643, + "grad_norm": 21.920454025268555, + "learning_rate": 2.8812934268959743e-05, + "loss": 7.5098, + "step": 1538200 + }, + { + "epoch": 6.266696133677025, + "grad_norm": 10.484559059143066, + "learning_rate": 2.875217656360108e-05, + "loss": 7.523, + "step": 1538300 + }, + { + "epoch": 6.267103511700406, + "grad_norm": 9.51395320892334, + "learning_rate": 2.869151675756289e-05, + "loss": 7.5014, + "step": 1538400 + }, + { + "epoch": 6.267510889723788, + "grad_norm": 10.454301834106445, + "learning_rate": 2.8630954853243323e-05, + "loss": 7.5519, + "step": 1538500 + }, + { + "epoch": 6.267918267747169, + "grad_norm": 10.728215217590332, + "learning_rate": 2.8570490853036567e-05, + "loss": 7.5193, + "step": 1538600 + }, + { + "epoch": 6.268325645770551, + "grad_norm": 6.6606340408325195, + "learning_rate": 2.8510124759333137e-05, + "loss": 7.5209, + "step": 1538700 + }, + { + "epoch": 6.268733023793932, + "grad_norm": 10.098960876464844, + "learning_rate": 2.844985657451964e-05, + "loss": 7.5453, + "step": 1538800 + }, + { + "epoch": 6.269140401817314, + "grad_norm": 10.603069305419922, + "learning_rate": 2.83896863009789e-05, + "loss": 7.5287, + "step": 1538900 + }, + { + "epoch": 6.2695477798406944, + "grad_norm": 2.9649927616119385, + "learning_rate": 2.832961394108973e-05, + "loss": 7.5094, + "step": 1539000 + }, + { + "epoch": 6.2695477798406944, + "eval_MaskedAccuracy": 0.5134989380885026, + "eval_loss": 1.5853767395019531, + "eval_runtime": 158.3888, + "eval_samples_per_second": 400.761, + "eval_steps_per_second": 1.566, + "step": 1539000 + }, + { + "epoch": 6.269955157864076, + "grad_norm": 9.790529251098633, + "learning_rate": 2.826963949722687e-05, + "loss": 7.4891, + "step": 1539100 + }, + { + "epoch": 6.2703625358874575, + "grad_norm": 19.240970611572266, + "learning_rate": 2.8209762971761705e-05, + "loss": 7.5075, + "step": 1539200 + }, + { + "epoch": 6.270769913910839, + "grad_norm": 7.542076587677002, + "learning_rate": 2.8149984367061514e-05, + "loss": 7.537, + "step": 1539300 + }, + { + "epoch": 6.271177291934221, + "grad_norm": 7.113610744476318, + "learning_rate": 2.8090303685489716e-05, + "loss": 7.5219, + "step": 1539400 + }, + { + "epoch": 6.271584669957602, + "grad_norm": 14.183826446533203, + "learning_rate": 2.8030720929405435e-05, + "loss": 7.5073, + "step": 1539500 + }, + { + "epoch": 6.271992047980984, + "grad_norm": 13.633611679077148, + "learning_rate": 2.7971236101164614e-05, + "loss": 7.507, + "step": 1539600 + }, + { + "epoch": 6.272399426004365, + "grad_norm": 35.740501403808594, + "learning_rate": 2.791184920311897e-05, + "loss": 7.5294, + "step": 1539700 + }, + { + "epoch": 6.272806804027747, + "grad_norm": 9.583195686340332, + "learning_rate": 2.7852560237616386e-05, + "loss": 7.501, + "step": 1539800 + }, + { + "epoch": 6.273214182051128, + "grad_norm": 7.148852825164795, + "learning_rate": 2.7793369207000513e-05, + "loss": 7.5262, + "step": 1539900 + }, + { + "epoch": 6.27362156007451, + "grad_norm": 8.172830581665039, + "learning_rate": 2.773427611361206e-05, + "loss": 7.5402, + "step": 1540000 + }, + { + "epoch": 6.27362156007451, + "eval_MaskedAccuracy": 0.5130502717006363, + "eval_loss": 1.5896717309951782, + "eval_runtime": 162.3102, + "eval_samples_per_second": 391.078, + "eval_steps_per_second": 1.528, + "step": 1540000 + }, + { + "epoch": 6.274028938097891, + "grad_norm": 7.16539192199707, + "learning_rate": 2.7675280959786952e-05, + "loss": 7.5082, + "step": 1540100 + }, + { + "epoch": 6.274436316121272, + "grad_norm": 10.530797958374023, + "learning_rate": 2.761638374785754e-05, + "loss": 7.5043, + "step": 1540200 + }, + { + "epoch": 6.2748436941446535, + "grad_norm": 4.062539577484131, + "learning_rate": 2.755758448015251e-05, + "loss": 7.529, + "step": 1540300 + }, + { + "epoch": 6.275251072168035, + "grad_norm": 4.5090179443359375, + "learning_rate": 2.7498883158996543e-05, + "loss": 7.526, + "step": 1540400 + }, + { + "epoch": 6.2756584501914165, + "grad_norm": 5.734238147735596, + "learning_rate": 2.7440279786710342e-05, + "loss": 7.5248, + "step": 1540500 + }, + { + "epoch": 6.276065828214798, + "grad_norm": 4.6185784339904785, + "learning_rate": 2.7381774365610962e-05, + "loss": 7.5338, + "step": 1540600 + }, + { + "epoch": 6.27647320623818, + "grad_norm": 6.488243103027344, + "learning_rate": 2.732336689801083e-05, + "loss": 7.5309, + "step": 1540700 + }, + { + "epoch": 6.276880584261561, + "grad_norm": 16.597230911254883, + "learning_rate": 2.7265057386219804e-05, + "loss": 7.5216, + "step": 1540800 + }, + { + "epoch": 6.277287962284943, + "grad_norm": 13.147786140441895, + "learning_rate": 2.7206845832542878e-05, + "loss": 7.4982, + "step": 1540900 + }, + { + "epoch": 6.277695340308324, + "grad_norm": 3.276792526245117, + "learning_rate": 2.7148732239281367e-05, + "loss": 7.506, + "step": 1541000 + }, + { + "epoch": 6.277695340308324, + "eval_MaskedAccuracy": 0.5135812078446895, + "eval_loss": 1.5850106477737427, + "eval_runtime": 162.1925, + "eval_samples_per_second": 391.362, + "eval_steps_per_second": 1.529, + "step": 1541000 + }, + { + "epoch": 6.278102718331706, + "grad_norm": 14.958015441894531, + "learning_rate": 2.7090716608733243e-05, + "loss": 7.5147, + "step": 1541100 + }, + { + "epoch": 6.278510096355087, + "grad_norm": 4.234958648681641, + "learning_rate": 2.7032798943191818e-05, + "loss": 7.4796, + "step": 1541200 + }, + { + "epoch": 6.278917474378469, + "grad_norm": 4.868026256561279, + "learning_rate": 2.697497924494681e-05, + "loss": 7.5059, + "step": 1541300 + }, + { + "epoch": 6.2793248524018495, + "grad_norm": 10.32494831085205, + "learning_rate": 2.6917257516284467e-05, + "loss": 7.5118, + "step": 1541400 + }, + { + "epoch": 6.279732230425231, + "grad_norm": 4.418272495269775, + "learning_rate": 2.685963375948653e-05, + "loss": 7.5131, + "step": 1541500 + }, + { + "epoch": 6.2801396084486125, + "grad_norm": 6.862038612365723, + "learning_rate": 2.6802107976831325e-05, + "loss": 7.5109, + "step": 1541600 + }, + { + "epoch": 6.280546986471994, + "grad_norm": 2.388366460800171, + "learning_rate": 2.6744680170592958e-05, + "loss": 7.4737, + "step": 1541700 + }, + { + "epoch": 6.280954364495376, + "grad_norm": 2.647691488265991, + "learning_rate": 2.6687350343042097e-05, + "loss": 7.541, + "step": 1541800 + }, + { + "epoch": 6.281361742518757, + "grad_norm": 6.9391374588012695, + "learning_rate": 2.663011849644541e-05, + "loss": 7.5037, + "step": 1541900 + }, + { + "epoch": 6.281769120542139, + "grad_norm": 6.617556571960449, + "learning_rate": 2.6572984633065148e-05, + "loss": 7.5134, + "step": 1542000 + }, + { + "epoch": 6.281769120542139, + "eval_MaskedAccuracy": 0.5137466046271422, + "eval_loss": 1.5850129127502441, + "eval_runtime": 164.3074, + "eval_samples_per_second": 386.325, + "eval_steps_per_second": 1.509, + "step": 1542000 + }, + { + "epoch": 6.28217649856552, + "grad_norm": 3.4860963821411133, + "learning_rate": 2.6515948755160534e-05, + "loss": 7.5133, + "step": 1542100 + }, + { + "epoch": 6.282583876588902, + "grad_norm": 8.580883026123047, + "learning_rate": 2.645901086498634e-05, + "loss": 7.5207, + "step": 1542200 + }, + { + "epoch": 6.282991254612283, + "grad_norm": 3.5101170539855957, + "learning_rate": 2.640217096479354e-05, + "loss": 7.5088, + "step": 1542300 + }, + { + "epoch": 6.283398632635665, + "grad_norm": 2.9503724575042725, + "learning_rate": 2.634542905682933e-05, + "loss": 7.5506, + "step": 1542400 + }, + { + "epoch": 6.283806010659046, + "grad_norm": 6.986172676086426, + "learning_rate": 2.628878514333717e-05, + "loss": 7.5387, + "step": 1542500 + }, + { + "epoch": 6.284213388682428, + "grad_norm": 4.252118110656738, + "learning_rate": 2.6232239226556366e-05, + "loss": 7.5069, + "step": 1542600 + }, + { + "epoch": 6.2846207667058085, + "grad_norm": 2.297511100769043, + "learning_rate": 2.61757913087226e-05, + "loss": 7.4889, + "step": 1542700 + }, + { + "epoch": 6.28502814472919, + "grad_norm": 3.1588618755340576, + "learning_rate": 2.6119441392067372e-05, + "loss": 7.4981, + "step": 1542800 + }, + { + "epoch": 6.285435522752572, + "grad_norm": 6.755675792694092, + "learning_rate": 2.6063189478818877e-05, + "loss": 7.5002, + "step": 1542900 + }, + { + "epoch": 6.285842900775953, + "grad_norm": 6.133284091949463, + "learning_rate": 2.600703557120061e-05, + "loss": 7.5354, + "step": 1543000 + }, + { + "epoch": 6.285842900775953, + "eval_MaskedAccuracy": 0.5135435694030508, + "eval_loss": 1.5861613750457764, + "eval_runtime": 166.0879, + "eval_samples_per_second": 382.183, + "eval_steps_per_second": 1.493, + "step": 1543000 + }, + { + "epoch": 6.286250278799335, + "grad_norm": 19.410377502441406, + "learning_rate": 2.595097967143269e-05, + "loss": 7.4911, + "step": 1543100 + }, + { + "epoch": 6.286657656822716, + "grad_norm": 22.64906883239746, + "learning_rate": 2.5895021781731402e-05, + "loss": 7.5217, + "step": 1543200 + }, + { + "epoch": 6.287065034846098, + "grad_norm": 4.644437789916992, + "learning_rate": 2.5839161904309486e-05, + "loss": 7.4868, + "step": 1543300 + }, + { + "epoch": 6.287472412869479, + "grad_norm": 5.192335605621338, + "learning_rate": 2.5783400041374586e-05, + "loss": 7.5224, + "step": 1543400 + }, + { + "epoch": 6.287879790892861, + "grad_norm": 3.651285409927368, + "learning_rate": 2.572773619513192e-05, + "loss": 7.5123, + "step": 1543500 + }, + { + "epoch": 6.288287168916242, + "grad_norm": 3.404053211212158, + "learning_rate": 2.567217036778169e-05, + "loss": 7.5286, + "step": 1543600 + }, + { + "epoch": 6.288694546939624, + "grad_norm": 11.100665092468262, + "learning_rate": 2.5616702561521106e-05, + "loss": 7.5262, + "step": 1543700 + }, + { + "epoch": 6.289101924963005, + "grad_norm": 6.327223300933838, + "learning_rate": 2.556133277854287e-05, + "loss": 7.5167, + "step": 1543800 + }, + { + "epoch": 6.289509302986387, + "grad_norm": 6.676726818084717, + "learning_rate": 2.550606102103614e-05, + "loss": 7.5107, + "step": 1543900 + }, + { + "epoch": 6.2899166810097675, + "grad_norm": 3.6032087802886963, + "learning_rate": 2.5450887291186158e-05, + "loss": 7.5253, + "step": 1544000 + }, + { + "epoch": 6.2899166810097675, + "eval_MaskedAccuracy": 0.5136579686421746, + "eval_loss": 1.5964300632476807, + "eval_runtime": 153.8311, + "eval_samples_per_second": 412.634, + "eval_steps_per_second": 1.612, + "step": 1544000 + }, + { + "epoch": 6.290324059033149, + "grad_norm": 4.43002986907959, + "learning_rate": 2.5395811591173996e-05, + "loss": 7.5194, + "step": 1544100 + }, + { + "epoch": 6.290731437056531, + "grad_norm": 3.6076090335845947, + "learning_rate": 2.5340833923177492e-05, + "loss": 7.5204, + "step": 1544200 + }, + { + "epoch": 6.291138815079912, + "grad_norm": 5.127583980560303, + "learning_rate": 2.5285954289369706e-05, + "loss": 7.518, + "step": 1544300 + }, + { + "epoch": 6.291546193103294, + "grad_norm": 3.441530227661133, + "learning_rate": 2.5231172691920852e-05, + "loss": 7.4968, + "step": 1544400 + }, + { + "epoch": 6.291953571126675, + "grad_norm": 2.1325786113739014, + "learning_rate": 2.517648913299621e-05, + "loss": 7.492, + "step": 1544500 + }, + { + "epoch": 6.292360949150057, + "grad_norm": 6.765881061553955, + "learning_rate": 2.5121903614757894e-05, + "loss": 7.5395, + "step": 1544600 + }, + { + "epoch": 6.292768327173438, + "grad_norm": 3.79774808883667, + "learning_rate": 2.506741613936427e-05, + "loss": 7.5131, + "step": 1544700 + }, + { + "epoch": 6.29317570519682, + "grad_norm": 3.0351104736328125, + "learning_rate": 2.5013026708969013e-05, + "loss": 7.5093, + "step": 1544800 + }, + { + "epoch": 6.293583083220201, + "grad_norm": 6.138299465179443, + "learning_rate": 2.4958735325722757e-05, + "loss": 7.5447, + "step": 1544900 + }, + { + "epoch": 6.293990461243583, + "grad_norm": 7.833155155181885, + "learning_rate": 2.490454199177196e-05, + "loss": 7.5159, + "step": 1545000 + }, + { + "epoch": 6.293990461243583, + "eval_MaskedAccuracy": 0.5137496808109812, + "eval_loss": 1.5737380981445312, + "eval_runtime": 170.0133, + "eval_samples_per_second": 373.359, + "eval_steps_per_second": 1.459, + "step": 1545000 + }, + { + "epoch": 6.294397839266964, + "grad_norm": 2.2542214393615723, + "learning_rate": 2.485044670925896e-05, + "loss": 7.5076, + "step": 1545100 + }, + { + "epoch": 6.294805217290345, + "grad_norm": 3.1822574138641357, + "learning_rate": 2.479644948032277e-05, + "loss": 7.5026, + "step": 1545200 + }, + { + "epoch": 6.295212595313727, + "grad_norm": 5.585951328277588, + "learning_rate": 2.4742550307097957e-05, + "loss": 7.555, + "step": 1545300 + }, + { + "epoch": 6.295619973337108, + "grad_norm": 15.215363502502441, + "learning_rate": 2.468874919171529e-05, + "loss": 7.5263, + "step": 1545400 + }, + { + "epoch": 6.29602735136049, + "grad_norm": 8.550811767578125, + "learning_rate": 2.4635046136302315e-05, + "loss": 7.5195, + "step": 1545500 + }, + { + "epoch": 6.296434729383871, + "grad_norm": 4.234350681304932, + "learning_rate": 2.458144114298169e-05, + "loss": 7.528, + "step": 1545600 + }, + { + "epoch": 6.296842107407253, + "grad_norm": 3.0124824047088623, + "learning_rate": 2.4527934213873043e-05, + "loss": 7.5405, + "step": 1545700 + }, + { + "epoch": 6.297249485430634, + "grad_norm": 3.5371623039245605, + "learning_rate": 2.4474525351091507e-05, + "loss": 7.5232, + "step": 1545800 + }, + { + "epoch": 6.297656863454016, + "grad_norm": 4.27669620513916, + "learning_rate": 2.4421214556748977e-05, + "loss": 7.5253, + "step": 1545900 + }, + { + "epoch": 6.298064241477397, + "grad_norm": 15.751446723937988, + "learning_rate": 2.4368001832952584e-05, + "loss": 7.5517, + "step": 1546000 + }, + { + "epoch": 6.298064241477397, + "eval_MaskedAccuracy": 0.5144174046331489, + "eval_loss": 1.5825165510177612, + "eval_runtime": 154.0075, + "eval_samples_per_second": 412.162, + "eval_steps_per_second": 1.61, + "step": 1546000 + }, + { + "epoch": 6.298471619500779, + "grad_norm": 2.991913080215454, + "learning_rate": 2.4314887181806695e-05, + "loss": 7.5131, + "step": 1546100 + }, + { + "epoch": 6.29887899752416, + "grad_norm": 4.033088207244873, + "learning_rate": 2.426187060541071e-05, + "loss": 7.551, + "step": 1546200 + }, + { + "epoch": 6.299286375547542, + "grad_norm": 2.9316580295562744, + "learning_rate": 2.420895210586127e-05, + "loss": 7.5507, + "step": 1546300 + }, + { + "epoch": 6.2996937535709225, + "grad_norm": 17.982954025268555, + "learning_rate": 2.4156131685249995e-05, + "loss": 7.5173, + "step": 1546400 + }, + { + "epoch": 6.300101131594304, + "grad_norm": 2.853330135345459, + "learning_rate": 2.4103409345665162e-05, + "loss": 7.5133, + "step": 1546500 + }, + { + "epoch": 6.300508509617686, + "grad_norm": 16.396175384521484, + "learning_rate": 2.4050785089191555e-05, + "loss": 7.5211, + "step": 1546600 + }, + { + "epoch": 6.300915887641067, + "grad_norm": 3.7255969047546387, + "learning_rate": 2.399825891790913e-05, + "loss": 7.4858, + "step": 1546700 + }, + { + "epoch": 6.301323265664449, + "grad_norm": 4.791287422180176, + "learning_rate": 2.39458308338951e-05, + "loss": 7.5008, + "step": 1546800 + }, + { + "epoch": 6.30173064368783, + "grad_norm": 4.549746513366699, + "learning_rate": 2.389350083922225e-05, + "loss": 7.5088, + "step": 1546900 + }, + { + "epoch": 6.302138021711212, + "grad_norm": 4.2029924392700195, + "learning_rate": 2.384126893595865e-05, + "loss": 7.5219, + "step": 1547000 + }, + { + "epoch": 6.302138021711212, + "eval_MaskedAccuracy": 0.5134780985385201, + "eval_loss": 1.5871491432189941, + "eval_runtime": 183.8151, + "eval_samples_per_second": 345.325, + "eval_steps_per_second": 1.349, + "step": 1547000 + }, + { + "epoch": 6.302545399734593, + "grad_norm": 3.2818925380706787, + "learning_rate": 2.3789135126170228e-05, + "loss": 7.52, + "step": 1547100 + }, + { + "epoch": 6.302952777757975, + "grad_norm": 4.296736240386963, + "learning_rate": 2.3737099411917534e-05, + "loss": 7.5068, + "step": 1547200 + }, + { + "epoch": 6.303360155781356, + "grad_norm": 14.635024070739746, + "learning_rate": 2.368516179525814e-05, + "loss": 7.4914, + "step": 1547300 + }, + { + "epoch": 6.303767533804738, + "grad_norm": 3.2431273460388184, + "learning_rate": 2.3633322278245203e-05, + "loss": 7.5023, + "step": 1547400 + }, + { + "epoch": 6.304174911828119, + "grad_norm": 3.529853343963623, + "learning_rate": 2.3581580862928547e-05, + "loss": 7.5368, + "step": 1547500 + }, + { + "epoch": 6.304582289851501, + "grad_norm": 2.9401190280914307, + "learning_rate": 2.3529937551353235e-05, + "loss": 7.5288, + "step": 1547600 + }, + { + "epoch": 6.304989667874882, + "grad_norm": 5.547223091125488, + "learning_rate": 2.3478392345561626e-05, + "loss": 7.4997, + "step": 1547700 + }, + { + "epoch": 6.305397045898263, + "grad_norm": 4.2137932777404785, + "learning_rate": 2.3426945247591316e-05, + "loss": 7.496, + "step": 1547800 + }, + { + "epoch": 6.305804423921645, + "grad_norm": 3.1072821617126465, + "learning_rate": 2.3375596259476027e-05, + "loss": 7.5234, + "step": 1547900 + }, + { + "epoch": 6.306211801945026, + "grad_norm": 2.5990145206451416, + "learning_rate": 2.332434538324618e-05, + "loss": 7.498, + "step": 1548000 + }, + { + "epoch": 6.306211801945026, + "eval_MaskedAccuracy": 0.5140046198565413, + "eval_loss": 1.5883818864822388, + "eval_runtime": 158.6248, + "eval_samples_per_second": 400.164, + "eval_steps_per_second": 1.563, + "step": 1548000 + }, + { + "epoch": 6.306619179968408, + "grad_norm": 3.3525407314300537, + "learning_rate": 2.3273192620928023e-05, + "loss": 7.5136, + "step": 1548100 + }, + { + "epoch": 6.307026557991789, + "grad_norm": 20.4449520111084, + "learning_rate": 2.3222137974543414e-05, + "loss": 7.5274, + "step": 1548200 + }, + { + "epoch": 6.307433936015171, + "grad_norm": 3.1215569972991943, + "learning_rate": 2.3171181446111346e-05, + "loss": 7.5342, + "step": 1548300 + }, + { + "epoch": 6.307841314038552, + "grad_norm": 3.0884578227996826, + "learning_rate": 2.3120323037646223e-05, + "loss": 7.5303, + "step": 1548400 + }, + { + "epoch": 6.308248692061934, + "grad_norm": 3.8087260723114014, + "learning_rate": 2.3069562751158775e-05, + "loss": 7.56, + "step": 1548500 + }, + { + "epoch": 6.308656070085315, + "grad_norm": 8.330710411071777, + "learning_rate": 2.3018900588655905e-05, + "loss": 7.4897, + "step": 1548600 + }, + { + "epoch": 6.309063448108697, + "grad_norm": 4.615542888641357, + "learning_rate": 2.2968336552140338e-05, + "loss": 7.537, + "step": 1548700 + }, + { + "epoch": 6.309470826132078, + "grad_norm": 4.869720458984375, + "learning_rate": 2.2917870643611423e-05, + "loss": 7.5145, + "step": 1548800 + }, + { + "epoch": 6.30987820415546, + "grad_norm": 3.726224899291992, + "learning_rate": 2.2867502865064127e-05, + "loss": 7.4863, + "step": 1548900 + }, + { + "epoch": 6.310285582178841, + "grad_norm": 3.5653560161590576, + "learning_rate": 2.2817233218489566e-05, + "loss": 7.5168, + "step": 1549000 + }, + { + "epoch": 6.310285582178841, + "eval_MaskedAccuracy": 0.5135034707953396, + "eval_loss": 1.5824087858200073, + "eval_runtime": 160.2875, + "eval_samples_per_second": 396.013, + "eval_steps_per_second": 1.547, + "step": 1549000 + }, + { + "epoch": 6.310692960202222, + "grad_norm": 3.5148019790649414, + "learning_rate": 2.2767061705875443e-05, + "loss": 7.5085, + "step": 1549100 + }, + { + "epoch": 6.311100338225604, + "grad_norm": 4.2641119956970215, + "learning_rate": 2.271698832920565e-05, + "loss": 7.5447, + "step": 1549200 + }, + { + "epoch": 6.311507716248985, + "grad_norm": 4.688575744628906, + "learning_rate": 2.2667013090459345e-05, + "loss": 7.528, + "step": 1549300 + }, + { + "epoch": 6.311915094272367, + "grad_norm": 5.592088222503662, + "learning_rate": 2.2617135991612394e-05, + "loss": 7.5138, + "step": 1549400 + }, + { + "epoch": 6.312322472295748, + "grad_norm": 3.5355324745178223, + "learning_rate": 2.2567357034636735e-05, + "loss": 7.5125, + "step": 1549500 + }, + { + "epoch": 6.31272985031913, + "grad_norm": 3.9834372997283936, + "learning_rate": 2.2517676221500475e-05, + "loss": 7.5466, + "step": 1549600 + }, + { + "epoch": 6.313137228342511, + "grad_norm": 3.0744802951812744, + "learning_rate": 2.246809355416788e-05, + "loss": 7.5269, + "step": 1549700 + }, + { + "epoch": 6.313544606365893, + "grad_norm": 5.867956161499023, + "learning_rate": 2.2418609034598952e-05, + "loss": 7.5172, + "step": 1549800 + }, + { + "epoch": 6.313951984389274, + "grad_norm": 4.516674995422363, + "learning_rate": 2.2369222664750145e-05, + "loss": 7.5246, + "step": 1549900 + }, + { + "epoch": 6.314359362412656, + "grad_norm": 3.641747236251831, + "learning_rate": 2.2319934446574075e-05, + "loss": 7.5129, + "step": 1550000 + }, + { + "epoch": 6.314359362412656, + "eval_MaskedAccuracy": 0.5141479137000701, + "eval_loss": 1.585058331489563, + "eval_runtime": 177.9819, + "eval_samples_per_second": 356.643, + "eval_steps_per_second": 1.393, + "step": 1550000 + }, + { + "epoch": 6.3147667404360375, + "grad_norm": 3.76349139213562, + "learning_rate": 2.2270744382019154e-05, + "loss": 7.4931, + "step": 1550100 + }, + { + "epoch": 6.315174118459418, + "grad_norm": 2.777132511138916, + "learning_rate": 2.2221652473030475e-05, + "loss": 7.5116, + "step": 1550200 + }, + { + "epoch": 6.3155814964828, + "grad_norm": 21.24774742126465, + "learning_rate": 2.2172658721548655e-05, + "loss": 7.517, + "step": 1550300 + }, + { + "epoch": 6.315988874506181, + "grad_norm": 11.929756164550781, + "learning_rate": 2.2123763129510794e-05, + "loss": 7.5168, + "step": 1550400 + }, + { + "epoch": 6.316396252529563, + "grad_norm": 3.7070703506469727, + "learning_rate": 2.2074965698849787e-05, + "loss": 7.4922, + "step": 1550500 + }, + { + "epoch": 6.316803630552944, + "grad_norm": 11.011404037475586, + "learning_rate": 2.202626643149497e-05, + "loss": 7.5135, + "step": 1550600 + }, + { + "epoch": 6.317211008576326, + "grad_norm": 12.582701683044434, + "learning_rate": 2.1977665329371752e-05, + "loss": 7.5118, + "step": 1550700 + }, + { + "epoch": 6.317618386599707, + "grad_norm": 12.216609001159668, + "learning_rate": 2.192916239440169e-05, + "loss": 7.5235, + "step": 1550800 + }, + { + "epoch": 6.318025764623089, + "grad_norm": 7.560016632080078, + "learning_rate": 2.1880757628501945e-05, + "loss": 7.516, + "step": 1550900 + }, + { + "epoch": 6.31843314264647, + "grad_norm": 15.873871803283691, + "learning_rate": 2.1832451033586595e-05, + "loss": 7.5127, + "step": 1551000 + }, + { + "epoch": 6.31843314264647, + "eval_MaskedAccuracy": 0.5135422715831626, + "eval_loss": 1.5892564058303833, + "eval_runtime": 157.5923, + "eval_samples_per_second": 402.786, + "eval_steps_per_second": 1.574, + "step": 1551000 + }, + { + "epoch": 6.318840520669852, + "grad_norm": 4.224386215209961, + "learning_rate": 2.1784242611565508e-05, + "loss": 7.5359, + "step": 1551100 + }, + { + "epoch": 6.3192478986932334, + "grad_norm": 14.862460136413574, + "learning_rate": 2.1736132364344478e-05, + "loss": 7.5073, + "step": 1551200 + }, + { + "epoch": 6.319655276716615, + "grad_norm": 13.30049991607666, + "learning_rate": 2.1688120293825685e-05, + "loss": 7.4914, + "step": 1551300 + }, + { + "epoch": 6.320062654739996, + "grad_norm": 6.62608003616333, + "learning_rate": 2.164020640190713e-05, + "loss": 7.5474, + "step": 1551400 + }, + { + "epoch": 6.320470032763377, + "grad_norm": 8.579277992248535, + "learning_rate": 2.1592390690483152e-05, + "loss": 7.5181, + "step": 1551500 + }, + { + "epoch": 6.320877410786759, + "grad_norm": 6.059241771697998, + "learning_rate": 2.154467316144405e-05, + "loss": 7.4969, + "step": 1551600 + }, + { + "epoch": 6.32128478881014, + "grad_norm": 12.62930965423584, + "learning_rate": 2.1497053816676465e-05, + "loss": 7.5222, + "step": 1551700 + }, + { + "epoch": 6.321692166833522, + "grad_norm": 8.592183113098145, + "learning_rate": 2.1449532658063176e-05, + "loss": 7.5298, + "step": 1551800 + }, + { + "epoch": 6.322099544856903, + "grad_norm": 11.156678199768066, + "learning_rate": 2.1402109687482526e-05, + "loss": 7.505, + "step": 1551900 + }, + { + "epoch": 6.322506922880285, + "grad_norm": 7.913949489593506, + "learning_rate": 2.1354784906810086e-05, + "loss": 7.5041, + "step": 1552000 + }, + { + "epoch": 6.322506922880285, + "eval_MaskedAccuracy": 0.5137276939288936, + "eval_loss": 1.5954068899154663, + "eval_runtime": 157.1305, + "eval_samples_per_second": 403.97, + "eval_steps_per_second": 1.578, + "step": 1552000 + }, + { + "epoch": 6.322914300903666, + "grad_norm": 5.202408790588379, + "learning_rate": 2.130755831791617e-05, + "loss": 7.4768, + "step": 1552100 + }, + { + "epoch": 6.323321678927048, + "grad_norm": 9.220905303955078, + "learning_rate": 2.1260429922668334e-05, + "loss": 7.53, + "step": 1552200 + }, + { + "epoch": 6.323729056950429, + "grad_norm": 8.458250045776367, + "learning_rate": 2.1213399722929684e-05, + "loss": 7.4975, + "step": 1552300 + }, + { + "epoch": 6.324136434973811, + "grad_norm": 21.704702377319336, + "learning_rate": 2.1166467720559447e-05, + "loss": 7.5063, + "step": 1552400 + }, + { + "epoch": 6.3245438129971925, + "grad_norm": 5.595561504364014, + "learning_rate": 2.1119633917413532e-05, + "loss": 7.5118, + "step": 1552500 + }, + { + "epoch": 6.324951191020574, + "grad_norm": 16.391130447387695, + "learning_rate": 2.1072898315342825e-05, + "loss": 7.486, + "step": 1552600 + }, + { + "epoch": 6.325358569043955, + "grad_norm": 17.90876007080078, + "learning_rate": 2.1026260916195762e-05, + "loss": 7.5579, + "step": 1552700 + }, + { + "epoch": 6.325765947067336, + "grad_norm": 3.0364785194396973, + "learning_rate": 2.0979721721815476e-05, + "loss": 7.5202, + "step": 1552800 + }, + { + "epoch": 6.326173325090718, + "grad_norm": 2.3322737216949463, + "learning_rate": 2.093328073404243e-05, + "loss": 7.5142, + "step": 1552900 + }, + { + "epoch": 6.326580703114099, + "grad_norm": 7.759130477905273, + "learning_rate": 2.0886937954712498e-05, + "loss": 7.499, + "step": 1553000 + }, + { + "epoch": 6.326580703114099, + "eval_MaskedAccuracy": 0.5136656005610186, + "eval_loss": 1.5845599174499512, + "eval_runtime": 151.8068, + "eval_samples_per_second": 418.137, + "eval_steps_per_second": 1.634, + "step": 1553000 + }, + { + "epoch": 6.326988081137481, + "grad_norm": 5.862680912017822, + "learning_rate": 2.0840693385657803e-05, + "loss": 7.5128, + "step": 1553100 + }, + { + "epoch": 6.327395459160862, + "grad_norm": 3.3214495182037354, + "learning_rate": 2.0794547028706842e-05, + "loss": 7.501, + "step": 1553200 + }, + { + "epoch": 6.327802837184244, + "grad_norm": 4.727343559265137, + "learning_rate": 2.0748498885683912e-05, + "loss": 7.5259, + "step": 1553300 + }, + { + "epoch": 6.328210215207625, + "grad_norm": 12.341339111328125, + "learning_rate": 2.0702548958409484e-05, + "loss": 7.4982, + "step": 1553400 + }, + { + "epoch": 6.328617593231007, + "grad_norm": 9.620864868164062, + "learning_rate": 2.0656697248700478e-05, + "loss": 7.5229, + "step": 1553500 + }, + { + "epoch": 6.3290249712543885, + "grad_norm": 3.068958044052124, + "learning_rate": 2.06109437583692e-05, + "loss": 7.5123, + "step": 1553600 + }, + { + "epoch": 6.32943234927777, + "grad_norm": 27.00047492980957, + "learning_rate": 2.05652884892248e-05, + "loss": 7.4955, + "step": 1553700 + }, + { + "epoch": 6.3298397273011515, + "grad_norm": 12.979219436645508, + "learning_rate": 2.0519731443072178e-05, + "loss": 7.5298, + "step": 1553800 + }, + { + "epoch": 6.330247105324533, + "grad_norm": 3.2111220359802246, + "learning_rate": 2.0474272621712674e-05, + "loss": 7.4988, + "step": 1553900 + }, + { + "epoch": 6.330654483347914, + "grad_norm": 3.862499952316284, + "learning_rate": 2.0428912026943186e-05, + "loss": 7.524, + "step": 1554000 + }, + { + "epoch": 6.330654483347914, + "eval_MaskedAccuracy": 0.5132917372360736, + "eval_loss": 1.5841280221939087, + "eval_runtime": 151.9416, + "eval_samples_per_second": 417.766, + "eval_steps_per_second": 1.632, + "step": 1554000 + }, + { + "epoch": 6.331061861371295, + "grad_norm": 8.490737915039062, + "learning_rate": 2.0383649660557302e-05, + "loss": 7.5266, + "step": 1554100 + }, + { + "epoch": 6.331469239394677, + "grad_norm": 9.859129905700684, + "learning_rate": 2.033848552434442e-05, + "loss": 7.525, + "step": 1554200 + }, + { + "epoch": 6.331876617418058, + "grad_norm": 5.727023601531982, + "learning_rate": 2.029341962009012e-05, + "loss": 7.5259, + "step": 1554300 + }, + { + "epoch": 6.33228399544144, + "grad_norm": 11.75688648223877, + "learning_rate": 2.024845194957605e-05, + "loss": 7.4937, + "step": 1554400 + }, + { + "epoch": 6.332691373464821, + "grad_norm": 6.944560527801514, + "learning_rate": 2.0203582514580015e-05, + "loss": 7.5038, + "step": 1554500 + }, + { + "epoch": 6.333098751488203, + "grad_norm": 6.843235969543457, + "learning_rate": 2.0158811316875938e-05, + "loss": 7.5152, + "step": 1554600 + }, + { + "epoch": 6.333506129511584, + "grad_norm": 4.89893102645874, + "learning_rate": 2.0114138358234142e-05, + "loss": 7.5155, + "step": 1554700 + }, + { + "epoch": 6.333913507534966, + "grad_norm": 7.306215286254883, + "learning_rate": 2.006956364042024e-05, + "loss": 7.5191, + "step": 1554800 + }, + { + "epoch": 6.3343208855583475, + "grad_norm": 5.872547626495361, + "learning_rate": 2.0025087165197002e-05, + "loss": 7.4958, + "step": 1554900 + }, + { + "epoch": 6.334728263581729, + "grad_norm": 3.7879750728607178, + "learning_rate": 1.9980708934322866e-05, + "loss": 7.5321, + "step": 1555000 + }, + { + "epoch": 6.334728263581729, + "eval_MaskedAccuracy": 0.5140193556513327, + "eval_loss": 1.5914322137832642, + "eval_runtime": 153.473, + "eval_samples_per_second": 413.597, + "eval_steps_per_second": 1.616, + "step": 1555000 + }, + { + "epoch": 6.335135641605111, + "grad_norm": 8.08877182006836, + "learning_rate": 1.9936428949551795e-05, + "loss": 7.5091, + "step": 1555100 + }, + { + "epoch": 6.335543019628491, + "grad_norm": 6.396212577819824, + "learning_rate": 1.9892247212634958e-05, + "loss": 7.5296, + "step": 1555200 + }, + { + "epoch": 6.335950397651873, + "grad_norm": 5.900701522827148, + "learning_rate": 1.984816372531885e-05, + "loss": 7.508, + "step": 1555300 + }, + { + "epoch": 6.336357775675254, + "grad_norm": 3.2566726207733154, + "learning_rate": 1.9804178489346125e-05, + "loss": 7.5052, + "step": 1555400 + }, + { + "epoch": 6.336765153698636, + "grad_norm": 3.6948935985565186, + "learning_rate": 1.9760291506456324e-05, + "loss": 7.5557, + "step": 1555500 + }, + { + "epoch": 6.337172531722017, + "grad_norm": 6.9162068367004395, + "learning_rate": 1.9716502778384026e-05, + "loss": 7.5245, + "step": 1555600 + }, + { + "epoch": 6.337579909745399, + "grad_norm": 4.947198390960693, + "learning_rate": 1.9672812306860425e-05, + "loss": 7.5214, + "step": 1555700 + }, + { + "epoch": 6.33798728776878, + "grad_norm": 10.452269554138184, + "learning_rate": 1.9629220093613275e-05, + "loss": 7.5273, + "step": 1555800 + }, + { + "epoch": 6.338394665792162, + "grad_norm": 4.318188667297363, + "learning_rate": 1.9585726140365742e-05, + "loss": 7.4954, + "step": 1555900 + }, + { + "epoch": 6.3388020438155435, + "grad_norm": 13.407175064086914, + "learning_rate": 1.954233044883721e-05, + "loss": 7.5215, + "step": 1556000 + }, + { + "epoch": 6.3388020438155435, + "eval_MaskedAccuracy": 0.5139136207573256, + "eval_loss": 1.5885847806930542, + "eval_runtime": 152.5531, + "eval_samples_per_second": 416.091, + "eval_steps_per_second": 1.626, + "step": 1556000 + }, + { + "epoch": 6.339209421838925, + "grad_norm": 4.189796447753906, + "learning_rate": 1.9499033020743675e-05, + "loss": 7.4855, + "step": 1556100 + }, + { + "epoch": 6.3396167998623065, + "grad_norm": 3.5075576305389404, + "learning_rate": 1.9455833857796718e-05, + "loss": 7.5178, + "step": 1556200 + }, + { + "epoch": 6.340024177885688, + "grad_norm": 3.9251222610473633, + "learning_rate": 1.941273296170434e-05, + "loss": 7.5157, + "step": 1556300 + }, + { + "epoch": 6.340431555909069, + "grad_norm": 5.285815715789795, + "learning_rate": 1.9369730334170382e-05, + "loss": 7.5364, + "step": 1556400 + }, + { + "epoch": 6.34083893393245, + "grad_norm": 4.4611496925354, + "learning_rate": 1.9326825976895263e-05, + "loss": 7.5316, + "step": 1556500 + }, + { + "epoch": 6.341246311955832, + "grad_norm": 3.0326576232910156, + "learning_rate": 1.928401989157483e-05, + "loss": 7.5232, + "step": 1556600 + }, + { + "epoch": 6.341653689979213, + "grad_norm": 8.034393310546875, + "learning_rate": 1.9241312079901564e-05, + "loss": 7.4537, + "step": 1556700 + }, + { + "epoch": 6.342061068002595, + "grad_norm": 7.434968948364258, + "learning_rate": 1.9198702543564097e-05, + "loss": 7.5038, + "step": 1556800 + }, + { + "epoch": 6.342468446025976, + "grad_norm": 5.263147354125977, + "learning_rate": 1.9156191284246835e-05, + "loss": 7.523, + "step": 1556900 + }, + { + "epoch": 6.342875824049358, + "grad_norm": 8.264650344848633, + "learning_rate": 1.9113778303630574e-05, + "loss": 7.4962, + "step": 1557000 + }, + { + "epoch": 6.342875824049358, + "eval_MaskedAccuracy": 0.5140052365365326, + "eval_loss": 1.5736886262893677, + "eval_runtime": 153.4389, + "eval_samples_per_second": 413.689, + "eval_steps_per_second": 1.616, + "step": 1557000 + }, + { + "epoch": 6.343283202072739, + "grad_norm": 4.549633026123047, + "learning_rate": 1.9071463603392052e-05, + "loss": 7.5306, + "step": 1557100 + }, + { + "epoch": 6.343690580096121, + "grad_norm": 6.143858432769775, + "learning_rate": 1.9029247185204394e-05, + "loss": 7.516, + "step": 1557200 + }, + { + "epoch": 6.3440979581195025, + "grad_norm": 5.988174915313721, + "learning_rate": 1.898712905073646e-05, + "loss": 7.4866, + "step": 1557300 + }, + { + "epoch": 6.344505336142884, + "grad_norm": 7.643118858337402, + "learning_rate": 1.8945109201653615e-05, + "loss": 7.5416, + "step": 1557400 + }, + { + "epoch": 6.344912714166266, + "grad_norm": 6.2892656326293945, + "learning_rate": 1.8903187639616777e-05, + "loss": 7.5139, + "step": 1557500 + }, + { + "epoch": 6.345320092189647, + "grad_norm": 3.603832960128784, + "learning_rate": 1.886136436628375e-05, + "loss": 7.4909, + "step": 1557600 + }, + { + "epoch": 6.345727470213028, + "grad_norm": 5.11536979675293, + "learning_rate": 1.881963938330741e-05, + "loss": 7.5166, + "step": 1557700 + }, + { + "epoch": 6.346134848236409, + "grad_norm": 45.51047134399414, + "learning_rate": 1.877801269233813e-05, + "loss": 7.5349, + "step": 1557800 + }, + { + "epoch": 6.346542226259791, + "grad_norm": 33.30308532714844, + "learning_rate": 1.8736484295020995e-05, + "loss": 7.5352, + "step": 1557900 + }, + { + "epoch": 6.346949604283172, + "grad_norm": 16.877361297607422, + "learning_rate": 1.869505419299835e-05, + "loss": 7.5148, + "step": 1558000 + }, + { + "epoch": 6.346949604283172, + "eval_MaskedAccuracy": 0.5138252345160843, + "eval_loss": 1.5794929265975952, + "eval_runtime": 153.5837, + "eval_samples_per_second": 413.299, + "eval_steps_per_second": 1.615, + "step": 1558000 + }, + { + "epoch": 6.347356982306554, + "grad_norm": 26.333221435546875, + "learning_rate": 1.865372238790754e-05, + "loss": 7.5101, + "step": 1558100 + }, + { + "epoch": 6.347764360329935, + "grad_norm": 4.551407814025879, + "learning_rate": 1.8612488881383453e-05, + "loss": 7.5327, + "step": 1558200 + }, + { + "epoch": 6.348171738353317, + "grad_norm": 5.632599353790283, + "learning_rate": 1.857135367505535e-05, + "loss": 7.502, + "step": 1558300 + }, + { + "epoch": 6.3485791163766985, + "grad_norm": 8.501633644104004, + "learning_rate": 1.8530316770550363e-05, + "loss": 7.5343, + "step": 1558400 + }, + { + "epoch": 6.34898649440008, + "grad_norm": 4.900854587554932, + "learning_rate": 1.8489378169490323e-05, + "loss": 7.5467, + "step": 1558500 + }, + { + "epoch": 6.3493938724234615, + "grad_norm": 13.692998886108398, + "learning_rate": 1.8448537873493815e-05, + "loss": 7.5175, + "step": 1558600 + }, + { + "epoch": 6.349801250446843, + "grad_norm": 4.244467258453369, + "learning_rate": 1.840779588417597e-05, + "loss": 7.5005, + "step": 1558700 + }, + { + "epoch": 6.350208628470225, + "grad_norm": 25.029279708862305, + "learning_rate": 1.8367152203147006e-05, + "loss": 7.5113, + "step": 1558800 + }, + { + "epoch": 6.350616006493606, + "grad_norm": 4.887223243713379, + "learning_rate": 1.8326606832013838e-05, + "loss": 7.5298, + "step": 1558900 + }, + { + "epoch": 6.351023384516987, + "grad_norm": 3.9406700134277344, + "learning_rate": 1.8286159772379687e-05, + "loss": 7.5382, + "step": 1559000 + }, + { + "epoch": 6.351023384516987, + "eval_MaskedAccuracy": 0.5132906179554274, + "eval_loss": 1.5843143463134766, + "eval_runtime": 158.9723, + "eval_samples_per_second": 399.29, + "eval_steps_per_second": 1.56, + "step": 1559000 + }, + { + "epoch": 6.351430762540368, + "grad_norm": 6.848637104034424, + "learning_rate": 1.8245811025843708e-05, + "loss": 7.5133, + "step": 1559100 + }, + { + "epoch": 6.35183814056375, + "grad_norm": 8.351176261901855, + "learning_rate": 1.820556059400087e-05, + "loss": 7.5041, + "step": 1559200 + }, + { + "epoch": 6.352245518587131, + "grad_norm": 2.5494906902313232, + "learning_rate": 1.8165408478442223e-05, + "loss": 7.5183, + "step": 1559300 + }, + { + "epoch": 6.352652896610513, + "grad_norm": 4.4983038902282715, + "learning_rate": 1.812535468075584e-05, + "loss": 7.5481, + "step": 1559400 + }, + { + "epoch": 6.3530602746338944, + "grad_norm": 3.0655312538146973, + "learning_rate": 1.8085399202524756e-05, + "loss": 7.5151, + "step": 1559500 + }, + { + "epoch": 6.353467652657276, + "grad_norm": 4.258170127868652, + "learning_rate": 1.8045542045328738e-05, + "loss": 7.5208, + "step": 1559600 + }, + { + "epoch": 6.3538750306806575, + "grad_norm": 5.146317005157471, + "learning_rate": 1.800578321074391e-05, + "loss": 7.51, + "step": 1559700 + }, + { + "epoch": 6.354282408704039, + "grad_norm": 7.607929229736328, + "learning_rate": 1.79661227003414e-05, + "loss": 7.5159, + "step": 1559800 + }, + { + "epoch": 6.354689786727421, + "grad_norm": 4.449549674987793, + "learning_rate": 1.792656051568987e-05, + "loss": 7.4969, + "step": 1559900 + }, + { + "epoch": 6.355097164750802, + "grad_norm": 3.163651704788208, + "learning_rate": 1.7887096658353308e-05, + "loss": 7.5189, + "step": 1560000 + }, + { + "epoch": 6.355097164750802, + "eval_MaskedAccuracy": 0.513831468785368, + "eval_loss": 1.5857032537460327, + "eval_runtime": 161.4409, + "eval_samples_per_second": 393.184, + "eval_steps_per_second": 1.536, + "step": 1560000 + }, + { + "epoch": 6.355504542774184, + "grad_norm": 6.928320407867432, + "learning_rate": 1.7847731129891757e-05, + "loss": 7.5037, + "step": 1560100 + }, + { + "epoch": 6.355911920797564, + "grad_norm": 5.489813804626465, + "learning_rate": 1.7808463931861707e-05, + "loss": 7.5059, + "step": 1560200 + }, + { + "epoch": 6.356319298820946, + "grad_norm": 2.625778913497925, + "learning_rate": 1.776929506581545e-05, + "loss": 7.5187, + "step": 1560300 + }, + { + "epoch": 6.356726676844327, + "grad_norm": 4.402017593383789, + "learning_rate": 1.7730224533301418e-05, + "loss": 7.501, + "step": 1560400 + }, + { + "epoch": 6.357134054867709, + "grad_norm": 9.853277206420898, + "learning_rate": 1.7691252335864725e-05, + "loss": 7.4903, + "step": 1560500 + }, + { + "epoch": 6.35754143289109, + "grad_norm": 4.044744968414307, + "learning_rate": 1.7652378475045767e-05, + "loss": 7.5104, + "step": 1560600 + }, + { + "epoch": 6.357948810914472, + "grad_norm": 16.034936904907227, + "learning_rate": 1.76136029523816e-05, + "loss": 7.5028, + "step": 1560700 + }, + { + "epoch": 6.3583561889378535, + "grad_norm": 11.029666900634766, + "learning_rate": 1.757492576940514e-05, + "loss": 7.5431, + "step": 1560800 + }, + { + "epoch": 6.358763566961235, + "grad_norm": 7.335923671722412, + "learning_rate": 1.7536346927645733e-05, + "loss": 7.4897, + "step": 1560900 + }, + { + "epoch": 6.3591709449846165, + "grad_norm": 2.46474552154541, + "learning_rate": 1.7497866428628268e-05, + "loss": 7.5329, + "step": 1561000 + }, + { + "epoch": 6.3591709449846165, + "eval_MaskedAccuracy": 0.5139506078771523, + "eval_loss": 1.5861738920211792, + "eval_runtime": 163.5106, + "eval_samples_per_second": 388.207, + "eval_steps_per_second": 1.517, + "step": 1561000 + }, + { + "epoch": 6.359578323007998, + "grad_norm": 3.765188455581665, + "learning_rate": 1.745948427387403e-05, + "loss": 7.5007, + "step": 1561100 + }, + { + "epoch": 6.35998570103138, + "grad_norm": 3.644688606262207, + "learning_rate": 1.7421200464900977e-05, + "loss": 7.4808, + "step": 1561200 + }, + { + "epoch": 6.360393079054761, + "grad_norm": 3.6915690898895264, + "learning_rate": 1.738301500322235e-05, + "loss": 7.5105, + "step": 1561300 + }, + { + "epoch": 6.360800457078142, + "grad_norm": 8.23080062866211, + "learning_rate": 1.7344927890347818e-05, + "loss": 7.4961, + "step": 1561400 + }, + { + "epoch": 6.361207835101523, + "grad_norm": 2.4154865741729736, + "learning_rate": 1.7306939127783145e-05, + "loss": 7.5105, + "step": 1561500 + }, + { + "epoch": 6.361615213124905, + "grad_norm": 4.842697620391846, + "learning_rate": 1.726904871703026e-05, + "loss": 7.5211, + "step": 1561600 + }, + { + "epoch": 6.362022591148286, + "grad_norm": 3.899458885192871, + "learning_rate": 1.723125665958718e-05, + "loss": 7.5516, + "step": 1561700 + }, + { + "epoch": 6.362429969171668, + "grad_norm": 5.276438236236572, + "learning_rate": 1.71935629569483e-05, + "loss": 7.5145, + "step": 1561800 + }, + { + "epoch": 6.3628373471950495, + "grad_norm": 20.220172882080078, + "learning_rate": 1.7155967610603337e-05, + "loss": 7.5103, + "step": 1561900 + }, + { + "epoch": 6.363244725218431, + "grad_norm": 8.841887474060059, + "learning_rate": 1.7118470622038957e-05, + "loss": 7.529, + "step": 1562000 + }, + { + "epoch": 6.363244725218431, + "eval_MaskedAccuracy": 0.5141052740534775, + "eval_loss": 1.5849666595458984, + "eval_runtime": 160.3905, + "eval_samples_per_second": 395.759, + "eval_steps_per_second": 1.546, + "step": 1562000 + }, + { + "epoch": 6.3636521032418125, + "grad_norm": 15.423996925354004, + "learning_rate": 1.708107199273767e-05, + "loss": 7.4776, + "step": 1562100 + }, + { + "epoch": 6.364059481265194, + "grad_norm": 8.583870887756348, + "learning_rate": 1.7043771724177823e-05, + "loss": 7.4877, + "step": 1562200 + }, + { + "epoch": 6.364466859288576, + "grad_norm": 7.320174694061279, + "learning_rate": 1.7006569817834142e-05, + "loss": 7.5218, + "step": 1562300 + }, + { + "epoch": 6.364874237311957, + "grad_norm": 15.85832405090332, + "learning_rate": 1.6969466275177817e-05, + "loss": 7.5264, + "step": 1562400 + }, + { + "epoch": 6.365281615335339, + "grad_norm": 3.109506368637085, + "learning_rate": 1.6932461097674995e-05, + "loss": 7.4765, + "step": 1562500 + }, + { + "epoch": 6.36568899335872, + "grad_norm": 2.3691346645355225, + "learning_rate": 1.6895554286789353e-05, + "loss": 7.5039, + "step": 1562600 + }, + { + "epoch": 6.366096371382101, + "grad_norm": 10.584104537963867, + "learning_rate": 1.6858745843979863e-05, + "loss": 7.4992, + "step": 1562700 + }, + { + "epoch": 6.366503749405482, + "grad_norm": 7.2751946449279785, + "learning_rate": 1.6822035770701287e-05, + "loss": 7.5276, + "step": 1562800 + }, + { + "epoch": 6.366911127428864, + "grad_norm": 5.5859761238098145, + "learning_rate": 1.6785424068405414e-05, + "loss": 7.5109, + "step": 1562900 + }, + { + "epoch": 6.367318505452245, + "grad_norm": 14.105605125427246, + "learning_rate": 1.6748910738539864e-05, + "loss": 7.5131, + "step": 1563000 + }, + { + "epoch": 6.367318505452245, + "eval_MaskedAccuracy": 0.5144510631890555, + "eval_loss": 1.588063359260559, + "eval_runtime": 162.8593, + "eval_samples_per_second": 389.76, + "eval_steps_per_second": 1.523, + "step": 1563000 + }, + { + "epoch": 6.367725883475627, + "grad_norm": 10.73180103302002, + "learning_rate": 1.6712495782547813e-05, + "loss": 7.4896, + "step": 1563100 + }, + { + "epoch": 6.3681332614990085, + "grad_norm": 13.278154373168945, + "learning_rate": 1.667617920186908e-05, + "loss": 7.4768, + "step": 1563200 + }, + { + "epoch": 6.36854063952239, + "grad_norm": 11.352387428283691, + "learning_rate": 1.6639960997939677e-05, + "loss": 7.5252, + "step": 1563300 + }, + { + "epoch": 6.368948017545772, + "grad_norm": 4.686359882354736, + "learning_rate": 1.6603841172191113e-05, + "loss": 7.4968, + "step": 1563400 + }, + { + "epoch": 6.369355395569153, + "grad_norm": 9.022072792053223, + "learning_rate": 1.6567819726051885e-05, + "loss": 7.5334, + "step": 1563500 + }, + { + "epoch": 6.369762773592535, + "grad_norm": 5.520549774169922, + "learning_rate": 1.6531896660945527e-05, + "loss": 7.5159, + "step": 1563600 + }, + { + "epoch": 6.370170151615916, + "grad_norm": 4.639460563659668, + "learning_rate": 1.6496071978292727e-05, + "loss": 7.5079, + "step": 1563700 + }, + { + "epoch": 6.370577529639298, + "grad_norm": 2.850416660308838, + "learning_rate": 1.6460345679509558e-05, + "loss": 7.5264, + "step": 1563800 + }, + { + "epoch": 6.370984907662679, + "grad_norm": 5.534444808959961, + "learning_rate": 1.6424717766008685e-05, + "loss": 7.5156, + "step": 1563900 + }, + { + "epoch": 6.37139228568606, + "grad_norm": 23.55628776550293, + "learning_rate": 1.638918823919837e-05, + "loss": 7.5402, + "step": 1564000 + }, + { + "epoch": 6.37139228568606, + "eval_MaskedAccuracy": 0.5138514868274114, + "eval_loss": 1.5761414766311646, + "eval_runtime": 161.0599, + "eval_samples_per_second": 394.114, + "eval_steps_per_second": 1.54, + "step": 1564000 + }, + { + "epoch": 6.371799663709441, + "grad_norm": 8.773727416992188, + "learning_rate": 1.635375710048358e-05, + "loss": 7.5176, + "step": 1564100 + }, + { + "epoch": 6.372207041732823, + "grad_norm": 7.016404628753662, + "learning_rate": 1.6318424351264843e-05, + "loss": 7.5226, + "step": 1564200 + }, + { + "epoch": 6.3726144197562045, + "grad_norm": 5.481175422668457, + "learning_rate": 1.6283189992939363e-05, + "loss": 7.4929, + "step": 1564300 + }, + { + "epoch": 6.373021797779586, + "grad_norm": 10.70868968963623, + "learning_rate": 1.624805402689984e-05, + "loss": 7.5332, + "step": 1564400 + }, + { + "epoch": 6.3734291758029675, + "grad_norm": 3.794466972351074, + "learning_rate": 1.6213016454535483e-05, + "loss": 7.5421, + "step": 1564500 + }, + { + "epoch": 6.373836553826349, + "grad_norm": 23.68807601928711, + "learning_rate": 1.6178077277231848e-05, + "loss": 7.5147, + "step": 1564600 + }, + { + "epoch": 6.374243931849731, + "grad_norm": 6.079192638397217, + "learning_rate": 1.6143236496369767e-05, + "loss": 7.5127, + "step": 1564700 + }, + { + "epoch": 6.374651309873112, + "grad_norm": 8.446185111999512, + "learning_rate": 1.6108494113326774e-05, + "loss": 7.5139, + "step": 1564800 + }, + { + "epoch": 6.375058687896494, + "grad_norm": 3.0648036003112793, + "learning_rate": 1.607385012947678e-05, + "loss": 7.4996, + "step": 1564900 + }, + { + "epoch": 6.375466065919875, + "grad_norm": 4.1956915855407715, + "learning_rate": 1.6039304546189004e-05, + "loss": 7.5252, + "step": 1565000 + }, + { + "epoch": 6.375466065919875, + "eval_MaskedAccuracy": 0.5143203620581782, + "eval_loss": 1.5797115564346313, + "eval_runtime": 152.8501, + "eval_samples_per_second": 415.283, + "eval_steps_per_second": 1.623, + "step": 1565000 + }, + { + "epoch": 6.375873443943257, + "grad_norm": 3.805178642272949, + "learning_rate": 1.600485736482963e-05, + "loss": 7.5215, + "step": 1565100 + }, + { + "epoch": 6.376280821966637, + "grad_norm": 17.14861488342285, + "learning_rate": 1.5970508586760117e-05, + "loss": 7.5125, + "step": 1565200 + }, + { + "epoch": 6.376688199990019, + "grad_norm": 4.253889560699463, + "learning_rate": 1.5936258213338865e-05, + "loss": 7.5076, + "step": 1565300 + }, + { + "epoch": 6.3770955780134, + "grad_norm": 8.798266410827637, + "learning_rate": 1.590210624591957e-05, + "loss": 7.4796, + "step": 1565400 + }, + { + "epoch": 6.377502956036782, + "grad_norm": 5.30897855758667, + "learning_rate": 1.5868052685852874e-05, + "loss": 7.5318, + "step": 1565500 + }, + { + "epoch": 6.3779103340601635, + "grad_norm": 6.551238536834717, + "learning_rate": 1.583409753448474e-05, + "loss": 7.5349, + "step": 1565600 + }, + { + "epoch": 6.378317712083545, + "grad_norm": 8.6677885055542, + "learning_rate": 1.5800240793157513e-05, + "loss": 7.5319, + "step": 1565700 + }, + { + "epoch": 6.378725090106927, + "grad_norm": 3.4282755851745605, + "learning_rate": 1.576648246321022e-05, + "loss": 7.5404, + "step": 1565800 + }, + { + "epoch": 6.379132468130308, + "grad_norm": 5.345614910125732, + "learning_rate": 1.5732822545977163e-05, + "loss": 7.5257, + "step": 1565900 + }, + { + "epoch": 6.37953984615369, + "grad_norm": 5.274416923522949, + "learning_rate": 1.569926104278932e-05, + "loss": 7.5251, + "step": 1566000 + }, + { + "epoch": 6.37953984615369, + "eval_MaskedAccuracy": 0.5136792658576195, + "eval_loss": 1.5815542936325073, + "eval_runtime": 150.9756, + "eval_samples_per_second": 420.439, + "eval_steps_per_second": 1.643, + "step": 1566000 + }, + { + "epoch": 6.379947224177071, + "grad_norm": 3.6069445610046387, + "learning_rate": 1.566579795497325e-05, + "loss": 7.5443, + "step": 1566100 + }, + { + "epoch": 6.380354602200453, + "grad_norm": 3.70088529586792, + "learning_rate": 1.563243328385192e-05, + "loss": 7.5102, + "step": 1566200 + }, + { + "epoch": 6.380761980223834, + "grad_norm": 3.7640960216522217, + "learning_rate": 1.559916703074499e-05, + "loss": 7.512, + "step": 1566300 + }, + { + "epoch": 6.381169358247215, + "grad_norm": 7.848387241363525, + "learning_rate": 1.5565999196966772e-05, + "loss": 7.5263, + "step": 1566400 + }, + { + "epoch": 6.381576736270596, + "grad_norm": 4.667734622955322, + "learning_rate": 1.5532929783829444e-05, + "loss": 7.5417, + "step": 1566500 + }, + { + "epoch": 6.381984114293978, + "grad_norm": 7.917548656463623, + "learning_rate": 1.5499958792639604e-05, + "loss": 7.5363, + "step": 1566600 + }, + { + "epoch": 6.3823914923173595, + "grad_norm": 3.24745512008667, + "learning_rate": 1.546708622470142e-05, + "loss": 7.5244, + "step": 1566700 + }, + { + "epoch": 6.382798870340741, + "grad_norm": 7.438083648681641, + "learning_rate": 1.5434312081314263e-05, + "loss": 7.5467, + "step": 1566800 + }, + { + "epoch": 6.3832062483641225, + "grad_norm": 4.114525318145752, + "learning_rate": 1.540163636377395e-05, + "loss": 7.5005, + "step": 1566900 + }, + { + "epoch": 6.383613626387504, + "grad_norm": 9.119081497192383, + "learning_rate": 1.536905907337213e-05, + "loss": 7.5167, + "step": 1567000 + }, + { + "epoch": 6.383613626387504, + "eval_MaskedAccuracy": 0.5140174664665519, + "eval_loss": 1.583139181137085, + "eval_runtime": 158.7008, + "eval_samples_per_second": 399.973, + "eval_steps_per_second": 1.563, + "step": 1567000 + }, + { + "epoch": 6.384021004410886, + "grad_norm": 14.878480911254883, + "learning_rate": 1.5336580211396862e-05, + "loss": 7.5111, + "step": 1567100 + }, + { + "epoch": 6.384428382434267, + "grad_norm": 2.7665915489196777, + "learning_rate": 1.53041997791323e-05, + "loss": 7.5391, + "step": 1567200 + }, + { + "epoch": 6.384835760457649, + "grad_norm": 3.556154251098633, + "learning_rate": 1.527191777785848e-05, + "loss": 7.5148, + "step": 1567300 + }, + { + "epoch": 6.38524313848103, + "grad_norm": 3.4475274085998535, + "learning_rate": 1.5239734208852098e-05, + "loss": 7.525, + "step": 1567400 + }, + { + "epoch": 6.385650516504412, + "grad_norm": 5.463974475860596, + "learning_rate": 1.5207649073384855e-05, + "loss": 7.5549, + "step": 1567500 + }, + { + "epoch": 6.386057894527793, + "grad_norm": 12.315871238708496, + "learning_rate": 1.5175662372725715e-05, + "loss": 7.5281, + "step": 1567600 + }, + { + "epoch": 6.386465272551174, + "grad_norm": 4.808372974395752, + "learning_rate": 1.5143774108138895e-05, + "loss": 7.5086, + "step": 1567700 + }, + { + "epoch": 6.386872650574555, + "grad_norm": 3.1701083183288574, + "learning_rate": 1.5111984280885853e-05, + "loss": 7.4936, + "step": 1567800 + }, + { + "epoch": 6.387280028597937, + "grad_norm": 4.402795791625977, + "learning_rate": 1.508029289222251e-05, + "loss": 7.5438, + "step": 1567900 + }, + { + "epoch": 6.3876874066213185, + "grad_norm": 4.104953289031982, + "learning_rate": 1.5048699943402289e-05, + "loss": 7.5011, + "step": 1568000 + }, + { + "epoch": 6.3876874066213185, + "eval_MaskedAccuracy": 0.5136394569990271, + "eval_loss": 1.5913456678390503, + "eval_runtime": 161.1058, + "eval_samples_per_second": 394.002, + "eval_steps_per_second": 1.539, + "step": 1568000 + }, + { + "epoch": 6.3880947846447, + "grad_norm": 3.2469029426574707, + "learning_rate": 1.5017205435674459e-05, + "loss": 7.5338, + "step": 1568100 + }, + { + "epoch": 6.388502162668082, + "grad_norm": 7.085195541381836, + "learning_rate": 1.4985809370283602e-05, + "loss": 7.5311, + "step": 1568200 + }, + { + "epoch": 6.388909540691463, + "grad_norm": 6.949836254119873, + "learning_rate": 1.4954511748471229e-05, + "loss": 7.5079, + "step": 1568300 + }, + { + "epoch": 6.389316918714845, + "grad_norm": 6.27656888961792, + "learning_rate": 1.4923312571474701e-05, + "loss": 7.5389, + "step": 1568400 + }, + { + "epoch": 6.389724296738226, + "grad_norm": 9.262924194335938, + "learning_rate": 1.4892211840527508e-05, + "loss": 7.5219, + "step": 1568500 + }, + { + "epoch": 6.390131674761608, + "grad_norm": 3.329096555709839, + "learning_rate": 1.4861209556859257e-05, + "loss": 7.5307, + "step": 1568600 + }, + { + "epoch": 6.390539052784989, + "grad_norm": 8.52308177947998, + "learning_rate": 1.4830305721695655e-05, + "loss": 7.502, + "step": 1568700 + }, + { + "epoch": 6.390946430808371, + "grad_norm": 7.067248821258545, + "learning_rate": 1.4799500336258288e-05, + "loss": 7.5316, + "step": 1568800 + }, + { + "epoch": 6.391353808831752, + "grad_norm": 9.274039268493652, + "learning_rate": 1.4768793401765402e-05, + "loss": 7.5129, + "step": 1568900 + }, + { + "epoch": 6.391761186855133, + "grad_norm": 11.559528350830078, + "learning_rate": 1.4738184919430536e-05, + "loss": 7.5088, + "step": 1569000 + }, + { + "epoch": 6.391761186855133, + "eval_MaskedAccuracy": 0.5130918001902454, + "eval_loss": 1.5823763608932495, + "eval_runtime": 170.4927, + "eval_samples_per_second": 372.309, + "eval_steps_per_second": 1.455, + "step": 1569000 + }, + { + "epoch": 6.3921685648785145, + "grad_norm": 16.58714485168457, + "learning_rate": 1.4707674890464179e-05, + "loss": 7.4964, + "step": 1569100 + }, + { + "epoch": 6.392575942901896, + "grad_norm": 8.326848030090332, + "learning_rate": 1.467726331607267e-05, + "loss": 7.5183, + "step": 1569200 + }, + { + "epoch": 6.3929833209252775, + "grad_norm": 14.714838027954102, + "learning_rate": 1.4646950197457929e-05, + "loss": 7.4614, + "step": 1569300 + }, + { + "epoch": 6.393390698948659, + "grad_norm": 2.6382505893707275, + "learning_rate": 1.4616735535818535e-05, + "loss": 7.5017, + "step": 1569400 + }, + { + "epoch": 6.393798076972041, + "grad_norm": 5.560884952545166, + "learning_rate": 1.4586619332349186e-05, + "loss": 7.5287, + "step": 1569500 + }, + { + "epoch": 6.394205454995422, + "grad_norm": 13.803794860839844, + "learning_rate": 1.4556601588240438e-05, + "loss": 7.5174, + "step": 1569600 + }, + { + "epoch": 6.394612833018804, + "grad_norm": 4.421621799468994, + "learning_rate": 1.4526682304678949e-05, + "loss": 7.5351, + "step": 1569700 + }, + { + "epoch": 6.395020211042185, + "grad_norm": 9.589630126953125, + "learning_rate": 1.4496861482847798e-05, + "loss": 7.5342, + "step": 1569800 + }, + { + "epoch": 6.395427589065567, + "grad_norm": 4.070805072784424, + "learning_rate": 1.446713912392588e-05, + "loss": 7.4844, + "step": 1569900 + }, + { + "epoch": 6.395834967088948, + "grad_norm": 12.339241981506348, + "learning_rate": 1.4437515229088253e-05, + "loss": 7.52, + "step": 1570000 + }, + { + "epoch": 6.395834967088948, + "eval_MaskedAccuracy": 0.5141620454833628, + "eval_loss": 1.5788440704345703, + "eval_runtime": 175.9873, + "eval_samples_per_second": 360.685, + "eval_steps_per_second": 1.409, + "step": 1570000 + }, + { + "epoch": 6.39624234511233, + "grad_norm": 7.558104038238525, + "learning_rate": 1.4407989799506074e-05, + "loss": 7.4956, + "step": 1570100 + }, + { + "epoch": 6.3966497231357105, + "grad_norm": 6.623258113861084, + "learning_rate": 1.4378562836346616e-05, + "loss": 7.5005, + "step": 1570200 + }, + { + "epoch": 6.397057101159092, + "grad_norm": 9.98888874053955, + "learning_rate": 1.4349234340773585e-05, + "loss": 7.4881, + "step": 1570300 + }, + { + "epoch": 6.3974644791824735, + "grad_norm": 5.980010986328125, + "learning_rate": 1.4320004313945938e-05, + "loss": 7.5196, + "step": 1570400 + }, + { + "epoch": 6.397871857205855, + "grad_norm": 4.522244453430176, + "learning_rate": 1.429087275701985e-05, + "loss": 7.5148, + "step": 1570500 + }, + { + "epoch": 6.398279235229237, + "grad_norm": 2.6743404865264893, + "learning_rate": 1.4261839671146826e-05, + "loss": 7.5055, + "step": 1570600 + }, + { + "epoch": 6.398686613252618, + "grad_norm": 3.34663987159729, + "learning_rate": 1.4232905057474483e-05, + "loss": 7.5199, + "step": 1570700 + }, + { + "epoch": 6.399093991276, + "grad_norm": 10.206490516662598, + "learning_rate": 1.4204068917147113e-05, + "loss": 7.5251, + "step": 1570800 + }, + { + "epoch": 6.399501369299381, + "grad_norm": 4.575131893157959, + "learning_rate": 1.4175331251304563e-05, + "loss": 7.5042, + "step": 1570900 + }, + { + "epoch": 6.399908747322763, + "grad_norm": 2.9370667934417725, + "learning_rate": 1.4146692061082811e-05, + "loss": 7.5048, + "step": 1571000 + }, + { + "epoch": 6.399908747322763, + "eval_MaskedAccuracy": 0.5134899479556091, + "eval_loss": 1.5859872102737427, + "eval_runtime": 152.4059, + "eval_samples_per_second": 416.493, + "eval_steps_per_second": 1.627, + "step": 1571000 + }, + { + "epoch": 6.400316125346144, + "grad_norm": 4.213624477386475, + "learning_rate": 1.411815134761482e-05, + "loss": 7.5082, + "step": 1571100 + }, + { + "epoch": 6.400723503369526, + "grad_norm": 12.737908363342285, + "learning_rate": 1.4089709112028228e-05, + "loss": 7.4791, + "step": 1571200 + }, + { + "epoch": 6.401130881392907, + "grad_norm": 9.245916366577148, + "learning_rate": 1.4061365355447684e-05, + "loss": 7.4917, + "step": 1571300 + }, + { + "epoch": 6.401538259416288, + "grad_norm": 6.643359184265137, + "learning_rate": 1.4033120078993914e-05, + "loss": 7.5082, + "step": 1571400 + }, + { + "epoch": 6.4019456374396695, + "grad_norm": 5.419043064117432, + "learning_rate": 1.400497328378352e-05, + "loss": 7.5098, + "step": 1571500 + }, + { + "epoch": 6.402353015463051, + "grad_norm": 3.9829065799713135, + "learning_rate": 1.3976924970929485e-05, + "loss": 7.4848, + "step": 1571600 + }, + { + "epoch": 6.4027603934864326, + "grad_norm": 6.7930827140808105, + "learning_rate": 1.3948975141540344e-05, + "loss": 7.5118, + "step": 1571700 + }, + { + "epoch": 6.403167771509814, + "grad_norm": 9.234661102294922, + "learning_rate": 1.3921123796721353e-05, + "loss": 7.5022, + "step": 1571800 + }, + { + "epoch": 6.403575149533196, + "grad_norm": 7.889405250549316, + "learning_rate": 1.3893370937573576e-05, + "loss": 7.5065, + "step": 1571900 + }, + { + "epoch": 6.403982527556577, + "grad_norm": 3.2302517890930176, + "learning_rate": 1.3865716565194205e-05, + "loss": 7.5146, + "step": 1572000 + }, + { + "epoch": 6.403982527556577, + "eval_MaskedAccuracy": 0.5136081210469635, + "eval_loss": 1.5881190299987793, + "eval_runtime": 164.6993, + "eval_samples_per_second": 385.405, + "eval_steps_per_second": 1.506, + "step": 1572000 + }, + { + "epoch": 6.404389905579959, + "grad_norm": 5.261856555938721, + "learning_rate": 1.3838160680676323e-05, + "loss": 7.5059, + "step": 1572100 + }, + { + "epoch": 6.40479728360334, + "grad_norm": 4.127289772033691, + "learning_rate": 1.38107032851099e-05, + "loss": 7.5274, + "step": 1572200 + }, + { + "epoch": 6.405204661626722, + "grad_norm": 3.562062978744507, + "learning_rate": 1.3783344379580229e-05, + "loss": 7.5379, + "step": 1572300 + }, + { + "epoch": 6.405612039650103, + "grad_norm": 3.4243810176849365, + "learning_rate": 1.3756083965168728e-05, + "loss": 7.4834, + "step": 1572400 + }, + { + "epoch": 6.406019417673485, + "grad_norm": 4.1015143394470215, + "learning_rate": 1.3728922042953475e-05, + "loss": 7.5272, + "step": 1572500 + }, + { + "epoch": 6.406426795696866, + "grad_norm": 3.1138134002685547, + "learning_rate": 1.3701858614008138e-05, + "loss": 7.4946, + "step": 1572600 + }, + { + "epoch": 6.406834173720247, + "grad_norm": 5.690712928771973, + "learning_rate": 1.3674893679402758e-05, + "loss": 7.5256, + "step": 1572700 + }, + { + "epoch": 6.4072415517436285, + "grad_norm": 5.451653003692627, + "learning_rate": 1.3648027240203498e-05, + "loss": 7.5216, + "step": 1572800 + }, + { + "epoch": 6.40764892976701, + "grad_norm": 5.183475494384766, + "learning_rate": 1.3621259297472122e-05, + "loss": 7.5454, + "step": 1572900 + }, + { + "epoch": 6.408056307790392, + "grad_norm": 4.239622592926025, + "learning_rate": 1.3594589852267604e-05, + "loss": 7.4933, + "step": 1573000 + }, + { + "epoch": 6.408056307790392, + "eval_MaskedAccuracy": 0.5141377363086594, + "eval_loss": 1.58185875415802, + "eval_runtime": 223.9942, + "eval_samples_per_second": 283.382, + "eval_steps_per_second": 1.107, + "step": 1573000 + }, + { + "epoch": 6.408463685813773, + "grad_norm": 4.208160877227783, + "learning_rate": 1.3568018905643638e-05, + "loss": 7.5127, + "step": 1573100 + }, + { + "epoch": 6.408871063837155, + "grad_norm": 5.692768573760986, + "learning_rate": 1.3541546458650899e-05, + "loss": 7.5169, + "step": 1573200 + }, + { + "epoch": 6.409278441860536, + "grad_norm": 15.91248893737793, + "learning_rate": 1.3515172512336166e-05, + "loss": 7.5104, + "step": 1573300 + }, + { + "epoch": 6.409685819883918, + "grad_norm": 5.291548252105713, + "learning_rate": 1.3488897067742353e-05, + "loss": 7.5227, + "step": 1573400 + }, + { + "epoch": 6.410093197907299, + "grad_norm": 7.054795265197754, + "learning_rate": 1.3462720125907653e-05, + "loss": 7.502, + "step": 1573500 + }, + { + "epoch": 6.410500575930681, + "grad_norm": 8.391228675842285, + "learning_rate": 1.3436641687867511e-05, + "loss": 7.5509, + "step": 1573600 + }, + { + "epoch": 6.410907953954062, + "grad_norm": 2.310081720352173, + "learning_rate": 1.3410661754652617e-05, + "loss": 7.5209, + "step": 1573700 + }, + { + "epoch": 6.411315331977444, + "grad_norm": 5.13197135925293, + "learning_rate": 1.3384780327290112e-05, + "loss": 7.5029, + "step": 1573800 + }, + { + "epoch": 6.411722710000825, + "grad_norm": 2.7105376720428467, + "learning_rate": 1.3358997406803501e-05, + "loss": 7.5187, + "step": 1573900 + }, + { + "epoch": 6.412130088024206, + "grad_norm": 5.311029434204102, + "learning_rate": 1.333331299421189e-05, + "loss": 7.4873, + "step": 1574000 + }, + { + "epoch": 6.412130088024206, + "eval_MaskedAccuracy": 0.5134961140972867, + "eval_loss": 1.5884283781051636, + "eval_runtime": 165.2433, + "eval_samples_per_second": 384.137, + "eval_steps_per_second": 1.501, + "step": 1574000 + }, + { + "epoch": 6.412537466047588, + "grad_norm": 4.1327900886535645, + "learning_rate": 1.3307727090530741e-05, + "loss": 7.4987, + "step": 1574100 + }, + { + "epoch": 6.412944844070969, + "grad_norm": 3.0449063777923584, + "learning_rate": 1.328223969677193e-05, + "loss": 7.5378, + "step": 1574200 + }, + { + "epoch": 6.413352222094351, + "grad_norm": 11.36555290222168, + "learning_rate": 1.325685081394237e-05, + "loss": 7.4879, + "step": 1574300 + }, + { + "epoch": 6.413759600117732, + "grad_norm": 3.7845590114593506, + "learning_rate": 1.3231560443046744e-05, + "loss": 7.5143, + "step": 1574400 + }, + { + "epoch": 6.414166978141114, + "grad_norm": 13.816906929016113, + "learning_rate": 1.3206368585084195e-05, + "loss": 7.5011, + "step": 1574500 + }, + { + "epoch": 6.414574356164495, + "grad_norm": 12.928583145141602, + "learning_rate": 1.318127524105082e-05, + "loss": 7.4846, + "step": 1574600 + }, + { + "epoch": 6.414981734187877, + "grad_norm": 9.984003067016602, + "learning_rate": 1.3156280411938826e-05, + "loss": 7.5424, + "step": 1574700 + }, + { + "epoch": 6.415389112211258, + "grad_norm": 15.186087608337402, + "learning_rate": 1.3131384098736561e-05, + "loss": 7.5193, + "step": 1574800 + }, + { + "epoch": 6.41579649023464, + "grad_norm": 3.359351634979248, + "learning_rate": 1.310658630242818e-05, + "loss": 7.5105, + "step": 1574900 + }, + { + "epoch": 6.416203868258021, + "grad_norm": 5.87614631652832, + "learning_rate": 1.3081887023994011e-05, + "loss": 7.5329, + "step": 1575000 + }, + { + "epoch": 6.416203868258021, + "eval_MaskedAccuracy": 0.5141055018772199, + "eval_loss": 1.5858298540115356, + "eval_runtime": 160.8309, + "eval_samples_per_second": 394.675, + "eval_steps_per_second": 1.542, + "step": 1575000 + }, + { + "epoch": 6.416611246281403, + "grad_norm": 3.7395567893981934, + "learning_rate": 1.3057286264410479e-05, + "loss": 7.5396, + "step": 1575100 + }, + { + "epoch": 6.4170186243047835, + "grad_norm": 7.1665940284729, + "learning_rate": 1.3032784024650144e-05, + "loss": 7.5205, + "step": 1575200 + }, + { + "epoch": 6.417426002328165, + "grad_norm": 2.775817632675171, + "learning_rate": 1.3008380305681945e-05, + "loss": 7.5332, + "step": 1575300 + }, + { + "epoch": 6.417833380351547, + "grad_norm": 11.79802131652832, + "learning_rate": 1.2984075108470668e-05, + "loss": 7.4973, + "step": 1575400 + }, + { + "epoch": 6.418240758374928, + "grad_norm": 18.12425994873047, + "learning_rate": 1.2959868433977236e-05, + "loss": 7.523, + "step": 1575500 + }, + { + "epoch": 6.41864813639831, + "grad_norm": 4.883094787597656, + "learning_rate": 1.2935760283158411e-05, + "loss": 7.5362, + "step": 1575600 + }, + { + "epoch": 6.419055514421691, + "grad_norm": 5.846532344818115, + "learning_rate": 1.2911750656967353e-05, + "loss": 7.5088, + "step": 1575700 + }, + { + "epoch": 6.419462892445073, + "grad_norm": 3.527452230453491, + "learning_rate": 1.2887839556353617e-05, + "loss": 7.5245, + "step": 1575800 + }, + { + "epoch": 6.419870270468454, + "grad_norm": 15.958311080932617, + "learning_rate": 1.286402698226233e-05, + "loss": 7.5036, + "step": 1575900 + }, + { + "epoch": 6.420277648491836, + "grad_norm": 8.36783218383789, + "learning_rate": 1.2840312935635002e-05, + "loss": 7.5173, + "step": 1576000 + }, + { + "epoch": 6.420277648491836, + "eval_MaskedAccuracy": 0.5133222906597588, + "eval_loss": 1.5864717960357666, + "eval_runtime": 157.9601, + "eval_samples_per_second": 401.848, + "eval_steps_per_second": 1.57, + "step": 1576000 + }, + { + "epoch": 6.420685026515217, + "grad_norm": 7.594944000244141, + "learning_rate": 1.2816697417408736e-05, + "loss": 7.499, + "step": 1576100 + }, + { + "epoch": 6.421092404538599, + "grad_norm": 8.242910385131836, + "learning_rate": 1.2793180428517868e-05, + "loss": 7.4981, + "step": 1576200 + }, + { + "epoch": 6.42149978256198, + "grad_norm": 13.289484977722168, + "learning_rate": 1.2769761969891999e-05, + "loss": 7.5153, + "step": 1576300 + }, + { + "epoch": 6.421907160585361, + "grad_norm": 5.258137226104736, + "learning_rate": 1.2746442042456588e-05, + "loss": 7.5096, + "step": 1576400 + }, + { + "epoch": 6.422314538608743, + "grad_norm": 12.008098602294922, + "learning_rate": 1.2723220647133773e-05, + "loss": 7.5106, + "step": 1576500 + }, + { + "epoch": 6.422721916632124, + "grad_norm": 12.856749534606934, + "learning_rate": 1.2700097784841802e-05, + "loss": 7.504, + "step": 1576600 + }, + { + "epoch": 6.423129294655506, + "grad_norm": 12.059041976928711, + "learning_rate": 1.2677073456494521e-05, + "loss": 7.5214, + "step": 1576700 + }, + { + "epoch": 6.423536672678887, + "grad_norm": 10.986212730407715, + "learning_rate": 1.265414766300243e-05, + "loss": 7.5158, + "step": 1576800 + }, + { + "epoch": 6.423944050702269, + "grad_norm": 12.987505912780762, + "learning_rate": 1.263132040527213e-05, + "loss": 7.4991, + "step": 1576900 + }, + { + "epoch": 6.42435142872565, + "grad_norm": 8.581718444824219, + "learning_rate": 1.2608591684205837e-05, + "loss": 7.5328, + "step": 1577000 + }, + { + "epoch": 6.42435142872565, + "eval_MaskedAccuracy": 0.5135896450978338, + "eval_loss": 1.5826035737991333, + "eval_runtime": 153.6081, + "eval_samples_per_second": 413.233, + "eval_steps_per_second": 1.614, + "step": 1577000 + }, + { + "epoch": 6.424758806749032, + "grad_norm": 5.405650615692139, + "learning_rate": 1.2585961500701854e-05, + "loss": 7.4957, + "step": 1577100 + }, + { + "epoch": 6.425166184772413, + "grad_norm": 10.33969497680664, + "learning_rate": 1.2563429855655443e-05, + "loss": 7.5112, + "step": 1577200 + }, + { + "epoch": 6.425573562795795, + "grad_norm": 20.685165405273438, + "learning_rate": 1.2540996749957157e-05, + "loss": 7.5436, + "step": 1577300 + }, + { + "epoch": 6.425980940819176, + "grad_norm": 8.307595252990723, + "learning_rate": 1.251866218449396e-05, + "loss": 7.5399, + "step": 1577400 + }, + { + "epoch": 6.426388318842558, + "grad_norm": 9.786670684814453, + "learning_rate": 1.2496426160148366e-05, + "loss": 7.4785, + "step": 1577500 + }, + { + "epoch": 6.426795696865939, + "grad_norm": 17.664812088012695, + "learning_rate": 1.2474288677800156e-05, + "loss": 7.484, + "step": 1577600 + }, + { + "epoch": 6.42720307488932, + "grad_norm": 17.866188049316406, + "learning_rate": 1.2452249738324344e-05, + "loss": 7.5178, + "step": 1577700 + }, + { + "epoch": 6.427610452912702, + "grad_norm": 10.082836151123047, + "learning_rate": 1.2430309342592104e-05, + "loss": 7.5313, + "step": 1577800 + }, + { + "epoch": 6.428017830936083, + "grad_norm": 7.397581577301025, + "learning_rate": 1.2408467491471011e-05, + "loss": 7.5053, + "step": 1577900 + }, + { + "epoch": 6.428425208959465, + "grad_norm": 17.505754470825195, + "learning_rate": 1.2386724185824467e-05, + "loss": 7.485, + "step": 1578000 + }, + { + "epoch": 6.428425208959465, + "eval_MaskedAccuracy": 0.514182775242332, + "eval_loss": 1.5848979949951172, + "eval_runtime": 152.7666, + "eval_samples_per_second": 415.51, + "eval_steps_per_second": 1.623, + "step": 1578000 + }, + { + "epoch": 6.428832586982846, + "grad_norm": 19.836593627929688, + "learning_rate": 1.2365079426512016e-05, + "loss": 7.5257, + "step": 1578100 + }, + { + "epoch": 6.429239965006228, + "grad_norm": 5.72556734085083, + "learning_rate": 1.2343533214389587e-05, + "loss": 7.511, + "step": 1578200 + }, + { + "epoch": 6.429647343029609, + "grad_norm": 11.72077751159668, + "learning_rate": 1.232208555030923e-05, + "loss": 7.5083, + "step": 1578300 + }, + { + "epoch": 6.430054721052991, + "grad_norm": 11.613635063171387, + "learning_rate": 1.2300736435118291e-05, + "loss": 7.5003, + "step": 1578400 + }, + { + "epoch": 6.430462099076372, + "grad_norm": 6.428407669067383, + "learning_rate": 1.2279485869661065e-05, + "loss": 7.4788, + "step": 1578500 + }, + { + "epoch": 6.430869477099754, + "grad_norm": 6.941511631011963, + "learning_rate": 1.2258333854777984e-05, + "loss": 7.5334, + "step": 1578600 + }, + { + "epoch": 6.431276855123135, + "grad_norm": 21.483781814575195, + "learning_rate": 1.2237280391305018e-05, + "loss": 7.4977, + "step": 1578700 + }, + { + "epoch": 6.431684233146517, + "grad_norm": 19.6728572845459, + "learning_rate": 1.221632548007456e-05, + "loss": 7.5281, + "step": 1578800 + }, + { + "epoch": 6.4320916111698985, + "grad_norm": 9.46621322631836, + "learning_rate": 1.219546912191512e-05, + "loss": 7.5062, + "step": 1578900 + }, + { + "epoch": 6.432498989193279, + "grad_norm": 6.844732761383057, + "learning_rate": 1.2174711317651337e-05, + "loss": 7.4939, + "step": 1579000 + }, + { + "epoch": 6.432498989193279, + "eval_MaskedAccuracy": 0.5135044825236108, + "eval_loss": 1.5933804512023926, + "eval_runtime": 160.879, + "eval_samples_per_second": 394.557, + "eval_steps_per_second": 1.542, + "step": 1579000 + }, + { + "epoch": 6.432906367216661, + "grad_norm": 19.491687774658203, + "learning_rate": 1.2154052068103675e-05, + "loss": 7.5178, + "step": 1579100 + }, + { + "epoch": 6.433313745240042, + "grad_norm": 8.367425918579102, + "learning_rate": 1.2133491374089022e-05, + "loss": 7.518, + "step": 1579200 + }, + { + "epoch": 6.433721123263424, + "grad_norm": 8.211845397949219, + "learning_rate": 1.2113029236420363e-05, + "loss": 7.5384, + "step": 1579300 + }, + { + "epoch": 6.434128501286805, + "grad_norm": 8.700518608093262, + "learning_rate": 1.2092665655906547e-05, + "loss": 7.5014, + "step": 1579400 + }, + { + "epoch": 6.434535879310187, + "grad_norm": 15.144051551818848, + "learning_rate": 1.2072400633352532e-05, + "loss": 7.5121, + "step": 1579500 + }, + { + "epoch": 6.434943257333568, + "grad_norm": 12.884886741638184, + "learning_rate": 1.2052234169559683e-05, + "loss": 7.5048, + "step": 1579600 + }, + { + "epoch": 6.43535063535695, + "grad_norm": 2.9523839950561523, + "learning_rate": 1.2032166265325214e-05, + "loss": 7.4923, + "step": 1579700 + }, + { + "epoch": 6.435758013380331, + "grad_norm": 10.289582252502441, + "learning_rate": 1.2012196921442453e-05, + "loss": 7.5141, + "step": 1579800 + }, + { + "epoch": 6.436165391403713, + "grad_norm": 12.150877952575684, + "learning_rate": 1.1992326138701138e-05, + "loss": 7.4757, + "step": 1579900 + }, + { + "epoch": 6.436572769427094, + "grad_norm": 7.628262042999268, + "learning_rate": 1.1972553917886819e-05, + "loss": 7.5078, + "step": 1580000 + }, + { + "epoch": 6.436572769427094, + "eval_MaskedAccuracy": 0.5136051176975212, + "eval_loss": 1.576341152191162, + "eval_runtime": 192.0348, + "eval_samples_per_second": 330.544, + "eval_steps_per_second": 1.291, + "step": 1580000 + }, + { + "epoch": 6.436980147450476, + "grad_norm": 10.1820707321167, + "learning_rate": 1.1952880259780914e-05, + "loss": 7.4935, + "step": 1580100 + }, + { + "epoch": 6.437387525473857, + "grad_norm": 5.5103044509887695, + "learning_rate": 1.1933305165161531e-05, + "loss": 7.5113, + "step": 1580200 + }, + { + "epoch": 6.437794903497238, + "grad_norm": 3.9326303005218506, + "learning_rate": 1.1913828634802331e-05, + "loss": 7.5108, + "step": 1580300 + }, + { + "epoch": 6.43820228152062, + "grad_norm": 12.331070899963379, + "learning_rate": 1.1894450669473385e-05, + "loss": 7.5332, + "step": 1580400 + }, + { + "epoch": 6.438609659544001, + "grad_norm": 8.85410213470459, + "learning_rate": 1.1875171269940862e-05, + "loss": 7.5079, + "step": 1580500 + }, + { + "epoch": 6.439017037567383, + "grad_norm": 9.459521293640137, + "learning_rate": 1.1855990436967075e-05, + "loss": 7.4942, + "step": 1580600 + }, + { + "epoch": 6.439424415590764, + "grad_norm": 7.890523910522461, + "learning_rate": 1.1836908171310452e-05, + "loss": 7.4848, + "step": 1580700 + }, + { + "epoch": 6.439831793614146, + "grad_norm": 25.802410125732422, + "learning_rate": 1.1817924473724993e-05, + "loss": 7.5027, + "step": 1580800 + }, + { + "epoch": 6.440239171637527, + "grad_norm": 5.142045974731445, + "learning_rate": 1.1799039344961371e-05, + "loss": 7.5088, + "step": 1580900 + }, + { + "epoch": 6.440646549660909, + "grad_norm": 7.51984167098999, + "learning_rate": 1.1780252785766629e-05, + "loss": 7.5149, + "step": 1581000 + }, + { + "epoch": 6.440646549660909, + "eval_MaskedAccuracy": 0.5139996727286729, + "eval_loss": 1.575803518295288, + "eval_runtime": 161.763, + "eval_samples_per_second": 392.401, + "eval_steps_per_second": 1.533, + "step": 1581000 + }, + { + "epoch": 6.44105392768429, + "grad_norm": 3.9571900367736816, + "learning_rate": 1.1761564796882889e-05, + "loss": 7.5341, + "step": 1581100 + }, + { + "epoch": 6.441461305707672, + "grad_norm": 2.9896321296691895, + "learning_rate": 1.1742975379049166e-05, + "loss": 7.5011, + "step": 1581200 + }, + { + "epoch": 6.4418686837310535, + "grad_norm": 10.502874374389648, + "learning_rate": 1.1724484533000904e-05, + "loss": 7.4909, + "step": 1581300 + }, + { + "epoch": 6.442276061754434, + "grad_norm": 10.83086109161377, + "learning_rate": 1.1706092259468554e-05, + "loss": 7.504, + "step": 1581400 + }, + { + "epoch": 6.442683439777816, + "grad_norm": 2.590233564376831, + "learning_rate": 1.1687798559179252e-05, + "loss": 7.549, + "step": 1581500 + }, + { + "epoch": 6.443090817801197, + "grad_norm": 4.046431064605713, + "learning_rate": 1.1669603432856774e-05, + "loss": 7.5088, + "step": 1581600 + }, + { + "epoch": 6.443498195824579, + "grad_norm": 9.649344444274902, + "learning_rate": 1.165150688121996e-05, + "loss": 7.5432, + "step": 1581700 + }, + { + "epoch": 6.44390557384796, + "grad_norm": 3.338127374649048, + "learning_rate": 1.1633508904984299e-05, + "loss": 7.5146, + "step": 1581800 + }, + { + "epoch": 6.444312951871342, + "grad_norm": 18.291717529296875, + "learning_rate": 1.1615609504861685e-05, + "loss": 7.5071, + "step": 1581900 + }, + { + "epoch": 6.444720329894723, + "grad_norm": 3.2955408096313477, + "learning_rate": 1.1597808681559567e-05, + "loss": 7.5291, + "step": 1582000 + }, + { + "epoch": 6.444720329894723, + "eval_MaskedAccuracy": 0.5137903031881628, + "eval_loss": 1.5809904336929321, + "eval_runtime": 159.7544, + "eval_samples_per_second": 397.335, + "eval_steps_per_second": 1.552, + "step": 1582000 + }, + { + "epoch": 6.445127707918105, + "grad_norm": 11.436583518981934, + "learning_rate": 1.1580106435781803e-05, + "loss": 7.5191, + "step": 1582100 + }, + { + "epoch": 6.445535085941486, + "grad_norm": 5.290562629699707, + "learning_rate": 1.156250276822811e-05, + "loss": 7.5033, + "step": 1582200 + }, + { + "epoch": 6.445942463964868, + "grad_norm": 3.2766430377960205, + "learning_rate": 1.1544997679594573e-05, + "loss": 7.4911, + "step": 1582300 + }, + { + "epoch": 6.4463498419882495, + "grad_norm": 7.670306205749512, + "learning_rate": 1.1527591170573147e-05, + "loss": 7.5289, + "step": 1582400 + }, + { + "epoch": 6.446757220011631, + "grad_norm": 6.270802974700928, + "learning_rate": 1.1510283241851915e-05, + "loss": 7.5287, + "step": 1582500 + }, + { + "epoch": 6.4471645980350125, + "grad_norm": 4.065499305725098, + "learning_rate": 1.1493073894115615e-05, + "loss": 7.5492, + "step": 1582600 + }, + { + "epoch": 6.447571976058393, + "grad_norm": 7.180081844329834, + "learning_rate": 1.1475963128043996e-05, + "loss": 7.5232, + "step": 1582700 + }, + { + "epoch": 6.447979354081775, + "grad_norm": 5.466172218322754, + "learning_rate": 1.1458950944314066e-05, + "loss": 7.5087, + "step": 1582800 + }, + { + "epoch": 6.448386732105156, + "grad_norm": 3.34328031539917, + "learning_rate": 1.1442037343598078e-05, + "loss": 7.495, + "step": 1582900 + }, + { + "epoch": 6.448794110128538, + "grad_norm": 8.226127624511719, + "learning_rate": 1.1425222326565008e-05, + "loss": 7.4944, + "step": 1583000 + }, + { + "epoch": 6.448794110128538, + "eval_MaskedAccuracy": 0.5132201020882595, + "eval_loss": 1.5804826021194458, + "eval_runtime": 237.9313, + "eval_samples_per_second": 266.783, + "eval_steps_per_second": 1.042, + "step": 1583000 + }, + { + "epoch": 6.449201488151919, + "grad_norm": 12.55417251586914, + "learning_rate": 1.1408505893879386e-05, + "loss": 7.5197, + "step": 1583100 + }, + { + "epoch": 6.449608866175301, + "grad_norm": 10.142425537109375, + "learning_rate": 1.139188804620213e-05, + "loss": 7.535, + "step": 1583200 + }, + { + "epoch": 6.450016244198682, + "grad_norm": 6.337942600250244, + "learning_rate": 1.1375368784190545e-05, + "loss": 7.5153, + "step": 1583300 + }, + { + "epoch": 6.450423622222064, + "grad_norm": 13.39480972290039, + "learning_rate": 1.1358948108497264e-05, + "loss": 7.5049, + "step": 1583400 + }, + { + "epoch": 6.450831000245445, + "grad_norm": 6.626671314239502, + "learning_rate": 1.1342626019771572e-05, + "loss": 7.5002, + "step": 1583500 + }, + { + "epoch": 6.451238378268827, + "grad_norm": 2.58742356300354, + "learning_rate": 1.1326402518659157e-05, + "loss": 7.5136, + "step": 1583600 + }, + { + "epoch": 6.4516457562922085, + "grad_norm": 5.8868184089660645, + "learning_rate": 1.1310277605800997e-05, + "loss": 7.5153, + "step": 1583700 + }, + { + "epoch": 6.45205313431559, + "grad_norm": 8.069238662719727, + "learning_rate": 1.1294251281834755e-05, + "loss": 7.5035, + "step": 1583800 + }, + { + "epoch": 6.4524605123389716, + "grad_norm": 5.264612197875977, + "learning_rate": 1.12783235473942e-05, + "loss": 7.5259, + "step": 1583900 + }, + { + "epoch": 6.452867890362352, + "grad_norm": 16.8353271484375, + "learning_rate": 1.1262494403108685e-05, + "loss": 7.4895, + "step": 1584000 + }, + { + "epoch": 6.452867890362352, + "eval_MaskedAccuracy": 0.5131780757268724, + "eval_loss": 1.5877623558044434, + "eval_runtime": 168.9802, + "eval_samples_per_second": 375.642, + "eval_steps_per_second": 1.468, + "step": 1584000 + }, + { + "epoch": 6.453275268385734, + "grad_norm": 9.595535278320312, + "learning_rate": 1.124676384960422e-05, + "loss": 7.4899, + "step": 1584100 + }, + { + "epoch": 6.453682646409115, + "grad_norm": 3.171947479248047, + "learning_rate": 1.1231131887502687e-05, + "loss": 7.5199, + "step": 1584200 + }, + { + "epoch": 6.454090024432497, + "grad_norm": 5.0405449867248535, + "learning_rate": 1.121559851742233e-05, + "loss": 7.4804, + "step": 1584300 + }, + { + "epoch": 6.454497402455878, + "grad_norm": 2.5416932106018066, + "learning_rate": 1.1200163739977016e-05, + "loss": 7.5072, + "step": 1584400 + }, + { + "epoch": 6.45490478047926, + "grad_norm": 6.3112568855285645, + "learning_rate": 1.1184827555776944e-05, + "loss": 7.5124, + "step": 1584500 + }, + { + "epoch": 6.455312158502641, + "grad_norm": 5.8980631828308105, + "learning_rate": 1.116958996542848e-05, + "loss": 7.4989, + "step": 1584600 + }, + { + "epoch": 6.455719536526023, + "grad_norm": 14.551684379577637, + "learning_rate": 1.1154450969533821e-05, + "loss": 7.5118, + "step": 1584700 + }, + { + "epoch": 6.4561269145494045, + "grad_norm": 4.331643104553223, + "learning_rate": 1.1139410568692128e-05, + "loss": 7.5193, + "step": 1584800 + }, + { + "epoch": 6.456534292572786, + "grad_norm": 2.8261187076568604, + "learning_rate": 1.1124468763497561e-05, + "loss": 7.5006, + "step": 1584900 + }, + { + "epoch": 6.4569416705961675, + "grad_norm": 8.621109962463379, + "learning_rate": 1.1109625554540683e-05, + "loss": 7.5418, + "step": 1585000 + }, + { + "epoch": 6.4569416705961675, + "eval_MaskedAccuracy": 0.5142094645817232, + "eval_loss": 1.581610918045044, + "eval_runtime": 175.0879, + "eval_samples_per_second": 362.538, + "eval_steps_per_second": 1.416, + "step": 1585000 + }, + { + "epoch": 6.457349048619549, + "grad_norm": 7.384837627410889, + "learning_rate": 1.1094880942408735e-05, + "loss": 7.5623, + "step": 1585100 + }, + { + "epoch": 6.45775642664293, + "grad_norm": 7.93504524230957, + "learning_rate": 1.1080234927684532e-05, + "loss": 7.5121, + "step": 1585200 + }, + { + "epoch": 6.458163804666311, + "grad_norm": 8.416412353515625, + "learning_rate": 1.1065687510947014e-05, + "loss": 7.5209, + "step": 1585300 + }, + { + "epoch": 6.458571182689693, + "grad_norm": 8.54407024383545, + "learning_rate": 1.1051238692771492e-05, + "loss": 7.5198, + "step": 1585400 + }, + { + "epoch": 6.458978560713074, + "grad_norm": 5.531566143035889, + "learning_rate": 1.1036888473729149e-05, + "loss": 7.5146, + "step": 1585500 + }, + { + "epoch": 6.459385938736456, + "grad_norm": 4.485382080078125, + "learning_rate": 1.102263685438701e-05, + "loss": 7.5261, + "step": 1585600 + }, + { + "epoch": 6.459793316759837, + "grad_norm": 3.3528404235839844, + "learning_rate": 1.1008483835308761e-05, + "loss": 7.5484, + "step": 1585700 + }, + { + "epoch": 6.460200694783219, + "grad_norm": 2.1114022731781006, + "learning_rate": 1.099442941705395e-05, + "loss": 7.5096, + "step": 1585800 + }, + { + "epoch": 6.4606080728066, + "grad_norm": 3.5055644512176514, + "learning_rate": 1.0980473600178517e-05, + "loss": 7.5133, + "step": 1585900 + }, + { + "epoch": 6.461015450829982, + "grad_norm": 15.722739219665527, + "learning_rate": 1.0966616385233708e-05, + "loss": 7.5503, + "step": 1586000 + }, + { + "epoch": 6.461015450829982, + "eval_MaskedAccuracy": 0.51397272595115, + "eval_loss": 1.589124083518982, + "eval_runtime": 185.0763, + "eval_samples_per_second": 342.972, + "eval_steps_per_second": 1.34, + "step": 1586000 + }, + { + "epoch": 6.4614228288533635, + "grad_norm": 5.017574310302734, + "learning_rate": 1.0952857772767406e-05, + "loss": 7.5233, + "step": 1586100 + }, + { + "epoch": 6.461830206876745, + "grad_norm": 2.8360648155212402, + "learning_rate": 1.0939197763323929e-05, + "loss": 7.5366, + "step": 1586200 + }, + { + "epoch": 6.462237584900127, + "grad_norm": 2.794994831085205, + "learning_rate": 1.0925636357443144e-05, + "loss": 7.525, + "step": 1586300 + }, + { + "epoch": 6.462644962923507, + "grad_norm": 4.378689289093018, + "learning_rate": 1.0912173555661342e-05, + "loss": 7.4783, + "step": 1586400 + }, + { + "epoch": 6.463052340946889, + "grad_norm": 6.236534595489502, + "learning_rate": 1.089880935851034e-05, + "loss": 7.5189, + "step": 1586500 + }, + { + "epoch": 6.46345971897027, + "grad_norm": 3.1669552326202393, + "learning_rate": 1.088554376651895e-05, + "loss": 7.4782, + "step": 1586600 + }, + { + "epoch": 6.463867096993652, + "grad_norm": 2.9984211921691895, + "learning_rate": 1.087237678021153e-05, + "loss": 7.5291, + "step": 1586700 + }, + { + "epoch": 6.464274475017033, + "grad_norm": 20.27840805053711, + "learning_rate": 1.0859308400108288e-05, + "loss": 7.5295, + "step": 1586800 + }, + { + "epoch": 6.464681853040415, + "grad_norm": 2.6895344257354736, + "learning_rate": 1.0846338626726658e-05, + "loss": 7.5427, + "step": 1586900 + }, + { + "epoch": 6.465089231063796, + "grad_norm": 3.2413620948791504, + "learning_rate": 1.0833467460578534e-05, + "loss": 7.505, + "step": 1587000 + }, + { + "epoch": 6.465089231063796, + "eval_MaskedAccuracy": 0.5138568338712798, + "eval_loss": 1.5885602235794067, + "eval_runtime": 154.1563, + "eval_samples_per_second": 411.764, + "eval_steps_per_second": 1.609, + "step": 1587000 + }, + { + "epoch": 6.465496609087178, + "grad_norm": 4.166514873504639, + "learning_rate": 1.0820694902173316e-05, + "loss": 7.5256, + "step": 1587100 + }, + { + "epoch": 6.4659039871105595, + "grad_norm": 4.5649943351745605, + "learning_rate": 1.0808020952015985e-05, + "loss": 7.5255, + "step": 1587200 + }, + { + "epoch": 6.466311365133941, + "grad_norm": 10.997081756591797, + "learning_rate": 1.079544561060736e-05, + "loss": 7.4938, + "step": 1587300 + }, + { + "epoch": 6.4667187431573225, + "grad_norm": 4.487954139709473, + "learning_rate": 1.0782968878444657e-05, + "loss": 7.5526, + "step": 1587400 + }, + { + "epoch": 6.467126121180704, + "grad_norm": 15.784062385559082, + "learning_rate": 1.0770590756021208e-05, + "loss": 7.5212, + "step": 1587500 + }, + { + "epoch": 6.467533499204086, + "grad_norm": 2.7647440433502197, + "learning_rate": 1.0758311243826476e-05, + "loss": 7.5318, + "step": 1587600 + }, + { + "epoch": 6.467940877227466, + "grad_norm": 5.251222610473633, + "learning_rate": 1.0746130342345768e-05, + "loss": 7.4894, + "step": 1587700 + }, + { + "epoch": 6.468348255250848, + "grad_norm": 9.594130516052246, + "learning_rate": 1.0734048052060783e-05, + "loss": 7.5199, + "step": 1587800 + }, + { + "epoch": 6.468755633274229, + "grad_norm": 3.82230544090271, + "learning_rate": 1.0722064373449081e-05, + "loss": 7.5112, + "step": 1587900 + }, + { + "epoch": 6.469163011297611, + "grad_norm": 8.367039680480957, + "learning_rate": 1.0710179306984607e-05, + "loss": 7.5348, + "step": 1588000 + }, + { + "epoch": 6.469163011297611, + "eval_MaskedAccuracy": 0.513640790322325, + "eval_loss": 1.5872994661331177, + "eval_runtime": 164.0861, + "eval_samples_per_second": 386.846, + "eval_steps_per_second": 1.511, + "step": 1588000 + }, + { + "epoch": 6.469570389320992, + "grad_norm": 3.6034107208251953, + "learning_rate": 1.0698392853136869e-05, + "loss": 7.5058, + "step": 1588100 + }, + { + "epoch": 6.469977767344374, + "grad_norm": 13.84794807434082, + "learning_rate": 1.0686705012372343e-05, + "loss": 7.5032, + "step": 1588200 + }, + { + "epoch": 6.470385145367755, + "grad_norm": 4.37919282913208, + "learning_rate": 1.0675115785153065e-05, + "loss": 7.5362, + "step": 1588300 + }, + { + "epoch": 6.470792523391137, + "grad_norm": 3.441962242126465, + "learning_rate": 1.0663625171936911e-05, + "loss": 7.525, + "step": 1588400 + }, + { + "epoch": 6.4711999014145185, + "grad_norm": 8.499149322509766, + "learning_rate": 1.0652233173178164e-05, + "loss": 7.5046, + "step": 1588500 + }, + { + "epoch": 6.4716072794379, + "grad_norm": 21.030723571777344, + "learning_rate": 1.0640939789327512e-05, + "loss": 7.476, + "step": 1588600 + }, + { + "epoch": 6.472014657461282, + "grad_norm": 4.852090835571289, + "learning_rate": 1.0629745020831204e-05, + "loss": 7.5316, + "step": 1588700 + }, + { + "epoch": 6.472422035484663, + "grad_norm": 3.8750147819519043, + "learning_rate": 1.0618648868131876e-05, + "loss": 7.5006, + "step": 1588800 + }, + { + "epoch": 6.472829413508045, + "grad_norm": 7.608121871948242, + "learning_rate": 1.0607651331668303e-05, + "loss": 7.5461, + "step": 1588900 + }, + { + "epoch": 6.473236791531425, + "grad_norm": 3.698364496231079, + "learning_rate": 1.0596752411875099e-05, + "loss": 7.5182, + "step": 1589000 + }, + { + "epoch": 6.473236791531425, + "eval_MaskedAccuracy": 0.5138018319626427, + "eval_loss": 1.5852344036102295, + "eval_runtime": 157.879, + "eval_samples_per_second": 402.055, + "eval_steps_per_second": 1.571, + "step": 1589000 + }, + { + "epoch": 6.473644169554807, + "grad_norm": 5.7360968589782715, + "learning_rate": 1.0585952109183555e-05, + "loss": 7.5142, + "step": 1589100 + }, + { + "epoch": 6.474051547578188, + "grad_norm": 2.508575916290283, + "learning_rate": 1.0575250424020241e-05, + "loss": 7.4523, + "step": 1589200 + }, + { + "epoch": 6.47445892560157, + "grad_norm": 2.6369049549102783, + "learning_rate": 1.0564647356808142e-05, + "loss": 7.5342, + "step": 1589300 + }, + { + "epoch": 6.474866303624951, + "grad_norm": 2.3004262447357178, + "learning_rate": 1.0554142907967208e-05, + "loss": 7.5372, + "step": 1589400 + }, + { + "epoch": 6.475273681648333, + "grad_norm": 4.236722946166992, + "learning_rate": 1.054373707791181e-05, + "loss": 7.5406, + "step": 1589500 + }, + { + "epoch": 6.4756810596717145, + "grad_norm": 2.2029640674591064, + "learning_rate": 1.0533429867053856e-05, + "loss": 7.5114, + "step": 1589600 + }, + { + "epoch": 6.476088437695096, + "grad_norm": 4.125149726867676, + "learning_rate": 1.0523221275801079e-05, + "loss": 7.5253, + "step": 1589700 + }, + { + "epoch": 6.4764958157184775, + "grad_norm": 3.957239866256714, + "learning_rate": 1.0513111304556538e-05, + "loss": 7.513, + "step": 1589800 + }, + { + "epoch": 6.476903193741859, + "grad_norm": 9.370373725891113, + "learning_rate": 1.0503099953719934e-05, + "loss": 7.4979, + "step": 1589900 + }, + { + "epoch": 6.477310571765241, + "grad_norm": 3.218662977218628, + "learning_rate": 1.0493187223687658e-05, + "loss": 7.5451, + "step": 1590000 + }, + { + "epoch": 6.477310571765241, + "eval_MaskedAccuracy": 0.5137643071789635, + "eval_loss": 1.5946288108825684, + "eval_runtime": 168.1511, + "eval_samples_per_second": 377.494, + "eval_steps_per_second": 1.475, + "step": 1590000 + }, + { + "epoch": 6.477717949788622, + "grad_norm": 5.178323745727539, + "learning_rate": 1.0483373114851103e-05, + "loss": 7.4875, + "step": 1590100 + }, + { + "epoch": 6.478125327812003, + "grad_norm": 3.749709129333496, + "learning_rate": 1.047365762759836e-05, + "loss": 7.5125, + "step": 1590200 + }, + { + "epoch": 6.478532705835384, + "grad_norm": 3.386944532394409, + "learning_rate": 1.0464040762313626e-05, + "loss": 7.5194, + "step": 1590300 + }, + { + "epoch": 6.478940083858766, + "grad_norm": 5.481697082519531, + "learning_rate": 1.0454522519377218e-05, + "loss": 7.5216, + "step": 1590400 + }, + { + "epoch": 6.479347461882147, + "grad_norm": 3.3972976207733154, + "learning_rate": 1.0445102899165296e-05, + "loss": 7.5015, + "step": 1590500 + }, + { + "epoch": 6.479754839905529, + "grad_norm": 4.845938682556152, + "learning_rate": 1.043578190205017e-05, + "loss": 7.5048, + "step": 1590600 + }, + { + "epoch": 6.4801622179289105, + "grad_norm": 3.8213133811950684, + "learning_rate": 1.0426559528400503e-05, + "loss": 7.5027, + "step": 1590700 + }, + { + "epoch": 6.480569595952292, + "grad_norm": 9.342162132263184, + "learning_rate": 1.0417435778580849e-05, + "loss": 7.4939, + "step": 1590800 + }, + { + "epoch": 6.4809769739756735, + "grad_norm": 5.004269599914551, + "learning_rate": 1.0408410652951845e-05, + "loss": 7.5404, + "step": 1590900 + }, + { + "epoch": 6.481384351999055, + "grad_norm": 10.357196807861328, + "learning_rate": 1.0399484151870282e-05, + "loss": 7.5, + "step": 1591000 + }, + { + "epoch": 6.481384351999055, + "eval_MaskedAccuracy": 0.5139598372954575, + "eval_loss": 1.5865259170532227, + "eval_runtime": 170.4569, + "eval_samples_per_second": 372.387, + "eval_steps_per_second": 1.455, + "step": 1591000 + }, + { + "epoch": 6.481791730022437, + "grad_norm": 3.1712253093719482, + "learning_rate": 1.0390656275689327e-05, + "loss": 7.4713, + "step": 1591100 + }, + { + "epoch": 6.482199108045818, + "grad_norm": 4.492165565490723, + "learning_rate": 1.0381927024758006e-05, + "loss": 7.5415, + "step": 1591200 + }, + { + "epoch": 6.4826064860692, + "grad_norm": 3.1798746585845947, + "learning_rate": 1.0373296399420902e-05, + "loss": 7.5001, + "step": 1591300 + }, + { + "epoch": 6.48301386409258, + "grad_norm": 2.6678378582000732, + "learning_rate": 1.0364764400019843e-05, + "loss": 7.5141, + "step": 1591400 + }, + { + "epoch": 6.483421242115962, + "grad_norm": 2.719944477081299, + "learning_rate": 1.0356331026891933e-05, + "loss": 7.5104, + "step": 1591500 + }, + { + "epoch": 6.483828620139343, + "grad_norm": 5.455245018005371, + "learning_rate": 1.0347996280370401e-05, + "loss": 7.5199, + "step": 1591600 + }, + { + "epoch": 6.484235998162725, + "grad_norm": 2.79030704498291, + "learning_rate": 1.0339760160784878e-05, + "loss": 7.514, + "step": 1591700 + }, + { + "epoch": 6.484643376186106, + "grad_norm": 6.094854354858398, + "learning_rate": 1.0331622668461119e-05, + "loss": 7.5171, + "step": 1591800 + }, + { + "epoch": 6.485050754209488, + "grad_norm": 9.372957229614258, + "learning_rate": 1.0323583803720725e-05, + "loss": 7.5203, + "step": 1591900 + }, + { + "epoch": 6.4854581322328695, + "grad_norm": 2.224552869796753, + "learning_rate": 1.0315643566881688e-05, + "loss": 7.5276, + "step": 1592000 + }, + { + "epoch": 6.4854581322328695, + "eval_MaskedAccuracy": 0.5127872637926909, + "eval_loss": 1.585646152496338, + "eval_runtime": 157.3491, + "eval_samples_per_second": 403.409, + "eval_steps_per_second": 1.576, + "step": 1592000 + }, + { + "epoch": 6.485865510256251, + "grad_norm": 10.867362022399902, + "learning_rate": 1.030780195825758e-05, + "loss": 7.5037, + "step": 1592100 + }, + { + "epoch": 6.4862728882796326, + "grad_norm": 3.9829726219177246, + "learning_rate": 1.0300058978158635e-05, + "loss": 7.5175, + "step": 1592200 + }, + { + "epoch": 6.486680266303014, + "grad_norm": 13.730901718139648, + "learning_rate": 1.0292414626890936e-05, + "loss": 7.5117, + "step": 1592300 + }, + { + "epoch": 6.487087644326396, + "grad_norm": 3.234405040740967, + "learning_rate": 1.0284868904756694e-05, + "loss": 7.4918, + "step": 1592400 + }, + { + "epoch": 6.487495022349777, + "grad_norm": 6.621614933013916, + "learning_rate": 1.0277421812054517e-05, + "loss": 7.4883, + "step": 1592500 + }, + { + "epoch": 6.487902400373159, + "grad_norm": 2.6514699459075928, + "learning_rate": 1.0270073349078306e-05, + "loss": 7.5101, + "step": 1592600 + }, + { + "epoch": 6.488309778396539, + "grad_norm": 2.6967132091522217, + "learning_rate": 1.0262823516118905e-05, + "loss": 7.5188, + "step": 1592700 + }, + { + "epoch": 6.488717156419921, + "grad_norm": 5.549939155578613, + "learning_rate": 1.0255672313462732e-05, + "loss": 7.5004, + "step": 1592800 + }, + { + "epoch": 6.489124534443302, + "grad_norm": 2.9948530197143555, + "learning_rate": 1.0248619741392903e-05, + "loss": 7.4946, + "step": 1592900 + }, + { + "epoch": 6.489531912466684, + "grad_norm": 7.538805961608887, + "learning_rate": 1.0241665800188049e-05, + "loss": 7.5029, + "step": 1593000 + }, + { + "epoch": 6.489531912466684, + "eval_MaskedAccuracy": 0.5136503409021163, + "eval_loss": 1.5819203853607178, + "eval_runtime": 152.7341, + "eval_samples_per_second": 415.598, + "eval_steps_per_second": 1.624, + "step": 1593000 + }, + { + "epoch": 6.4899392904900655, + "grad_norm": 3.414398193359375, + "learning_rate": 1.0234810490122685e-05, + "loss": 7.5272, + "step": 1593100 + }, + { + "epoch": 6.490346668513447, + "grad_norm": 5.589507579803467, + "learning_rate": 1.0228053811468562e-05, + "loss": 7.5081, + "step": 1593200 + }, + { + "epoch": 6.4907540465368285, + "grad_norm": 3.89789080619812, + "learning_rate": 1.0221395764492432e-05, + "loss": 7.515, + "step": 1593300 + }, + { + "epoch": 6.49116142456021, + "grad_norm": 10.572711944580078, + "learning_rate": 1.0214836349457435e-05, + "loss": 7.5403, + "step": 1593400 + }, + { + "epoch": 6.491568802583592, + "grad_norm": 16.077180862426758, + "learning_rate": 1.0208375566622848e-05, + "loss": 7.4979, + "step": 1593500 + }, + { + "epoch": 6.491976180606973, + "grad_norm": 9.048277854919434, + "learning_rate": 1.0202013416244615e-05, + "loss": 7.5185, + "step": 1593600 + }, + { + "epoch": 6.492383558630355, + "grad_norm": 4.375500202178955, + "learning_rate": 1.0195749898573698e-05, + "loss": 7.5342, + "step": 1593700 + }, + { + "epoch": 6.492790936653736, + "grad_norm": 4.217859745025635, + "learning_rate": 1.0189585013857742e-05, + "loss": 7.5132, + "step": 1593800 + }, + { + "epoch": 6.493198314677118, + "grad_norm": 5.354471206665039, + "learning_rate": 1.0183518762341066e-05, + "loss": 7.4876, + "step": 1593900 + }, + { + "epoch": 6.493605692700498, + "grad_norm": 3.2286570072174072, + "learning_rate": 1.0177551144262986e-05, + "loss": 7.5053, + "step": 1594000 + }, + { + "epoch": 6.493605692700498, + "eval_MaskedAccuracy": 0.5137151254428228, + "eval_loss": 1.5947133302688599, + "eval_runtime": 151.9626, + "eval_samples_per_second": 417.708, + "eval_steps_per_second": 1.632, + "step": 1594000 + }, + { + "epoch": 6.49401307072388, + "grad_norm": 6.318943500518799, + "learning_rate": 1.0171682159859514e-05, + "loss": 7.529, + "step": 1594100 + }, + { + "epoch": 6.494420448747261, + "grad_norm": 3.0594611167907715, + "learning_rate": 1.0165911809362497e-05, + "loss": 7.5077, + "step": 1594200 + }, + { + "epoch": 6.494827826770643, + "grad_norm": 7.546759605407715, + "learning_rate": 1.0160240093000742e-05, + "loss": 7.5603, + "step": 1594300 + }, + { + "epoch": 6.4952352047940245, + "grad_norm": 3.6959216594696045, + "learning_rate": 1.0154667010997782e-05, + "loss": 7.5156, + "step": 1594400 + }, + { + "epoch": 6.495642582817406, + "grad_norm": 3.110666513442993, + "learning_rate": 1.0149192563574114e-05, + "loss": 7.5325, + "step": 1594500 + }, + { + "epoch": 6.496049960840788, + "grad_norm": 2.758005380630493, + "learning_rate": 1.014381675094635e-05, + "loss": 7.5314, + "step": 1594600 + }, + { + "epoch": 6.496457338864169, + "grad_norm": 4.565113544464111, + "learning_rate": 1.0138539573326957e-05, + "loss": 7.5346, + "step": 1594700 + }, + { + "epoch": 6.496864716887551, + "grad_norm": 6.778021812438965, + "learning_rate": 1.0133361030924785e-05, + "loss": 7.5082, + "step": 1594800 + }, + { + "epoch": 6.497272094910932, + "grad_norm": 6.886841773986816, + "learning_rate": 1.0128281123943979e-05, + "loss": 7.5191, + "step": 1594900 + }, + { + "epoch": 6.497679472934314, + "grad_norm": 4.827107906341553, + "learning_rate": 1.0123299852585925e-05, + "loss": 7.5382, + "step": 1595000 + }, + { + "epoch": 6.497679472934314, + "eval_MaskedAccuracy": 0.5138491514953356, + "eval_loss": 1.5851107835769653, + "eval_runtime": 172.8377, + "eval_samples_per_second": 367.258, + "eval_steps_per_second": 1.435, + "step": 1595000 + }, + { + "epoch": 6.498086850957695, + "grad_norm": 3.5091304779052734, + "learning_rate": 1.0118417217047295e-05, + "loss": 7.5279, + "step": 1595100 + }, + { + "epoch": 6.498494228981076, + "grad_norm": 3.0194764137268066, + "learning_rate": 1.011363321752116e-05, + "loss": 7.5539, + "step": 1595200 + }, + { + "epoch": 6.498901607004457, + "grad_norm": 4.65777063369751, + "learning_rate": 1.010894785419698e-05, + "loss": 7.5182, + "step": 1595300 + }, + { + "epoch": 6.499308985027839, + "grad_norm": 18.234819412231445, + "learning_rate": 1.0104361127259519e-05, + "loss": 7.5082, + "step": 1595400 + }, + { + "epoch": 6.4997163630512205, + "grad_norm": 3.018636465072632, + "learning_rate": 1.0099873036890212e-05, + "loss": 7.5009, + "step": 1595500 + }, + { + "epoch": 6.500123741074602, + "grad_norm": 3.6853644847869873, + "learning_rate": 1.009548358326662e-05, + "loss": 7.5047, + "step": 1595600 + }, + { + "epoch": 6.5005311190979835, + "grad_norm": 3.3983805179595947, + "learning_rate": 1.0091192766562143e-05, + "loss": 7.496, + "step": 1595700 + }, + { + "epoch": 6.500938497121365, + "grad_norm": 14.519161224365234, + "learning_rate": 1.0087000586946863e-05, + "loss": 7.5264, + "step": 1595800 + }, + { + "epoch": 6.501345875144747, + "grad_norm": 10.01481819152832, + "learning_rate": 1.008290704458587e-05, + "loss": 7.5127, + "step": 1595900 + }, + { + "epoch": 6.501753253168128, + "grad_norm": 5.591251850128174, + "learning_rate": 1.0078912139641481e-05, + "loss": 7.5212, + "step": 1596000 + }, + { + "epoch": 6.501753253168128, + "eval_MaskedAccuracy": 0.5138266559577939, + "eval_loss": 1.5807507038116455, + "eval_runtime": 155.3243, + "eval_samples_per_second": 408.667, + "eval_steps_per_second": 1.597, + "step": 1596000 + }, + { + "epoch": 6.50216063119151, + "grad_norm": 2.69199800491333, + "learning_rate": 1.007501587227132e-05, + "loss": 7.517, + "step": 1596100 + }, + { + "epoch": 6.502568009214891, + "grad_norm": 3.9832923412323, + "learning_rate": 1.0071218242629667e-05, + "loss": 7.5131, + "step": 1596200 + }, + { + "epoch": 6.502975387238273, + "grad_norm": 2.246173620223999, + "learning_rate": 1.0067519250866656e-05, + "loss": 7.5379, + "step": 1596300 + }, + { + "epoch": 6.503382765261653, + "grad_norm": 16.615299224853516, + "learning_rate": 1.0063918897128273e-05, + "loss": 7.5131, + "step": 1596400 + }, + { + "epoch": 6.503790143285035, + "grad_norm": 3.7326560020446777, + "learning_rate": 1.0060417181557158e-05, + "loss": 7.5267, + "step": 1596500 + }, + { + "epoch": 6.504197521308416, + "grad_norm": 12.541718482971191, + "learning_rate": 1.005701410429156e-05, + "loss": 7.5, + "step": 1596600 + }, + { + "epoch": 6.504604899331798, + "grad_norm": 3.1988630294799805, + "learning_rate": 1.0053709665466081e-05, + "loss": 7.5298, + "step": 1596700 + }, + { + "epoch": 6.5050122773551795, + "grad_norm": 2.962094783782959, + "learning_rate": 1.005050386521148e-05, + "loss": 7.5127, + "step": 1596800 + }, + { + "epoch": 6.505419655378561, + "grad_norm": 4.029211521148682, + "learning_rate": 1.004739670365433e-05, + "loss": 7.4984, + "step": 1596900 + }, + { + "epoch": 6.505827033401943, + "grad_norm": 12.007691383361816, + "learning_rate": 1.0044388180917625e-05, + "loss": 7.5096, + "step": 1597000 + }, + { + "epoch": 6.505827033401943, + "eval_MaskedAccuracy": 0.513536260494133, + "eval_loss": 1.5880944728851318, + "eval_runtime": 170.5549, + "eval_samples_per_second": 372.173, + "eval_steps_per_second": 1.454, + "step": 1597000 + }, + { + "epoch": 6.506234411425324, + "grad_norm": 3.1685709953308105, + "learning_rate": 1.004147829711993e-05, + "loss": 7.5305, + "step": 1597100 + }, + { + "epoch": 6.506641789448706, + "grad_norm": 3.2527971267700195, + "learning_rate": 1.0038667052376744e-05, + "loss": 7.4967, + "step": 1597200 + }, + { + "epoch": 6.507049167472087, + "grad_norm": 4.4246368408203125, + "learning_rate": 1.0035954446799152e-05, + "loss": 7.5019, + "step": 1597300 + }, + { + "epoch": 6.507456545495469, + "grad_norm": 6.173441410064697, + "learning_rate": 1.0033340480494348e-05, + "loss": 7.5042, + "step": 1597400 + }, + { + "epoch": 6.50786392351885, + "grad_norm": 10.910491943359375, + "learning_rate": 1.0030825153565378e-05, + "loss": 7.5051, + "step": 1597500 + }, + { + "epoch": 6.508271301542232, + "grad_norm": 6.449568748474121, + "learning_rate": 1.0028408466111964e-05, + "loss": 7.5127, + "step": 1597600 + }, + { + "epoch": 6.508678679565612, + "grad_norm": 10.72154712677002, + "learning_rate": 1.0026090418229666e-05, + "loss": 7.5266, + "step": 1597700 + }, + { + "epoch": 6.509086057588994, + "grad_norm": 3.6500210762023926, + "learning_rate": 1.0023871010010177e-05, + "loss": 7.5066, + "step": 1597800 + }, + { + "epoch": 6.5094934356123755, + "grad_norm": 7.185964584350586, + "learning_rate": 1.0021750241541024e-05, + "loss": 7.5379, + "step": 1597900 + }, + { + "epoch": 6.509900813635757, + "grad_norm": 3.6083433628082275, + "learning_rate": 1.0019728112906143e-05, + "loss": 7.5008, + "step": 1598000 + }, + { + "epoch": 6.509900813635757, + "eval_MaskedAccuracy": 0.5139002103389801, + "eval_loss": 1.58592689037323, + "eval_runtime": 166.8944, + "eval_samples_per_second": 380.336, + "eval_steps_per_second": 1.486, + "step": 1598000 + }, + { + "epoch": 6.5103081916591385, + "grad_norm": 13.888603210449219, + "learning_rate": 1.0017804624185296e-05, + "loss": 7.5217, + "step": 1598100 + }, + { + "epoch": 6.51071556968252, + "grad_norm": 13.128150939941406, + "learning_rate": 1.001597977545496e-05, + "loss": 7.4866, + "step": 1598200 + }, + { + "epoch": 6.511122947705902, + "grad_norm": 3.8094708919525146, + "learning_rate": 1.0014253566787136e-05, + "loss": 7.5137, + "step": 1598300 + }, + { + "epoch": 6.511530325729283, + "grad_norm": 3.978328227996826, + "learning_rate": 1.0012625998249971e-05, + "loss": 7.5076, + "step": 1598400 + }, + { + "epoch": 6.511937703752665, + "grad_norm": 7.387543678283691, + "learning_rate": 1.0011097069907731e-05, + "loss": 7.5153, + "step": 1598500 + }, + { + "epoch": 6.512345081776046, + "grad_norm": 3.7918782234191895, + "learning_rate": 1.0009666781821084e-05, + "loss": 7.5407, + "step": 1598600 + }, + { + "epoch": 6.512752459799428, + "grad_norm": 4.427021503448486, + "learning_rate": 1.0008335134046257e-05, + "loss": 7.5382, + "step": 1598700 + }, + { + "epoch": 6.513159837822809, + "grad_norm": 12.303589820861816, + "learning_rate": 1.0007102126636154e-05, + "loss": 7.4989, + "step": 1598800 + }, + { + "epoch": 6.513567215846191, + "grad_norm": 10.409649848937988, + "learning_rate": 1.0005967759639529e-05, + "loss": 7.5282, + "step": 1598900 + }, + { + "epoch": 6.5139745938695714, + "grad_norm": 7.53159761428833, + "learning_rate": 1.000493203310126e-05, + "loss": 7.4865, + "step": 1599000 + }, + { + "epoch": 6.5139745938695714, + "eval_MaskedAccuracy": 0.5138469709881768, + "eval_loss": 1.5772393941879272, + "eval_runtime": 162.0602, + "eval_samples_per_second": 391.682, + "eval_steps_per_second": 1.53, + "step": 1599000 + }, + { + "epoch": 6.514381971892953, + "grad_norm": 5.471723556518555, + "learning_rate": 1.0003994947062063e-05, + "loss": 7.5011, + "step": 1599100 + }, + { + "epoch": 6.5147893499163345, + "grad_norm": 2.725353956222534, + "learning_rate": 1.0003156501559048e-05, + "loss": 7.4923, + "step": 1599200 + }, + { + "epoch": 6.515196727939716, + "grad_norm": 3.509063959121704, + "learning_rate": 1.0002416696625745e-05, + "loss": 7.5084, + "step": 1599300 + }, + { + "epoch": 6.515604105963098, + "grad_norm": 5.341278076171875, + "learning_rate": 1.0001775532290948e-05, + "loss": 7.5007, + "step": 1599400 + }, + { + "epoch": 6.516011483986479, + "grad_norm": 3.1526851654052734, + "learning_rate": 1.000123300857988e-05, + "loss": 7.5017, + "step": 1599500 + }, + { + "epoch": 6.516418862009861, + "grad_norm": 9.90170669555664, + "learning_rate": 1.0000789125514688e-05, + "loss": 7.5098, + "step": 1599600 + }, + { + "epoch": 6.516826240033242, + "grad_norm": 3.8604884147644043, + "learning_rate": 1.0000443883112281e-05, + "loss": 7.5306, + "step": 1599700 + }, + { + "epoch": 6.517233618056624, + "grad_norm": 8.72430419921875, + "learning_rate": 1.00001972813865e-05, + "loss": 7.5348, + "step": 1599800 + }, + { + "epoch": 6.517640996080005, + "grad_norm": 8.924925804138184, + "learning_rate": 1.000004932034704e-05, + "loss": 7.5241, + "step": 1599900 + }, + { + "epoch": 6.518048374103387, + "grad_norm": 4.717263221740723, + "learning_rate": 1e-05, + "loss": 7.5391, + "step": 1600000 + }, + { + "epoch": 6.518048374103387, + "eval_MaskedAccuracy": 0.5132993789243037, + "eval_loss": 1.5939823389053345, + "eval_runtime": 242.1229, + "eval_samples_per_second": 262.164, + "eval_steps_per_second": 1.024, + "step": 1600000 + } + ], + "logging_steps": 100, + "max_steps": 1600000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.272945468178432e+20, + "train_batch_size": 400, + "trial_name": null, + "trial_params": null +}