diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,129652 @@ +{ + "best_metric": 0.8729588504245591, + "best_model_checkpoint": "/tmp/training/timesformer-base-finetuned-k400-timesformer-2-10-epochs-v_2sec_others-1735155772.870025/checkpoint-50776", + "epoch": 39.02494719168066, + "eval_steps": 500, + "global_step": 184630, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.416237881167741e-05, + "grad_norm": 20.10410499572754, + "learning_rate": 2.7081189405838704e-08, + "loss": 0.7079, + "step": 10 + }, + { + "epoch": 0.00010832475762335481, + "grad_norm": 12.395395278930664, + "learning_rate": 5.416237881167741e-08, + "loss": 0.7502, + "step": 20 + }, + { + "epoch": 0.00016248713643503222, + "grad_norm": 18.303630828857422, + "learning_rate": 8.124356821751612e-08, + "loss": 0.7339, + "step": 30 + }, + { + "epoch": 0.00021664951524670963, + "grad_norm": 16.716684341430664, + "learning_rate": 1.0832475762335482e-07, + "loss": 0.6762, + "step": 40 + }, + { + "epoch": 0.00027081189405838704, + "grad_norm": 13.273477554321289, + "learning_rate": 1.3540594702919353e-07, + "loss": 0.7062, + "step": 50 + }, + { + "epoch": 0.00032497427287006444, + "grad_norm": 10.835434913635254, + "learning_rate": 1.6248713643503224e-07, + "loss": 0.6229, + "step": 60 + }, + { + "epoch": 0.00037913665168174185, + "grad_norm": 8.233874320983887, + "learning_rate": 1.8956832584087095e-07, + "loss": 0.6267, + "step": 70 + }, + { + "epoch": 0.00043329903049341926, + "grad_norm": 16.0180721282959, + "learning_rate": 2.1664951524670963e-07, + "loss": 0.5836, + "step": 80 + }, + { + "epoch": 0.00048746140930509667, + "grad_norm": 10.363085746765137, + "learning_rate": 2.4373070465254834e-07, + "loss": 0.5214, + "step": 90 + }, + { + "epoch": 0.0005416237881167741, + "grad_norm": 10.672710418701172, + "learning_rate": 2.7081189405838706e-07, + "loss": 0.561, + "step": 100 + }, + { + "epoch": 0.0005957861669284515, + "grad_norm": 10.219686508178711, + "learning_rate": 2.9789308346422577e-07, + "loss": 0.4567, + "step": 110 + }, + { + "epoch": 0.0006499485457401289, + "grad_norm": 9.160877227783203, + "learning_rate": 3.249742728700645e-07, + "loss": 0.4479, + "step": 120 + }, + { + "epoch": 0.0007041109245518063, + "grad_norm": 10.465577125549316, + "learning_rate": 3.5205546227590314e-07, + "loss": 0.408, + "step": 130 + }, + { + "epoch": 0.0007582733033634837, + "grad_norm": 8.945135116577148, + "learning_rate": 3.791366516817419e-07, + "loss": 0.3772, + "step": 140 + }, + { + "epoch": 0.0008124356821751611, + "grad_norm": 7.2595367431640625, + "learning_rate": 4.062178410875806e-07, + "loss": 0.4109, + "step": 150 + }, + { + "epoch": 0.0008665980609868385, + "grad_norm": 4.461381912231445, + "learning_rate": 4.3329903049341927e-07, + "loss": 0.4839, + "step": 160 + }, + { + "epoch": 0.0009207604397985159, + "grad_norm": 4.287492275238037, + "learning_rate": 4.6038021989925803e-07, + "loss": 0.4121, + "step": 170 + }, + { + "epoch": 0.0009749228186101933, + "grad_norm": 5.897005558013916, + "learning_rate": 4.874614093050967e-07, + "loss": 0.3397, + "step": 180 + }, + { + "epoch": 0.0010290851974218708, + "grad_norm": 5.811868190765381, + "learning_rate": 5.145425987109355e-07, + "loss": 0.3653, + "step": 190 + }, + { + "epoch": 0.0010832475762335481, + "grad_norm": 5.0204315185546875, + "learning_rate": 5.416237881167741e-07, + "loss": 0.3293, + "step": 200 + }, + { + "epoch": 0.0011374099550452257, + "grad_norm": 3.7267143726348877, + "learning_rate": 5.687049775226128e-07, + "loss": 0.2977, + "step": 210 + }, + { + "epoch": 0.001191572333856903, + "grad_norm": 3.854102611541748, + "learning_rate": 5.957861669284515e-07, + "loss": 0.3031, + "step": 220 + }, + { + "epoch": 0.0012457347126685805, + "grad_norm": 4.71415901184082, + "learning_rate": 6.228673563342903e-07, + "loss": 0.2406, + "step": 230 + }, + { + "epoch": 0.0012998970914802578, + "grad_norm": 2.8576502799987793, + "learning_rate": 6.49948545740129e-07, + "loss": 0.3423, + "step": 240 + }, + { + "epoch": 0.0013540594702919353, + "grad_norm": 4.175868511199951, + "learning_rate": 6.770297351459676e-07, + "loss": 0.3983, + "step": 250 + }, + { + "epoch": 0.0014082218491036126, + "grad_norm": 5.465442657470703, + "learning_rate": 7.041109245518063e-07, + "loss": 0.3079, + "step": 260 + }, + { + "epoch": 0.00146238422791529, + "grad_norm": 3.185835123062134, + "learning_rate": 7.31192113957645e-07, + "loss": 0.3912, + "step": 270 + }, + { + "epoch": 0.0015165466067269674, + "grad_norm": 5.0673136711120605, + "learning_rate": 7.582733033634838e-07, + "loss": 0.4281, + "step": 280 + }, + { + "epoch": 0.001570708985538645, + "grad_norm": 8.75891399383545, + "learning_rate": 7.853544927693226e-07, + "loss": 0.2525, + "step": 290 + }, + { + "epoch": 0.0016248713643503222, + "grad_norm": 2.9567043781280518, + "learning_rate": 8.124356821751612e-07, + "loss": 0.2774, + "step": 300 + }, + { + "epoch": 0.0016790337431619997, + "grad_norm": 5.7689948081970215, + "learning_rate": 8.395168715809998e-07, + "loss": 0.3633, + "step": 310 + }, + { + "epoch": 0.001733196121973677, + "grad_norm": 4.984217166900635, + "learning_rate": 8.665980609868385e-07, + "loss": 0.2346, + "step": 320 + }, + { + "epoch": 0.0017873585007853545, + "grad_norm": 2.781419038772583, + "learning_rate": 8.936792503926773e-07, + "loss": 0.3225, + "step": 330 + }, + { + "epoch": 0.0018415208795970318, + "grad_norm": 5.357931137084961, + "learning_rate": 9.207604397985161e-07, + "loss": 0.302, + "step": 340 + }, + { + "epoch": 0.0018956832584087094, + "grad_norm": 4.991815567016602, + "learning_rate": 9.478416292043547e-07, + "loss": 0.2639, + "step": 350 + }, + { + "epoch": 0.0019498456372203867, + "grad_norm": 15.28780746459961, + "learning_rate": 9.749228186101934e-07, + "loss": 0.3252, + "step": 360 + }, + { + "epoch": 0.002004008016032064, + "grad_norm": 3.467195749282837, + "learning_rate": 1.002004008016032e-06, + "loss": 0.2975, + "step": 370 + }, + { + "epoch": 0.0020581703948437417, + "grad_norm": 2.50441837310791, + "learning_rate": 1.029085197421871e-06, + "loss": 0.373, + "step": 380 + }, + { + "epoch": 0.002112332773655419, + "grad_norm": 2.72977614402771, + "learning_rate": 1.0561663868277096e-06, + "loss": 0.2565, + "step": 390 + }, + { + "epoch": 0.0021664951524670963, + "grad_norm": 11.968005180358887, + "learning_rate": 1.0832475762335482e-06, + "loss": 0.3352, + "step": 400 + }, + { + "epoch": 0.0022206575312787736, + "grad_norm": 4.734884738922119, + "learning_rate": 1.1103287656393869e-06, + "loss": 0.3146, + "step": 410 + }, + { + "epoch": 0.0022748199100904513, + "grad_norm": 6.225593566894531, + "learning_rate": 1.1374099550452255e-06, + "loss": 0.4213, + "step": 420 + }, + { + "epoch": 0.0023289822889021286, + "grad_norm": 3.426356792449951, + "learning_rate": 1.1644911444510644e-06, + "loss": 0.2279, + "step": 430 + }, + { + "epoch": 0.002383144667713806, + "grad_norm": 4.541796684265137, + "learning_rate": 1.191572333856903e-06, + "loss": 0.2536, + "step": 440 + }, + { + "epoch": 0.0024373070465254832, + "grad_norm": 6.385365962982178, + "learning_rate": 1.2186535232627417e-06, + "loss": 0.3258, + "step": 450 + }, + { + "epoch": 0.002491469425337161, + "grad_norm": 11.369861602783203, + "learning_rate": 1.2457347126685806e-06, + "loss": 0.2766, + "step": 460 + }, + { + "epoch": 0.0025456318041488383, + "grad_norm": 1.258400321006775, + "learning_rate": 1.272815902074419e-06, + "loss": 0.1841, + "step": 470 + }, + { + "epoch": 0.0025997941829605156, + "grad_norm": 5.790881633758545, + "learning_rate": 1.299897091480258e-06, + "loss": 0.2256, + "step": 480 + }, + { + "epoch": 0.002653956561772193, + "grad_norm": 1.7510968446731567, + "learning_rate": 1.3269782808860966e-06, + "loss": 0.1498, + "step": 490 + }, + { + "epoch": 0.0027081189405838706, + "grad_norm": 3.2240734100341797, + "learning_rate": 1.3540594702919352e-06, + "loss": 0.191, + "step": 500 + }, + { + "epoch": 0.002762281319395548, + "grad_norm": 6.453549861907959, + "learning_rate": 1.381140659697774e-06, + "loss": 0.3503, + "step": 510 + }, + { + "epoch": 0.002816443698207225, + "grad_norm": 0.880176842212677, + "learning_rate": 1.4082218491036125e-06, + "loss": 0.2512, + "step": 520 + }, + { + "epoch": 0.0028706060770189025, + "grad_norm": 6.094227313995361, + "learning_rate": 1.4353030385094514e-06, + "loss": 0.4495, + "step": 530 + }, + { + "epoch": 0.00292476845583058, + "grad_norm": 9.922907829284668, + "learning_rate": 1.46238422791529e-06, + "loss": 0.317, + "step": 540 + }, + { + "epoch": 0.0029789308346422575, + "grad_norm": 5.951837062835693, + "learning_rate": 1.4894654173211287e-06, + "loss": 0.1796, + "step": 550 + }, + { + "epoch": 0.003033093213453935, + "grad_norm": 3.550400972366333, + "learning_rate": 1.5165466067269676e-06, + "loss": 0.1991, + "step": 560 + }, + { + "epoch": 0.003087255592265612, + "grad_norm": 8.273002624511719, + "learning_rate": 1.5436277961328063e-06, + "loss": 0.2552, + "step": 570 + }, + { + "epoch": 0.00314141797107729, + "grad_norm": 0.7080442309379578, + "learning_rate": 1.5707089855386451e-06, + "loss": 0.2153, + "step": 580 + }, + { + "epoch": 0.003195580349888967, + "grad_norm": 12.202431678771973, + "learning_rate": 1.5977901749444836e-06, + "loss": 0.3161, + "step": 590 + }, + { + "epoch": 0.0032497427287006444, + "grad_norm": 11.932994842529297, + "learning_rate": 1.6248713643503224e-06, + "loss": 0.1985, + "step": 600 + }, + { + "epoch": 0.0033039051075123217, + "grad_norm": 7.520737648010254, + "learning_rate": 1.651952553756161e-06, + "loss": 0.3108, + "step": 610 + }, + { + "epoch": 0.0033580674863239995, + "grad_norm": 6.838932991027832, + "learning_rate": 1.6790337431619995e-06, + "loss": 0.297, + "step": 620 + }, + { + "epoch": 0.0034122298651356768, + "grad_norm": 14.098441123962402, + "learning_rate": 1.7061149325678386e-06, + "loss": 0.2268, + "step": 630 + }, + { + "epoch": 0.003466392243947354, + "grad_norm": 1.9403700828552246, + "learning_rate": 1.733196121973677e-06, + "loss": 0.1323, + "step": 640 + }, + { + "epoch": 0.0035205546227590314, + "grad_norm": 14.455674171447754, + "learning_rate": 1.760277311379516e-06, + "loss": 0.3326, + "step": 650 + }, + { + "epoch": 0.003574717001570709, + "grad_norm": 6.1730055809021, + "learning_rate": 1.7873585007853546e-06, + "loss": 0.1964, + "step": 660 + }, + { + "epoch": 0.0036288793803823864, + "grad_norm": 6.193252086639404, + "learning_rate": 1.8144396901911933e-06, + "loss": 0.3, + "step": 670 + }, + { + "epoch": 0.0036830417591940637, + "grad_norm": 3.2565383911132812, + "learning_rate": 1.8415208795970321e-06, + "loss": 0.2528, + "step": 680 + }, + { + "epoch": 0.003737204138005741, + "grad_norm": 4.425357818603516, + "learning_rate": 1.8686020690028706e-06, + "loss": 0.2006, + "step": 690 + }, + { + "epoch": 0.0037913665168174187, + "grad_norm": 3.9478373527526855, + "learning_rate": 1.8956832584087094e-06, + "loss": 0.1726, + "step": 700 + }, + { + "epoch": 0.003845528895629096, + "grad_norm": 16.950448989868164, + "learning_rate": 1.9227644478145483e-06, + "loss": 0.2724, + "step": 710 + }, + { + "epoch": 0.0038996912744407733, + "grad_norm": 8.464611053466797, + "learning_rate": 1.9498456372203868e-06, + "loss": 0.3324, + "step": 720 + }, + { + "epoch": 0.003953853653252451, + "grad_norm": 1.5604084730148315, + "learning_rate": 1.9769268266262256e-06, + "loss": 0.2974, + "step": 730 + }, + { + "epoch": 0.004008016032064128, + "grad_norm": 9.275230407714844, + "learning_rate": 2.004008016032064e-06, + "loss": 0.3822, + "step": 740 + }, + { + "epoch": 0.004062178410875806, + "grad_norm": 2.2739484310150146, + "learning_rate": 2.031089205437903e-06, + "loss": 0.239, + "step": 750 + }, + { + "epoch": 0.004116340789687483, + "grad_norm": 1.5351502895355225, + "learning_rate": 2.058170394843742e-06, + "loss": 0.2525, + "step": 760 + }, + { + "epoch": 0.004170503168499161, + "grad_norm": 10.470738410949707, + "learning_rate": 2.0852515842495803e-06, + "loss": 0.2536, + "step": 770 + }, + { + "epoch": 0.004224665547310838, + "grad_norm": 9.513916015625, + "learning_rate": 2.112332773655419e-06, + "loss": 0.2066, + "step": 780 + }, + { + "epoch": 0.004278827926122515, + "grad_norm": 5.80160665512085, + "learning_rate": 2.1394139630612576e-06, + "loss": 0.1448, + "step": 790 + }, + { + "epoch": 0.004332990304934193, + "grad_norm": 4.15544319152832, + "learning_rate": 2.1664951524670964e-06, + "loss": 0.1131, + "step": 800 + }, + { + "epoch": 0.00438715268374587, + "grad_norm": 8.018930435180664, + "learning_rate": 2.1935763418729353e-06, + "loss": 0.2033, + "step": 810 + }, + { + "epoch": 0.004441315062557547, + "grad_norm": 7.4652299880981445, + "learning_rate": 2.2206575312787738e-06, + "loss": 0.2851, + "step": 820 + }, + { + "epoch": 0.004495477441369225, + "grad_norm": 1.039151668548584, + "learning_rate": 2.2477387206846126e-06, + "loss": 0.3434, + "step": 830 + }, + { + "epoch": 0.004549639820180903, + "grad_norm": 5.006525993347168, + "learning_rate": 2.274819910090451e-06, + "loss": 0.1206, + "step": 840 + }, + { + "epoch": 0.00460380219899258, + "grad_norm": 3.593724489212036, + "learning_rate": 2.30190109949629e-06, + "loss": 0.1714, + "step": 850 + }, + { + "epoch": 0.004657964577804257, + "grad_norm": 3.3161869049072266, + "learning_rate": 2.328982288902129e-06, + "loss": 0.2221, + "step": 860 + }, + { + "epoch": 0.0047121269566159345, + "grad_norm": 1.9727444648742676, + "learning_rate": 2.3560634783079677e-06, + "loss": 0.2049, + "step": 870 + }, + { + "epoch": 0.004766289335427612, + "grad_norm": 7.851633071899414, + "learning_rate": 2.383144667713806e-06, + "loss": 0.2649, + "step": 880 + }, + { + "epoch": 0.004820451714239289, + "grad_norm": 5.193733215332031, + "learning_rate": 2.4102258571196446e-06, + "loss": 0.1396, + "step": 890 + }, + { + "epoch": 0.0048746140930509664, + "grad_norm": 1.011207103729248, + "learning_rate": 2.4373070465254834e-06, + "loss": 0.1725, + "step": 900 + }, + { + "epoch": 0.004928776471862645, + "grad_norm": 13.909852981567383, + "learning_rate": 2.4643882359313223e-06, + "loss": 0.1248, + "step": 910 + }, + { + "epoch": 0.004982938850674322, + "grad_norm": 5.273774147033691, + "learning_rate": 2.491469425337161e-06, + "loss": 0.3292, + "step": 920 + }, + { + "epoch": 0.005037101229485999, + "grad_norm": 10.943607330322266, + "learning_rate": 2.5185506147429996e-06, + "loss": 0.2189, + "step": 930 + }, + { + "epoch": 0.0050912636082976765, + "grad_norm": 3.3928189277648926, + "learning_rate": 2.545631804148838e-06, + "loss": 0.1711, + "step": 940 + }, + { + "epoch": 0.005145425987109354, + "grad_norm": 0.3687291741371155, + "learning_rate": 2.572712993554677e-06, + "loss": 0.2337, + "step": 950 + }, + { + "epoch": 0.005199588365921031, + "grad_norm": 13.932756423950195, + "learning_rate": 2.599794182960516e-06, + "loss": 0.1759, + "step": 960 + }, + { + "epoch": 0.005253750744732708, + "grad_norm": 5.288064002990723, + "learning_rate": 2.6268753723663547e-06, + "loss": 0.2691, + "step": 970 + }, + { + "epoch": 0.005307913123544386, + "grad_norm": 13.720651626586914, + "learning_rate": 2.653956561772193e-06, + "loss": 0.3122, + "step": 980 + }, + { + "epoch": 0.005362075502356064, + "grad_norm": 19.008438110351562, + "learning_rate": 2.6810377511780316e-06, + "loss": 0.3128, + "step": 990 + }, + { + "epoch": 0.005416237881167741, + "grad_norm": 8.452180862426758, + "learning_rate": 2.7081189405838705e-06, + "loss": 0.264, + "step": 1000 + }, + { + "epoch": 0.0054704002599794185, + "grad_norm": 5.3774213790893555, + "learning_rate": 2.7352001299897093e-06, + "loss": 0.1, + "step": 1010 + }, + { + "epoch": 0.005524562638791096, + "grad_norm": 11.209014892578125, + "learning_rate": 2.762281319395548e-06, + "loss": 0.2373, + "step": 1020 + }, + { + "epoch": 0.005578725017602773, + "grad_norm": 0.986271858215332, + "learning_rate": 2.7893625088013866e-06, + "loss": 0.2423, + "step": 1030 + }, + { + "epoch": 0.00563288739641445, + "grad_norm": 10.323759078979492, + "learning_rate": 2.816443698207225e-06, + "loss": 0.1242, + "step": 1040 + }, + { + "epoch": 0.005687049775226128, + "grad_norm": 8.332191467285156, + "learning_rate": 2.843524887613064e-06, + "loss": 0.2449, + "step": 1050 + }, + { + "epoch": 0.005741212154037805, + "grad_norm": 0.5740471482276917, + "learning_rate": 2.870606077018903e-06, + "loss": 0.1571, + "step": 1060 + }, + { + "epoch": 0.005795374532849483, + "grad_norm": 0.3038335144519806, + "learning_rate": 2.8976872664247417e-06, + "loss": 0.11, + "step": 1070 + }, + { + "epoch": 0.00584953691166116, + "grad_norm": 0.332409530878067, + "learning_rate": 2.92476845583058e-06, + "loss": 0.3623, + "step": 1080 + }, + { + "epoch": 0.005903699290472838, + "grad_norm": 16.101699829101562, + "learning_rate": 2.951849645236419e-06, + "loss": 0.3917, + "step": 1090 + }, + { + "epoch": 0.005957861669284515, + "grad_norm": 9.438064575195312, + "learning_rate": 2.9789308346422575e-06, + "loss": 0.2522, + "step": 1100 + }, + { + "epoch": 0.006012024048096192, + "grad_norm": 9.571954727172852, + "learning_rate": 3.0060120240480963e-06, + "loss": 0.1858, + "step": 1110 + }, + { + "epoch": 0.00606618642690787, + "grad_norm": 2.3903117179870605, + "learning_rate": 3.033093213453935e-06, + "loss": 0.1358, + "step": 1120 + }, + { + "epoch": 0.006120348805719547, + "grad_norm": 7.4836602210998535, + "learning_rate": 3.0601744028597736e-06, + "loss": 0.3127, + "step": 1130 + }, + { + "epoch": 0.006174511184531224, + "grad_norm": 0.7595980763435364, + "learning_rate": 3.0872555922656125e-06, + "loss": 0.161, + "step": 1140 + }, + { + "epoch": 0.006228673563342902, + "grad_norm": 13.478638648986816, + "learning_rate": 3.1143367816714514e-06, + "loss": 0.1979, + "step": 1150 + }, + { + "epoch": 0.00628283594215458, + "grad_norm": 0.13827350735664368, + "learning_rate": 3.1414179710772902e-06, + "loss": 0.1584, + "step": 1160 + }, + { + "epoch": 0.006336998320966257, + "grad_norm": 0.9700174927711487, + "learning_rate": 3.1684991604831287e-06, + "loss": 0.1007, + "step": 1170 + }, + { + "epoch": 0.006391160699777934, + "grad_norm": 16.490318298339844, + "learning_rate": 3.195580349888967e-06, + "loss": 0.3193, + "step": 1180 + }, + { + "epoch": 0.006445323078589612, + "grad_norm": 16.22899627685547, + "learning_rate": 3.2226615392948056e-06, + "loss": 0.191, + "step": 1190 + }, + { + "epoch": 0.006499485457401289, + "grad_norm": 2.4667770862579346, + "learning_rate": 3.249742728700645e-06, + "loss": 0.1502, + "step": 1200 + }, + { + "epoch": 0.006553647836212966, + "grad_norm": 12.279610633850098, + "learning_rate": 3.2768239181064837e-06, + "loss": 0.2712, + "step": 1210 + }, + { + "epoch": 0.0066078102150246435, + "grad_norm": 11.72748851776123, + "learning_rate": 3.303905107512322e-06, + "loss": 0.1852, + "step": 1220 + }, + { + "epoch": 0.006661972593836322, + "grad_norm": 13.363080978393555, + "learning_rate": 3.3309862969181606e-06, + "loss": 0.2039, + "step": 1230 + }, + { + "epoch": 0.006716134972647999, + "grad_norm": 0.2834123969078064, + "learning_rate": 3.358067486323999e-06, + "loss": 0.2097, + "step": 1240 + }, + { + "epoch": 0.006770297351459676, + "grad_norm": 0.6939675807952881, + "learning_rate": 3.3851486757298384e-06, + "loss": 0.0843, + "step": 1250 + }, + { + "epoch": 0.0068244597302713535, + "grad_norm": 14.633173942565918, + "learning_rate": 3.4122298651356773e-06, + "loss": 0.3808, + "step": 1260 + }, + { + "epoch": 0.006878622109083031, + "grad_norm": 1.2923930883407593, + "learning_rate": 3.4393110545415157e-06, + "loss": 0.1107, + "step": 1270 + }, + { + "epoch": 0.006932784487894708, + "grad_norm": 4.484220504760742, + "learning_rate": 3.466392243947354e-06, + "loss": 0.2875, + "step": 1280 + }, + { + "epoch": 0.0069869468667063854, + "grad_norm": 4.865492820739746, + "learning_rate": 3.4934734333531926e-06, + "loss": 0.1493, + "step": 1290 + }, + { + "epoch": 0.007041109245518063, + "grad_norm": 1.115090012550354, + "learning_rate": 3.520554622759032e-06, + "loss": 0.1268, + "step": 1300 + }, + { + "epoch": 0.007095271624329741, + "grad_norm": 15.911785125732422, + "learning_rate": 3.5476358121648708e-06, + "loss": 0.4489, + "step": 1310 + }, + { + "epoch": 0.007149434003141418, + "grad_norm": 0.6042145490646362, + "learning_rate": 3.574717001570709e-06, + "loss": 0.1431, + "step": 1320 + }, + { + "epoch": 0.0072035963819530955, + "grad_norm": 1.5543569326400757, + "learning_rate": 3.6017981909765476e-06, + "loss": 0.0734, + "step": 1330 + }, + { + "epoch": 0.007257758760764773, + "grad_norm": 5.523393154144287, + "learning_rate": 3.6288793803823865e-06, + "loss": 0.181, + "step": 1340 + }, + { + "epoch": 0.00731192113957645, + "grad_norm": 0.9403535723686218, + "learning_rate": 3.6559605697882254e-06, + "loss": 0.4392, + "step": 1350 + }, + { + "epoch": 0.007366083518388127, + "grad_norm": 0.14056411385536194, + "learning_rate": 3.6830417591940643e-06, + "loss": 0.1773, + "step": 1360 + }, + { + "epoch": 0.007420245897199805, + "grad_norm": 11.49558162689209, + "learning_rate": 3.7101229485999027e-06, + "loss": 0.2058, + "step": 1370 + }, + { + "epoch": 0.007474408276011482, + "grad_norm": 1.5845856666564941, + "learning_rate": 3.737204138005741e-06, + "loss": 0.178, + "step": 1380 + }, + { + "epoch": 0.00752857065482316, + "grad_norm": 25.275712966918945, + "learning_rate": 3.76428532741158e-06, + "loss": 0.2149, + "step": 1390 + }, + { + "epoch": 0.0075827330336348375, + "grad_norm": 0.39356210827827454, + "learning_rate": 3.791366516817419e-06, + "loss": 0.3029, + "step": 1400 + }, + { + "epoch": 0.007636895412446515, + "grad_norm": 8.914508819580078, + "learning_rate": 3.818447706223257e-06, + "loss": 0.1491, + "step": 1410 + }, + { + "epoch": 0.007691057791258192, + "grad_norm": 0.5700368881225586, + "learning_rate": 3.845528895629097e-06, + "loss": 0.0903, + "step": 1420 + }, + { + "epoch": 0.007745220170069869, + "grad_norm": 11.440764427185059, + "learning_rate": 3.872610085034935e-06, + "loss": 0.1919, + "step": 1430 + }, + { + "epoch": 0.007799382548881547, + "grad_norm": 0.05412263795733452, + "learning_rate": 3.8996912744407735e-06, + "loss": 0.1732, + "step": 1440 + }, + { + "epoch": 0.007853544927693225, + "grad_norm": 0.0935371071100235, + "learning_rate": 3.926772463846613e-06, + "loss": 0.2862, + "step": 1450 + }, + { + "epoch": 0.007907707306504901, + "grad_norm": 0.7733398079872131, + "learning_rate": 3.953853653252451e-06, + "loss": 0.1773, + "step": 1460 + }, + { + "epoch": 0.00796186968531658, + "grad_norm": 9.057716369628906, + "learning_rate": 3.98093484265829e-06, + "loss": 0.3535, + "step": 1470 + }, + { + "epoch": 0.008016032064128256, + "grad_norm": 10.047187805175781, + "learning_rate": 4.008016032064128e-06, + "loss": 0.2437, + "step": 1480 + }, + { + "epoch": 0.008070194442939934, + "grad_norm": 8.595992088317871, + "learning_rate": 4.035097221469967e-06, + "loss": 0.1659, + "step": 1490 + }, + { + "epoch": 0.008124356821751612, + "grad_norm": 11.408587455749512, + "learning_rate": 4.062178410875806e-06, + "loss": 0.3872, + "step": 1500 + }, + { + "epoch": 0.008178519200563289, + "grad_norm": 0.454853355884552, + "learning_rate": 4.089259600281644e-06, + "loss": 0.2742, + "step": 1510 + }, + { + "epoch": 0.008232681579374967, + "grad_norm": 0.8397417664527893, + "learning_rate": 4.116340789687484e-06, + "loss": 0.132, + "step": 1520 + }, + { + "epoch": 0.008286843958186643, + "grad_norm": 4.540715217590332, + "learning_rate": 4.143421979093322e-06, + "loss": 0.3404, + "step": 1530 + }, + { + "epoch": 0.008341006336998321, + "grad_norm": 10.00632095336914, + "learning_rate": 4.1705031684991605e-06, + "loss": 0.2746, + "step": 1540 + }, + { + "epoch": 0.008395168715809998, + "grad_norm": 0.7299137711524963, + "learning_rate": 4.197584357905e-06, + "loss": 0.1752, + "step": 1550 + }, + { + "epoch": 0.008449331094621676, + "grad_norm": 18.551773071289062, + "learning_rate": 4.224665547310838e-06, + "loss": 0.3609, + "step": 1560 + }, + { + "epoch": 0.008503493473433352, + "grad_norm": 5.70363187789917, + "learning_rate": 4.251746736716677e-06, + "loss": 0.26, + "step": 1570 + }, + { + "epoch": 0.00855765585224503, + "grad_norm": 1.885787844657898, + "learning_rate": 4.278827926122515e-06, + "loss": 0.2018, + "step": 1580 + }, + { + "epoch": 0.008611818231056709, + "grad_norm": 9.965442657470703, + "learning_rate": 4.305909115528354e-06, + "loss": 0.1437, + "step": 1590 + }, + { + "epoch": 0.008665980609868385, + "grad_norm": 4.264651298522949, + "learning_rate": 4.332990304934193e-06, + "loss": 0.1367, + "step": 1600 + }, + { + "epoch": 0.008720142988680063, + "grad_norm": 0.563714861869812, + "learning_rate": 4.360071494340031e-06, + "loss": 0.1886, + "step": 1610 + }, + { + "epoch": 0.00877430536749174, + "grad_norm": 0.058974359184503555, + "learning_rate": 4.387152683745871e-06, + "loss": 0.1381, + "step": 1620 + }, + { + "epoch": 0.008828467746303418, + "grad_norm": 10.235445022583008, + "learning_rate": 4.414233873151709e-06, + "loss": 0.2079, + "step": 1630 + }, + { + "epoch": 0.008882630125115094, + "grad_norm": 11.585585594177246, + "learning_rate": 4.4413150625575475e-06, + "loss": 0.2543, + "step": 1640 + }, + { + "epoch": 0.008936792503926773, + "grad_norm": 10.081459045410156, + "learning_rate": 4.468396251963387e-06, + "loss": 0.1546, + "step": 1650 + }, + { + "epoch": 0.00899095488273845, + "grad_norm": 14.459733963012695, + "learning_rate": 4.495477441369225e-06, + "loss": 0.3679, + "step": 1660 + }, + { + "epoch": 0.009045117261550127, + "grad_norm": 0.5199190378189087, + "learning_rate": 4.522558630775064e-06, + "loss": 0.3144, + "step": 1670 + }, + { + "epoch": 0.009099279640361805, + "grad_norm": 7.516209125518799, + "learning_rate": 4.549639820180902e-06, + "loss": 0.177, + "step": 1680 + }, + { + "epoch": 0.009153442019173482, + "grad_norm": 9.246564865112305, + "learning_rate": 4.5767210095867414e-06, + "loss": 0.2036, + "step": 1690 + }, + { + "epoch": 0.00920760439798516, + "grad_norm": 0.16920925676822662, + "learning_rate": 4.60380219899258e-06, + "loss": 0.299, + "step": 1700 + }, + { + "epoch": 0.009261766776796836, + "grad_norm": 6.636193752288818, + "learning_rate": 4.630883388398418e-06, + "loss": 0.0464, + "step": 1710 + }, + { + "epoch": 0.009315929155608514, + "grad_norm": 6.875288486480713, + "learning_rate": 4.657964577804258e-06, + "loss": 0.1895, + "step": 1720 + }, + { + "epoch": 0.009370091534420191, + "grad_norm": 16.30193328857422, + "learning_rate": 4.685045767210096e-06, + "loss": 0.2676, + "step": 1730 + }, + { + "epoch": 0.009424253913231869, + "grad_norm": 12.137092590332031, + "learning_rate": 4.712126956615935e-06, + "loss": 0.1557, + "step": 1740 + }, + { + "epoch": 0.009478416292043547, + "grad_norm": 3.4417829513549805, + "learning_rate": 4.739208146021774e-06, + "loss": 0.2868, + "step": 1750 + }, + { + "epoch": 0.009532578670855224, + "grad_norm": 16.96158790588379, + "learning_rate": 4.766289335427612e-06, + "loss": 0.2293, + "step": 1760 + }, + { + "epoch": 0.009586741049666902, + "grad_norm": 0.506269097328186, + "learning_rate": 4.793370524833451e-06, + "loss": 0.2242, + "step": 1770 + }, + { + "epoch": 0.009640903428478578, + "grad_norm": 7.0682759284973145, + "learning_rate": 4.820451714239289e-06, + "loss": 0.1179, + "step": 1780 + }, + { + "epoch": 0.009695065807290256, + "grad_norm": 0.0309380404651165, + "learning_rate": 4.8475329036451284e-06, + "loss": 0.0692, + "step": 1790 + }, + { + "epoch": 0.009749228186101933, + "grad_norm": 0.15507839620113373, + "learning_rate": 4.874614093050967e-06, + "loss": 0.3089, + "step": 1800 + }, + { + "epoch": 0.009803390564913611, + "grad_norm": 0.43257203698158264, + "learning_rate": 4.901695282456805e-06, + "loss": 0.1392, + "step": 1810 + }, + { + "epoch": 0.00985755294372529, + "grad_norm": 0.7982332110404968, + "learning_rate": 4.928776471862645e-06, + "loss": 0.1016, + "step": 1820 + }, + { + "epoch": 0.009911715322536966, + "grad_norm": 9.400498390197754, + "learning_rate": 4.955857661268483e-06, + "loss": 0.1916, + "step": 1830 + }, + { + "epoch": 0.009965877701348644, + "grad_norm": 23.438247680664062, + "learning_rate": 4.982938850674322e-06, + "loss": 0.2222, + "step": 1840 + }, + { + "epoch": 0.01002004008016032, + "grad_norm": 0.13065080344676971, + "learning_rate": 5.010020040080161e-06, + "loss": 0.1703, + "step": 1850 + }, + { + "epoch": 0.010074202458971998, + "grad_norm": 8.28375244140625, + "learning_rate": 5.037101229485999e-06, + "loss": 0.2881, + "step": 1860 + }, + { + "epoch": 0.010128364837783675, + "grad_norm": 13.20846939086914, + "learning_rate": 5.064182418891838e-06, + "loss": 0.2875, + "step": 1870 + }, + { + "epoch": 0.010182527216595353, + "grad_norm": 1.1501129865646362, + "learning_rate": 5.091263608297676e-06, + "loss": 0.0459, + "step": 1880 + }, + { + "epoch": 0.01023668959540703, + "grad_norm": 11.881379127502441, + "learning_rate": 5.1183447977035155e-06, + "loss": 0.251, + "step": 1890 + }, + { + "epoch": 0.010290851974218708, + "grad_norm": 8.79068660736084, + "learning_rate": 5.145425987109354e-06, + "loss": 0.2023, + "step": 1900 + }, + { + "epoch": 0.010345014353030386, + "grad_norm": 11.687437057495117, + "learning_rate": 5.172507176515193e-06, + "loss": 0.0947, + "step": 1910 + }, + { + "epoch": 0.010399176731842062, + "grad_norm": 18.260955810546875, + "learning_rate": 5.199588365921032e-06, + "loss": 0.2501, + "step": 1920 + }, + { + "epoch": 0.01045333911065374, + "grad_norm": 0.22816596925258636, + "learning_rate": 5.22666955532687e-06, + "loss": 0.2251, + "step": 1930 + }, + { + "epoch": 0.010507501489465417, + "grad_norm": 19.185792922973633, + "learning_rate": 5.253750744732709e-06, + "loss": 0.1623, + "step": 1940 + }, + { + "epoch": 0.010561663868277095, + "grad_norm": 11.821993827819824, + "learning_rate": 5.280831934138548e-06, + "loss": 0.1856, + "step": 1950 + }, + { + "epoch": 0.010615826247088771, + "grad_norm": 0.5218510627746582, + "learning_rate": 5.307913123544386e-06, + "loss": 0.1847, + "step": 1960 + }, + { + "epoch": 0.01066998862590045, + "grad_norm": 2.061352014541626, + "learning_rate": 5.334994312950225e-06, + "loss": 0.2875, + "step": 1970 + }, + { + "epoch": 0.010724151004712128, + "grad_norm": 0.05574014037847519, + "learning_rate": 5.362075502356063e-06, + "loss": 0.1318, + "step": 1980 + }, + { + "epoch": 0.010778313383523804, + "grad_norm": 0.7815129160881042, + "learning_rate": 5.3891566917619025e-06, + "loss": 0.3664, + "step": 1990 + }, + { + "epoch": 0.010832475762335482, + "grad_norm": 3.4889206886291504, + "learning_rate": 5.416237881167741e-06, + "loss": 0.1583, + "step": 2000 + }, + { + "epoch": 0.010886638141147159, + "grad_norm": 15.025595664978027, + "learning_rate": 5.44331907057358e-06, + "loss": 0.144, + "step": 2010 + }, + { + "epoch": 0.010940800519958837, + "grad_norm": 18.19122886657715, + "learning_rate": 5.470400259979419e-06, + "loss": 0.1852, + "step": 2020 + }, + { + "epoch": 0.010994962898770513, + "grad_norm": 9.63279914855957, + "learning_rate": 5.497481449385257e-06, + "loss": 0.3396, + "step": 2030 + }, + { + "epoch": 0.011049125277582192, + "grad_norm": 0.3777359127998352, + "learning_rate": 5.524562638791096e-06, + "loss": 0.317, + "step": 2040 + }, + { + "epoch": 0.011103287656393868, + "grad_norm": 13.24089241027832, + "learning_rate": 5.551643828196935e-06, + "loss": 0.1068, + "step": 2050 + }, + { + "epoch": 0.011157450035205546, + "grad_norm": 10.463374137878418, + "learning_rate": 5.578725017602773e-06, + "loss": 0.2755, + "step": 2060 + }, + { + "epoch": 0.011211612414017224, + "grad_norm": 12.978836059570312, + "learning_rate": 5.605806207008612e-06, + "loss": 0.1263, + "step": 2070 + }, + { + "epoch": 0.0112657747928289, + "grad_norm": 6.684110641479492, + "learning_rate": 5.63288739641445e-06, + "loss": 0.1938, + "step": 2080 + }, + { + "epoch": 0.011319937171640579, + "grad_norm": 0.10449805110692978, + "learning_rate": 5.6599685858202895e-06, + "loss": 0.3209, + "step": 2090 + }, + { + "epoch": 0.011374099550452255, + "grad_norm": 0.06844928860664368, + "learning_rate": 5.687049775226128e-06, + "loss": 0.2169, + "step": 2100 + }, + { + "epoch": 0.011428261929263933, + "grad_norm": 15.535572052001953, + "learning_rate": 5.714130964631967e-06, + "loss": 0.3062, + "step": 2110 + }, + { + "epoch": 0.01148242430807561, + "grad_norm": 5.069727420806885, + "learning_rate": 5.741212154037806e-06, + "loss": 0.0945, + "step": 2120 + }, + { + "epoch": 0.011536586686887288, + "grad_norm": 1.6268349885940552, + "learning_rate": 5.768293343443645e-06, + "loss": 0.5221, + "step": 2130 + }, + { + "epoch": 0.011590749065698966, + "grad_norm": 10.044612884521484, + "learning_rate": 5.795374532849483e-06, + "loss": 0.1352, + "step": 2140 + }, + { + "epoch": 0.011644911444510643, + "grad_norm": 9.841995239257812, + "learning_rate": 5.822455722255322e-06, + "loss": 0.1294, + "step": 2150 + }, + { + "epoch": 0.01169907382332232, + "grad_norm": 0.5504431128501892, + "learning_rate": 5.84953691166116e-06, + "loss": 0.1555, + "step": 2160 + }, + { + "epoch": 0.011753236202133997, + "grad_norm": 8.598047256469727, + "learning_rate": 5.876618101066999e-06, + "loss": 0.1918, + "step": 2170 + }, + { + "epoch": 0.011807398580945675, + "grad_norm": 9.61973762512207, + "learning_rate": 5.903699290472838e-06, + "loss": 0.1438, + "step": 2180 + }, + { + "epoch": 0.011861560959757352, + "grad_norm": 4.57594108581543, + "learning_rate": 5.9307804798786765e-06, + "loss": 0.0852, + "step": 2190 + }, + { + "epoch": 0.01191572333856903, + "grad_norm": 1.6536269187927246, + "learning_rate": 5.957861669284515e-06, + "loss": 0.286, + "step": 2200 + }, + { + "epoch": 0.011969885717380708, + "grad_norm": 10.493042945861816, + "learning_rate": 5.984942858690354e-06, + "loss": 0.0719, + "step": 2210 + }, + { + "epoch": 0.012024048096192385, + "grad_norm": 9.874662399291992, + "learning_rate": 6.012024048096193e-06, + "loss": 0.2279, + "step": 2220 + }, + { + "epoch": 0.012078210475004063, + "grad_norm": 0.7626814842224121, + "learning_rate": 6.039105237502032e-06, + "loss": 0.1773, + "step": 2230 + }, + { + "epoch": 0.01213237285381574, + "grad_norm": 0.1158587709069252, + "learning_rate": 6.06618642690787e-06, + "loss": 0.3115, + "step": 2240 + }, + { + "epoch": 0.012186535232627417, + "grad_norm": 0.22568051517009735, + "learning_rate": 6.093267616313709e-06, + "loss": 0.167, + "step": 2250 + }, + { + "epoch": 0.012240697611439094, + "grad_norm": 8.374415397644043, + "learning_rate": 6.120348805719547e-06, + "loss": 0.1034, + "step": 2260 + }, + { + "epoch": 0.012294859990250772, + "grad_norm": 0.5191140174865723, + "learning_rate": 6.147429995125386e-06, + "loss": 0.1811, + "step": 2270 + }, + { + "epoch": 0.012349022369062448, + "grad_norm": 13.508127212524414, + "learning_rate": 6.174511184531225e-06, + "loss": 0.2613, + "step": 2280 + }, + { + "epoch": 0.012403184747874127, + "grad_norm": 10.812383651733398, + "learning_rate": 6.2015923739370635e-06, + "loss": 0.0991, + "step": 2290 + }, + { + "epoch": 0.012457347126685805, + "grad_norm": 2.6488330364227295, + "learning_rate": 6.228673563342903e-06, + "loss": 0.2613, + "step": 2300 + }, + { + "epoch": 0.012511509505497481, + "grad_norm": 6.766165256500244, + "learning_rate": 6.255754752748741e-06, + "loss": 0.1243, + "step": 2310 + }, + { + "epoch": 0.01256567188430916, + "grad_norm": 5.757248878479004, + "learning_rate": 6.2828359421545805e-06, + "loss": 0.1792, + "step": 2320 + }, + { + "epoch": 0.012619834263120836, + "grad_norm": 1.5272959470748901, + "learning_rate": 6.309917131560418e-06, + "loss": 0.4394, + "step": 2330 + }, + { + "epoch": 0.012673996641932514, + "grad_norm": 9.392232894897461, + "learning_rate": 6.336998320966257e-06, + "loss": 0.3065, + "step": 2340 + }, + { + "epoch": 0.01272815902074419, + "grad_norm": 0.08856473863124847, + "learning_rate": 6.364079510372095e-06, + "loss": 0.0588, + "step": 2350 + }, + { + "epoch": 0.012782321399555869, + "grad_norm": 17.71160888671875, + "learning_rate": 6.391160699777934e-06, + "loss": 0.196, + "step": 2360 + }, + { + "epoch": 0.012836483778367547, + "grad_norm": 16.369842529296875, + "learning_rate": 6.4182418891837736e-06, + "loss": 0.2099, + "step": 2370 + }, + { + "epoch": 0.012890646157179223, + "grad_norm": 2.869304895401001, + "learning_rate": 6.445323078589611e-06, + "loss": 0.2393, + "step": 2380 + }, + { + "epoch": 0.012944808535990901, + "grad_norm": 18.225059509277344, + "learning_rate": 6.4724042679954505e-06, + "loss": 0.1843, + "step": 2390 + }, + { + "epoch": 0.012998970914802578, + "grad_norm": 13.161109924316406, + "learning_rate": 6.49948545740129e-06, + "loss": 0.1357, + "step": 2400 + }, + { + "epoch": 0.013053133293614256, + "grad_norm": 0.015998415648937225, + "learning_rate": 6.526566646807128e-06, + "loss": 0.0491, + "step": 2410 + }, + { + "epoch": 0.013107295672425932, + "grad_norm": 0.2340915948152542, + "learning_rate": 6.5536478362129675e-06, + "loss": 0.1041, + "step": 2420 + }, + { + "epoch": 0.01316145805123761, + "grad_norm": 0.563816249370575, + "learning_rate": 6.580729025618805e-06, + "loss": 0.196, + "step": 2430 + }, + { + "epoch": 0.013215620430049287, + "grad_norm": 4.820296764373779, + "learning_rate": 6.607810215024644e-06, + "loss": 0.1051, + "step": 2440 + }, + { + "epoch": 0.013269782808860965, + "grad_norm": 14.42601203918457, + "learning_rate": 6.634891404430484e-06, + "loss": 0.5505, + "step": 2450 + }, + { + "epoch": 0.013323945187672643, + "grad_norm": 10.15406322479248, + "learning_rate": 6.661972593836321e-06, + "loss": 0.1675, + "step": 2460 + }, + { + "epoch": 0.01337810756648432, + "grad_norm": 0.6887327432632446, + "learning_rate": 6.6890537832421606e-06, + "loss": 0.0545, + "step": 2470 + }, + { + "epoch": 0.013432269945295998, + "grad_norm": 0.020448531955480576, + "learning_rate": 6.716134972647998e-06, + "loss": 0.0721, + "step": 2480 + }, + { + "epoch": 0.013486432324107674, + "grad_norm": 1.4815093278884888, + "learning_rate": 6.7432161620538375e-06, + "loss": 0.2573, + "step": 2490 + }, + { + "epoch": 0.013540594702919352, + "grad_norm": 6.15155029296875, + "learning_rate": 6.770297351459677e-06, + "loss": 0.2181, + "step": 2500 + }, + { + "epoch": 0.013594757081731029, + "grad_norm": 19.403987884521484, + "learning_rate": 6.797378540865515e-06, + "loss": 0.2851, + "step": 2510 + }, + { + "epoch": 0.013648919460542707, + "grad_norm": 1.2729215621948242, + "learning_rate": 6.8244597302713545e-06, + "loss": 0.0364, + "step": 2520 + }, + { + "epoch": 0.013703081839354385, + "grad_norm": 8.071385383605957, + "learning_rate": 6.851540919677192e-06, + "loss": 0.189, + "step": 2530 + }, + { + "epoch": 0.013757244218166062, + "grad_norm": 22.172866821289062, + "learning_rate": 6.878622109083031e-06, + "loss": 0.3155, + "step": 2540 + }, + { + "epoch": 0.01381140659697774, + "grad_norm": 4.125961780548096, + "learning_rate": 6.905703298488871e-06, + "loss": 0.1819, + "step": 2550 + }, + { + "epoch": 0.013865568975789416, + "grad_norm": 0.03139098733663559, + "learning_rate": 6.932784487894708e-06, + "loss": 0.1468, + "step": 2560 + }, + { + "epoch": 0.013919731354601094, + "grad_norm": 7.408463954925537, + "learning_rate": 6.959865677300548e-06, + "loss": 0.1542, + "step": 2570 + }, + { + "epoch": 0.013973893733412771, + "grad_norm": 0.3038051426410675, + "learning_rate": 6.986946866706385e-06, + "loss": 0.2232, + "step": 2580 + }, + { + "epoch": 0.014028056112224449, + "grad_norm": 0.04449661076068878, + "learning_rate": 7.0140280561122245e-06, + "loss": 0.1202, + "step": 2590 + }, + { + "epoch": 0.014082218491036125, + "grad_norm": 1.1187139749526978, + "learning_rate": 7.041109245518064e-06, + "loss": 0.3058, + "step": 2600 + }, + { + "epoch": 0.014136380869847804, + "grad_norm": 30.933040618896484, + "learning_rate": 7.068190434923902e-06, + "loss": 0.1647, + "step": 2610 + }, + { + "epoch": 0.014190543248659482, + "grad_norm": 9.613822937011719, + "learning_rate": 7.0952716243297415e-06, + "loss": 0.2161, + "step": 2620 + }, + { + "epoch": 0.014244705627471158, + "grad_norm": 0.10896974802017212, + "learning_rate": 7.122352813735579e-06, + "loss": 0.2912, + "step": 2630 + }, + { + "epoch": 0.014298868006282836, + "grad_norm": 0.9928270578384399, + "learning_rate": 7.149434003141418e-06, + "loss": 0.2969, + "step": 2640 + }, + { + "epoch": 0.014353030385094513, + "grad_norm": 0.23428748548030853, + "learning_rate": 7.176515192547258e-06, + "loss": 0.2334, + "step": 2650 + }, + { + "epoch": 0.014407192763906191, + "grad_norm": 8.088724136352539, + "learning_rate": 7.203596381953095e-06, + "loss": 0.2022, + "step": 2660 + }, + { + "epoch": 0.014461355142717867, + "grad_norm": 3.8421812057495117, + "learning_rate": 7.230677571358935e-06, + "loss": 0.1577, + "step": 2670 + }, + { + "epoch": 0.014515517521529546, + "grad_norm": 0.7655143737792969, + "learning_rate": 7.257758760764773e-06, + "loss": 0.2326, + "step": 2680 + }, + { + "epoch": 0.014569679900341224, + "grad_norm": 0.5175389647483826, + "learning_rate": 7.2848399501706115e-06, + "loss": 0.0681, + "step": 2690 + }, + { + "epoch": 0.0146238422791529, + "grad_norm": 11.59901237487793, + "learning_rate": 7.311921139576451e-06, + "loss": 0.1115, + "step": 2700 + }, + { + "epoch": 0.014678004657964578, + "grad_norm": 0.09244125336408615, + "learning_rate": 7.339002328982289e-06, + "loss": 0.1618, + "step": 2710 + }, + { + "epoch": 0.014732167036776255, + "grad_norm": 16.086334228515625, + "learning_rate": 7.3660835183881285e-06, + "loss": 0.099, + "step": 2720 + }, + { + "epoch": 0.014786329415587933, + "grad_norm": 18.5790958404541, + "learning_rate": 7.393164707793966e-06, + "loss": 0.356, + "step": 2730 + }, + { + "epoch": 0.01484049179439961, + "grad_norm": 14.101476669311523, + "learning_rate": 7.420245897199805e-06, + "loss": 0.1119, + "step": 2740 + }, + { + "epoch": 0.014894654173211288, + "grad_norm": 14.634298324584961, + "learning_rate": 7.447327086605645e-06, + "loss": 0.062, + "step": 2750 + }, + { + "epoch": 0.014948816552022964, + "grad_norm": 0.07589283585548401, + "learning_rate": 7.474408276011482e-06, + "loss": 0.1916, + "step": 2760 + }, + { + "epoch": 0.015002978930834642, + "grad_norm": 3.6496224403381348, + "learning_rate": 7.501489465417322e-06, + "loss": 0.1638, + "step": 2770 + }, + { + "epoch": 0.01505714130964632, + "grad_norm": 0.5233298540115356, + "learning_rate": 7.52857065482316e-06, + "loss": 0.0703, + "step": 2780 + }, + { + "epoch": 0.015111303688457997, + "grad_norm": 27.49069595336914, + "learning_rate": 7.555651844228999e-06, + "loss": 0.1169, + "step": 2790 + }, + { + "epoch": 0.015165466067269675, + "grad_norm": 24.941747665405273, + "learning_rate": 7.582733033634838e-06, + "loss": 0.3642, + "step": 2800 + }, + { + "epoch": 0.015219628446081351, + "grad_norm": 7.129408836364746, + "learning_rate": 7.609814223040676e-06, + "loss": 0.0745, + "step": 2810 + }, + { + "epoch": 0.01527379082489303, + "grad_norm": 4.0692949295043945, + "learning_rate": 7.636895412446515e-06, + "loss": 0.3942, + "step": 2820 + }, + { + "epoch": 0.015327953203704706, + "grad_norm": 12.763285636901855, + "learning_rate": 7.663976601852353e-06, + "loss": 0.306, + "step": 2830 + }, + { + "epoch": 0.015382115582516384, + "grad_norm": 0.1583407074213028, + "learning_rate": 7.691057791258193e-06, + "loss": 0.1179, + "step": 2840 + }, + { + "epoch": 0.015436277961328062, + "grad_norm": 2.1661629676818848, + "learning_rate": 7.718138980664032e-06, + "loss": 0.0288, + "step": 2850 + }, + { + "epoch": 0.015490440340139739, + "grad_norm": 11.14156723022461, + "learning_rate": 7.74522017006987e-06, + "loss": 0.2479, + "step": 2860 + }, + { + "epoch": 0.015544602718951417, + "grad_norm": 10.528443336486816, + "learning_rate": 7.772301359475709e-06, + "loss": 0.1476, + "step": 2870 + }, + { + "epoch": 0.015598765097763093, + "grad_norm": 2.220219850540161, + "learning_rate": 7.799382548881547e-06, + "loss": 0.1202, + "step": 2880 + }, + { + "epoch": 0.01565292747657477, + "grad_norm": 10.49760913848877, + "learning_rate": 7.826463738287385e-06, + "loss": 0.329, + "step": 2890 + }, + { + "epoch": 0.01570708985538645, + "grad_norm": 0.652311384677887, + "learning_rate": 7.853544927693226e-06, + "loss": 0.0622, + "step": 2900 + }, + { + "epoch": 0.015761252234198126, + "grad_norm": 10.15507698059082, + "learning_rate": 7.880626117099062e-06, + "loss": 0.1086, + "step": 2910 + }, + { + "epoch": 0.015815414613009803, + "grad_norm": 0.03973688185214996, + "learning_rate": 7.907707306504903e-06, + "loss": 0.1583, + "step": 2920 + }, + { + "epoch": 0.015869576991821482, + "grad_norm": 6.715467929840088, + "learning_rate": 7.934788495910741e-06, + "loss": 0.1388, + "step": 2930 + }, + { + "epoch": 0.01592373937063316, + "grad_norm": 0.12040809541940689, + "learning_rate": 7.96186968531658e-06, + "loss": 0.1741, + "step": 2940 + }, + { + "epoch": 0.015977901749444835, + "grad_norm": 12.735998153686523, + "learning_rate": 7.98895087472242e-06, + "loss": 0.1461, + "step": 2950 + }, + { + "epoch": 0.01603206412825651, + "grad_norm": 0.13440634310245514, + "learning_rate": 8.016032064128256e-06, + "loss": 0.2491, + "step": 2960 + }, + { + "epoch": 0.01608622650706819, + "grad_norm": 0.034504640847444534, + "learning_rate": 8.043113253534096e-06, + "loss": 0.1087, + "step": 2970 + }, + { + "epoch": 0.016140388885879868, + "grad_norm": 1.0604479312896729, + "learning_rate": 8.070194442939933e-06, + "loss": 0.2038, + "step": 2980 + }, + { + "epoch": 0.016194551264691544, + "grad_norm": 0.08650480210781097, + "learning_rate": 8.097275632345773e-06, + "loss": 0.1494, + "step": 2990 + }, + { + "epoch": 0.016248713643503224, + "grad_norm": 13.948455810546875, + "learning_rate": 8.124356821751612e-06, + "loss": 0.1128, + "step": 3000 + }, + { + "epoch": 0.0163028760223149, + "grad_norm": 1.1568231582641602, + "learning_rate": 8.15143801115745e-06, + "loss": 0.1303, + "step": 3010 + }, + { + "epoch": 0.016357038401126577, + "grad_norm": 0.2540189325809479, + "learning_rate": 8.178519200563289e-06, + "loss": 0.1251, + "step": 3020 + }, + { + "epoch": 0.016411200779938254, + "grad_norm": 0.5418497323989868, + "learning_rate": 8.205600389969127e-06, + "loss": 0.139, + "step": 3030 + }, + { + "epoch": 0.016465363158749934, + "grad_norm": 0.03439989313483238, + "learning_rate": 8.232681579374967e-06, + "loss": 0.2036, + "step": 3040 + }, + { + "epoch": 0.01651952553756161, + "grad_norm": 14.814231872558594, + "learning_rate": 8.259762768780806e-06, + "loss": 0.3518, + "step": 3050 + }, + { + "epoch": 0.016573687916373286, + "grad_norm": 0.07176771759986877, + "learning_rate": 8.286843958186644e-06, + "loss": 0.0941, + "step": 3060 + }, + { + "epoch": 0.016627850295184963, + "grad_norm": 11.88023853302002, + "learning_rate": 8.313925147592483e-06, + "loss": 0.1153, + "step": 3070 + }, + { + "epoch": 0.016682012673996643, + "grad_norm": 18.649930953979492, + "learning_rate": 8.341006336998321e-06, + "loss": 0.2723, + "step": 3080 + }, + { + "epoch": 0.01673617505280832, + "grad_norm": 0.06487064808607101, + "learning_rate": 8.36808752640416e-06, + "loss": 0.0391, + "step": 3090 + }, + { + "epoch": 0.016790337431619996, + "grad_norm": 12.66914176940918, + "learning_rate": 8.39516871581e-06, + "loss": 0.1444, + "step": 3100 + }, + { + "epoch": 0.016844499810431676, + "grad_norm": 3.384040355682373, + "learning_rate": 8.422249905215836e-06, + "loss": 0.1133, + "step": 3110 + }, + { + "epoch": 0.016898662189243352, + "grad_norm": 15.202681541442871, + "learning_rate": 8.449331094621677e-06, + "loss": 0.1103, + "step": 3120 + }, + { + "epoch": 0.01695282456805503, + "grad_norm": 0.24872304499149323, + "learning_rate": 8.476412284027515e-06, + "loss": 0.0789, + "step": 3130 + }, + { + "epoch": 0.017006986946866705, + "grad_norm": 16.875961303710938, + "learning_rate": 8.503493473433353e-06, + "loss": 0.1305, + "step": 3140 + }, + { + "epoch": 0.017061149325678385, + "grad_norm": 12.362018585205078, + "learning_rate": 8.530574662839194e-06, + "loss": 0.1562, + "step": 3150 + }, + { + "epoch": 0.01711531170449006, + "grad_norm": 9.296303749084473, + "learning_rate": 8.55765585224503e-06, + "loss": 0.1391, + "step": 3160 + }, + { + "epoch": 0.017169474083301738, + "grad_norm": 17.57097816467285, + "learning_rate": 8.58473704165087e-06, + "loss": 0.1316, + "step": 3170 + }, + { + "epoch": 0.017223636462113417, + "grad_norm": 0.5297034978866577, + "learning_rate": 8.611818231056707e-06, + "loss": 0.188, + "step": 3180 + }, + { + "epoch": 0.017277798840925094, + "grad_norm": 0.49622806906700134, + "learning_rate": 8.638899420462547e-06, + "loss": 0.0447, + "step": 3190 + }, + { + "epoch": 0.01733196121973677, + "grad_norm": 0.010179011151194572, + "learning_rate": 8.665980609868386e-06, + "loss": 0.0289, + "step": 3200 + }, + { + "epoch": 0.017386123598548447, + "grad_norm": 0.004617643542587757, + "learning_rate": 8.693061799274224e-06, + "loss": 0.1062, + "step": 3210 + }, + { + "epoch": 0.017440285977360127, + "grad_norm": 0.006837939843535423, + "learning_rate": 8.720142988680063e-06, + "loss": 0.169, + "step": 3220 + }, + { + "epoch": 0.017494448356171803, + "grad_norm": 0.013601787388324738, + "learning_rate": 8.747224178085901e-06, + "loss": 0.1819, + "step": 3230 + }, + { + "epoch": 0.01754861073498348, + "grad_norm": 4.8988542556762695, + "learning_rate": 8.774305367491741e-06, + "loss": 0.2609, + "step": 3240 + }, + { + "epoch": 0.01760277311379516, + "grad_norm": 0.11133372783660889, + "learning_rate": 8.80138655689758e-06, + "loss": 0.1516, + "step": 3250 + }, + { + "epoch": 0.017656935492606836, + "grad_norm": 8.025609016418457, + "learning_rate": 8.828467746303418e-06, + "loss": 0.2025, + "step": 3260 + }, + { + "epoch": 0.017711097871418512, + "grad_norm": 11.38595199584961, + "learning_rate": 8.855548935709257e-06, + "loss": 0.2502, + "step": 3270 + }, + { + "epoch": 0.01776526025023019, + "grad_norm": 14.865860939025879, + "learning_rate": 8.882630125115095e-06, + "loss": 0.1727, + "step": 3280 + }, + { + "epoch": 0.01781942262904187, + "grad_norm": 0.016133446246385574, + "learning_rate": 8.909711314520933e-06, + "loss": 0.1145, + "step": 3290 + }, + { + "epoch": 0.017873585007853545, + "grad_norm": 0.028595535084605217, + "learning_rate": 8.936792503926774e-06, + "loss": 0.1594, + "step": 3300 + }, + { + "epoch": 0.01792774738666522, + "grad_norm": 0.48404446244239807, + "learning_rate": 8.96387369333261e-06, + "loss": 0.0532, + "step": 3310 + }, + { + "epoch": 0.0179819097654769, + "grad_norm": 14.107460975646973, + "learning_rate": 8.99095488273845e-06, + "loss": 0.1268, + "step": 3320 + }, + { + "epoch": 0.018036072144288578, + "grad_norm": 0.32853007316589355, + "learning_rate": 9.018036072144289e-06, + "loss": 0.2301, + "step": 3330 + }, + { + "epoch": 0.018090234523100254, + "grad_norm": 0.38807371258735657, + "learning_rate": 9.045117261550127e-06, + "loss": 0.1235, + "step": 3340 + }, + { + "epoch": 0.01814439690191193, + "grad_norm": 0.13610494136810303, + "learning_rate": 9.072198450955968e-06, + "loss": 0.1812, + "step": 3350 + }, + { + "epoch": 0.01819855928072361, + "grad_norm": 0.06020895764231682, + "learning_rate": 9.099279640361804e-06, + "loss": 0.0659, + "step": 3360 + }, + { + "epoch": 0.018252721659535287, + "grad_norm": 7.4572248458862305, + "learning_rate": 9.126360829767644e-06, + "loss": 0.1285, + "step": 3370 + }, + { + "epoch": 0.018306884038346963, + "grad_norm": 0.032984111458063126, + "learning_rate": 9.153442019173483e-06, + "loss": 0.1165, + "step": 3380 + }, + { + "epoch": 0.01836104641715864, + "grad_norm": 0.3076828420162201, + "learning_rate": 9.180523208579321e-06, + "loss": 0.4361, + "step": 3390 + }, + { + "epoch": 0.01841520879597032, + "grad_norm": 11.997506141662598, + "learning_rate": 9.20760439798516e-06, + "loss": 0.0928, + "step": 3400 + }, + { + "epoch": 0.018469371174781996, + "grad_norm": 9.479592323303223, + "learning_rate": 9.234685587390998e-06, + "loss": 0.2161, + "step": 3410 + }, + { + "epoch": 0.018523533553593673, + "grad_norm": 0.2633627951145172, + "learning_rate": 9.261766776796837e-06, + "loss": 0.2252, + "step": 3420 + }, + { + "epoch": 0.018577695932405353, + "grad_norm": 0.05867617577314377, + "learning_rate": 9.288847966202677e-06, + "loss": 0.0646, + "step": 3430 + }, + { + "epoch": 0.01863185831121703, + "grad_norm": 6.255855083465576, + "learning_rate": 9.315929155608515e-06, + "loss": 0.2436, + "step": 3440 + }, + { + "epoch": 0.018686020690028705, + "grad_norm": 21.897829055786133, + "learning_rate": 9.343010345014354e-06, + "loss": 0.1573, + "step": 3450 + }, + { + "epoch": 0.018740183068840382, + "grad_norm": 0.13150711357593536, + "learning_rate": 9.370091534420192e-06, + "loss": 0.2083, + "step": 3460 + }, + { + "epoch": 0.018794345447652062, + "grad_norm": 0.4870622456073761, + "learning_rate": 9.39717272382603e-06, + "loss": 0.2694, + "step": 3470 + }, + { + "epoch": 0.018848507826463738, + "grad_norm": 1.5932246446609497, + "learning_rate": 9.42425391323187e-06, + "loss": 0.0509, + "step": 3480 + }, + { + "epoch": 0.018902670205275415, + "grad_norm": 13.53217887878418, + "learning_rate": 9.451335102637707e-06, + "loss": 0.0676, + "step": 3490 + }, + { + "epoch": 0.018956832584087095, + "grad_norm": 1.2342296838760376, + "learning_rate": 9.478416292043548e-06, + "loss": 0.1939, + "step": 3500 + }, + { + "epoch": 0.01901099496289877, + "grad_norm": 16.256885528564453, + "learning_rate": 9.505497481449384e-06, + "loss": 0.1537, + "step": 3510 + }, + { + "epoch": 0.019065157341710447, + "grad_norm": 0.10760337859392166, + "learning_rate": 9.532578670855225e-06, + "loss": 0.0313, + "step": 3520 + }, + { + "epoch": 0.019119319720522124, + "grad_norm": 0.02776266634464264, + "learning_rate": 9.559659860261063e-06, + "loss": 0.0411, + "step": 3530 + }, + { + "epoch": 0.019173482099333804, + "grad_norm": 13.333271980285645, + "learning_rate": 9.586741049666901e-06, + "loss": 0.25, + "step": 3540 + }, + { + "epoch": 0.01922764447814548, + "grad_norm": 0.17971506714820862, + "learning_rate": 9.613822239072742e-06, + "loss": 0.1235, + "step": 3550 + }, + { + "epoch": 0.019281806856957157, + "grad_norm": 3.618074655532837, + "learning_rate": 9.640903428478578e-06, + "loss": 0.0525, + "step": 3560 + }, + { + "epoch": 0.019335969235768836, + "grad_norm": 5.667023658752441, + "learning_rate": 9.667984617884418e-06, + "loss": 0.1042, + "step": 3570 + }, + { + "epoch": 0.019390131614580513, + "grad_norm": 0.04888833686709404, + "learning_rate": 9.695065807290257e-06, + "loss": 0.0896, + "step": 3580 + }, + { + "epoch": 0.01944429399339219, + "grad_norm": 16.012990951538086, + "learning_rate": 9.722146996696095e-06, + "loss": 0.16, + "step": 3590 + }, + { + "epoch": 0.019498456372203866, + "grad_norm": 0.9709793329238892, + "learning_rate": 9.749228186101934e-06, + "loss": 0.0153, + "step": 3600 + }, + { + "epoch": 0.019552618751015546, + "grad_norm": 0.024725614115595818, + "learning_rate": 9.776309375507772e-06, + "loss": 0.2117, + "step": 3610 + }, + { + "epoch": 0.019606781129827222, + "grad_norm": 17.21625518798828, + "learning_rate": 9.80339056491361e-06, + "loss": 0.3039, + "step": 3620 + }, + { + "epoch": 0.0196609435086389, + "grad_norm": 2.6144094467163086, + "learning_rate": 9.83047175431945e-06, + "loss": 0.1027, + "step": 3630 + }, + { + "epoch": 0.01971510588745058, + "grad_norm": 1.0703226327896118, + "learning_rate": 9.85755294372529e-06, + "loss": 0.1531, + "step": 3640 + }, + { + "epoch": 0.019769268266262255, + "grad_norm": 12.112059593200684, + "learning_rate": 9.884634133131128e-06, + "loss": 0.2491, + "step": 3650 + }, + { + "epoch": 0.01982343064507393, + "grad_norm": 13.295300483703613, + "learning_rate": 9.911715322536966e-06, + "loss": 0.304, + "step": 3660 + }, + { + "epoch": 0.019877593023885608, + "grad_norm": 10.894659996032715, + "learning_rate": 9.938796511942805e-06, + "loss": 0.1809, + "step": 3670 + }, + { + "epoch": 0.019931755402697288, + "grad_norm": 9.558890342712402, + "learning_rate": 9.965877701348645e-06, + "loss": 0.1819, + "step": 3680 + }, + { + "epoch": 0.019985917781508964, + "grad_norm": 12.2958984375, + "learning_rate": 9.992958890754482e-06, + "loss": 0.1622, + "step": 3690 + }, + { + "epoch": 0.02004008016032064, + "grad_norm": 7.649306297302246, + "learning_rate": 1.0020040080160322e-05, + "loss": 0.2289, + "step": 3700 + }, + { + "epoch": 0.02009424253913232, + "grad_norm": 7.0021562576293945, + "learning_rate": 1.004712126956616e-05, + "loss": 0.0808, + "step": 3710 + }, + { + "epoch": 0.020148404917943997, + "grad_norm": 11.905574798583984, + "learning_rate": 1.0074202458971999e-05, + "loss": 0.157, + "step": 3720 + }, + { + "epoch": 0.020202567296755673, + "grad_norm": 1.3817291259765625, + "learning_rate": 1.0101283648377837e-05, + "loss": 0.2395, + "step": 3730 + }, + { + "epoch": 0.02025672967556735, + "grad_norm": 10.934839248657227, + "learning_rate": 1.0128364837783675e-05, + "loss": 0.3125, + "step": 3740 + }, + { + "epoch": 0.02031089205437903, + "grad_norm": 1.3985371589660645, + "learning_rate": 1.0155446027189516e-05, + "loss": 0.138, + "step": 3750 + }, + { + "epoch": 0.020365054433190706, + "grad_norm": 16.088497161865234, + "learning_rate": 1.0182527216595352e-05, + "loss": 0.08, + "step": 3760 + }, + { + "epoch": 0.020419216812002382, + "grad_norm": 0.16190293431282043, + "learning_rate": 1.0209608406001192e-05, + "loss": 0.0821, + "step": 3770 + }, + { + "epoch": 0.02047337919081406, + "grad_norm": 7.365142822265625, + "learning_rate": 1.0236689595407031e-05, + "loss": 0.1498, + "step": 3780 + }, + { + "epoch": 0.02052754156962574, + "grad_norm": 2.6703548431396484, + "learning_rate": 1.026377078481287e-05, + "loss": 0.0054, + "step": 3790 + }, + { + "epoch": 0.020581703948437415, + "grad_norm": 4.02484655380249, + "learning_rate": 1.0290851974218708e-05, + "loss": 0.2068, + "step": 3800 + }, + { + "epoch": 0.02063586632724909, + "grad_norm": 2.0805554389953613, + "learning_rate": 1.0317933163624546e-05, + "loss": 0.0575, + "step": 3810 + }, + { + "epoch": 0.02069002870606077, + "grad_norm": 0.08940979838371277, + "learning_rate": 1.0345014353030386e-05, + "loss": 0.122, + "step": 3820 + }, + { + "epoch": 0.020744191084872448, + "grad_norm": 0.03994546830654144, + "learning_rate": 1.0372095542436225e-05, + "loss": 0.141, + "step": 3830 + }, + { + "epoch": 0.020798353463684124, + "grad_norm": 0.15983133018016815, + "learning_rate": 1.0399176731842063e-05, + "loss": 0.2305, + "step": 3840 + }, + { + "epoch": 0.0208525158424958, + "grad_norm": 9.104622840881348, + "learning_rate": 1.0426257921247902e-05, + "loss": 0.1814, + "step": 3850 + }, + { + "epoch": 0.02090667822130748, + "grad_norm": 11.871993064880371, + "learning_rate": 1.045333911065374e-05, + "loss": 0.1936, + "step": 3860 + }, + { + "epoch": 0.020960840600119157, + "grad_norm": 2.99444580078125, + "learning_rate": 1.0480420300059579e-05, + "loss": 0.2136, + "step": 3870 + }, + { + "epoch": 0.021015002978930834, + "grad_norm": 0.13686329126358032, + "learning_rate": 1.0507501489465419e-05, + "loss": 0.1604, + "step": 3880 + }, + { + "epoch": 0.021069165357742514, + "grad_norm": 14.69699764251709, + "learning_rate": 1.0534582678871256e-05, + "loss": 0.2605, + "step": 3890 + }, + { + "epoch": 0.02112332773655419, + "grad_norm": 0.06052977219223976, + "learning_rate": 1.0561663868277096e-05, + "loss": 0.1282, + "step": 3900 + }, + { + "epoch": 0.021177490115365866, + "grad_norm": 0.5213247537612915, + "learning_rate": 1.0588745057682934e-05, + "loss": 0.1146, + "step": 3910 + }, + { + "epoch": 0.021231652494177543, + "grad_norm": 2.0313215255737305, + "learning_rate": 1.0615826247088773e-05, + "loss": 0.1391, + "step": 3920 + }, + { + "epoch": 0.021285814872989223, + "grad_norm": 7.082489490509033, + "learning_rate": 1.0642907436494613e-05, + "loss": 0.4085, + "step": 3930 + }, + { + "epoch": 0.0213399772518009, + "grad_norm": 23.206266403198242, + "learning_rate": 1.066998862590045e-05, + "loss": 0.1238, + "step": 3940 + }, + { + "epoch": 0.021394139630612576, + "grad_norm": 2.0371592044830322, + "learning_rate": 1.069706981530629e-05, + "loss": 0.0908, + "step": 3950 + }, + { + "epoch": 0.021448302009424255, + "grad_norm": 2.39154314994812, + "learning_rate": 1.0724151004712126e-05, + "loss": 0.0327, + "step": 3960 + }, + { + "epoch": 0.021502464388235932, + "grad_norm": 21.7739315032959, + "learning_rate": 1.0751232194117966e-05, + "loss": 0.2254, + "step": 3970 + }, + { + "epoch": 0.02155662676704761, + "grad_norm": 3.9335272312164307, + "learning_rate": 1.0778313383523805e-05, + "loss": 0.1415, + "step": 3980 + }, + { + "epoch": 0.021610789145859285, + "grad_norm": 28.758039474487305, + "learning_rate": 1.0805394572929643e-05, + "loss": 0.21, + "step": 3990 + }, + { + "epoch": 0.021664951524670965, + "grad_norm": 11.815359115600586, + "learning_rate": 1.0832475762335482e-05, + "loss": 0.5278, + "step": 4000 + }, + { + "epoch": 0.02171911390348264, + "grad_norm": 0.07528593391180038, + "learning_rate": 1.085955695174132e-05, + "loss": 0.1214, + "step": 4010 + }, + { + "epoch": 0.021773276282294318, + "grad_norm": 6.289927005767822, + "learning_rate": 1.088663814114716e-05, + "loss": 0.3179, + "step": 4020 + }, + { + "epoch": 0.021827438661105997, + "grad_norm": 0.013665788806974888, + "learning_rate": 1.0913719330552999e-05, + "loss": 0.1196, + "step": 4030 + }, + { + "epoch": 0.021881601039917674, + "grad_norm": 28.726531982421875, + "learning_rate": 1.0940800519958837e-05, + "loss": 0.1887, + "step": 4040 + }, + { + "epoch": 0.02193576341872935, + "grad_norm": 13.926639556884766, + "learning_rate": 1.0967881709364676e-05, + "loss": 0.2291, + "step": 4050 + }, + { + "epoch": 0.021989925797541027, + "grad_norm": 4.498834609985352, + "learning_rate": 1.0994962898770514e-05, + "loss": 0.1101, + "step": 4060 + }, + { + "epoch": 0.022044088176352707, + "grad_norm": 3.99916934967041, + "learning_rate": 1.1022044088176353e-05, + "loss": 0.0678, + "step": 4070 + }, + { + "epoch": 0.022098250555164383, + "grad_norm": 16.539644241333008, + "learning_rate": 1.1049125277582193e-05, + "loss": 0.1848, + "step": 4080 + }, + { + "epoch": 0.02215241293397606, + "grad_norm": 0.08804541081190109, + "learning_rate": 1.107620646698803e-05, + "loss": 0.3688, + "step": 4090 + }, + { + "epoch": 0.022206575312787736, + "grad_norm": 18.573204040527344, + "learning_rate": 1.110328765639387e-05, + "loss": 0.3505, + "step": 4100 + }, + { + "epoch": 0.022260737691599416, + "grad_norm": 14.636613845825195, + "learning_rate": 1.1130368845799708e-05, + "loss": 0.1392, + "step": 4110 + }, + { + "epoch": 0.022314900070411092, + "grad_norm": 0.20648688077926636, + "learning_rate": 1.1157450035205547e-05, + "loss": 0.1662, + "step": 4120 + }, + { + "epoch": 0.02236906244922277, + "grad_norm": 0.7622145414352417, + "learning_rate": 1.1184531224611387e-05, + "loss": 0.1352, + "step": 4130 + }, + { + "epoch": 0.02242322482803445, + "grad_norm": 0.35124677419662476, + "learning_rate": 1.1211612414017223e-05, + "loss": 0.1916, + "step": 4140 + }, + { + "epoch": 0.022477387206846125, + "grad_norm": 10.01506233215332, + "learning_rate": 1.1238693603423064e-05, + "loss": 0.0895, + "step": 4150 + }, + { + "epoch": 0.0225315495856578, + "grad_norm": 2.7913334369659424, + "learning_rate": 1.12657747928289e-05, + "loss": 0.1182, + "step": 4160 + }, + { + "epoch": 0.022585711964469478, + "grad_norm": 9.296822547912598, + "learning_rate": 1.129285598223474e-05, + "loss": 0.0815, + "step": 4170 + }, + { + "epoch": 0.022639874343281158, + "grad_norm": 0.27187666296958923, + "learning_rate": 1.1319937171640579e-05, + "loss": 0.1747, + "step": 4180 + }, + { + "epoch": 0.022694036722092834, + "grad_norm": 0.00788525678217411, + "learning_rate": 1.1347018361046417e-05, + "loss": 0.3289, + "step": 4190 + }, + { + "epoch": 0.02274819910090451, + "grad_norm": 0.21577899158000946, + "learning_rate": 1.1374099550452256e-05, + "loss": 0.285, + "step": 4200 + }, + { + "epoch": 0.02280236147971619, + "grad_norm": 1.6981812715530396, + "learning_rate": 1.1401180739858094e-05, + "loss": 0.0498, + "step": 4210 + }, + { + "epoch": 0.022856523858527867, + "grad_norm": 11.118830680847168, + "learning_rate": 1.1428261929263934e-05, + "loss": 0.2278, + "step": 4220 + }, + { + "epoch": 0.022910686237339543, + "grad_norm": 20.275279998779297, + "learning_rate": 1.1455343118669773e-05, + "loss": 0.0986, + "step": 4230 + }, + { + "epoch": 0.02296484861615122, + "grad_norm": 0.7346848845481873, + "learning_rate": 1.1482424308075611e-05, + "loss": 0.1386, + "step": 4240 + }, + { + "epoch": 0.0230190109949629, + "grad_norm": 0.3359120488166809, + "learning_rate": 1.150950549748145e-05, + "loss": 0.1821, + "step": 4250 + }, + { + "epoch": 0.023073173373774576, + "grad_norm": 0.49664902687072754, + "learning_rate": 1.153658668688729e-05, + "loss": 0.3696, + "step": 4260 + }, + { + "epoch": 0.023127335752586253, + "grad_norm": 9.279929161071777, + "learning_rate": 1.1563667876293127e-05, + "loss": 0.163, + "step": 4270 + }, + { + "epoch": 0.023181498131397932, + "grad_norm": 7.005964279174805, + "learning_rate": 1.1590749065698967e-05, + "loss": 0.2215, + "step": 4280 + }, + { + "epoch": 0.02323566051020961, + "grad_norm": 10.28485107421875, + "learning_rate": 1.1617830255104804e-05, + "loss": 0.1645, + "step": 4290 + }, + { + "epoch": 0.023289822889021285, + "grad_norm": 11.885159492492676, + "learning_rate": 1.1644911444510644e-05, + "loss": 0.1746, + "step": 4300 + }, + { + "epoch": 0.023343985267832962, + "grad_norm": 0.6822012066841125, + "learning_rate": 1.1671992633916482e-05, + "loss": 0.1036, + "step": 4310 + }, + { + "epoch": 0.02339814764664464, + "grad_norm": 19.65869903564453, + "learning_rate": 1.169907382332232e-05, + "loss": 0.1455, + "step": 4320 + }, + { + "epoch": 0.023452310025456318, + "grad_norm": 13.028449058532715, + "learning_rate": 1.172615501272816e-05, + "loss": 0.181, + "step": 4330 + }, + { + "epoch": 0.023506472404267995, + "grad_norm": 20.51473045349121, + "learning_rate": 1.1753236202133997e-05, + "loss": 0.2015, + "step": 4340 + }, + { + "epoch": 0.023560634783079674, + "grad_norm": 1.0791538953781128, + "learning_rate": 1.1780317391539838e-05, + "loss": 0.1887, + "step": 4350 + }, + { + "epoch": 0.02361479716189135, + "grad_norm": 2.31613826751709, + "learning_rate": 1.1807398580945676e-05, + "loss": 0.1759, + "step": 4360 + }, + { + "epoch": 0.023668959540703027, + "grad_norm": 0.34528276324272156, + "learning_rate": 1.1834479770351514e-05, + "loss": 0.1166, + "step": 4370 + }, + { + "epoch": 0.023723121919514704, + "grad_norm": 12.702816009521484, + "learning_rate": 1.1861560959757353e-05, + "loss": 0.2566, + "step": 4380 + }, + { + "epoch": 0.023777284298326384, + "grad_norm": 17.36395835876465, + "learning_rate": 1.1888642149163191e-05, + "loss": 0.1975, + "step": 4390 + }, + { + "epoch": 0.02383144667713806, + "grad_norm": 5.461242198944092, + "learning_rate": 1.191572333856903e-05, + "loss": 0.2271, + "step": 4400 + }, + { + "epoch": 0.023885609055949737, + "grad_norm": 10.677934646606445, + "learning_rate": 1.194280452797487e-05, + "loss": 0.2211, + "step": 4410 + }, + { + "epoch": 0.023939771434761416, + "grad_norm": 14.284161567687988, + "learning_rate": 1.1969885717380708e-05, + "loss": 0.3792, + "step": 4420 + }, + { + "epoch": 0.023993933813573093, + "grad_norm": 5.5361528396606445, + "learning_rate": 1.1996966906786547e-05, + "loss": 0.1973, + "step": 4430 + }, + { + "epoch": 0.02404809619238477, + "grad_norm": 6.730859756469727, + "learning_rate": 1.2024048096192385e-05, + "loss": 0.2131, + "step": 4440 + }, + { + "epoch": 0.024102258571196446, + "grad_norm": 3.104515314102173, + "learning_rate": 1.2051129285598224e-05, + "loss": 0.1302, + "step": 4450 + }, + { + "epoch": 0.024156420950008126, + "grad_norm": 0.23196585476398468, + "learning_rate": 1.2078210475004064e-05, + "loss": 0.1061, + "step": 4460 + }, + { + "epoch": 0.024210583328819802, + "grad_norm": 12.78976058959961, + "learning_rate": 1.21052916644099e-05, + "loss": 0.0613, + "step": 4470 + }, + { + "epoch": 0.02426474570763148, + "grad_norm": 0.46999871730804443, + "learning_rate": 1.213237285381574e-05, + "loss": 0.0904, + "step": 4480 + }, + { + "epoch": 0.024318908086443155, + "grad_norm": 8.691812515258789, + "learning_rate": 1.215945404322158e-05, + "loss": 0.2883, + "step": 4490 + }, + { + "epoch": 0.024373070465254835, + "grad_norm": 0.6712044477462769, + "learning_rate": 1.2186535232627418e-05, + "loss": 0.3228, + "step": 4500 + }, + { + "epoch": 0.02442723284406651, + "grad_norm": 0.16956719756126404, + "learning_rate": 1.2213616422033256e-05, + "loss": 0.0837, + "step": 4510 + }, + { + "epoch": 0.024481395222878188, + "grad_norm": 3.9974451065063477, + "learning_rate": 1.2240697611439095e-05, + "loss": 0.1316, + "step": 4520 + }, + { + "epoch": 0.024535557601689868, + "grad_norm": 0.14901350438594818, + "learning_rate": 1.2267778800844935e-05, + "loss": 0.0982, + "step": 4530 + }, + { + "epoch": 0.024589719980501544, + "grad_norm": 0.03604470565915108, + "learning_rate": 1.2294859990250771e-05, + "loss": 0.0491, + "step": 4540 + }, + { + "epoch": 0.02464388235931322, + "grad_norm": 4.222771167755127, + "learning_rate": 1.2321941179656612e-05, + "loss": 0.2153, + "step": 4550 + }, + { + "epoch": 0.024698044738124897, + "grad_norm": 0.20324325561523438, + "learning_rate": 1.234902236906245e-05, + "loss": 0.151, + "step": 4560 + }, + { + "epoch": 0.024752207116936577, + "grad_norm": 1.0344115495681763, + "learning_rate": 1.2376103558468288e-05, + "loss": 0.0225, + "step": 4570 + }, + { + "epoch": 0.024806369495748253, + "grad_norm": 4.032346248626709, + "learning_rate": 1.2403184747874127e-05, + "loss": 0.0836, + "step": 4580 + }, + { + "epoch": 0.02486053187455993, + "grad_norm": 0.4779151976108551, + "learning_rate": 1.2430265937279965e-05, + "loss": 0.1778, + "step": 4590 + }, + { + "epoch": 0.02491469425337161, + "grad_norm": 6.882388591766357, + "learning_rate": 1.2457347126685806e-05, + "loss": 0.2568, + "step": 4600 + }, + { + "epoch": 0.024968856632183286, + "grad_norm": 8.82935905456543, + "learning_rate": 1.2484428316091644e-05, + "loss": 0.0556, + "step": 4610 + }, + { + "epoch": 0.025001354059470293, + "eval_accuracy": 0.771717831482691, + "eval_loss": 0.5975676774978638, + "eval_runtime": 115.8538, + "eval_samples_per_second": 26.43, + "eval_steps_per_second": 3.306, + "step": 4616 + }, + { + "epoch": 1.0000216649515248, + "grad_norm": 11.221735954284668, + "learning_rate": 1.2511509505497482e-05, + "loss": 0.117, + "step": 4620 + }, + { + "epoch": 1.0000758273303363, + "grad_norm": 2.145427942276001, + "learning_rate": 1.2538590694903321e-05, + "loss": 0.1498, + "step": 4630 + }, + { + "epoch": 1.000129989709148, + "grad_norm": 4.7485270500183105, + "learning_rate": 1.2565671884309161e-05, + "loss": 0.1205, + "step": 4640 + }, + { + "epoch": 1.0001841520879597, + "grad_norm": 5.265538215637207, + "learning_rate": 1.2592753073714996e-05, + "loss": 0.1498, + "step": 4650 + }, + { + "epoch": 1.0002383144667715, + "grad_norm": 13.468058586120605, + "learning_rate": 1.2619834263120836e-05, + "loss": 0.147, + "step": 4660 + }, + { + "epoch": 1.000292476845583, + "grad_norm": 8.46922492980957, + "learning_rate": 1.2646915452526675e-05, + "loss": 0.2363, + "step": 4670 + }, + { + "epoch": 1.0003466392243947, + "grad_norm": 1.401848316192627, + "learning_rate": 1.2673996641932515e-05, + "loss": 0.1741, + "step": 4680 + }, + { + "epoch": 1.0004008016032064, + "grad_norm": 0.04455507546663284, + "learning_rate": 1.2701077831338353e-05, + "loss": 0.0657, + "step": 4690 + }, + { + "epoch": 1.0004549639820182, + "grad_norm": 18.212881088256836, + "learning_rate": 1.272815902074419e-05, + "loss": 0.4548, + "step": 4700 + }, + { + "epoch": 1.0005091263608297, + "grad_norm": 0.23012007772922516, + "learning_rate": 1.275524021015003e-05, + "loss": 0.0741, + "step": 4710 + }, + { + "epoch": 1.0005632887396414, + "grad_norm": 0.5398334264755249, + "learning_rate": 1.2782321399555869e-05, + "loss": 0.0215, + "step": 4720 + }, + { + "epoch": 1.0006174511184531, + "grad_norm": 14.357131004333496, + "learning_rate": 1.2809402588961709e-05, + "loss": 0.0624, + "step": 4730 + }, + { + "epoch": 1.0006716134972649, + "grad_norm": 6.364073276519775, + "learning_rate": 1.2836483778367547e-05, + "loss": 0.0585, + "step": 4740 + }, + { + "epoch": 1.0007257758760766, + "grad_norm": 14.589290618896484, + "learning_rate": 1.2863564967773387e-05, + "loss": 0.1438, + "step": 4750 + }, + { + "epoch": 1.000779938254888, + "grad_norm": 0.007784543093293905, + "learning_rate": 1.2890646157179222e-05, + "loss": 0.1042, + "step": 4760 + }, + { + "epoch": 1.0008341006336998, + "grad_norm": 2.5892255306243896, + "learning_rate": 1.2917727346585062e-05, + "loss": 0.0982, + "step": 4770 + }, + { + "epoch": 1.0008882630125115, + "grad_norm": 0.00336353643797338, + "learning_rate": 1.2944808535990901e-05, + "loss": 0.1332, + "step": 4780 + }, + { + "epoch": 1.0009424253913233, + "grad_norm": 1.3481225967407227, + "learning_rate": 1.2971889725396741e-05, + "loss": 0.1473, + "step": 4790 + }, + { + "epoch": 1.0009965877701348, + "grad_norm": 0.15100885927677155, + "learning_rate": 1.299897091480258e-05, + "loss": 0.0417, + "step": 4800 + }, + { + "epoch": 1.0010507501489465, + "grad_norm": 0.06472824513912201, + "learning_rate": 1.3026052104208416e-05, + "loss": 0.0121, + "step": 4810 + }, + { + "epoch": 1.0011049125277582, + "grad_norm": 13.793485641479492, + "learning_rate": 1.3053133293614256e-05, + "loss": 0.2383, + "step": 4820 + }, + { + "epoch": 1.00115907490657, + "grad_norm": 9.669205665588379, + "learning_rate": 1.3080214483020095e-05, + "loss": 0.3432, + "step": 4830 + }, + { + "epoch": 1.0012132372853815, + "grad_norm": 8.995847702026367, + "learning_rate": 1.3107295672425935e-05, + "loss": 0.1652, + "step": 4840 + }, + { + "epoch": 1.0012673996641932, + "grad_norm": 0.567703127861023, + "learning_rate": 1.3134376861831773e-05, + "loss": 0.0623, + "step": 4850 + }, + { + "epoch": 1.001321562043005, + "grad_norm": 0.017258131876587868, + "learning_rate": 1.316145805123761e-05, + "loss": 0.1949, + "step": 4860 + }, + { + "epoch": 1.0013757244218167, + "grad_norm": 0.15183156728744507, + "learning_rate": 1.3188539240643449e-05, + "loss": 0.1148, + "step": 4870 + }, + { + "epoch": 1.0014298868006284, + "grad_norm": 0.019001543521881104, + "learning_rate": 1.3215620430049289e-05, + "loss": 0.0864, + "step": 4880 + }, + { + "epoch": 1.00148404917944, + "grad_norm": 11.483125686645508, + "learning_rate": 1.3242701619455127e-05, + "loss": 0.2469, + "step": 4890 + }, + { + "epoch": 1.0015382115582516, + "grad_norm": 0.8063971996307373, + "learning_rate": 1.3269782808860967e-05, + "loss": 0.2656, + "step": 4900 + }, + { + "epoch": 1.0015923739370634, + "grad_norm": 18.782106399536133, + "learning_rate": 1.3296863998266804e-05, + "loss": 0.0798, + "step": 4910 + }, + { + "epoch": 1.001646536315875, + "grad_norm": 13.890620231628418, + "learning_rate": 1.3323945187672643e-05, + "loss": 0.1394, + "step": 4920 + }, + { + "epoch": 1.0017006986946866, + "grad_norm": 1.7208741903305054, + "learning_rate": 1.3351026377078483e-05, + "loss": 0.131, + "step": 4930 + }, + { + "epoch": 1.0017548610734983, + "grad_norm": 2.471468925476074, + "learning_rate": 1.3378107566484321e-05, + "loss": 0.2093, + "step": 4940 + }, + { + "epoch": 1.00180902345231, + "grad_norm": 0.06446637213230133, + "learning_rate": 1.3405188755890161e-05, + "loss": 0.0747, + "step": 4950 + }, + { + "epoch": 1.0018631858311218, + "grad_norm": 3.501986026763916, + "learning_rate": 1.3432269945295996e-05, + "loss": 0.1191, + "step": 4960 + }, + { + "epoch": 1.0019173482099333, + "grad_norm": 0.38957253098487854, + "learning_rate": 1.3459351134701836e-05, + "loss": 0.0978, + "step": 4970 + }, + { + "epoch": 1.001971510588745, + "grad_norm": 0.31398651003837585, + "learning_rate": 1.3486432324107675e-05, + "loss": 0.1465, + "step": 4980 + }, + { + "epoch": 1.0020256729675567, + "grad_norm": 0.00766389723867178, + "learning_rate": 1.3513513513513515e-05, + "loss": 0.1788, + "step": 4990 + }, + { + "epoch": 1.0020798353463685, + "grad_norm": 10.83895492553711, + "learning_rate": 1.3540594702919354e-05, + "loss": 0.1077, + "step": 5000 + }, + { + "epoch": 1.0021339977251802, + "grad_norm": 0.33199188113212585, + "learning_rate": 1.356767589232519e-05, + "loss": 0.1344, + "step": 5010 + }, + { + "epoch": 1.0021881601039917, + "grad_norm": 14.667470932006836, + "learning_rate": 1.359475708173103e-05, + "loss": 0.1808, + "step": 5020 + }, + { + "epoch": 1.0022423224828034, + "grad_norm": 0.023105673491954803, + "learning_rate": 1.3621838271136869e-05, + "loss": 0.178, + "step": 5030 + }, + { + "epoch": 1.0022964848616152, + "grad_norm": 0.07697730511426926, + "learning_rate": 1.3648919460542709e-05, + "loss": 0.0641, + "step": 5040 + }, + { + "epoch": 1.0023506472404269, + "grad_norm": 0.01796918362379074, + "learning_rate": 1.3676000649948547e-05, + "loss": 0.088, + "step": 5050 + }, + { + "epoch": 1.0024048096192384, + "grad_norm": 3.338207244873047, + "learning_rate": 1.3703081839354384e-05, + "loss": 0.1495, + "step": 5060 + }, + { + "epoch": 1.0024589719980501, + "grad_norm": 0.5778915882110596, + "learning_rate": 1.3730163028760223e-05, + "loss": 0.1651, + "step": 5070 + }, + { + "epoch": 1.0025131343768618, + "grad_norm": 6.403656959533691, + "learning_rate": 1.3757244218166063e-05, + "loss": 0.2021, + "step": 5080 + }, + { + "epoch": 1.0025672967556736, + "grad_norm": 0.18753719329833984, + "learning_rate": 1.3784325407571901e-05, + "loss": 0.1137, + "step": 5090 + }, + { + "epoch": 1.002621459134485, + "grad_norm": 0.21636955440044403, + "learning_rate": 1.3811406596977741e-05, + "loss": 0.1887, + "step": 5100 + }, + { + "epoch": 1.0026756215132968, + "grad_norm": 5.41344690322876, + "learning_rate": 1.3838487786383578e-05, + "loss": 0.1605, + "step": 5110 + }, + { + "epoch": 1.0027297838921085, + "grad_norm": 27.693023681640625, + "learning_rate": 1.3865568975789417e-05, + "loss": 0.1915, + "step": 5120 + }, + { + "epoch": 1.0027839462709203, + "grad_norm": 0.07914004474878311, + "learning_rate": 1.3892650165195257e-05, + "loss": 0.0815, + "step": 5130 + }, + { + "epoch": 1.002838108649732, + "grad_norm": 21.18498420715332, + "learning_rate": 1.3919731354601095e-05, + "loss": 0.0786, + "step": 5140 + }, + { + "epoch": 1.0028922710285435, + "grad_norm": 0.675323486328125, + "learning_rate": 1.3946812544006935e-05, + "loss": 0.129, + "step": 5150 + }, + { + "epoch": 1.0029464334073552, + "grad_norm": 0.25883620977401733, + "learning_rate": 1.397389373341277e-05, + "loss": 0.1807, + "step": 5160 + }, + { + "epoch": 1.003000595786167, + "grad_norm": 2.4212636947631836, + "learning_rate": 1.400097492281861e-05, + "loss": 0.1047, + "step": 5170 + }, + { + "epoch": 1.0030547581649787, + "grad_norm": 0.05566485598683357, + "learning_rate": 1.4028056112224449e-05, + "loss": 0.0811, + "step": 5180 + }, + { + "epoch": 1.0031089205437902, + "grad_norm": 0.036301493644714355, + "learning_rate": 1.4055137301630289e-05, + "loss": 0.0976, + "step": 5190 + }, + { + "epoch": 1.003163082922602, + "grad_norm": 5.5409016609191895, + "learning_rate": 1.4082218491036128e-05, + "loss": 0.2103, + "step": 5200 + }, + { + "epoch": 1.0032172453014137, + "grad_norm": 0.024279391393065453, + "learning_rate": 1.4109299680441964e-05, + "loss": 0.1749, + "step": 5210 + }, + { + "epoch": 1.0032714076802254, + "grad_norm": 0.026982193812727928, + "learning_rate": 1.4136380869847804e-05, + "loss": 0.0942, + "step": 5220 + }, + { + "epoch": 1.0033255700590369, + "grad_norm": 15.476045608520508, + "learning_rate": 1.4163462059253643e-05, + "loss": 0.2969, + "step": 5230 + }, + { + "epoch": 1.0033797324378486, + "grad_norm": 2.254821300506592, + "learning_rate": 1.4190543248659483e-05, + "loss": 0.0864, + "step": 5240 + }, + { + "epoch": 1.0034338948166603, + "grad_norm": 0.25831055641174316, + "learning_rate": 1.4217624438065321e-05, + "loss": 0.0407, + "step": 5250 + }, + { + "epoch": 1.003488057195472, + "grad_norm": 8.5800199508667, + "learning_rate": 1.4244705627471158e-05, + "loss": 0.3385, + "step": 5260 + }, + { + "epoch": 1.0035422195742838, + "grad_norm": 0.18973223865032196, + "learning_rate": 1.4271786816876997e-05, + "loss": 0.2013, + "step": 5270 + }, + { + "epoch": 1.0035963819530953, + "grad_norm": 3.566890239715576, + "learning_rate": 1.4298868006282837e-05, + "loss": 0.1313, + "step": 5280 + }, + { + "epoch": 1.003650544331907, + "grad_norm": 0.006112874485552311, + "learning_rate": 1.4325949195688675e-05, + "loss": 0.1564, + "step": 5290 + }, + { + "epoch": 1.0037047067107188, + "grad_norm": 0.04018423333764076, + "learning_rate": 1.4353030385094515e-05, + "loss": 0.5115, + "step": 5300 + }, + { + "epoch": 1.0037588690895305, + "grad_norm": 9.201103210449219, + "learning_rate": 1.4380111574500352e-05, + "loss": 0.2361, + "step": 5310 + }, + { + "epoch": 1.003813031468342, + "grad_norm": 3.9105515480041504, + "learning_rate": 1.440719276390619e-05, + "loss": 0.1299, + "step": 5320 + }, + { + "epoch": 1.0038671938471537, + "grad_norm": 0.04575413465499878, + "learning_rate": 1.443427395331203e-05, + "loss": 0.1806, + "step": 5330 + }, + { + "epoch": 1.0039213562259655, + "grad_norm": 0.38823843002319336, + "learning_rate": 1.446135514271787e-05, + "loss": 0.1279, + "step": 5340 + }, + { + "epoch": 1.0039755186047772, + "grad_norm": 0.06289808452129364, + "learning_rate": 1.448843633212371e-05, + "loss": 0.2172, + "step": 5350 + }, + { + "epoch": 1.0040296809835887, + "grad_norm": 8.095627784729004, + "learning_rate": 1.4515517521529546e-05, + "loss": 0.0602, + "step": 5360 + }, + { + "epoch": 1.0040838433624004, + "grad_norm": 1.2055705785751343, + "learning_rate": 1.4542598710935385e-05, + "loss": 0.0932, + "step": 5370 + }, + { + "epoch": 1.0041380057412121, + "grad_norm": 0.03328526392579079, + "learning_rate": 1.4569679900341223e-05, + "loss": 0.0946, + "step": 5380 + }, + { + "epoch": 1.0041921681200239, + "grad_norm": 0.0172802172601223, + "learning_rate": 1.4596761089747063e-05, + "loss": 0.148, + "step": 5390 + }, + { + "epoch": 1.0042463304988356, + "grad_norm": 0.6869587302207947, + "learning_rate": 1.4623842279152902e-05, + "loss": 0.1024, + "step": 5400 + }, + { + "epoch": 1.004300492877647, + "grad_norm": 0.027849959209561348, + "learning_rate": 1.4650923468558738e-05, + "loss": 0.2324, + "step": 5410 + }, + { + "epoch": 1.0043546552564588, + "grad_norm": 0.274179607629776, + "learning_rate": 1.4678004657964578e-05, + "loss": 0.1531, + "step": 5420 + }, + { + "epoch": 1.0044088176352706, + "grad_norm": 8.129202842712402, + "learning_rate": 1.4705085847370417e-05, + "loss": 0.2129, + "step": 5430 + }, + { + "epoch": 1.0044629800140823, + "grad_norm": 10.423924446105957, + "learning_rate": 1.4732167036776257e-05, + "loss": 0.1039, + "step": 5440 + }, + { + "epoch": 1.0045171423928938, + "grad_norm": 9.197463989257812, + "learning_rate": 1.4759248226182095e-05, + "loss": 0.1666, + "step": 5450 + }, + { + "epoch": 1.0045713047717055, + "grad_norm": 0.4824374318122864, + "learning_rate": 1.4786329415587932e-05, + "loss": 0.2749, + "step": 5460 + }, + { + "epoch": 1.0046254671505173, + "grad_norm": 6.999705791473389, + "learning_rate": 1.4813410604993772e-05, + "loss": 0.1256, + "step": 5470 + }, + { + "epoch": 1.004679629529329, + "grad_norm": 2.835362434387207, + "learning_rate": 1.484049179439961e-05, + "loss": 0.0721, + "step": 5480 + }, + { + "epoch": 1.0047337919081407, + "grad_norm": 0.30005159974098206, + "learning_rate": 1.486757298380545e-05, + "loss": 0.1556, + "step": 5490 + }, + { + "epoch": 1.0047879542869522, + "grad_norm": 10.57960033416748, + "learning_rate": 1.489465417321129e-05, + "loss": 0.1126, + "step": 5500 + }, + { + "epoch": 1.004842116665764, + "grad_norm": 7.267529010772705, + "learning_rate": 1.4921735362617126e-05, + "loss": 0.0511, + "step": 5510 + }, + { + "epoch": 1.0048962790445757, + "grad_norm": 0.21566058695316315, + "learning_rate": 1.4948816552022965e-05, + "loss": 0.2056, + "step": 5520 + }, + { + "epoch": 1.0049504414233874, + "grad_norm": 0.019111791625618935, + "learning_rate": 1.4975897741428805e-05, + "loss": 0.0289, + "step": 5530 + }, + { + "epoch": 1.005004603802199, + "grad_norm": 0.15545818209648132, + "learning_rate": 1.5002978930834643e-05, + "loss": 0.0706, + "step": 5540 + }, + { + "epoch": 1.0050587661810106, + "grad_norm": 0.027943026274442673, + "learning_rate": 1.5030060120240483e-05, + "loss": 0.0326, + "step": 5550 + }, + { + "epoch": 1.0051129285598224, + "grad_norm": 0.07392769306898117, + "learning_rate": 1.505714130964632e-05, + "loss": 0.2006, + "step": 5560 + }, + { + "epoch": 1.005167090938634, + "grad_norm": 0.006998785771429539, + "learning_rate": 1.5084222499052159e-05, + "loss": 0.023, + "step": 5570 + }, + { + "epoch": 1.0052212533174456, + "grad_norm": 0.056796181946992874, + "learning_rate": 1.5111303688457999e-05, + "loss": 0.1421, + "step": 5580 + }, + { + "epoch": 1.0052754156962573, + "grad_norm": 11.64111042022705, + "learning_rate": 1.5138384877863837e-05, + "loss": 0.0878, + "step": 5590 + }, + { + "epoch": 1.005329578075069, + "grad_norm": 0.06926272809505463, + "learning_rate": 1.5165466067269676e-05, + "loss": 0.0455, + "step": 5600 + }, + { + "epoch": 1.0053837404538808, + "grad_norm": 0.06168804317712784, + "learning_rate": 1.5192547256675512e-05, + "loss": 0.198, + "step": 5610 + }, + { + "epoch": 1.0054379028326925, + "grad_norm": 1.4107177257537842, + "learning_rate": 1.5219628446081352e-05, + "loss": 0.1432, + "step": 5620 + }, + { + "epoch": 1.005492065211504, + "grad_norm": 1.07001531124115, + "learning_rate": 1.5246709635487191e-05, + "loss": 0.1253, + "step": 5630 + }, + { + "epoch": 1.0055462275903158, + "grad_norm": 0.2279176414012909, + "learning_rate": 1.527379082489303e-05, + "loss": 0.2208, + "step": 5640 + }, + { + "epoch": 1.0056003899691275, + "grad_norm": 24.20154571533203, + "learning_rate": 1.530087201429887e-05, + "loss": 0.3131, + "step": 5650 + }, + { + "epoch": 1.0056545523479392, + "grad_norm": 0.18962661921977997, + "learning_rate": 1.5327953203704706e-05, + "loss": 0.1051, + "step": 5660 + }, + { + "epoch": 1.0057087147267507, + "grad_norm": 1.4455677270889282, + "learning_rate": 1.5355034393110546e-05, + "loss": 0.1195, + "step": 5670 + }, + { + "epoch": 1.0057628771055624, + "grad_norm": 0.21034394204616547, + "learning_rate": 1.5382115582516386e-05, + "loss": 0.1809, + "step": 5680 + }, + { + "epoch": 1.0058170394843742, + "grad_norm": 7.6428093910217285, + "learning_rate": 1.5409196771922223e-05, + "loss": 0.1877, + "step": 5690 + }, + { + "epoch": 1.005871201863186, + "grad_norm": 15.961008071899414, + "learning_rate": 1.5436277961328063e-05, + "loss": 0.1935, + "step": 5700 + }, + { + "epoch": 1.0059253642419974, + "grad_norm": 0.04052744433283806, + "learning_rate": 1.54633591507339e-05, + "loss": 0.2125, + "step": 5710 + }, + { + "epoch": 1.0059795266208091, + "grad_norm": 0.8605957627296448, + "learning_rate": 1.549044034013974e-05, + "loss": 0.1004, + "step": 5720 + }, + { + "epoch": 1.0060336889996209, + "grad_norm": 0.29141169786453247, + "learning_rate": 1.5517521529545577e-05, + "loss": 0.0862, + "step": 5730 + }, + { + "epoch": 1.0060878513784326, + "grad_norm": 18.26161766052246, + "learning_rate": 1.5544602718951417e-05, + "loss": 0.1503, + "step": 5740 + }, + { + "epoch": 1.0061420137572443, + "grad_norm": 0.11275200545787811, + "learning_rate": 1.5571683908357257e-05, + "loss": 0.1391, + "step": 5750 + }, + { + "epoch": 1.0061961761360558, + "grad_norm": 0.9635252356529236, + "learning_rate": 1.5598765097763094e-05, + "loss": 0.0486, + "step": 5760 + }, + { + "epoch": 1.0062503385148676, + "grad_norm": 0.15050961077213287, + "learning_rate": 1.5625846287168934e-05, + "loss": 0.2905, + "step": 5770 + }, + { + "epoch": 1.0063045008936793, + "grad_norm": 0.08780816197395325, + "learning_rate": 1.565292747657477e-05, + "loss": 0.0179, + "step": 5780 + }, + { + "epoch": 1.006358663272491, + "grad_norm": 7.881669044494629, + "learning_rate": 1.568000866598061e-05, + "loss": 0.0973, + "step": 5790 + }, + { + "epoch": 1.0064128256513025, + "grad_norm": 0.2199449986219406, + "learning_rate": 1.570708985538645e-05, + "loss": 0.1473, + "step": 5800 + }, + { + "epoch": 1.0064669880301143, + "grad_norm": 7.633479118347168, + "learning_rate": 1.5734171044792288e-05, + "loss": 0.203, + "step": 5810 + }, + { + "epoch": 1.006521150408926, + "grad_norm": 26.020856857299805, + "learning_rate": 1.5761252234198125e-05, + "loss": 0.1699, + "step": 5820 + }, + { + "epoch": 1.0065753127877377, + "grad_norm": 11.52465534210205, + "learning_rate": 1.5788333423603965e-05, + "loss": 0.1913, + "step": 5830 + }, + { + "epoch": 1.0066294751665492, + "grad_norm": 0.22552692890167236, + "learning_rate": 1.5815414613009805e-05, + "loss": 0.1158, + "step": 5840 + }, + { + "epoch": 1.006683637545361, + "grad_norm": 0.1789989471435547, + "learning_rate": 1.5842495802415645e-05, + "loss": 0.0614, + "step": 5850 + }, + { + "epoch": 1.0067377999241727, + "grad_norm": 13.492840766906738, + "learning_rate": 1.5869576991821482e-05, + "loss": 0.1796, + "step": 5860 + }, + { + "epoch": 1.0067919623029844, + "grad_norm": 0.03283512592315674, + "learning_rate": 1.589665818122732e-05, + "loss": 0.2383, + "step": 5870 + }, + { + "epoch": 1.0068461246817961, + "grad_norm": 7.165161609649658, + "learning_rate": 1.592373937063316e-05, + "loss": 0.3537, + "step": 5880 + }, + { + "epoch": 1.0069002870606076, + "grad_norm": 0.3631797432899475, + "learning_rate": 1.5950820560039e-05, + "loss": 0.2072, + "step": 5890 + }, + { + "epoch": 1.0069544494394194, + "grad_norm": 6.52224063873291, + "learning_rate": 1.597790174944484e-05, + "loss": 0.1192, + "step": 5900 + }, + { + "epoch": 1.007008611818231, + "grad_norm": 0.1821892112493515, + "learning_rate": 1.6004982938850672e-05, + "loss": 0.107, + "step": 5910 + }, + { + "epoch": 1.0070627741970428, + "grad_norm": 6.126409530639648, + "learning_rate": 1.6032064128256513e-05, + "loss": 0.2029, + "step": 5920 + }, + { + "epoch": 1.0071169365758543, + "grad_norm": 0.1807929277420044, + "learning_rate": 1.6059145317662353e-05, + "loss": 0.1831, + "step": 5930 + }, + { + "epoch": 1.007171098954666, + "grad_norm": 2.4333465099334717, + "learning_rate": 1.6086226507068193e-05, + "loss": 0.0583, + "step": 5940 + }, + { + "epoch": 1.0072252613334778, + "grad_norm": 0.24373111128807068, + "learning_rate": 1.611330769647403e-05, + "loss": 0.0288, + "step": 5950 + }, + { + "epoch": 1.0072794237122895, + "grad_norm": 5.126194477081299, + "learning_rate": 1.6140388885879866e-05, + "loss": 0.258, + "step": 5960 + }, + { + "epoch": 1.007333586091101, + "grad_norm": 0.23472626507282257, + "learning_rate": 1.6167470075285707e-05, + "loss": 0.078, + "step": 5970 + }, + { + "epoch": 1.0073877484699127, + "grad_norm": 0.010092008858919144, + "learning_rate": 1.6194551264691547e-05, + "loss": 0.2113, + "step": 5980 + }, + { + "epoch": 1.0074419108487245, + "grad_norm": 0.4639599919319153, + "learning_rate": 1.6221632454097387e-05, + "loss": 0.0834, + "step": 5990 + }, + { + "epoch": 1.0074960732275362, + "grad_norm": 0.04273061081767082, + "learning_rate": 1.6248713643503224e-05, + "loss": 0.1571, + "step": 6000 + }, + { + "epoch": 1.007550235606348, + "grad_norm": 0.03644074872136116, + "learning_rate": 1.627579483290906e-05, + "loss": 0.1267, + "step": 6010 + }, + { + "epoch": 1.0076043979851594, + "grad_norm": 0.29697954654693604, + "learning_rate": 1.63028760223149e-05, + "loss": 0.1746, + "step": 6020 + }, + { + "epoch": 1.0076585603639712, + "grad_norm": 7.900935173034668, + "learning_rate": 1.632995721172074e-05, + "loss": 0.396, + "step": 6030 + }, + { + "epoch": 1.007712722742783, + "grad_norm": 0.3344002664089203, + "learning_rate": 1.6357038401126577e-05, + "loss": 0.0461, + "step": 6040 + }, + { + "epoch": 1.0077668851215946, + "grad_norm": 0.03698104992508888, + "learning_rate": 1.6384119590532417e-05, + "loss": 0.1278, + "step": 6050 + }, + { + "epoch": 1.0078210475004061, + "grad_norm": 5.615370273590088, + "learning_rate": 1.6411200779938254e-05, + "loss": 0.0888, + "step": 6060 + }, + { + "epoch": 1.0078752098792179, + "grad_norm": 3.4630277156829834, + "learning_rate": 1.6438281969344094e-05, + "loss": 0.0601, + "step": 6070 + }, + { + "epoch": 1.0079293722580296, + "grad_norm": 0.002622748026624322, + "learning_rate": 1.6465363158749935e-05, + "loss": 0.1391, + "step": 6080 + }, + { + "epoch": 1.0079835346368413, + "grad_norm": 0.23459641635417938, + "learning_rate": 1.649244434815577e-05, + "loss": 0.0656, + "step": 6090 + }, + { + "epoch": 1.0080376970156528, + "grad_norm": 0.24089162051677704, + "learning_rate": 1.651952553756161e-05, + "loss": 0.0534, + "step": 6100 + }, + { + "epoch": 1.0080918593944646, + "grad_norm": 1.3197511434555054, + "learning_rate": 1.6546606726967448e-05, + "loss": 0.0562, + "step": 6110 + }, + { + "epoch": 1.0081460217732763, + "grad_norm": 0.3972378373146057, + "learning_rate": 1.6573687916373288e-05, + "loss": 0.2319, + "step": 6120 + }, + { + "epoch": 1.008200184152088, + "grad_norm": 0.0014920945977792144, + "learning_rate": 1.6600769105779125e-05, + "loss": 0.0256, + "step": 6130 + }, + { + "epoch": 1.0082543465308997, + "grad_norm": 9.764899253845215, + "learning_rate": 1.6627850295184965e-05, + "loss": 0.1133, + "step": 6140 + }, + { + "epoch": 1.0083085089097112, + "grad_norm": 0.0019019015599042177, + "learning_rate": 1.6654931484590805e-05, + "loss": 0.1025, + "step": 6150 + }, + { + "epoch": 1.008362671288523, + "grad_norm": 20.74660873413086, + "learning_rate": 1.6682012673996642e-05, + "loss": 0.0429, + "step": 6160 + }, + { + "epoch": 1.0084168336673347, + "grad_norm": 13.710766792297363, + "learning_rate": 1.6709093863402482e-05, + "loss": 0.1172, + "step": 6170 + }, + { + "epoch": 1.0084709960461464, + "grad_norm": 7.177912712097168, + "learning_rate": 1.673617505280832e-05, + "loss": 0.1053, + "step": 6180 + }, + { + "epoch": 1.008525158424958, + "grad_norm": 0.009729472920298576, + "learning_rate": 1.676325624221416e-05, + "loss": 0.081, + "step": 6190 + }, + { + "epoch": 1.0085793208037697, + "grad_norm": 1.7846341133117676, + "learning_rate": 1.679033743162e-05, + "loss": 0.5051, + "step": 6200 + }, + { + "epoch": 1.0086334831825814, + "grad_norm": 0.5096381902694702, + "learning_rate": 1.6817418621025836e-05, + "loss": 0.0382, + "step": 6210 + }, + { + "epoch": 1.0086876455613931, + "grad_norm": 1.3840848207473755, + "learning_rate": 1.6844499810431673e-05, + "loss": 0.2545, + "step": 6220 + }, + { + "epoch": 1.0087418079402046, + "grad_norm": 7.938632965087891, + "learning_rate": 1.6871580999837513e-05, + "loss": 0.1278, + "step": 6230 + }, + { + "epoch": 1.0087959703190164, + "grad_norm": 0.2374674379825592, + "learning_rate": 1.6898662189243353e-05, + "loss": 0.0881, + "step": 6240 + }, + { + "epoch": 1.008850132697828, + "grad_norm": 0.17536091804504395, + "learning_rate": 1.6925743378649193e-05, + "loss": 0.1825, + "step": 6250 + }, + { + "epoch": 1.0089042950766398, + "grad_norm": 0.016839461401104927, + "learning_rate": 1.695282456805503e-05, + "loss": 0.1274, + "step": 6260 + }, + { + "epoch": 1.0089584574554515, + "grad_norm": 11.03178596496582, + "learning_rate": 1.6979905757460867e-05, + "loss": 0.0759, + "step": 6270 + }, + { + "epoch": 1.009012619834263, + "grad_norm": 0.35099270939826965, + "learning_rate": 1.7006986946866707e-05, + "loss": 0.1349, + "step": 6280 + }, + { + "epoch": 1.0090667822130748, + "grad_norm": 0.01564708724617958, + "learning_rate": 1.7034068136272547e-05, + "loss": 0.0046, + "step": 6290 + }, + { + "epoch": 1.0091209445918865, + "grad_norm": 0.08374343812465668, + "learning_rate": 1.7061149325678387e-05, + "loss": 0.0576, + "step": 6300 + }, + { + "epoch": 1.0091751069706982, + "grad_norm": 16.92888832092285, + "learning_rate": 1.708823051508422e-05, + "loss": 0.295, + "step": 6310 + }, + { + "epoch": 1.0092292693495097, + "grad_norm": 10.002669334411621, + "learning_rate": 1.711531170449006e-05, + "loss": 0.1686, + "step": 6320 + }, + { + "epoch": 1.0092834317283215, + "grad_norm": 0.8879705667495728, + "learning_rate": 1.71423928938959e-05, + "loss": 0.2505, + "step": 6330 + }, + { + "epoch": 1.0093375941071332, + "grad_norm": 0.5622044801712036, + "learning_rate": 1.716947408330174e-05, + "loss": 0.1138, + "step": 6340 + }, + { + "epoch": 1.009391756485945, + "grad_norm": 0.04202735796570778, + "learning_rate": 1.7196555272707578e-05, + "loss": 0.1883, + "step": 6350 + }, + { + "epoch": 1.0094459188647564, + "grad_norm": 0.05232328549027443, + "learning_rate": 1.7223636462113414e-05, + "loss": 0.0244, + "step": 6360 + }, + { + "epoch": 1.0095000812435682, + "grad_norm": 29.768434524536133, + "learning_rate": 1.7250717651519255e-05, + "loss": 0.0777, + "step": 6370 + }, + { + "epoch": 1.0095542436223799, + "grad_norm": 0.25223052501678467, + "learning_rate": 1.7277798840925095e-05, + "loss": 0.0807, + "step": 6380 + }, + { + "epoch": 1.0096084060011916, + "grad_norm": 0.10149019956588745, + "learning_rate": 1.7304880030330935e-05, + "loss": 0.0189, + "step": 6390 + }, + { + "epoch": 1.0096625683800033, + "grad_norm": 13.238442420959473, + "learning_rate": 1.733196121973677e-05, + "loss": 0.2988, + "step": 6400 + }, + { + "epoch": 1.0097167307588149, + "grad_norm": 0.20283780992031097, + "learning_rate": 1.735904240914261e-05, + "loss": 0.32, + "step": 6410 + }, + { + "epoch": 1.0097708931376266, + "grad_norm": 0.0046094972640275955, + "learning_rate": 1.738612359854845e-05, + "loss": 0.0296, + "step": 6420 + }, + { + "epoch": 1.0098250555164383, + "grad_norm": 2.21079683303833, + "learning_rate": 1.741320478795429e-05, + "loss": 0.1614, + "step": 6430 + }, + { + "epoch": 1.00987921789525, + "grad_norm": 0.01659194566309452, + "learning_rate": 1.7440285977360125e-05, + "loss": 0.0308, + "step": 6440 + }, + { + "epoch": 1.0099333802740615, + "grad_norm": 1.0018967390060425, + "learning_rate": 1.7467367166765965e-05, + "loss": 0.1626, + "step": 6450 + }, + { + "epoch": 1.0099875426528733, + "grad_norm": 18.80198097229004, + "learning_rate": 1.7494448356171802e-05, + "loss": 0.2218, + "step": 6460 + }, + { + "epoch": 1.010041705031685, + "grad_norm": 0.005072493571788073, + "learning_rate": 1.7521529545577642e-05, + "loss": 0.1464, + "step": 6470 + }, + { + "epoch": 1.0100958674104967, + "grad_norm": 0.009674482978880405, + "learning_rate": 1.7548610734983483e-05, + "loss": 0.1269, + "step": 6480 + }, + { + "epoch": 1.0101500297893082, + "grad_norm": 0.18282200396060944, + "learning_rate": 1.757569192438932e-05, + "loss": 0.1504, + "step": 6490 + }, + { + "epoch": 1.01020419216812, + "grad_norm": 0.03937043249607086, + "learning_rate": 1.760277311379516e-05, + "loss": 0.2068, + "step": 6500 + }, + { + "epoch": 1.0102583545469317, + "grad_norm": 0.053609561175107956, + "learning_rate": 1.7629854303200996e-05, + "loss": 0.0912, + "step": 6510 + }, + { + "epoch": 1.0103125169257434, + "grad_norm": 0.14519517123699188, + "learning_rate": 1.7656935492606836e-05, + "loss": 0.0896, + "step": 6520 + }, + { + "epoch": 1.0103666793045551, + "grad_norm": 0.452588826417923, + "learning_rate": 1.7684016682012673e-05, + "loss": 0.0891, + "step": 6530 + }, + { + "epoch": 1.0104208416833667, + "grad_norm": 4.576705455780029, + "learning_rate": 1.7711097871418513e-05, + "loss": 0.1449, + "step": 6540 + }, + { + "epoch": 1.0104750040621784, + "grad_norm": 0.2624431848526001, + "learning_rate": 1.7738179060824353e-05, + "loss": 0.1106, + "step": 6550 + }, + { + "epoch": 1.0105291664409901, + "grad_norm": 0.5083410739898682, + "learning_rate": 1.776526025023019e-05, + "loss": 0.148, + "step": 6560 + }, + { + "epoch": 1.0105833288198018, + "grad_norm": 0.5606547594070435, + "learning_rate": 1.779234143963603e-05, + "loss": 0.1122, + "step": 6570 + }, + { + "epoch": 1.0106374911986133, + "grad_norm": 8.397526741027832, + "learning_rate": 1.7819422629041867e-05, + "loss": 0.1233, + "step": 6580 + }, + { + "epoch": 1.010691653577425, + "grad_norm": 3.787290096282959, + "learning_rate": 1.7846503818447707e-05, + "loss": 0.096, + "step": 6590 + }, + { + "epoch": 1.0107458159562368, + "grad_norm": 10.594948768615723, + "learning_rate": 1.7873585007853547e-05, + "loss": 0.1062, + "step": 6600 + }, + { + "epoch": 1.0107999783350485, + "grad_norm": 0.4961230754852295, + "learning_rate": 1.7900666197259387e-05, + "loss": 0.1029, + "step": 6610 + }, + { + "epoch": 1.0108541407138603, + "grad_norm": 0.08628525584936142, + "learning_rate": 1.792774738666522e-05, + "loss": 0.1168, + "step": 6620 + }, + { + "epoch": 1.0109083030926718, + "grad_norm": 0.013310207985341549, + "learning_rate": 1.795482857607106e-05, + "loss": 0.1525, + "step": 6630 + }, + { + "epoch": 1.0109624654714835, + "grad_norm": 10.659051895141602, + "learning_rate": 1.79819097654769e-05, + "loss": 0.395, + "step": 6640 + }, + { + "epoch": 1.0110166278502952, + "grad_norm": 10.468491554260254, + "learning_rate": 1.800899095488274e-05, + "loss": 0.0887, + "step": 6650 + }, + { + "epoch": 1.011070790229107, + "grad_norm": 0.39261865615844727, + "learning_rate": 1.8036072144288578e-05, + "loss": 0.0711, + "step": 6660 + }, + { + "epoch": 1.0111249526079185, + "grad_norm": 7.732344150543213, + "learning_rate": 1.8063153333694415e-05, + "loss": 0.0768, + "step": 6670 + }, + { + "epoch": 1.0111791149867302, + "grad_norm": 0.07193596661090851, + "learning_rate": 1.8090234523100255e-05, + "loss": 0.1808, + "step": 6680 + }, + { + "epoch": 1.011233277365542, + "grad_norm": 0.9607293605804443, + "learning_rate": 1.8117315712506095e-05, + "loss": 0.1348, + "step": 6690 + }, + { + "epoch": 1.0112874397443536, + "grad_norm": 0.09517800807952881, + "learning_rate": 1.8144396901911935e-05, + "loss": 0.0979, + "step": 6700 + }, + { + "epoch": 1.0113416021231652, + "grad_norm": 5.6020026206970215, + "learning_rate": 1.8171478091317772e-05, + "loss": 0.1224, + "step": 6710 + }, + { + "epoch": 1.0113957645019769, + "grad_norm": 7.661608695983887, + "learning_rate": 1.819855928072361e-05, + "loss": 0.3071, + "step": 6720 + }, + { + "epoch": 1.0114499268807886, + "grad_norm": 0.18265685439109802, + "learning_rate": 1.822564047012945e-05, + "loss": 0.1061, + "step": 6730 + }, + { + "epoch": 1.0115040892596003, + "grad_norm": 0.19316133856773376, + "learning_rate": 1.825272165953529e-05, + "loss": 0.0979, + "step": 6740 + }, + { + "epoch": 1.011558251638412, + "grad_norm": 12.234869956970215, + "learning_rate": 1.8279802848941126e-05, + "loss": 0.063, + "step": 6750 + }, + { + "epoch": 1.0116124140172236, + "grad_norm": 0.9758610129356384, + "learning_rate": 1.8306884038346966e-05, + "loss": 0.1302, + "step": 6760 + }, + { + "epoch": 1.0116665763960353, + "grad_norm": 2.0391602516174316, + "learning_rate": 1.8333965227752803e-05, + "loss": 0.1961, + "step": 6770 + }, + { + "epoch": 1.011720738774847, + "grad_norm": 12.44857120513916, + "learning_rate": 1.8361046417158643e-05, + "loss": 0.0994, + "step": 6780 + }, + { + "epoch": 1.0117749011536588, + "grad_norm": 17.314586639404297, + "learning_rate": 1.8388127606564483e-05, + "loss": 0.1165, + "step": 6790 + }, + { + "epoch": 1.0118290635324703, + "grad_norm": 7.5439348220825195, + "learning_rate": 1.841520879597032e-05, + "loss": 0.1015, + "step": 6800 + }, + { + "epoch": 1.011883225911282, + "grad_norm": 5.168331623077393, + "learning_rate": 1.844228998537616e-05, + "loss": 0.1158, + "step": 6810 + }, + { + "epoch": 1.0119373882900937, + "grad_norm": 5.015535354614258, + "learning_rate": 1.8469371174781996e-05, + "loss": 0.0646, + "step": 6820 + }, + { + "epoch": 1.0119915506689054, + "grad_norm": 0.12766510248184204, + "learning_rate": 1.8496452364187837e-05, + "loss": 0.1355, + "step": 6830 + }, + { + "epoch": 1.012045713047717, + "grad_norm": 18.111997604370117, + "learning_rate": 1.8523533553593673e-05, + "loss": 0.3379, + "step": 6840 + }, + { + "epoch": 1.0120998754265287, + "grad_norm": 0.8048194050788879, + "learning_rate": 1.8550614742999513e-05, + "loss": 0.1267, + "step": 6850 + }, + { + "epoch": 1.0121540378053404, + "grad_norm": 0.12104889750480652, + "learning_rate": 1.8577695932405354e-05, + "loss": 0.1732, + "step": 6860 + }, + { + "epoch": 1.0122082001841521, + "grad_norm": 6.464033603668213, + "learning_rate": 1.860477712181119e-05, + "loss": 0.1919, + "step": 6870 + }, + { + "epoch": 1.0122623625629639, + "grad_norm": 7.577486991882324, + "learning_rate": 1.863185831121703e-05, + "loss": 0.152, + "step": 6880 + }, + { + "epoch": 1.0123165249417754, + "grad_norm": 4.441318511962891, + "learning_rate": 1.8658939500622867e-05, + "loss": 0.3375, + "step": 6890 + }, + { + "epoch": 1.012370687320587, + "grad_norm": 4.596330642700195, + "learning_rate": 1.8686020690028707e-05, + "loss": 0.1705, + "step": 6900 + }, + { + "epoch": 1.0124248496993988, + "grad_norm": 0.835213840007782, + "learning_rate": 1.8713101879434548e-05, + "loss": 0.1502, + "step": 6910 + }, + { + "epoch": 1.0124790120782106, + "grad_norm": 0.727605938911438, + "learning_rate": 1.8740183068840384e-05, + "loss": 0.1372, + "step": 6920 + }, + { + "epoch": 1.012533174457022, + "grad_norm": 0.3477530777454376, + "learning_rate": 1.876726425824622e-05, + "loss": 0.129, + "step": 6930 + }, + { + "epoch": 1.0125873368358338, + "grad_norm": 0.47689560055732727, + "learning_rate": 1.879434544765206e-05, + "loss": 0.1504, + "step": 6940 + }, + { + "epoch": 1.0126414992146455, + "grad_norm": 1.7074296474456787, + "learning_rate": 1.88214266370579e-05, + "loss": 0.0724, + "step": 6950 + }, + { + "epoch": 1.0126956615934573, + "grad_norm": 0.005806138273328543, + "learning_rate": 1.884850782646374e-05, + "loss": 0.1633, + "step": 6960 + }, + { + "epoch": 1.0127498239722688, + "grad_norm": 25.45844841003418, + "learning_rate": 1.8875589015869578e-05, + "loss": 0.2823, + "step": 6970 + }, + { + "epoch": 1.0128039863510805, + "grad_norm": 11.031665802001953, + "learning_rate": 1.8902670205275415e-05, + "loss": 0.1381, + "step": 6980 + }, + { + "epoch": 1.0128581487298922, + "grad_norm": 9.948840141296387, + "learning_rate": 1.8929751394681255e-05, + "loss": 0.2148, + "step": 6990 + }, + { + "epoch": 1.012912311108704, + "grad_norm": 8.283822059631348, + "learning_rate": 1.8956832584087095e-05, + "loss": 0.3595, + "step": 7000 + }, + { + "epoch": 1.0129664734875157, + "grad_norm": 14.782109260559082, + "learning_rate": 1.8983913773492935e-05, + "loss": 0.2517, + "step": 7010 + }, + { + "epoch": 1.0130206358663272, + "grad_norm": 0.30920207500457764, + "learning_rate": 1.901099496289877e-05, + "loss": 0.0751, + "step": 7020 + }, + { + "epoch": 1.013074798245139, + "grad_norm": 0.05570843815803528, + "learning_rate": 1.903807615230461e-05, + "loss": 0.0228, + "step": 7030 + }, + { + "epoch": 1.0131289606239506, + "grad_norm": 0.01521608978509903, + "learning_rate": 1.906515734171045e-05, + "loss": 0.0343, + "step": 7040 + }, + { + "epoch": 1.0131831230027624, + "grad_norm": 10.315482139587402, + "learning_rate": 1.909223853111629e-05, + "loss": 0.0945, + "step": 7050 + }, + { + "epoch": 1.0132372853815739, + "grad_norm": 5.5912065505981445, + "learning_rate": 1.9119319720522126e-05, + "loss": 0.1083, + "step": 7060 + }, + { + "epoch": 1.0132914477603856, + "grad_norm": 0.3033939599990845, + "learning_rate": 1.9146400909927963e-05, + "loss": 0.0698, + "step": 7070 + }, + { + "epoch": 1.0133456101391973, + "grad_norm": 18.474132537841797, + "learning_rate": 1.9173482099333803e-05, + "loss": 0.3939, + "step": 7080 + }, + { + "epoch": 1.013399772518009, + "grad_norm": 13.49885082244873, + "learning_rate": 1.9200563288739643e-05, + "loss": 0.0933, + "step": 7090 + }, + { + "epoch": 1.0134539348968206, + "grad_norm": 0.14398282766342163, + "learning_rate": 1.9227644478145483e-05, + "loss": 0.2563, + "step": 7100 + }, + { + "epoch": 1.0135080972756323, + "grad_norm": 2.8907034397125244, + "learning_rate": 1.925472566755132e-05, + "loss": 0.1747, + "step": 7110 + }, + { + "epoch": 1.013562259654444, + "grad_norm": 5.453670501708984, + "learning_rate": 1.9281806856957157e-05, + "loss": 0.0884, + "step": 7120 + }, + { + "epoch": 1.0136164220332557, + "grad_norm": 0.3007754683494568, + "learning_rate": 1.9308888046362997e-05, + "loss": 0.1919, + "step": 7130 + }, + { + "epoch": 1.0136705844120675, + "grad_norm": 7.7074103355407715, + "learning_rate": 1.9335969235768837e-05, + "loss": 0.0765, + "step": 7140 + }, + { + "epoch": 1.013724746790879, + "grad_norm": 0.27512675523757935, + "learning_rate": 1.9363050425174674e-05, + "loss": 0.0607, + "step": 7150 + }, + { + "epoch": 1.0137789091696907, + "grad_norm": 0.05444017052650452, + "learning_rate": 1.9390131614580514e-05, + "loss": 0.1034, + "step": 7160 + }, + { + "epoch": 1.0138330715485024, + "grad_norm": 0.8474798798561096, + "learning_rate": 1.941721280398635e-05, + "loss": 0.1887, + "step": 7170 + }, + { + "epoch": 1.0138872339273142, + "grad_norm": 0.006067650858312845, + "learning_rate": 1.944429399339219e-05, + "loss": 0.098, + "step": 7180 + }, + { + "epoch": 1.0139413963061257, + "grad_norm": 0.2126474231481552, + "learning_rate": 1.947137518279803e-05, + "loss": 0.1628, + "step": 7190 + }, + { + "epoch": 1.0139955586849374, + "grad_norm": 0.07513108104467392, + "learning_rate": 1.9498456372203868e-05, + "loss": 0.0327, + "step": 7200 + }, + { + "epoch": 1.0140497210637491, + "grad_norm": 0.2871430814266205, + "learning_rate": 1.9525537561609708e-05, + "loss": 0.3656, + "step": 7210 + }, + { + "epoch": 1.0141038834425609, + "grad_norm": 4.570075511932373, + "learning_rate": 1.9552618751015544e-05, + "loss": 0.107, + "step": 7220 + }, + { + "epoch": 1.0141580458213724, + "grad_norm": 0.03759218752384186, + "learning_rate": 1.9579699940421385e-05, + "loss": 0.0456, + "step": 7230 + }, + { + "epoch": 1.014212208200184, + "grad_norm": 5.416863918304443, + "learning_rate": 1.960678112982722e-05, + "loss": 0.1201, + "step": 7240 + }, + { + "epoch": 1.0142663705789958, + "grad_norm": 0.28136321902275085, + "learning_rate": 1.963386231923306e-05, + "loss": 0.2061, + "step": 7250 + }, + { + "epoch": 1.0143205329578076, + "grad_norm": 0.4026988446712494, + "learning_rate": 1.96609435086389e-05, + "loss": 0.0978, + "step": 7260 + }, + { + "epoch": 1.0143746953366193, + "grad_norm": 4.427429676055908, + "learning_rate": 1.968802469804474e-05, + "loss": 0.0579, + "step": 7270 + }, + { + "epoch": 1.0144288577154308, + "grad_norm": 0.022751016542315483, + "learning_rate": 1.971510588745058e-05, + "loss": 0.2273, + "step": 7280 + }, + { + "epoch": 1.0144830200942425, + "grad_norm": 0.06493276357650757, + "learning_rate": 1.9742187076856415e-05, + "loss": 0.0697, + "step": 7290 + }, + { + "epoch": 1.0145371824730542, + "grad_norm": 11.15445327758789, + "learning_rate": 1.9769268266262255e-05, + "loss": 0.1938, + "step": 7300 + }, + { + "epoch": 1.014591344851866, + "grad_norm": 0.03762941434979439, + "learning_rate": 1.9796349455668096e-05, + "loss": 0.1354, + "step": 7310 + }, + { + "epoch": 1.0146455072306775, + "grad_norm": 12.62499713897705, + "learning_rate": 1.9823430645073932e-05, + "loss": 0.0784, + "step": 7320 + }, + { + "epoch": 1.0146996696094892, + "grad_norm": 1.2045097351074219, + "learning_rate": 1.9850511834479772e-05, + "loss": 0.4911, + "step": 7330 + }, + { + "epoch": 1.014753831988301, + "grad_norm": 6.993307113647461, + "learning_rate": 1.987759302388561e-05, + "loss": 0.238, + "step": 7340 + }, + { + "epoch": 1.0148079943671127, + "grad_norm": 0.48213624954223633, + "learning_rate": 1.990467421329145e-05, + "loss": 0.2362, + "step": 7350 + }, + { + "epoch": 1.0148621567459242, + "grad_norm": 5.916738510131836, + "learning_rate": 1.993175540269729e-05, + "loss": 0.1767, + "step": 7360 + }, + { + "epoch": 1.014916319124736, + "grad_norm": 0.38521188497543335, + "learning_rate": 1.9958836592103126e-05, + "loss": 0.1069, + "step": 7370 + }, + { + "epoch": 1.0149704815035476, + "grad_norm": 3.668041467666626, + "learning_rate": 1.9985917781508963e-05, + "loss": 0.1523, + "step": 7380 + }, + { + "epoch": 1.0150246438823594, + "grad_norm": 4.568041801452637, + "learning_rate": 2.0012998970914803e-05, + "loss": 0.2457, + "step": 7390 + }, + { + "epoch": 1.015078806261171, + "grad_norm": 7.887481212615967, + "learning_rate": 2.0040080160320643e-05, + "loss": 0.1226, + "step": 7400 + }, + { + "epoch": 1.0151329686399826, + "grad_norm": 0.2643120288848877, + "learning_rate": 2.0067161349726483e-05, + "loss": 0.0762, + "step": 7410 + }, + { + "epoch": 1.0151871310187943, + "grad_norm": 0.5190443396568298, + "learning_rate": 2.009424253913232e-05, + "loss": 0.1033, + "step": 7420 + }, + { + "epoch": 1.015241293397606, + "grad_norm": 11.669818878173828, + "learning_rate": 2.0121323728538157e-05, + "loss": 0.1164, + "step": 7430 + }, + { + "epoch": 1.0152954557764178, + "grad_norm": 2.0806403160095215, + "learning_rate": 2.0148404917943997e-05, + "loss": 0.2898, + "step": 7440 + }, + { + "epoch": 1.0153496181552293, + "grad_norm": 0.06709034740924835, + "learning_rate": 2.0175486107349837e-05, + "loss": 0.0182, + "step": 7450 + }, + { + "epoch": 1.015403780534041, + "grad_norm": 0.09180652350187302, + "learning_rate": 2.0202567296755674e-05, + "loss": 0.0712, + "step": 7460 + }, + { + "epoch": 1.0154579429128527, + "grad_norm": 0.08993180096149445, + "learning_rate": 2.022964848616151e-05, + "loss": 0.1017, + "step": 7470 + }, + { + "epoch": 1.0155121052916645, + "grad_norm": 11.000114440917969, + "learning_rate": 2.025672967556735e-05, + "loss": 0.3406, + "step": 7480 + }, + { + "epoch": 1.0155662676704762, + "grad_norm": 10.08365249633789, + "learning_rate": 2.028381086497319e-05, + "loss": 0.2799, + "step": 7490 + }, + { + "epoch": 1.0156204300492877, + "grad_norm": 0.15721741318702698, + "learning_rate": 2.031089205437903e-05, + "loss": 0.136, + "step": 7500 + }, + { + "epoch": 1.0156745924280994, + "grad_norm": 8.62751579284668, + "learning_rate": 2.0337973243784868e-05, + "loss": 0.0993, + "step": 7510 + }, + { + "epoch": 1.0157287548069112, + "grad_norm": 4.546952247619629, + "learning_rate": 2.0365054433190705e-05, + "loss": 0.1065, + "step": 7520 + }, + { + "epoch": 1.0157829171857229, + "grad_norm": 0.24773645401000977, + "learning_rate": 2.0392135622596545e-05, + "loss": 0.1074, + "step": 7530 + }, + { + "epoch": 1.0158370795645344, + "grad_norm": 0.04197702184319496, + "learning_rate": 2.0419216812002385e-05, + "loss": 0.0425, + "step": 7540 + }, + { + "epoch": 1.0158912419433461, + "grad_norm": 5.01960563659668, + "learning_rate": 2.0446298001408225e-05, + "loss": 0.1002, + "step": 7550 + }, + { + "epoch": 1.0159454043221579, + "grad_norm": 11.243790626525879, + "learning_rate": 2.0473379190814062e-05, + "loss": 0.1409, + "step": 7560 + }, + { + "epoch": 1.0159995667009696, + "grad_norm": 0.5363051891326904, + "learning_rate": 2.05004603802199e-05, + "loss": 0.1671, + "step": 7570 + }, + { + "epoch": 1.016053729079781, + "grad_norm": 13.043757438659668, + "learning_rate": 2.052754156962574e-05, + "loss": 0.1264, + "step": 7580 + }, + { + "epoch": 1.0161078914585928, + "grad_norm": 6.241858005523682, + "learning_rate": 2.055462275903158e-05, + "loss": 0.087, + "step": 7590 + }, + { + "epoch": 1.0161620538374045, + "grad_norm": 10.587591171264648, + "learning_rate": 2.0581703948437416e-05, + "loss": 0.095, + "step": 7600 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 1.0537279844284058, + "learning_rate": 2.0608785137843256e-05, + "loss": 0.0918, + "step": 7610 + }, + { + "epoch": 1.016270378595028, + "grad_norm": 10.354434967041016, + "learning_rate": 2.0635866327249092e-05, + "loss": 0.1887, + "step": 7620 + }, + { + "epoch": 1.0163245409738395, + "grad_norm": 0.020901674404740334, + "learning_rate": 2.0662947516654933e-05, + "loss": 0.3045, + "step": 7630 + }, + { + "epoch": 1.0163787033526512, + "grad_norm": 13.504830360412598, + "learning_rate": 2.0690028706060773e-05, + "loss": 0.1023, + "step": 7640 + }, + { + "epoch": 1.016432865731463, + "grad_norm": 0.8160046935081482, + "learning_rate": 2.071710989546661e-05, + "loss": 0.2075, + "step": 7650 + }, + { + "epoch": 1.0164870281102747, + "grad_norm": 6.086042404174805, + "learning_rate": 2.074419108487245e-05, + "loss": 0.14, + "step": 7660 + }, + { + "epoch": 1.0165411904890862, + "grad_norm": 0.4264332354068756, + "learning_rate": 2.0771272274278286e-05, + "loss": 0.3165, + "step": 7670 + }, + { + "epoch": 1.016595352867898, + "grad_norm": 3.3225131034851074, + "learning_rate": 2.0798353463684127e-05, + "loss": 0.1243, + "step": 7680 + }, + { + "epoch": 1.0166495152467097, + "grad_norm": 6.449355602264404, + "learning_rate": 2.0825434653089963e-05, + "loss": 0.1387, + "step": 7690 + }, + { + "epoch": 1.0167036776255214, + "grad_norm": 2.0489187240600586, + "learning_rate": 2.0852515842495803e-05, + "loss": 0.0936, + "step": 7700 + }, + { + "epoch": 1.016757840004333, + "grad_norm": 0.018114713951945305, + "learning_rate": 2.0879597031901644e-05, + "loss": 0.1476, + "step": 7710 + }, + { + "epoch": 1.0168120023831446, + "grad_norm": 0.07058751583099365, + "learning_rate": 2.090667822130748e-05, + "loss": 0.0099, + "step": 7720 + }, + { + "epoch": 1.0168661647619563, + "grad_norm": 0.014682876877486706, + "learning_rate": 2.093375941071332e-05, + "loss": 0.0623, + "step": 7730 + }, + { + "epoch": 1.016920327140768, + "grad_norm": 0.018200770020484924, + "learning_rate": 2.0960840600119157e-05, + "loss": 0.2034, + "step": 7740 + }, + { + "epoch": 1.0169744895195798, + "grad_norm": 0.5900371670722961, + "learning_rate": 2.0987921789524997e-05, + "loss": 0.0397, + "step": 7750 + }, + { + "epoch": 1.0170286518983913, + "grad_norm": 10.229544639587402, + "learning_rate": 2.1015002978930838e-05, + "loss": 0.1948, + "step": 7760 + }, + { + "epoch": 1.017082814277203, + "grad_norm": 0.5435572266578674, + "learning_rate": 2.1042084168336674e-05, + "loss": 0.0712, + "step": 7770 + }, + { + "epoch": 1.0171369766560148, + "grad_norm": 0.231843501329422, + "learning_rate": 2.106916535774251e-05, + "loss": 0.0305, + "step": 7780 + }, + { + "epoch": 1.0171911390348265, + "grad_norm": 0.12501852214336395, + "learning_rate": 2.109624654714835e-05, + "loss": 0.1102, + "step": 7790 + }, + { + "epoch": 1.017245301413638, + "grad_norm": 0.02919495292007923, + "learning_rate": 2.112332773655419e-05, + "loss": 0.1595, + "step": 7800 + }, + { + "epoch": 1.0172994637924497, + "grad_norm": 2.353459596633911, + "learning_rate": 2.115040892596003e-05, + "loss": 0.0845, + "step": 7810 + }, + { + "epoch": 1.0173536261712615, + "grad_norm": 0.03336772695183754, + "learning_rate": 2.1177490115365868e-05, + "loss": 0.0635, + "step": 7820 + }, + { + "epoch": 1.0174077885500732, + "grad_norm": 0.23539160192012787, + "learning_rate": 2.1204571304771705e-05, + "loss": 0.251, + "step": 7830 + }, + { + "epoch": 1.0174619509288847, + "grad_norm": 7.138901233673096, + "learning_rate": 2.1231652494177545e-05, + "loss": 0.2103, + "step": 7840 + }, + { + "epoch": 1.0175161133076964, + "grad_norm": 0.04235094413161278, + "learning_rate": 2.1258733683583385e-05, + "loss": 0.1297, + "step": 7850 + }, + { + "epoch": 1.0175702756865082, + "grad_norm": 7.181140422821045, + "learning_rate": 2.1285814872989225e-05, + "loss": 0.2085, + "step": 7860 + }, + { + "epoch": 1.0176244380653199, + "grad_norm": 25.974397659301758, + "learning_rate": 2.131289606239506e-05, + "loss": 0.1472, + "step": 7870 + }, + { + "epoch": 1.0176786004441316, + "grad_norm": 10.19543743133545, + "learning_rate": 2.13399772518009e-05, + "loss": 0.2377, + "step": 7880 + }, + { + "epoch": 1.0177327628229431, + "grad_norm": 7.249409198760986, + "learning_rate": 2.136705844120674e-05, + "loss": 0.3045, + "step": 7890 + }, + { + "epoch": 1.0177869252017548, + "grad_norm": 0.07230503112077713, + "learning_rate": 2.139413963061258e-05, + "loss": 0.0576, + "step": 7900 + }, + { + "epoch": 1.0178410875805666, + "grad_norm": 10.604240417480469, + "learning_rate": 2.1421220820018416e-05, + "loss": 0.3261, + "step": 7910 + }, + { + "epoch": 1.0178952499593783, + "grad_norm": 0.24538157880306244, + "learning_rate": 2.1448302009424253e-05, + "loss": 0.1279, + "step": 7920 + }, + { + "epoch": 1.0179494123381898, + "grad_norm": 0.43101370334625244, + "learning_rate": 2.1475383198830093e-05, + "loss": 0.0632, + "step": 7930 + }, + { + "epoch": 1.0180035747170015, + "grad_norm": 4.088959693908691, + "learning_rate": 2.1502464388235933e-05, + "loss": 0.0504, + "step": 7940 + }, + { + "epoch": 1.0180577370958133, + "grad_norm": 6.372555255889893, + "learning_rate": 2.1529545577641773e-05, + "loss": 0.1364, + "step": 7950 + }, + { + "epoch": 1.018111899474625, + "grad_norm": 0.08410994708538055, + "learning_rate": 2.155662676704761e-05, + "loss": 0.2924, + "step": 7960 + }, + { + "epoch": 1.0181660618534365, + "grad_norm": 7.97953462600708, + "learning_rate": 2.1583707956453447e-05, + "loss": 0.0705, + "step": 7970 + }, + { + "epoch": 1.0182202242322482, + "grad_norm": 1.021141767501831, + "learning_rate": 2.1610789145859287e-05, + "loss": 0.1114, + "step": 7980 + }, + { + "epoch": 1.01827438661106, + "grad_norm": 0.09889210015535355, + "learning_rate": 2.1637870335265127e-05, + "loss": 0.1554, + "step": 7990 + }, + { + "epoch": 1.0183285489898717, + "grad_norm": 0.16197384893894196, + "learning_rate": 2.1664951524670964e-05, + "loss": 0.1259, + "step": 8000 + }, + { + "epoch": 1.0183827113686834, + "grad_norm": 5.395924091339111, + "learning_rate": 2.1692032714076804e-05, + "loss": 0.1183, + "step": 8010 + }, + { + "epoch": 1.018436873747495, + "grad_norm": 0.252964049577713, + "learning_rate": 2.171911390348264e-05, + "loss": 0.1717, + "step": 8020 + }, + { + "epoch": 1.0184910361263066, + "grad_norm": 5.434632301330566, + "learning_rate": 2.174619509288848e-05, + "loss": 0.1501, + "step": 8030 + }, + { + "epoch": 1.0185451985051184, + "grad_norm": 3.6991426944732666, + "learning_rate": 2.177327628229432e-05, + "loss": 0.0886, + "step": 8040 + }, + { + "epoch": 1.01859936088393, + "grad_norm": 0.5846430659294128, + "learning_rate": 2.1800357471700158e-05, + "loss": 0.1866, + "step": 8050 + }, + { + "epoch": 1.0186535232627416, + "grad_norm": 0.04602121189236641, + "learning_rate": 2.1827438661105998e-05, + "loss": 0.0927, + "step": 8060 + }, + { + "epoch": 1.0187076856415533, + "grad_norm": 0.12850460410118103, + "learning_rate": 2.1854519850511834e-05, + "loss": 0.1047, + "step": 8070 + }, + { + "epoch": 1.018761848020365, + "grad_norm": 0.2103743702173233, + "learning_rate": 2.1881601039917675e-05, + "loss": 0.0861, + "step": 8080 + }, + { + "epoch": 1.0188160103991768, + "grad_norm": 0.061570312827825546, + "learning_rate": 2.190868222932351e-05, + "loss": 0.1902, + "step": 8090 + }, + { + "epoch": 1.0188701727779883, + "grad_norm": 7.8959832191467285, + "learning_rate": 2.193576341872935e-05, + "loss": 0.3573, + "step": 8100 + }, + { + "epoch": 1.0189243351568, + "grad_norm": 6.157942771911621, + "learning_rate": 2.196284460813519e-05, + "loss": 0.1414, + "step": 8110 + }, + { + "epoch": 1.0189784975356118, + "grad_norm": 0.08358870446681976, + "learning_rate": 2.198992579754103e-05, + "loss": 0.0808, + "step": 8120 + }, + { + "epoch": 1.0190326599144235, + "grad_norm": 0.0903225690126419, + "learning_rate": 2.201700698694687e-05, + "loss": 0.299, + "step": 8130 + }, + { + "epoch": 1.0190868222932352, + "grad_norm": 0.07028849422931671, + "learning_rate": 2.2044088176352705e-05, + "loss": 0.1023, + "step": 8140 + }, + { + "epoch": 1.0191409846720467, + "grad_norm": 8.503654479980469, + "learning_rate": 2.2071169365758545e-05, + "loss": 0.162, + "step": 8150 + }, + { + "epoch": 1.0191951470508585, + "grad_norm": 0.02293325401842594, + "learning_rate": 2.2098250555164386e-05, + "loss": 0.186, + "step": 8160 + }, + { + "epoch": 1.0192493094296702, + "grad_norm": 7.53859806060791, + "learning_rate": 2.2125331744570222e-05, + "loss": 0.2068, + "step": 8170 + }, + { + "epoch": 1.019303471808482, + "grad_norm": 8.205107688903809, + "learning_rate": 2.215241293397606e-05, + "loss": 0.2085, + "step": 8180 + }, + { + "epoch": 1.0193576341872934, + "grad_norm": 2.0090134143829346, + "learning_rate": 2.21794941233819e-05, + "loss": 0.052, + "step": 8190 + }, + { + "epoch": 1.0194117965661051, + "grad_norm": 13.223217010498047, + "learning_rate": 2.220657531278774e-05, + "loss": 0.1141, + "step": 8200 + }, + { + "epoch": 1.0194659589449169, + "grad_norm": 0.7136548757553101, + "learning_rate": 2.223365650219358e-05, + "loss": 0.1952, + "step": 8210 + }, + { + "epoch": 1.0195201213237286, + "grad_norm": 0.10293681174516678, + "learning_rate": 2.2260737691599416e-05, + "loss": 0.1763, + "step": 8220 + }, + { + "epoch": 1.01957428370254, + "grad_norm": 4.607089996337891, + "learning_rate": 2.2287818881005253e-05, + "loss": 0.0617, + "step": 8230 + }, + { + "epoch": 1.0196284460813518, + "grad_norm": 0.1776260882616043, + "learning_rate": 2.2314900070411093e-05, + "loss": 0.0431, + "step": 8240 + }, + { + "epoch": 1.0196826084601636, + "grad_norm": 3.896257162094116, + "learning_rate": 2.2341981259816933e-05, + "loss": 0.0181, + "step": 8250 + }, + { + "epoch": 1.0197367708389753, + "grad_norm": 4.337433338165283, + "learning_rate": 2.2369062449222773e-05, + "loss": 0.1375, + "step": 8260 + }, + { + "epoch": 1.019790933217787, + "grad_norm": 11.14919662475586, + "learning_rate": 2.2396143638628607e-05, + "loss": 0.1347, + "step": 8270 + }, + { + "epoch": 1.0198450955965985, + "grad_norm": 22.3974552154541, + "learning_rate": 2.2423224828034447e-05, + "loss": 0.1681, + "step": 8280 + }, + { + "epoch": 1.0198992579754103, + "grad_norm": 4.319814205169678, + "learning_rate": 2.2450306017440287e-05, + "loss": 0.1999, + "step": 8290 + }, + { + "epoch": 1.019953420354222, + "grad_norm": 0.015763726085424423, + "learning_rate": 2.2477387206846127e-05, + "loss": 0.0423, + "step": 8300 + }, + { + "epoch": 1.0200075827330337, + "grad_norm": 1.5752935409545898, + "learning_rate": 2.2504468396251964e-05, + "loss": 0.2208, + "step": 8310 + }, + { + "epoch": 1.0200617451118452, + "grad_norm": 3.3785133361816406, + "learning_rate": 2.25315495856578e-05, + "loss": 0.1654, + "step": 8320 + }, + { + "epoch": 1.020115907490657, + "grad_norm": 0.029080014675855637, + "learning_rate": 2.255863077506364e-05, + "loss": 0.1165, + "step": 8330 + }, + { + "epoch": 1.0201700698694687, + "grad_norm": 11.94728946685791, + "learning_rate": 2.258571196446948e-05, + "loss": 0.1077, + "step": 8340 + }, + { + "epoch": 1.0202242322482804, + "grad_norm": 7.84234094619751, + "learning_rate": 2.261279315387532e-05, + "loss": 0.1044, + "step": 8350 + }, + { + "epoch": 1.0202783946270921, + "grad_norm": 0.34438666701316833, + "learning_rate": 2.2639874343281158e-05, + "loss": 0.2921, + "step": 8360 + }, + { + "epoch": 1.0203325570059036, + "grad_norm": 0.008178877644240856, + "learning_rate": 2.2666955532686995e-05, + "loss": 0.0488, + "step": 8370 + }, + { + "epoch": 1.0203867193847154, + "grad_norm": 6.5237250328063965, + "learning_rate": 2.2694036722092835e-05, + "loss": 0.1695, + "step": 8380 + }, + { + "epoch": 1.020440881763527, + "grad_norm": 0.015764087438583374, + "learning_rate": 2.2721117911498675e-05, + "loss": 0.1601, + "step": 8390 + }, + { + "epoch": 1.0204950441423388, + "grad_norm": 0.45134851336479187, + "learning_rate": 2.274819910090451e-05, + "loss": 0.0982, + "step": 8400 + }, + { + "epoch": 1.0205492065211503, + "grad_norm": 0.32048025727272034, + "learning_rate": 2.2775280290310352e-05, + "loss": 0.1322, + "step": 8410 + }, + { + "epoch": 1.020603368899962, + "grad_norm": 8.119511604309082, + "learning_rate": 2.280236147971619e-05, + "loss": 0.167, + "step": 8420 + }, + { + "epoch": 1.0206575312787738, + "grad_norm": 5.122572898864746, + "learning_rate": 2.282944266912203e-05, + "loss": 0.2361, + "step": 8430 + }, + { + "epoch": 1.0207116936575855, + "grad_norm": 6.747355937957764, + "learning_rate": 2.285652385852787e-05, + "loss": 0.4125, + "step": 8440 + }, + { + "epoch": 1.020765856036397, + "grad_norm": 0.8762272596359253, + "learning_rate": 2.2883605047933706e-05, + "loss": 0.0244, + "step": 8450 + }, + { + "epoch": 1.0208200184152088, + "grad_norm": 2.4492897987365723, + "learning_rate": 2.2910686237339546e-05, + "loss": 0.108, + "step": 8460 + }, + { + "epoch": 1.0208741807940205, + "grad_norm": 0.03791321441531181, + "learning_rate": 2.2937767426745382e-05, + "loss": 0.0148, + "step": 8470 + }, + { + "epoch": 1.0209283431728322, + "grad_norm": 25.633575439453125, + "learning_rate": 2.2964848616151223e-05, + "loss": 0.1511, + "step": 8480 + }, + { + "epoch": 1.020982505551644, + "grad_norm": 19.85538673400879, + "learning_rate": 2.299192980555706e-05, + "loss": 0.0616, + "step": 8490 + }, + { + "epoch": 1.0210366679304554, + "grad_norm": 9.335641860961914, + "learning_rate": 2.30190109949629e-05, + "loss": 0.3763, + "step": 8500 + }, + { + "epoch": 1.0210908303092672, + "grad_norm": 0.2644675076007843, + "learning_rate": 2.304609218436874e-05, + "loss": 0.4837, + "step": 8510 + }, + { + "epoch": 1.021144992688079, + "grad_norm": 12.390207290649414, + "learning_rate": 2.307317337377458e-05, + "loss": 0.1311, + "step": 8520 + }, + { + "epoch": 1.0211991550668906, + "grad_norm": 0.19486680626869202, + "learning_rate": 2.3100254563180416e-05, + "loss": 0.0087, + "step": 8530 + }, + { + "epoch": 1.0212533174457021, + "grad_norm": 0.1632789522409439, + "learning_rate": 2.3127335752586253e-05, + "loss": 0.0644, + "step": 8540 + }, + { + "epoch": 1.0213074798245139, + "grad_norm": 0.37199920415878296, + "learning_rate": 2.3154416941992093e-05, + "loss": 0.1776, + "step": 8550 + }, + { + "epoch": 1.0213616422033256, + "grad_norm": 7.954822540283203, + "learning_rate": 2.3181498131397934e-05, + "loss": 0.1156, + "step": 8560 + }, + { + "epoch": 1.0214158045821373, + "grad_norm": 0.11525817215442657, + "learning_rate": 2.3208579320803774e-05, + "loss": 0.2473, + "step": 8570 + }, + { + "epoch": 1.0214699669609488, + "grad_norm": 0.9104013442993164, + "learning_rate": 2.3235660510209607e-05, + "loss": 0.0459, + "step": 8580 + }, + { + "epoch": 1.0215241293397606, + "grad_norm": 0.1107078567147255, + "learning_rate": 2.3262741699615447e-05, + "loss": 0.1173, + "step": 8590 + }, + { + "epoch": 1.0215782917185723, + "grad_norm": 0.31767624616622925, + "learning_rate": 2.3289822889021287e-05, + "loss": 0.1617, + "step": 8600 + }, + { + "epoch": 1.021632454097384, + "grad_norm": 0.0891372412443161, + "learning_rate": 2.3316904078427127e-05, + "loss": 0.142, + "step": 8610 + }, + { + "epoch": 1.0216866164761957, + "grad_norm": 0.03791169077157974, + "learning_rate": 2.3343985267832964e-05, + "loss": 0.0285, + "step": 8620 + }, + { + "epoch": 1.0217407788550072, + "grad_norm": 0.08133705705404282, + "learning_rate": 2.33710664572388e-05, + "loss": 0.2108, + "step": 8630 + }, + { + "epoch": 1.021794941233819, + "grad_norm": 16.016483306884766, + "learning_rate": 2.339814764664464e-05, + "loss": 0.212, + "step": 8640 + }, + { + "epoch": 1.0218491036126307, + "grad_norm": 4.583590507507324, + "learning_rate": 2.342522883605048e-05, + "loss": 0.2371, + "step": 8650 + }, + { + "epoch": 1.0219032659914424, + "grad_norm": 0.02521325647830963, + "learning_rate": 2.345231002545632e-05, + "loss": 0.1389, + "step": 8660 + }, + { + "epoch": 1.021957428370254, + "grad_norm": 19.35127067565918, + "learning_rate": 2.3479391214862158e-05, + "loss": 0.2013, + "step": 8670 + }, + { + "epoch": 1.0220115907490657, + "grad_norm": 0.008817694149911404, + "learning_rate": 2.3506472404267995e-05, + "loss": 0.1164, + "step": 8680 + }, + { + "epoch": 1.0220657531278774, + "grad_norm": 10.385764122009277, + "learning_rate": 2.3533553593673835e-05, + "loss": 0.1593, + "step": 8690 + }, + { + "epoch": 1.0221199155066891, + "grad_norm": 0.2148403823375702, + "learning_rate": 2.3560634783079675e-05, + "loss": 0.1551, + "step": 8700 + }, + { + "epoch": 1.0221740778855006, + "grad_norm": 0.01821153052151203, + "learning_rate": 2.3587715972485512e-05, + "loss": 0.1624, + "step": 8710 + }, + { + "epoch": 1.0222282402643124, + "grad_norm": 0.015685372054576874, + "learning_rate": 2.3614797161891352e-05, + "loss": 0.1721, + "step": 8720 + }, + { + "epoch": 1.022282402643124, + "grad_norm": 12.151284217834473, + "learning_rate": 2.364187835129719e-05, + "loss": 0.1997, + "step": 8730 + }, + { + "epoch": 1.0223365650219358, + "grad_norm": 5.7418293952941895, + "learning_rate": 2.366895954070303e-05, + "loss": 0.1643, + "step": 8740 + }, + { + "epoch": 1.0223907274007475, + "grad_norm": 12.88263988494873, + "learning_rate": 2.369604073010887e-05, + "loss": 0.0779, + "step": 8750 + }, + { + "epoch": 1.022444889779559, + "grad_norm": 0.08072952181100845, + "learning_rate": 2.3723121919514706e-05, + "loss": 0.0217, + "step": 8760 + }, + { + "epoch": 1.0224990521583708, + "grad_norm": 0.01277916133403778, + "learning_rate": 2.3750203108920546e-05, + "loss": 0.147, + "step": 8770 + }, + { + "epoch": 1.0225532145371825, + "grad_norm": 0.1694767326116562, + "learning_rate": 2.3777284298326383e-05, + "loss": 0.0921, + "step": 8780 + }, + { + "epoch": 1.0226073769159942, + "grad_norm": 0.17689557373523712, + "learning_rate": 2.3804365487732223e-05, + "loss": 0.1237, + "step": 8790 + }, + { + "epoch": 1.0226615392948057, + "grad_norm": 14.951048851013184, + "learning_rate": 2.383144667713806e-05, + "loss": 0.2809, + "step": 8800 + }, + { + "epoch": 1.0227157016736175, + "grad_norm": 0.1509571224451065, + "learning_rate": 2.38585278665439e-05, + "loss": 0.194, + "step": 8810 + }, + { + "epoch": 1.0227698640524292, + "grad_norm": 0.17340488731861115, + "learning_rate": 2.388560905594974e-05, + "loss": 0.1352, + "step": 8820 + }, + { + "epoch": 1.022824026431241, + "grad_norm": 0.14315247535705566, + "learning_rate": 2.3912690245355577e-05, + "loss": 0.1855, + "step": 8830 + }, + { + "epoch": 1.0228781888100524, + "grad_norm": 0.1092638373374939, + "learning_rate": 2.3939771434761417e-05, + "loss": 0.0599, + "step": 8840 + }, + { + "epoch": 1.0229323511888642, + "grad_norm": 0.14564770460128784, + "learning_rate": 2.3966852624167254e-05, + "loss": 0.1801, + "step": 8850 + }, + { + "epoch": 1.022986513567676, + "grad_norm": 7.556570053100586, + "learning_rate": 2.3993933813573094e-05, + "loss": 0.2225, + "step": 8860 + }, + { + "epoch": 1.0230406759464876, + "grad_norm": 0.22877050936222076, + "learning_rate": 2.4021015002978934e-05, + "loss": 0.1229, + "step": 8870 + }, + { + "epoch": 1.0230948383252993, + "grad_norm": 11.563535690307617, + "learning_rate": 2.404809619238477e-05, + "loss": 0.1832, + "step": 8880 + }, + { + "epoch": 1.0231490007041109, + "grad_norm": 0.5468963980674744, + "learning_rate": 2.4075177381790607e-05, + "loss": 0.2392, + "step": 8890 + }, + { + "epoch": 1.0232031630829226, + "grad_norm": 0.32981979846954346, + "learning_rate": 2.4102258571196447e-05, + "loss": 0.238, + "step": 8900 + }, + { + "epoch": 1.0232573254617343, + "grad_norm": 8.754648208618164, + "learning_rate": 2.4129339760602288e-05, + "loss": 0.2045, + "step": 8910 + }, + { + "epoch": 1.023311487840546, + "grad_norm": 0.023168964311480522, + "learning_rate": 2.4156420950008128e-05, + "loss": 0.1811, + "step": 8920 + }, + { + "epoch": 1.0233656502193575, + "grad_norm": 6.098969459533691, + "learning_rate": 2.4183502139413965e-05, + "loss": 0.284, + "step": 8930 + }, + { + "epoch": 1.0234198125981693, + "grad_norm": 0.3081972002983093, + "learning_rate": 2.42105833288198e-05, + "loss": 0.0904, + "step": 8940 + }, + { + "epoch": 1.023473974976981, + "grad_norm": 0.40178534388542175, + "learning_rate": 2.423766451822564e-05, + "loss": 0.1405, + "step": 8950 + }, + { + "epoch": 1.0235281373557927, + "grad_norm": 4.757392406463623, + "learning_rate": 2.426474570763148e-05, + "loss": 0.2248, + "step": 8960 + }, + { + "epoch": 1.0235822997346042, + "grad_norm": 0.06299714744091034, + "learning_rate": 2.429182689703732e-05, + "loss": 0.195, + "step": 8970 + }, + { + "epoch": 1.023636462113416, + "grad_norm": 0.07098275423049927, + "learning_rate": 2.431890808644316e-05, + "loss": 0.0643, + "step": 8980 + }, + { + "epoch": 1.0236906244922277, + "grad_norm": 5.918641567230225, + "learning_rate": 2.4345989275848995e-05, + "loss": 0.1776, + "step": 8990 + }, + { + "epoch": 1.0237447868710394, + "grad_norm": 4.775169849395752, + "learning_rate": 2.4373070465254835e-05, + "loss": 0.0938, + "step": 9000 + }, + { + "epoch": 1.0237989492498512, + "grad_norm": 3.719665765762329, + "learning_rate": 2.4400151654660675e-05, + "loss": 0.1518, + "step": 9010 + }, + { + "epoch": 1.0238531116286627, + "grad_norm": 4.596921443939209, + "learning_rate": 2.4427232844066512e-05, + "loss": 0.1177, + "step": 9020 + }, + { + "epoch": 1.0239072740074744, + "grad_norm": 0.2296004593372345, + "learning_rate": 2.445431403347235e-05, + "loss": 0.1572, + "step": 9030 + }, + { + "epoch": 1.0239614363862861, + "grad_norm": 0.12844622135162354, + "learning_rate": 2.448139522287819e-05, + "loss": 0.1742, + "step": 9040 + }, + { + "epoch": 1.0240155987650978, + "grad_norm": 0.02407272905111313, + "learning_rate": 2.450847641228403e-05, + "loss": 0.1085, + "step": 9050 + }, + { + "epoch": 1.0240697611439094, + "grad_norm": 0.38611647486686707, + "learning_rate": 2.453555760168987e-05, + "loss": 0.0699, + "step": 9060 + }, + { + "epoch": 1.024123923522721, + "grad_norm": 0.17095232009887695, + "learning_rate": 2.4562638791095706e-05, + "loss": 0.1555, + "step": 9070 + }, + { + "epoch": 1.0241780859015328, + "grad_norm": 0.016238434240221977, + "learning_rate": 2.4589719980501543e-05, + "loss": 0.1527, + "step": 9080 + }, + { + "epoch": 1.0242322482803445, + "grad_norm": 0.036795295774936676, + "learning_rate": 2.4616801169907383e-05, + "loss": 0.0868, + "step": 9090 + }, + { + "epoch": 1.024286410659156, + "grad_norm": 0.17608660459518433, + "learning_rate": 2.4643882359313223e-05, + "loss": 0.1053, + "step": 9100 + }, + { + "epoch": 1.0243405730379678, + "grad_norm": 0.11364918202161789, + "learning_rate": 2.467096354871906e-05, + "loss": 0.0914, + "step": 9110 + }, + { + "epoch": 1.0243947354167795, + "grad_norm": 13.97603988647461, + "learning_rate": 2.46980447381249e-05, + "loss": 0.1746, + "step": 9120 + }, + { + "epoch": 1.0244488977955912, + "grad_norm": 11.88137149810791, + "learning_rate": 2.4725125927530737e-05, + "loss": 0.1614, + "step": 9130 + }, + { + "epoch": 1.024503060174403, + "grad_norm": 0.004049554467201233, + "learning_rate": 2.4752207116936577e-05, + "loss": 0.039, + "step": 9140 + }, + { + "epoch": 1.0245572225532145, + "grad_norm": 0.08059156686067581, + "learning_rate": 2.4779288306342417e-05, + "loss": 0.0887, + "step": 9150 + }, + { + "epoch": 1.0246113849320262, + "grad_norm": 0.009483820758759975, + "learning_rate": 2.4806369495748254e-05, + "loss": 0.1704, + "step": 9160 + }, + { + "epoch": 1.024665547310838, + "grad_norm": 11.69779109954834, + "learning_rate": 2.4833450685154094e-05, + "loss": 0.1973, + "step": 9170 + }, + { + "epoch": 1.0247197096896496, + "grad_norm": 0.09558761864900589, + "learning_rate": 2.486053187455993e-05, + "loss": 0.0996, + "step": 9180 + }, + { + "epoch": 1.0247738720684612, + "grad_norm": 3.9016880989074707, + "learning_rate": 2.488761306396577e-05, + "loss": 0.1453, + "step": 9190 + }, + { + "epoch": 1.0248280344472729, + "grad_norm": 0.02212394028902054, + "learning_rate": 2.491469425337161e-05, + "loss": 0.0482, + "step": 9200 + }, + { + "epoch": 1.0248821968260846, + "grad_norm": 0.08401588350534439, + "learning_rate": 2.4941775442777448e-05, + "loss": 0.1775, + "step": 9210 + }, + { + "epoch": 1.0249363592048963, + "grad_norm": 8.838800430297852, + "learning_rate": 2.4968856632183288e-05, + "loss": 0.3945, + "step": 9220 + }, + { + "epoch": 1.0249905215837078, + "grad_norm": 1.5085456371307373, + "learning_rate": 2.4995937821589125e-05, + "loss": 0.1338, + "step": 9230 + }, + { + "epoch": 1.0250013540594702, + "eval_accuracy": 0.8530372305682561, + "eval_loss": 0.4428286850452423, + "eval_runtime": 116.0659, + "eval_samples_per_second": 26.382, + "eval_steps_per_second": 3.3, + "step": 9232 + }, + { + "epoch": 2.0000433299030496, + "grad_norm": 0.9716783761978149, + "learning_rate": 2.5023019010994965e-05, + "loss": 0.1418, + "step": 9240 + }, + { + "epoch": 2.000097492281861, + "grad_norm": 10.460701942443848, + "learning_rate": 2.50501002004008e-05, + "loss": 0.1551, + "step": 9250 + }, + { + "epoch": 2.0001516546606726, + "grad_norm": 10.114690780639648, + "learning_rate": 2.5077181389806642e-05, + "loss": 0.0855, + "step": 9260 + }, + { + "epoch": 2.0002058170394843, + "grad_norm": 0.10297568887472153, + "learning_rate": 2.510426257921248e-05, + "loss": 0.137, + "step": 9270 + }, + { + "epoch": 2.000259979418296, + "grad_norm": 3.387153387069702, + "learning_rate": 2.5131343768618322e-05, + "loss": 0.1331, + "step": 9280 + }, + { + "epoch": 2.0003141417971078, + "grad_norm": 0.02906339056789875, + "learning_rate": 2.515842495802416e-05, + "loss": 0.2252, + "step": 9290 + }, + { + "epoch": 2.0003683041759195, + "grad_norm": 4.739781856536865, + "learning_rate": 2.5185506147429992e-05, + "loss": 0.18, + "step": 9300 + }, + { + "epoch": 2.000422466554731, + "grad_norm": 3.947474718093872, + "learning_rate": 2.5212587336835836e-05, + "loss": 0.1263, + "step": 9310 + }, + { + "epoch": 2.000476628933543, + "grad_norm": 2.909573554992676, + "learning_rate": 2.5239668526241672e-05, + "loss": 0.1658, + "step": 9320 + }, + { + "epoch": 2.0005307913123542, + "grad_norm": 6.88187313079834, + "learning_rate": 2.5266749715647513e-05, + "loss": 0.1468, + "step": 9330 + }, + { + "epoch": 2.000584953691166, + "grad_norm": 0.7696195840835571, + "learning_rate": 2.529383090505335e-05, + "loss": 0.0884, + "step": 9340 + }, + { + "epoch": 2.0006391160699777, + "grad_norm": 0.06073271855711937, + "learning_rate": 2.5320912094459186e-05, + "loss": 0.1701, + "step": 9350 + }, + { + "epoch": 2.0006932784487894, + "grad_norm": 3.351287364959717, + "learning_rate": 2.534799328386503e-05, + "loss": 0.2489, + "step": 9360 + }, + { + "epoch": 2.000747440827601, + "grad_norm": 8.807023048400879, + "learning_rate": 2.5375074473270866e-05, + "loss": 0.0693, + "step": 9370 + }, + { + "epoch": 2.000801603206413, + "grad_norm": 0.8698745369911194, + "learning_rate": 2.5402155662676706e-05, + "loss": 0.1136, + "step": 9380 + }, + { + "epoch": 2.0008557655852246, + "grad_norm": 4.817749500274658, + "learning_rate": 2.5429236852082543e-05, + "loss": 0.1272, + "step": 9390 + }, + { + "epoch": 2.0009099279640363, + "grad_norm": 0.006764612160623074, + "learning_rate": 2.545631804148838e-05, + "loss": 0.1489, + "step": 9400 + }, + { + "epoch": 2.000964090342848, + "grad_norm": 0.04145198315382004, + "learning_rate": 2.5483399230894223e-05, + "loss": 0.3713, + "step": 9410 + }, + { + "epoch": 2.0010182527216593, + "grad_norm": 5.142956733703613, + "learning_rate": 2.551048042030006e-05, + "loss": 0.0693, + "step": 9420 + }, + { + "epoch": 2.001072415100471, + "grad_norm": 10.56652545928955, + "learning_rate": 2.55375616097059e-05, + "loss": 0.2129, + "step": 9430 + }, + { + "epoch": 2.001126577479283, + "grad_norm": 0.4112027883529663, + "learning_rate": 2.5564642799111737e-05, + "loss": 0.1453, + "step": 9440 + }, + { + "epoch": 2.0011807398580945, + "grad_norm": 0.7817435264587402, + "learning_rate": 2.559172398851758e-05, + "loss": 0.0267, + "step": 9450 + }, + { + "epoch": 2.0012349022369063, + "grad_norm": 0.014489194378256798, + "learning_rate": 2.5618805177923417e-05, + "loss": 0.0328, + "step": 9460 + }, + { + "epoch": 2.001289064615718, + "grad_norm": 0.03358307108283043, + "learning_rate": 2.5645886367329254e-05, + "loss": 0.1083, + "step": 9470 + }, + { + "epoch": 2.0013432269945297, + "grad_norm": 0.007270703557878733, + "learning_rate": 2.5672967556735094e-05, + "loss": 0.0318, + "step": 9480 + }, + { + "epoch": 2.0013973893733414, + "grad_norm": 0.8556511998176575, + "learning_rate": 2.570004874614093e-05, + "loss": 0.2267, + "step": 9490 + }, + { + "epoch": 2.001451551752153, + "grad_norm": 1.014774203300476, + "learning_rate": 2.5727129935546775e-05, + "loss": 0.1716, + "step": 9500 + }, + { + "epoch": 2.0015057141309645, + "grad_norm": 0.4959245026111603, + "learning_rate": 2.575421112495261e-05, + "loss": 0.1146, + "step": 9510 + }, + { + "epoch": 2.001559876509776, + "grad_norm": 0.28834041953086853, + "learning_rate": 2.5781292314358445e-05, + "loss": 0.2007, + "step": 9520 + }, + { + "epoch": 2.001614038888588, + "grad_norm": 0.004446244798600674, + "learning_rate": 2.5808373503764288e-05, + "loss": 0.168, + "step": 9530 + }, + { + "epoch": 2.0016682012673996, + "grad_norm": 0.19174903631210327, + "learning_rate": 2.5835454693170125e-05, + "loss": 0.086, + "step": 9540 + }, + { + "epoch": 2.0017223636462114, + "grad_norm": 0.07788599282503128, + "learning_rate": 2.5862535882575965e-05, + "loss": 0.0184, + "step": 9550 + }, + { + "epoch": 2.001776526025023, + "grad_norm": 0.021126534789800644, + "learning_rate": 2.5889617071981802e-05, + "loss": 0.1273, + "step": 9560 + }, + { + "epoch": 2.001830688403835, + "grad_norm": 0.05743755027651787, + "learning_rate": 2.591669826138764e-05, + "loss": 0.0358, + "step": 9570 + }, + { + "epoch": 2.0018848507826466, + "grad_norm": 7.864494800567627, + "learning_rate": 2.5943779450793482e-05, + "loss": 0.1947, + "step": 9580 + }, + { + "epoch": 2.001939013161458, + "grad_norm": 4.274430274963379, + "learning_rate": 2.597086064019932e-05, + "loss": 0.0592, + "step": 9590 + }, + { + "epoch": 2.0019931755402696, + "grad_norm": 12.919083595275879, + "learning_rate": 2.599794182960516e-05, + "loss": 0.319, + "step": 9600 + }, + { + "epoch": 2.0020473379190813, + "grad_norm": 0.0038401824422180653, + "learning_rate": 2.6025023019010996e-05, + "loss": 0.2001, + "step": 9610 + }, + { + "epoch": 2.002101500297893, + "grad_norm": 7.14784574508667, + "learning_rate": 2.6052104208416833e-05, + "loss": 0.0499, + "step": 9620 + }, + { + "epoch": 2.0021556626767048, + "grad_norm": 3.474597454071045, + "learning_rate": 2.6079185397822676e-05, + "loss": 0.2833, + "step": 9630 + }, + { + "epoch": 2.0022098250555165, + "grad_norm": 0.27092090249061584, + "learning_rate": 2.6106266587228513e-05, + "loss": 0.0535, + "step": 9640 + }, + { + "epoch": 2.002263987434328, + "grad_norm": 6.382439136505127, + "learning_rate": 2.6133347776634353e-05, + "loss": 0.1325, + "step": 9650 + }, + { + "epoch": 2.00231814981314, + "grad_norm": 1.0432744026184082, + "learning_rate": 2.616042896604019e-05, + "loss": 0.1884, + "step": 9660 + }, + { + "epoch": 2.0023723121919517, + "grad_norm": 0.17260316014289856, + "learning_rate": 2.6187510155446026e-05, + "loss": 0.0962, + "step": 9670 + }, + { + "epoch": 2.002426474570763, + "grad_norm": 6.967965602874756, + "learning_rate": 2.621459134485187e-05, + "loss": 0.255, + "step": 9680 + }, + { + "epoch": 2.0024806369495747, + "grad_norm": 9.667367935180664, + "learning_rate": 2.6241672534257707e-05, + "loss": 0.2021, + "step": 9690 + }, + { + "epoch": 2.0025347993283864, + "grad_norm": 0.17677615582942963, + "learning_rate": 2.6268753723663547e-05, + "loss": 0.1309, + "step": 9700 + }, + { + "epoch": 2.002588961707198, + "grad_norm": 7.379210948944092, + "learning_rate": 2.6295834913069384e-05, + "loss": 0.2298, + "step": 9710 + }, + { + "epoch": 2.00264312408601, + "grad_norm": 0.10350872576236725, + "learning_rate": 2.632291610247522e-05, + "loss": 0.0712, + "step": 9720 + }, + { + "epoch": 2.0026972864648216, + "grad_norm": 0.020184066146612167, + "learning_rate": 2.6349997291881064e-05, + "loss": 0.0819, + "step": 9730 + }, + { + "epoch": 2.0027514488436333, + "grad_norm": 4.150378227233887, + "learning_rate": 2.6377078481286897e-05, + "loss": 0.2302, + "step": 9740 + }, + { + "epoch": 2.002805611222445, + "grad_norm": 0.26181167364120483, + "learning_rate": 2.640415967069274e-05, + "loss": 0.1781, + "step": 9750 + }, + { + "epoch": 2.0028597736012568, + "grad_norm": 0.049532145261764526, + "learning_rate": 2.6431240860098578e-05, + "loss": 0.0075, + "step": 9760 + }, + { + "epoch": 2.002913935980068, + "grad_norm": 0.21926824748516083, + "learning_rate": 2.6458322049504414e-05, + "loss": 0.1319, + "step": 9770 + }, + { + "epoch": 2.00296809835888, + "grad_norm": 1.1541472673416138, + "learning_rate": 2.6485403238910254e-05, + "loss": 0.3023, + "step": 9780 + }, + { + "epoch": 2.0030222607376915, + "grad_norm": 0.32991182804107666, + "learning_rate": 2.651248442831609e-05, + "loss": 0.1048, + "step": 9790 + }, + { + "epoch": 2.0030764231165032, + "grad_norm": 1.7069029808044434, + "learning_rate": 2.6539565617721935e-05, + "loss": 0.0841, + "step": 9800 + }, + { + "epoch": 2.003130585495315, + "grad_norm": 5.662327766418457, + "learning_rate": 2.656664680712777e-05, + "loss": 0.062, + "step": 9810 + }, + { + "epoch": 2.0031847478741267, + "grad_norm": 0.188495472073555, + "learning_rate": 2.6593727996533608e-05, + "loss": 0.1534, + "step": 9820 + }, + { + "epoch": 2.0032389102529384, + "grad_norm": 7.603240013122559, + "learning_rate": 2.662080918593945e-05, + "loss": 0.055, + "step": 9830 + }, + { + "epoch": 2.00329307263175, + "grad_norm": 0.42159390449523926, + "learning_rate": 2.6647890375345285e-05, + "loss": 0.0971, + "step": 9840 + }, + { + "epoch": 2.0033472350105614, + "grad_norm": 0.08963049948215485, + "learning_rate": 2.667497156475113e-05, + "loss": 0.1695, + "step": 9850 + }, + { + "epoch": 2.003401397389373, + "grad_norm": 0.2405349761247635, + "learning_rate": 2.6702052754156965e-05, + "loss": 0.2016, + "step": 9860 + }, + { + "epoch": 2.003455559768185, + "grad_norm": 0.3134572207927704, + "learning_rate": 2.6729133943562802e-05, + "loss": 0.0496, + "step": 9870 + }, + { + "epoch": 2.0035097221469966, + "grad_norm": 0.010201583616435528, + "learning_rate": 2.6756215132968642e-05, + "loss": 0.0972, + "step": 9880 + }, + { + "epoch": 2.0035638845258084, + "grad_norm": 0.1624917984008789, + "learning_rate": 2.678329632237448e-05, + "loss": 0.1077, + "step": 9890 + }, + { + "epoch": 2.00361804690462, + "grad_norm": 0.37294724583625793, + "learning_rate": 2.6810377511780323e-05, + "loss": 0.1305, + "step": 9900 + }, + { + "epoch": 2.003672209283432, + "grad_norm": 10.69211483001709, + "learning_rate": 2.683745870118616e-05, + "loss": 0.3361, + "step": 9910 + }, + { + "epoch": 2.0037263716622435, + "grad_norm": 0.40617281198501587, + "learning_rate": 2.6864539890591993e-05, + "loss": 0.055, + "step": 9920 + }, + { + "epoch": 2.0037805340410553, + "grad_norm": 0.5242623090744019, + "learning_rate": 2.6891621079997836e-05, + "loss": 0.268, + "step": 9930 + }, + { + "epoch": 2.0038346964198666, + "grad_norm": 4.605323791503906, + "learning_rate": 2.6918702269403673e-05, + "loss": 0.0521, + "step": 9940 + }, + { + "epoch": 2.0038888587986783, + "grad_norm": 0.6376580595970154, + "learning_rate": 2.6945783458809517e-05, + "loss": 0.1595, + "step": 9950 + }, + { + "epoch": 2.00394302117749, + "grad_norm": 0.031810611486434937, + "learning_rate": 2.697286464821535e-05, + "loss": 0.2097, + "step": 9960 + }, + { + "epoch": 2.0039971835563017, + "grad_norm": 2.9264512062072754, + "learning_rate": 2.6999945837621187e-05, + "loss": 0.1124, + "step": 9970 + }, + { + "epoch": 2.0040513459351135, + "grad_norm": 0.9297757744789124, + "learning_rate": 2.702702702702703e-05, + "loss": 0.0895, + "step": 9980 + }, + { + "epoch": 2.004105508313925, + "grad_norm": 0.14892695844173431, + "learning_rate": 2.7054108216432867e-05, + "loss": 0.1184, + "step": 9990 + }, + { + "epoch": 2.004159670692737, + "grad_norm": 0.4015558362007141, + "learning_rate": 2.7081189405838707e-05, + "loss": 0.0719, + "step": 10000 + }, + { + "epoch": 2.0042138330715487, + "grad_norm": 12.244056701660156, + "learning_rate": 2.7108270595244544e-05, + "loss": 0.0613, + "step": 10010 + }, + { + "epoch": 2.0042679954503604, + "grad_norm": 4.659317970275879, + "learning_rate": 2.713535178465038e-05, + "loss": 0.1295, + "step": 10020 + }, + { + "epoch": 2.0043221578291717, + "grad_norm": 0.18125808238983154, + "learning_rate": 2.7162432974056224e-05, + "loss": 0.0956, + "step": 10030 + }, + { + "epoch": 2.0043763202079834, + "grad_norm": 0.0063599757850170135, + "learning_rate": 2.718951416346206e-05, + "loss": 0.1262, + "step": 10040 + }, + { + "epoch": 2.004430482586795, + "grad_norm": 0.003455224446952343, + "learning_rate": 2.72165953528679e-05, + "loss": 0.0617, + "step": 10050 + }, + { + "epoch": 2.004484644965607, + "grad_norm": 0.038315873593091965, + "learning_rate": 2.7243676542273738e-05, + "loss": 0.1585, + "step": 10060 + }, + { + "epoch": 2.0045388073444186, + "grad_norm": 11.802255630493164, + "learning_rate": 2.7270757731679574e-05, + "loss": 0.1333, + "step": 10070 + }, + { + "epoch": 2.0045929697232303, + "grad_norm": 0.500488817691803, + "learning_rate": 2.7297838921085418e-05, + "loss": 0.1222, + "step": 10080 + }, + { + "epoch": 2.004647132102042, + "grad_norm": 0.003573393914848566, + "learning_rate": 2.7324920110491255e-05, + "loss": 0.0755, + "step": 10090 + }, + { + "epoch": 2.0047012944808538, + "grad_norm": 12.56084156036377, + "learning_rate": 2.7352001299897095e-05, + "loss": 0.1373, + "step": 10100 + }, + { + "epoch": 2.0047554568596655, + "grad_norm": 7.36091947555542, + "learning_rate": 2.737908248930293e-05, + "loss": 0.1859, + "step": 10110 + }, + { + "epoch": 2.004809619238477, + "grad_norm": 0.20629718899726868, + "learning_rate": 2.740616367870877e-05, + "loss": 0.0533, + "step": 10120 + }, + { + "epoch": 2.0048637816172885, + "grad_norm": 3.326402425765991, + "learning_rate": 2.7433244868114612e-05, + "loss": 0.2077, + "step": 10130 + }, + { + "epoch": 2.0049179439961002, + "grad_norm": 3.7939960956573486, + "learning_rate": 2.7460326057520445e-05, + "loss": 0.1977, + "step": 10140 + }, + { + "epoch": 2.004972106374912, + "grad_norm": 0.008700119331479073, + "learning_rate": 2.748740724692629e-05, + "loss": 0.055, + "step": 10150 + }, + { + "epoch": 2.0050262687537237, + "grad_norm": 14.322308540344238, + "learning_rate": 2.7514488436332126e-05, + "loss": 0.135, + "step": 10160 + }, + { + "epoch": 2.0050804311325354, + "grad_norm": 0.477851539850235, + "learning_rate": 2.7541569625737962e-05, + "loss": 0.1886, + "step": 10170 + }, + { + "epoch": 2.005134593511347, + "grad_norm": 11.93802261352539, + "learning_rate": 2.7568650815143802e-05, + "loss": 0.1544, + "step": 10180 + }, + { + "epoch": 2.005188755890159, + "grad_norm": 0.06855883449316025, + "learning_rate": 2.759573200454964e-05, + "loss": 0.0596, + "step": 10190 + }, + { + "epoch": 2.00524291826897, + "grad_norm": 0.09509439766407013, + "learning_rate": 2.7622813193955483e-05, + "loss": 0.1566, + "step": 10200 + }, + { + "epoch": 2.005297080647782, + "grad_norm": 0.3731818199157715, + "learning_rate": 2.764989438336132e-05, + "loss": 0.015, + "step": 10210 + }, + { + "epoch": 2.0053512430265936, + "grad_norm": 0.03535544499754906, + "learning_rate": 2.7676975572767156e-05, + "loss": 0.0121, + "step": 10220 + }, + { + "epoch": 2.0054054054054054, + "grad_norm": 6.925097942352295, + "learning_rate": 2.7704056762172996e-05, + "loss": 0.2433, + "step": 10230 + }, + { + "epoch": 2.005459567784217, + "grad_norm": 0.0748184323310852, + "learning_rate": 2.7731137951578833e-05, + "loss": 0.3171, + "step": 10240 + }, + { + "epoch": 2.005513730163029, + "grad_norm": 0.4119904935359955, + "learning_rate": 2.7758219140984677e-05, + "loss": 0.0501, + "step": 10250 + }, + { + "epoch": 2.0055678925418405, + "grad_norm": 0.1276809573173523, + "learning_rate": 2.7785300330390513e-05, + "loss": 0.1618, + "step": 10260 + }, + { + "epoch": 2.0056220549206523, + "grad_norm": 0.4462754726409912, + "learning_rate": 2.781238151979635e-05, + "loss": 0.0479, + "step": 10270 + }, + { + "epoch": 2.005676217299464, + "grad_norm": 0.10795959830284119, + "learning_rate": 2.783946270920219e-05, + "loss": 0.0529, + "step": 10280 + }, + { + "epoch": 2.0057303796782753, + "grad_norm": 3.5766828060150146, + "learning_rate": 2.7866543898608027e-05, + "loss": 0.1484, + "step": 10290 + }, + { + "epoch": 2.005784542057087, + "grad_norm": 0.44350987672805786, + "learning_rate": 2.789362508801387e-05, + "loss": 0.2419, + "step": 10300 + }, + { + "epoch": 2.0058387044358987, + "grad_norm": 8.363729476928711, + "learning_rate": 2.7920706277419707e-05, + "loss": 0.236, + "step": 10310 + }, + { + "epoch": 2.0058928668147105, + "grad_norm": 1.1305174827575684, + "learning_rate": 2.794778746682554e-05, + "loss": 0.0483, + "step": 10320 + }, + { + "epoch": 2.005947029193522, + "grad_norm": 0.5103623270988464, + "learning_rate": 2.7974868656231384e-05, + "loss": 0.2507, + "step": 10330 + }, + { + "epoch": 2.006001191572334, + "grad_norm": 0.46423742175102234, + "learning_rate": 2.800194984563722e-05, + "loss": 0.0495, + "step": 10340 + }, + { + "epoch": 2.0060553539511456, + "grad_norm": 0.011112891137599945, + "learning_rate": 2.8029031035043065e-05, + "loss": 0.1458, + "step": 10350 + }, + { + "epoch": 2.0061095163299574, + "grad_norm": 7.482539653778076, + "learning_rate": 2.8056112224448898e-05, + "loss": 0.1277, + "step": 10360 + }, + { + "epoch": 2.006163678708769, + "grad_norm": 0.0029995981603860855, + "learning_rate": 2.8083193413854735e-05, + "loss": 0.0942, + "step": 10370 + }, + { + "epoch": 2.0062178410875804, + "grad_norm": 25.057388305664062, + "learning_rate": 2.8110274603260578e-05, + "loss": 0.2927, + "step": 10380 + }, + { + "epoch": 2.006272003466392, + "grad_norm": 0.24729081988334656, + "learning_rate": 2.8137355792666415e-05, + "loss": 0.0756, + "step": 10390 + }, + { + "epoch": 2.006326165845204, + "grad_norm": 0.18017950654029846, + "learning_rate": 2.8164436982072255e-05, + "loss": 0.0439, + "step": 10400 + }, + { + "epoch": 2.0063803282240156, + "grad_norm": 6.286605358123779, + "learning_rate": 2.8191518171478092e-05, + "loss": 0.0802, + "step": 10410 + }, + { + "epoch": 2.0064344906028273, + "grad_norm": 0.02570434845983982, + "learning_rate": 2.821859936088393e-05, + "loss": 0.0898, + "step": 10420 + }, + { + "epoch": 2.006488652981639, + "grad_norm": 0.7869524359703064, + "learning_rate": 2.8245680550289772e-05, + "loss": 0.0665, + "step": 10430 + }, + { + "epoch": 2.0065428153604508, + "grad_norm": 0.04381055384874344, + "learning_rate": 2.827276173969561e-05, + "loss": 0.0637, + "step": 10440 + }, + { + "epoch": 2.0065969777392625, + "grad_norm": 0.004375217482447624, + "learning_rate": 2.829984292910145e-05, + "loss": 0.0375, + "step": 10450 + }, + { + "epoch": 2.0066511401180738, + "grad_norm": 0.07726301997900009, + "learning_rate": 2.8326924118507286e-05, + "loss": 0.12, + "step": 10460 + }, + { + "epoch": 2.0067053024968855, + "grad_norm": 3.568556308746338, + "learning_rate": 2.8354005307913122e-05, + "loss": 0.0929, + "step": 10470 + }, + { + "epoch": 2.0067594648756972, + "grad_norm": 13.49373722076416, + "learning_rate": 2.8381086497318966e-05, + "loss": 0.1962, + "step": 10480 + }, + { + "epoch": 2.006813627254509, + "grad_norm": 0.02308129332959652, + "learning_rate": 2.8408167686724803e-05, + "loss": 0.2877, + "step": 10490 + }, + { + "epoch": 2.0068677896333207, + "grad_norm": 2.942129135131836, + "learning_rate": 2.8435248876130643e-05, + "loss": 0.0412, + "step": 10500 + }, + { + "epoch": 2.0069219520121324, + "grad_norm": 1.596834421157837, + "learning_rate": 2.846233006553648e-05, + "loss": 0.1063, + "step": 10510 + }, + { + "epoch": 2.006976114390944, + "grad_norm": 6.4013991355896, + "learning_rate": 2.8489411254942316e-05, + "loss": 0.146, + "step": 10520 + }, + { + "epoch": 2.007030276769756, + "grad_norm": 3.101732015609741, + "learning_rate": 2.851649244434816e-05, + "loss": 0.0753, + "step": 10530 + }, + { + "epoch": 2.0070844391485676, + "grad_norm": 0.6356239914894104, + "learning_rate": 2.8543573633753993e-05, + "loss": 0.1545, + "step": 10540 + }, + { + "epoch": 2.007138601527379, + "grad_norm": 0.04726947471499443, + "learning_rate": 2.8570654823159837e-05, + "loss": 0.0768, + "step": 10550 + }, + { + "epoch": 2.0071927639061906, + "grad_norm": 1.6258841753005981, + "learning_rate": 2.8597736012565674e-05, + "loss": 0.0614, + "step": 10560 + }, + { + "epoch": 2.0072469262850023, + "grad_norm": 0.07138931006193161, + "learning_rate": 2.862481720197151e-05, + "loss": 0.1699, + "step": 10570 + }, + { + "epoch": 2.007301088663814, + "grad_norm": 0.0028737366665154696, + "learning_rate": 2.865189839137735e-05, + "loss": 0.1134, + "step": 10580 + }, + { + "epoch": 2.007355251042626, + "grad_norm": 0.12745460867881775, + "learning_rate": 2.8678979580783187e-05, + "loss": 0.0768, + "step": 10590 + }, + { + "epoch": 2.0074094134214375, + "grad_norm": 0.5090658664703369, + "learning_rate": 2.870606077018903e-05, + "loss": 0.1304, + "step": 10600 + }, + { + "epoch": 2.0074635758002493, + "grad_norm": 0.025167297571897507, + "learning_rate": 2.8733141959594868e-05, + "loss": 0.0407, + "step": 10610 + }, + { + "epoch": 2.007517738179061, + "grad_norm": 0.14709678292274475, + "learning_rate": 2.8760223149000704e-05, + "loss": 0.1329, + "step": 10620 + }, + { + "epoch": 2.0075719005578727, + "grad_norm": 0.6223087906837463, + "learning_rate": 2.8787304338406544e-05, + "loss": 0.047, + "step": 10630 + }, + { + "epoch": 2.007626062936684, + "grad_norm": 4.937402725219727, + "learning_rate": 2.881438552781238e-05, + "loss": 0.1287, + "step": 10640 + }, + { + "epoch": 2.0076802253154957, + "grad_norm": 0.20888376235961914, + "learning_rate": 2.8841466717218225e-05, + "loss": 0.1114, + "step": 10650 + }, + { + "epoch": 2.0077343876943075, + "grad_norm": 3.705263614654541, + "learning_rate": 2.886854790662406e-05, + "loss": 0.276, + "step": 10660 + }, + { + "epoch": 2.007788550073119, + "grad_norm": 0.1653277575969696, + "learning_rate": 2.8895629096029898e-05, + "loss": 0.179, + "step": 10670 + }, + { + "epoch": 2.007842712451931, + "grad_norm": 0.08433392643928528, + "learning_rate": 2.892271028543574e-05, + "loss": 0.1848, + "step": 10680 + }, + { + "epoch": 2.0078968748307426, + "grad_norm": 5.9364333152771, + "learning_rate": 2.8949791474841575e-05, + "loss": 0.199, + "step": 10690 + }, + { + "epoch": 2.0079510372095544, + "grad_norm": 3.670397996902466, + "learning_rate": 2.897687266424742e-05, + "loss": 0.1678, + "step": 10700 + }, + { + "epoch": 2.008005199588366, + "grad_norm": 0.29517433047294617, + "learning_rate": 2.9003953853653255e-05, + "loss": 0.0538, + "step": 10710 + }, + { + "epoch": 2.0080593619671774, + "grad_norm": 5.216148853302002, + "learning_rate": 2.9031035043059092e-05, + "loss": 0.0962, + "step": 10720 + }, + { + "epoch": 2.008113524345989, + "grad_norm": 0.16172175109386444, + "learning_rate": 2.9058116232464932e-05, + "loss": 0.081, + "step": 10730 + }, + { + "epoch": 2.008167686724801, + "grad_norm": 3.384450912475586, + "learning_rate": 2.908519742187077e-05, + "loss": 0.0781, + "step": 10740 + }, + { + "epoch": 2.0082218491036126, + "grad_norm": 0.0442298986017704, + "learning_rate": 2.9112278611276613e-05, + "loss": 0.1177, + "step": 10750 + }, + { + "epoch": 2.0082760114824243, + "grad_norm": 0.14054176211357117, + "learning_rate": 2.9139359800682446e-05, + "loss": 0.1053, + "step": 10760 + }, + { + "epoch": 2.008330173861236, + "grad_norm": 0.01707112044095993, + "learning_rate": 2.9166440990088283e-05, + "loss": 0.0587, + "step": 10770 + }, + { + "epoch": 2.0083843362400478, + "grad_norm": 13.002603530883789, + "learning_rate": 2.9193522179494126e-05, + "loss": 0.3642, + "step": 10780 + }, + { + "epoch": 2.0084384986188595, + "grad_norm": 0.12390416115522385, + "learning_rate": 2.9220603368899963e-05, + "loss": 0.1937, + "step": 10790 + }, + { + "epoch": 2.008492660997671, + "grad_norm": 0.013050038367509842, + "learning_rate": 2.9247684558305803e-05, + "loss": 0.0416, + "step": 10800 + }, + { + "epoch": 2.0085468233764825, + "grad_norm": 0.8763691186904907, + "learning_rate": 2.927476574771164e-05, + "loss": 0.1869, + "step": 10810 + }, + { + "epoch": 2.008600985755294, + "grad_norm": 0.024156974628567696, + "learning_rate": 2.9301846937117477e-05, + "loss": 0.2468, + "step": 10820 + }, + { + "epoch": 2.008655148134106, + "grad_norm": 4.367465496063232, + "learning_rate": 2.932892812652332e-05, + "loss": 0.0589, + "step": 10830 + }, + { + "epoch": 2.0087093105129177, + "grad_norm": 3.3792953491210938, + "learning_rate": 2.9356009315929157e-05, + "loss": 0.0413, + "step": 10840 + }, + { + "epoch": 2.0087634728917294, + "grad_norm": 0.06176166608929634, + "learning_rate": 2.9383090505334997e-05, + "loss": 0.1339, + "step": 10850 + }, + { + "epoch": 2.008817635270541, + "grad_norm": 0.0355217345058918, + "learning_rate": 2.9410171694740834e-05, + "loss": 0.1042, + "step": 10860 + }, + { + "epoch": 2.008871797649353, + "grad_norm": 3.857578992843628, + "learning_rate": 2.943725288414667e-05, + "loss": 0.1628, + "step": 10870 + }, + { + "epoch": 2.0089259600281646, + "grad_norm": 7.932945728302002, + "learning_rate": 2.9464334073552514e-05, + "loss": 0.0862, + "step": 10880 + }, + { + "epoch": 2.0089801224069763, + "grad_norm": 0.03422662243247032, + "learning_rate": 2.949141526295835e-05, + "loss": 0.0405, + "step": 10890 + }, + { + "epoch": 2.0090342847857876, + "grad_norm": 1.6163532733917236, + "learning_rate": 2.951849645236419e-05, + "loss": 0.1242, + "step": 10900 + }, + { + "epoch": 2.0090884471645993, + "grad_norm": 6.539446830749512, + "learning_rate": 2.9545577641770028e-05, + "loss": 0.1816, + "step": 10910 + }, + { + "epoch": 2.009142609543411, + "grad_norm": 0.13296186923980713, + "learning_rate": 2.9572658831175864e-05, + "loss": 0.0916, + "step": 10920 + }, + { + "epoch": 2.009196771922223, + "grad_norm": 0.07593723386526108, + "learning_rate": 2.9599740020581708e-05, + "loss": 0.0969, + "step": 10930 + }, + { + "epoch": 2.0092509343010345, + "grad_norm": 9.49351978302002, + "learning_rate": 2.9626821209987545e-05, + "loss": 0.1828, + "step": 10940 + }, + { + "epoch": 2.0093050966798462, + "grad_norm": 0.7837936878204346, + "learning_rate": 2.9653902399393385e-05, + "loss": 0.1203, + "step": 10950 + }, + { + "epoch": 2.009359259058658, + "grad_norm": 21.198623657226562, + "learning_rate": 2.968098358879922e-05, + "loss": 0.0549, + "step": 10960 + }, + { + "epoch": 2.0094134214374697, + "grad_norm": 0.08162837475538254, + "learning_rate": 2.970806477820506e-05, + "loss": 0.0829, + "step": 10970 + }, + { + "epoch": 2.0094675838162814, + "grad_norm": 10.3756103515625, + "learning_rate": 2.97351459676109e-05, + "loss": 0.1975, + "step": 10980 + }, + { + "epoch": 2.0095217461950927, + "grad_norm": 0.337631493806839, + "learning_rate": 2.9762227157016735e-05, + "loss": 0.087, + "step": 10990 + }, + { + "epoch": 2.0095759085739044, + "grad_norm": 0.03793372958898544, + "learning_rate": 2.978930834642258e-05, + "loss": 0.035, + "step": 11000 + }, + { + "epoch": 2.009630070952716, + "grad_norm": 0.05545588955283165, + "learning_rate": 2.9816389535828416e-05, + "loss": 0.0557, + "step": 11010 + }, + { + "epoch": 2.009684233331528, + "grad_norm": 0.0044649033807218075, + "learning_rate": 2.9843470725234252e-05, + "loss": 0.2827, + "step": 11020 + }, + { + "epoch": 2.0097383957103396, + "grad_norm": 0.13376222550868988, + "learning_rate": 2.9870551914640092e-05, + "loss": 0.2, + "step": 11030 + }, + { + "epoch": 2.0097925580891514, + "grad_norm": 0.1770903319120407, + "learning_rate": 2.989763310404593e-05, + "loss": 0.0895, + "step": 11040 + }, + { + "epoch": 2.009846720467963, + "grad_norm": 8.169044494628906, + "learning_rate": 2.9924714293451773e-05, + "loss": 0.1205, + "step": 11050 + }, + { + "epoch": 2.009900882846775, + "grad_norm": 0.05922679975628853, + "learning_rate": 2.995179548285761e-05, + "loss": 0.1069, + "step": 11060 + }, + { + "epoch": 2.009955045225586, + "grad_norm": 0.5171164274215698, + "learning_rate": 2.9978876672263446e-05, + "loss": 0.1233, + "step": 11070 + }, + { + "epoch": 2.010009207604398, + "grad_norm": 0.04098554700613022, + "learning_rate": 3.0005957861669286e-05, + "loss": 0.1855, + "step": 11080 + }, + { + "epoch": 2.0100633699832096, + "grad_norm": 3.871633768081665, + "learning_rate": 3.0033039051075123e-05, + "loss": 0.084, + "step": 11090 + }, + { + "epoch": 2.0101175323620213, + "grad_norm": 0.02645137347280979, + "learning_rate": 3.0060120240480967e-05, + "loss": 0.1187, + "step": 11100 + }, + { + "epoch": 2.010171694740833, + "grad_norm": 0.045646291226148605, + "learning_rate": 3.0087201429886803e-05, + "loss": 0.1114, + "step": 11110 + }, + { + "epoch": 2.0102258571196447, + "grad_norm": 0.09569908678531647, + "learning_rate": 3.011428261929264e-05, + "loss": 0.1096, + "step": 11120 + }, + { + "epoch": 2.0102800194984565, + "grad_norm": 0.011157400906085968, + "learning_rate": 3.014136380869848e-05, + "loss": 0.0654, + "step": 11130 + }, + { + "epoch": 2.010334181877268, + "grad_norm": 6.388122081756592, + "learning_rate": 3.0168444998104317e-05, + "loss": 0.1392, + "step": 11140 + }, + { + "epoch": 2.01038834425608, + "grad_norm": 0.35880550742149353, + "learning_rate": 3.019552618751016e-05, + "loss": 0.061, + "step": 11150 + }, + { + "epoch": 2.010442506634891, + "grad_norm": 0.13029682636260986, + "learning_rate": 3.0222607376915997e-05, + "loss": 0.0931, + "step": 11160 + }, + { + "epoch": 2.010496669013703, + "grad_norm": 0.1609259992837906, + "learning_rate": 3.024968856632183e-05, + "loss": 0.1594, + "step": 11170 + }, + { + "epoch": 2.0105508313925147, + "grad_norm": 0.5064067244529724, + "learning_rate": 3.0276769755727674e-05, + "loss": 0.0901, + "step": 11180 + }, + { + "epoch": 2.0106049937713264, + "grad_norm": 0.13915587961673737, + "learning_rate": 3.030385094513351e-05, + "loss": 0.1036, + "step": 11190 + }, + { + "epoch": 2.010659156150138, + "grad_norm": 9.69935417175293, + "learning_rate": 3.033093213453935e-05, + "loss": 0.2377, + "step": 11200 + }, + { + "epoch": 2.01071331852895, + "grad_norm": 4.727365016937256, + "learning_rate": 3.0358013323945188e-05, + "loss": 0.1729, + "step": 11210 + }, + { + "epoch": 2.0107674809077616, + "grad_norm": 5.615570068359375, + "learning_rate": 3.0385094513351025e-05, + "loss": 0.361, + "step": 11220 + }, + { + "epoch": 2.0108216432865733, + "grad_norm": 0.6495730876922607, + "learning_rate": 3.0412175702756868e-05, + "loss": 0.0579, + "step": 11230 + }, + { + "epoch": 2.010875805665385, + "grad_norm": 2.885205030441284, + "learning_rate": 3.0439256892162705e-05, + "loss": 0.1456, + "step": 11240 + }, + { + "epoch": 2.0109299680441963, + "grad_norm": 3.769213914871216, + "learning_rate": 3.0466338081568545e-05, + "loss": 0.1156, + "step": 11250 + }, + { + "epoch": 2.010984130423008, + "grad_norm": 0.0688047856092453, + "learning_rate": 3.0493419270974382e-05, + "loss": 0.0138, + "step": 11260 + }, + { + "epoch": 2.01103829280182, + "grad_norm": 0.013385484926402569, + "learning_rate": 3.052050046038022e-05, + "loss": 0.0372, + "step": 11270 + }, + { + "epoch": 2.0110924551806315, + "grad_norm": 0.4245552122592926, + "learning_rate": 3.054758164978606e-05, + "loss": 0.1187, + "step": 11280 + }, + { + "epoch": 2.0111466175594432, + "grad_norm": 0.4624890685081482, + "learning_rate": 3.0574662839191895e-05, + "loss": 0.0233, + "step": 11290 + }, + { + "epoch": 2.011200779938255, + "grad_norm": 7.814807415008545, + "learning_rate": 3.060174402859774e-05, + "loss": 0.2186, + "step": 11300 + }, + { + "epoch": 2.0112549423170667, + "grad_norm": 3.2424368858337402, + "learning_rate": 3.0628825218003576e-05, + "loss": 0.3006, + "step": 11310 + }, + { + "epoch": 2.0113091046958784, + "grad_norm": 0.016877666115760803, + "learning_rate": 3.065590640740941e-05, + "loss": 0.1728, + "step": 11320 + }, + { + "epoch": 2.0113632670746897, + "grad_norm": 0.7634553909301758, + "learning_rate": 3.0682987596815256e-05, + "loss": 0.2301, + "step": 11330 + }, + { + "epoch": 2.0114174294535014, + "grad_norm": 0.059350281953811646, + "learning_rate": 3.071006878622109e-05, + "loss": 0.0923, + "step": 11340 + }, + { + "epoch": 2.011471591832313, + "grad_norm": 0.018498072400689125, + "learning_rate": 3.0737149975626936e-05, + "loss": 0.1949, + "step": 11350 + }, + { + "epoch": 2.011525754211125, + "grad_norm": 6.875436782836914, + "learning_rate": 3.076423116503277e-05, + "loss": 0.1076, + "step": 11360 + }, + { + "epoch": 2.0115799165899366, + "grad_norm": 2.2236738204956055, + "learning_rate": 3.07913123544386e-05, + "loss": 0.053, + "step": 11370 + }, + { + "epoch": 2.0116340789687484, + "grad_norm": 2.0142152309417725, + "learning_rate": 3.0818393543844446e-05, + "loss": 0.0684, + "step": 11380 + }, + { + "epoch": 2.01168824134756, + "grad_norm": 0.05115390568971634, + "learning_rate": 3.084547473325028e-05, + "loss": 0.132, + "step": 11390 + }, + { + "epoch": 2.011742403726372, + "grad_norm": 5.471987724304199, + "learning_rate": 3.087255592265613e-05, + "loss": 0.1752, + "step": 11400 + }, + { + "epoch": 2.0117965661051835, + "grad_norm": 0.11497990041971207, + "learning_rate": 3.0899637112061964e-05, + "loss": 0.0429, + "step": 11410 + }, + { + "epoch": 2.011850728483995, + "grad_norm": 0.05921848490834236, + "learning_rate": 3.09267183014678e-05, + "loss": 0.0463, + "step": 11420 + }, + { + "epoch": 2.0119048908628065, + "grad_norm": 7.342184066772461, + "learning_rate": 3.0953799490873644e-05, + "loss": 0.152, + "step": 11430 + }, + { + "epoch": 2.0119590532416183, + "grad_norm": 10.405746459960938, + "learning_rate": 3.098088068027948e-05, + "loss": 0.1124, + "step": 11440 + }, + { + "epoch": 2.01201321562043, + "grad_norm": 0.041833117604255676, + "learning_rate": 3.100796186968532e-05, + "loss": 0.1557, + "step": 11450 + }, + { + "epoch": 2.0120673779992417, + "grad_norm": 0.09370303153991699, + "learning_rate": 3.1035043059091154e-05, + "loss": 0.0497, + "step": 11460 + }, + { + "epoch": 2.0121215403780535, + "grad_norm": 0.7666518688201904, + "learning_rate": 3.106212424849699e-05, + "loss": 0.1778, + "step": 11470 + }, + { + "epoch": 2.012175702756865, + "grad_norm": 0.014196276664733887, + "learning_rate": 3.1089205437902834e-05, + "loss": 0.113, + "step": 11480 + }, + { + "epoch": 2.012229865135677, + "grad_norm": 0.008772369474172592, + "learning_rate": 3.111628662730867e-05, + "loss": 0.0864, + "step": 11490 + }, + { + "epoch": 2.0122840275144886, + "grad_norm": 5.983575344085693, + "learning_rate": 3.1143367816714515e-05, + "loss": 0.1888, + "step": 11500 + }, + { + "epoch": 2.0123381898933, + "grad_norm": 0.008744437247514725, + "learning_rate": 3.117044900612035e-05, + "loss": 0.0965, + "step": 11510 + }, + { + "epoch": 2.0123923522721117, + "grad_norm": 6.296879291534424, + "learning_rate": 3.119753019552619e-05, + "loss": 0.1919, + "step": 11520 + }, + { + "epoch": 2.0124465146509234, + "grad_norm": 0.11504576355218887, + "learning_rate": 3.122461138493203e-05, + "loss": 0.0327, + "step": 11530 + }, + { + "epoch": 2.012500677029735, + "grad_norm": 0.007781173102557659, + "learning_rate": 3.125169257433787e-05, + "loss": 0.0161, + "step": 11540 + }, + { + "epoch": 2.012554839408547, + "grad_norm": 24.066261291503906, + "learning_rate": 3.1278773763743705e-05, + "loss": 0.0943, + "step": 11550 + }, + { + "epoch": 2.0126090017873586, + "grad_norm": 0.045939650386571884, + "learning_rate": 3.130585495314954e-05, + "loss": 0.2537, + "step": 11560 + }, + { + "epoch": 2.0126631641661703, + "grad_norm": 1.0928617715835571, + "learning_rate": 3.133293614255538e-05, + "loss": 0.1511, + "step": 11570 + }, + { + "epoch": 2.012717326544982, + "grad_norm": 0.47520214319229126, + "learning_rate": 3.136001733196122e-05, + "loss": 0.1094, + "step": 11580 + }, + { + "epoch": 2.0127714889237933, + "grad_norm": 5.671433925628662, + "learning_rate": 3.138709852136706e-05, + "loss": 0.1016, + "step": 11590 + }, + { + "epoch": 2.012825651302605, + "grad_norm": 5.238346576690674, + "learning_rate": 3.14141797107729e-05, + "loss": 0.1735, + "step": 11600 + }, + { + "epoch": 2.0128798136814168, + "grad_norm": 0.32381361722946167, + "learning_rate": 3.144126090017874e-05, + "loss": 0.127, + "step": 11610 + }, + { + "epoch": 2.0129339760602285, + "grad_norm": 0.00988776981830597, + "learning_rate": 3.1468342089584576e-05, + "loss": 0.0055, + "step": 11620 + }, + { + "epoch": 2.0129881384390402, + "grad_norm": 1.0862452983856201, + "learning_rate": 3.149542327899041e-05, + "loss": 0.0904, + "step": 11630 + }, + { + "epoch": 2.013042300817852, + "grad_norm": 7.95330810546875, + "learning_rate": 3.152250446839625e-05, + "loss": 0.1307, + "step": 11640 + }, + { + "epoch": 2.0130964631966637, + "grad_norm": 0.03485788777470589, + "learning_rate": 3.154958565780209e-05, + "loss": 0.2265, + "step": 11650 + }, + { + "epoch": 2.0131506255754754, + "grad_norm": 15.599664688110352, + "learning_rate": 3.157666684720793e-05, + "loss": 0.0938, + "step": 11660 + }, + { + "epoch": 2.013204787954287, + "grad_norm": 5.874507904052734, + "learning_rate": 3.1603748036613767e-05, + "loss": 0.1558, + "step": 11670 + }, + { + "epoch": 2.0132589503330984, + "grad_norm": 0.06738252937793732, + "learning_rate": 3.163082922601961e-05, + "loss": 0.154, + "step": 11680 + }, + { + "epoch": 2.01331311271191, + "grad_norm": 0.023996414616703987, + "learning_rate": 3.165791041542545e-05, + "loss": 0.2183, + "step": 11690 + }, + { + "epoch": 2.013367275090722, + "grad_norm": 5.887137413024902, + "learning_rate": 3.168499160483129e-05, + "loss": 0.2642, + "step": 11700 + }, + { + "epoch": 2.0134214374695336, + "grad_norm": 3.5323596000671387, + "learning_rate": 3.171207279423713e-05, + "loss": 0.1881, + "step": 11710 + }, + { + "epoch": 2.0134755998483453, + "grad_norm": 4.75913143157959, + "learning_rate": 3.1739153983642964e-05, + "loss": 0.0359, + "step": 11720 + }, + { + "epoch": 2.013529762227157, + "grad_norm": 0.1018490120768547, + "learning_rate": 3.17662351730488e-05, + "loss": 0.2276, + "step": 11730 + }, + { + "epoch": 2.013583924605969, + "grad_norm": 0.32170718908309937, + "learning_rate": 3.179331636245464e-05, + "loss": 0.1631, + "step": 11740 + }, + { + "epoch": 2.0136380869847805, + "grad_norm": 0.1316663920879364, + "learning_rate": 3.182039755186048e-05, + "loss": 0.1124, + "step": 11750 + }, + { + "epoch": 2.0136922493635923, + "grad_norm": 13.302003860473633, + "learning_rate": 3.184747874126632e-05, + "loss": 0.2132, + "step": 11760 + }, + { + "epoch": 2.0137464117424035, + "grad_norm": 5.1972808837890625, + "learning_rate": 3.1874559930672154e-05, + "loss": 0.0968, + "step": 11770 + }, + { + "epoch": 2.0138005741212153, + "grad_norm": 6.667443752288818, + "learning_rate": 3.1901641120078e-05, + "loss": 0.1238, + "step": 11780 + }, + { + "epoch": 2.013854736500027, + "grad_norm": 0.5926281809806824, + "learning_rate": 3.1928722309483835e-05, + "loss": 0.0725, + "step": 11790 + }, + { + "epoch": 2.0139088988788387, + "grad_norm": 9.00967025756836, + "learning_rate": 3.195580349888968e-05, + "loss": 0.1224, + "step": 11800 + }, + { + "epoch": 2.0139630612576505, + "grad_norm": 0.8129690885543823, + "learning_rate": 3.198288468829551e-05, + "loss": 0.1839, + "step": 11810 + }, + { + "epoch": 2.014017223636462, + "grad_norm": 0.7865707278251648, + "learning_rate": 3.2009965877701345e-05, + "loss": 0.1786, + "step": 11820 + }, + { + "epoch": 2.014071386015274, + "grad_norm": 29.91607093811035, + "learning_rate": 3.203704706710719e-05, + "loss": 0.1454, + "step": 11830 + }, + { + "epoch": 2.0141255483940856, + "grad_norm": 1.102994441986084, + "learning_rate": 3.2064128256513025e-05, + "loss": 0.2002, + "step": 11840 + }, + { + "epoch": 2.014179710772897, + "grad_norm": 0.03364730253815651, + "learning_rate": 3.209120944591887e-05, + "loss": 0.0879, + "step": 11850 + }, + { + "epoch": 2.0142338731517087, + "grad_norm": 0.03363491967320442, + "learning_rate": 3.2118290635324705e-05, + "loss": 0.0942, + "step": 11860 + }, + { + "epoch": 2.0142880355305204, + "grad_norm": 13.74741268157959, + "learning_rate": 3.214537182473054e-05, + "loss": 0.2446, + "step": 11870 + }, + { + "epoch": 2.014342197909332, + "grad_norm": 1.1004738807678223, + "learning_rate": 3.2172453014136386e-05, + "loss": 0.0576, + "step": 11880 + }, + { + "epoch": 2.014396360288144, + "grad_norm": 0.33240827918052673, + "learning_rate": 3.219953420354222e-05, + "loss": 0.0897, + "step": 11890 + }, + { + "epoch": 2.0144505226669556, + "grad_norm": 0.12291119992733002, + "learning_rate": 3.222661539294806e-05, + "loss": 0.0087, + "step": 11900 + }, + { + "epoch": 2.0145046850457673, + "grad_norm": 3.137362003326416, + "learning_rate": 3.2253696582353896e-05, + "loss": 0.0306, + "step": 11910 + }, + { + "epoch": 2.014558847424579, + "grad_norm": 0.1821805089712143, + "learning_rate": 3.228077777175973e-05, + "loss": 0.1337, + "step": 11920 + }, + { + "epoch": 2.0146130098033908, + "grad_norm": 0.3822456896305084, + "learning_rate": 3.2307858961165576e-05, + "loss": 0.1117, + "step": 11930 + }, + { + "epoch": 2.014667172182202, + "grad_norm": 11.097694396972656, + "learning_rate": 3.233494015057141e-05, + "loss": 0.18, + "step": 11940 + }, + { + "epoch": 2.0147213345610138, + "grad_norm": 0.007438318338245153, + "learning_rate": 3.2362021339977257e-05, + "loss": 0.0679, + "step": 11950 + }, + { + "epoch": 2.0147754969398255, + "grad_norm": 0.17116063833236694, + "learning_rate": 3.238910252938309e-05, + "loss": 0.0994, + "step": 11960 + }, + { + "epoch": 2.0148296593186372, + "grad_norm": 0.3889947533607483, + "learning_rate": 3.241618371878893e-05, + "loss": 0.2003, + "step": 11970 + }, + { + "epoch": 2.014883821697449, + "grad_norm": 6.237257957458496, + "learning_rate": 3.2443264908194774e-05, + "loss": 0.1238, + "step": 11980 + }, + { + "epoch": 2.0149379840762607, + "grad_norm": 9.157485961914062, + "learning_rate": 3.2470346097600604e-05, + "loss": 0.1692, + "step": 11990 + }, + { + "epoch": 2.0149921464550724, + "grad_norm": 6.859687328338623, + "learning_rate": 3.249742728700645e-05, + "loss": 0.2418, + "step": 12000 + }, + { + "epoch": 2.015046308833884, + "grad_norm": 0.0047452677972614765, + "learning_rate": 3.2524508476412284e-05, + "loss": 0.069, + "step": 12010 + }, + { + "epoch": 2.015100471212696, + "grad_norm": 0.14347423613071442, + "learning_rate": 3.255158966581812e-05, + "loss": 0.0839, + "step": 12020 + }, + { + "epoch": 2.015154633591507, + "grad_norm": 0.3434869349002838, + "learning_rate": 3.2578670855223964e-05, + "loss": 0.2117, + "step": 12030 + }, + { + "epoch": 2.015208795970319, + "grad_norm": 0.5351919531822205, + "learning_rate": 3.26057520446298e-05, + "loss": 0.0301, + "step": 12040 + }, + { + "epoch": 2.0152629583491306, + "grad_norm": 8.491357803344727, + "learning_rate": 3.2632833234035644e-05, + "loss": 0.1099, + "step": 12050 + }, + { + "epoch": 2.0153171207279423, + "grad_norm": 1.2556184530258179, + "learning_rate": 3.265991442344148e-05, + "loss": 0.0873, + "step": 12060 + }, + { + "epoch": 2.015371283106754, + "grad_norm": 0.9146931767463684, + "learning_rate": 3.268699561284732e-05, + "loss": 0.0412, + "step": 12070 + }, + { + "epoch": 2.015425445485566, + "grad_norm": 0.22716519236564636, + "learning_rate": 3.2714076802253155e-05, + "loss": 0.2298, + "step": 12080 + }, + { + "epoch": 2.0154796078643775, + "grad_norm": 0.014685595408082008, + "learning_rate": 3.274115799165899e-05, + "loss": 0.1586, + "step": 12090 + }, + { + "epoch": 2.0155337702431892, + "grad_norm": 0.005086178425699472, + "learning_rate": 3.2768239181064835e-05, + "loss": 0.1578, + "step": 12100 + }, + { + "epoch": 2.015587932622001, + "grad_norm": 5.492428302764893, + "learning_rate": 3.279532037047067e-05, + "loss": 0.193, + "step": 12110 + }, + { + "epoch": 2.0156420950008123, + "grad_norm": 1.057099461555481, + "learning_rate": 3.282240155987651e-05, + "loss": 0.0939, + "step": 12120 + }, + { + "epoch": 2.015696257379624, + "grad_norm": 3.5471510887145996, + "learning_rate": 3.284948274928235e-05, + "loss": 0.1116, + "step": 12130 + }, + { + "epoch": 2.0157504197584357, + "grad_norm": 0.060382574796676636, + "learning_rate": 3.287656393868819e-05, + "loss": 0.164, + "step": 12140 + }, + { + "epoch": 2.0158045821372474, + "grad_norm": 6.153659820556641, + "learning_rate": 3.290364512809403e-05, + "loss": 0.1013, + "step": 12150 + }, + { + "epoch": 2.015858744516059, + "grad_norm": 0.009963717311620712, + "learning_rate": 3.293072631749987e-05, + "loss": 0.0965, + "step": 12160 + }, + { + "epoch": 2.015912906894871, + "grad_norm": 6.834847450256348, + "learning_rate": 3.2957807506905706e-05, + "loss": 0.1863, + "step": 12170 + }, + { + "epoch": 2.0159670692736826, + "grad_norm": 0.18618595600128174, + "learning_rate": 3.298488869631154e-05, + "loss": 0.1773, + "step": 12180 + }, + { + "epoch": 2.0160212316524944, + "grad_norm": 0.08487674593925476, + "learning_rate": 3.301196988571738e-05, + "loss": 0.0113, + "step": 12190 + }, + { + "epoch": 2.0160753940313056, + "grad_norm": 7.019794940948486, + "learning_rate": 3.303905107512322e-05, + "loss": 0.3476, + "step": 12200 + }, + { + "epoch": 2.0161295564101174, + "grad_norm": 4.588277816772461, + "learning_rate": 3.306613226452906e-05, + "loss": 0.1623, + "step": 12210 + }, + { + "epoch": 2.016183718788929, + "grad_norm": 3.7869319915771484, + "learning_rate": 3.3093213453934896e-05, + "loss": 0.1004, + "step": 12220 + }, + { + "epoch": 2.016237881167741, + "grad_norm": 11.600396156311035, + "learning_rate": 3.312029464334074e-05, + "loss": 0.1217, + "step": 12230 + }, + { + "epoch": 2.0162920435465526, + "grad_norm": 0.5873305797576904, + "learning_rate": 3.3147375832746577e-05, + "loss": 0.1434, + "step": 12240 + }, + { + "epoch": 2.0163462059253643, + "grad_norm": 0.14558346569538116, + "learning_rate": 3.317445702215241e-05, + "loss": 0.0524, + "step": 12250 + }, + { + "epoch": 2.016400368304176, + "grad_norm": 6.326120376586914, + "learning_rate": 3.320153821155825e-05, + "loss": 0.0918, + "step": 12260 + }, + { + "epoch": 2.0164545306829877, + "grad_norm": 0.03391530364751816, + "learning_rate": 3.322861940096409e-05, + "loss": 0.0909, + "step": 12270 + }, + { + "epoch": 2.0165086930617995, + "grad_norm": 5.653451442718506, + "learning_rate": 3.325570059036993e-05, + "loss": 0.2405, + "step": 12280 + }, + { + "epoch": 2.0165628554406108, + "grad_norm": 0.01566486433148384, + "learning_rate": 3.328278177977577e-05, + "loss": 0.1573, + "step": 12290 + }, + { + "epoch": 2.0166170178194225, + "grad_norm": 3.9634594917297363, + "learning_rate": 3.330986296918161e-05, + "loss": 0.084, + "step": 12300 + }, + { + "epoch": 2.016671180198234, + "grad_norm": 7.251890659332275, + "learning_rate": 3.333694415858745e-05, + "loss": 0.1964, + "step": 12310 + }, + { + "epoch": 2.016725342577046, + "grad_norm": 2.808184862136841, + "learning_rate": 3.3364025347993284e-05, + "loss": 0.1561, + "step": 12320 + }, + { + "epoch": 2.0167795049558577, + "grad_norm": 0.12122959643602371, + "learning_rate": 3.339110653739913e-05, + "loss": 0.1117, + "step": 12330 + }, + { + "epoch": 2.0168336673346694, + "grad_norm": 0.11696730554103851, + "learning_rate": 3.3418187726804964e-05, + "loss": 0.0439, + "step": 12340 + }, + { + "epoch": 2.016887829713481, + "grad_norm": 2.1736838817596436, + "learning_rate": 3.34452689162108e-05, + "loss": 0.1595, + "step": 12350 + }, + { + "epoch": 2.016941992092293, + "grad_norm": 0.23164257407188416, + "learning_rate": 3.347235010561664e-05, + "loss": 0.1083, + "step": 12360 + }, + { + "epoch": 2.0169961544711046, + "grad_norm": 3.1530017852783203, + "learning_rate": 3.3499431295022475e-05, + "loss": 0.0552, + "step": 12370 + }, + { + "epoch": 2.017050316849916, + "grad_norm": 0.11436858773231506, + "learning_rate": 3.352651248442832e-05, + "loss": 0.3773, + "step": 12380 + }, + { + "epoch": 2.0171044792287276, + "grad_norm": 5.262417793273926, + "learning_rate": 3.3553593673834155e-05, + "loss": 0.2231, + "step": 12390 + }, + { + "epoch": 2.0171586416075393, + "grad_norm": 0.4468846023082733, + "learning_rate": 3.358067486324e-05, + "loss": 0.0627, + "step": 12400 + }, + { + "epoch": 2.017212803986351, + "grad_norm": 0.010541246272623539, + "learning_rate": 3.3607756052645835e-05, + "loss": 0.1009, + "step": 12410 + }, + { + "epoch": 2.017266966365163, + "grad_norm": 0.008608734235167503, + "learning_rate": 3.363483724205167e-05, + "loss": 0.0946, + "step": 12420 + }, + { + "epoch": 2.0173211287439745, + "grad_norm": 0.8011439442634583, + "learning_rate": 3.366191843145751e-05, + "loss": 0.2022, + "step": 12430 + }, + { + "epoch": 2.0173752911227862, + "grad_norm": 0.18882989883422852, + "learning_rate": 3.3688999620863346e-05, + "loss": 0.1414, + "step": 12440 + }, + { + "epoch": 2.017429453501598, + "grad_norm": 0.1596672534942627, + "learning_rate": 3.371608081026919e-05, + "loss": 0.0748, + "step": 12450 + }, + { + "epoch": 2.0174836158804093, + "grad_norm": 0.15817615389823914, + "learning_rate": 3.3743161999675026e-05, + "loss": 0.1342, + "step": 12460 + }, + { + "epoch": 2.017537778259221, + "grad_norm": 0.1655571460723877, + "learning_rate": 3.377024318908086e-05, + "loss": 0.0795, + "step": 12470 + }, + { + "epoch": 2.0175919406380327, + "grad_norm": 0.09432656317949295, + "learning_rate": 3.3797324378486706e-05, + "loss": 0.1147, + "step": 12480 + }, + { + "epoch": 2.0176461030168444, + "grad_norm": 0.666968584060669, + "learning_rate": 3.382440556789254e-05, + "loss": 0.1331, + "step": 12490 + }, + { + "epoch": 2.017700265395656, + "grad_norm": 0.1401696503162384, + "learning_rate": 3.3851486757298386e-05, + "loss": 0.0474, + "step": 12500 + }, + { + "epoch": 2.017754427774468, + "grad_norm": 0.017631804570555687, + "learning_rate": 3.387856794670422e-05, + "loss": 0.092, + "step": 12510 + }, + { + "epoch": 2.0178085901532796, + "grad_norm": 0.24326394498348236, + "learning_rate": 3.390564913611006e-05, + "loss": 0.1211, + "step": 12520 + }, + { + "epoch": 2.0178627525320914, + "grad_norm": 0.32090145349502563, + "learning_rate": 3.3932730325515897e-05, + "loss": 0.0608, + "step": 12530 + }, + { + "epoch": 2.017916914910903, + "grad_norm": 3.0666465759277344, + "learning_rate": 3.395981151492173e-05, + "loss": 0.1465, + "step": 12540 + }, + { + "epoch": 2.0179710772897144, + "grad_norm": 0.020955899730324745, + "learning_rate": 3.398689270432758e-05, + "loss": 0.1616, + "step": 12550 + }, + { + "epoch": 2.018025239668526, + "grad_norm": 0.8219888806343079, + "learning_rate": 3.4013973893733414e-05, + "loss": 0.0489, + "step": 12560 + }, + { + "epoch": 2.018079402047338, + "grad_norm": 0.6329140067100525, + "learning_rate": 3.404105508313925e-05, + "loss": 0.1147, + "step": 12570 + }, + { + "epoch": 2.0181335644261496, + "grad_norm": 0.2484271228313446, + "learning_rate": 3.4068136272545094e-05, + "loss": 0.2944, + "step": 12580 + }, + { + "epoch": 2.0181877268049613, + "grad_norm": 3.284980058670044, + "learning_rate": 3.409521746195093e-05, + "loss": 0.1562, + "step": 12590 + }, + { + "epoch": 2.018241889183773, + "grad_norm": 0.16266091167926788, + "learning_rate": 3.4122298651356774e-05, + "loss": 0.3182, + "step": 12600 + }, + { + "epoch": 2.0182960515625847, + "grad_norm": 0.2192050963640213, + "learning_rate": 3.4149379840762604e-05, + "loss": 0.1349, + "step": 12610 + }, + { + "epoch": 2.0183502139413965, + "grad_norm": 0.177836075425148, + "learning_rate": 3.417646103016844e-05, + "loss": 0.1362, + "step": 12620 + }, + { + "epoch": 2.018404376320208, + "grad_norm": 0.5155708193778992, + "learning_rate": 3.4203542219574284e-05, + "loss": 0.1857, + "step": 12630 + }, + { + "epoch": 2.0184585386990195, + "grad_norm": 5.01894998550415, + "learning_rate": 3.423062340898012e-05, + "loss": 0.122, + "step": 12640 + }, + { + "epoch": 2.018512701077831, + "grad_norm": 10.946656227111816, + "learning_rate": 3.4257704598385965e-05, + "loss": 0.1839, + "step": 12650 + }, + { + "epoch": 2.018566863456643, + "grad_norm": 0.7470043897628784, + "learning_rate": 3.42847857877918e-05, + "loss": 0.2174, + "step": 12660 + }, + { + "epoch": 2.0186210258354547, + "grad_norm": 0.22185447812080383, + "learning_rate": 3.431186697719764e-05, + "loss": 0.1961, + "step": 12670 + }, + { + "epoch": 2.0186751882142664, + "grad_norm": 3.125663995742798, + "learning_rate": 3.433894816660348e-05, + "loss": 0.1079, + "step": 12680 + }, + { + "epoch": 2.018729350593078, + "grad_norm": 0.022659117355942726, + "learning_rate": 3.436602935600932e-05, + "loss": 0.2176, + "step": 12690 + }, + { + "epoch": 2.01878351297189, + "grad_norm": 7.36314582824707, + "learning_rate": 3.4393110545415155e-05, + "loss": 0.0763, + "step": 12700 + }, + { + "epoch": 2.0188376753507016, + "grad_norm": 0.02283978834748268, + "learning_rate": 3.442019173482099e-05, + "loss": 0.0584, + "step": 12710 + }, + { + "epoch": 2.018891837729513, + "grad_norm": 0.013137114234268665, + "learning_rate": 3.444727292422683e-05, + "loss": 0.1565, + "step": 12720 + }, + { + "epoch": 2.0189460001083246, + "grad_norm": 0.15513737499713898, + "learning_rate": 3.447435411363267e-05, + "loss": 0.1368, + "step": 12730 + }, + { + "epoch": 2.0190001624871363, + "grad_norm": 5.747733116149902, + "learning_rate": 3.450143530303851e-05, + "loss": 0.1531, + "step": 12740 + }, + { + "epoch": 2.019054324865948, + "grad_norm": 5.40429162979126, + "learning_rate": 3.452851649244435e-05, + "loss": 0.1603, + "step": 12750 + }, + { + "epoch": 2.0191084872447598, + "grad_norm": 0.2590388059616089, + "learning_rate": 3.455559768185019e-05, + "loss": 0.0254, + "step": 12760 + }, + { + "epoch": 2.0191626496235715, + "grad_norm": 0.0041846889071166515, + "learning_rate": 3.4582678871256026e-05, + "loss": 0.106, + "step": 12770 + }, + { + "epoch": 2.0192168120023832, + "grad_norm": 0.23071369528770447, + "learning_rate": 3.460976006066187e-05, + "loss": 0.22, + "step": 12780 + }, + { + "epoch": 2.019270974381195, + "grad_norm": 0.008287441916763783, + "learning_rate": 3.4636841250067706e-05, + "loss": 0.0992, + "step": 12790 + }, + { + "epoch": 2.0193251367600067, + "grad_norm": 1.2267436981201172, + "learning_rate": 3.466392243947354e-05, + "loss": 0.206, + "step": 12800 + }, + { + "epoch": 2.019379299138818, + "grad_norm": 3.2710859775543213, + "learning_rate": 3.469100362887938e-05, + "loss": 0.171, + "step": 12810 + }, + { + "epoch": 2.0194334615176297, + "grad_norm": 9.140034675598145, + "learning_rate": 3.471808481828522e-05, + "loss": 0.1927, + "step": 12820 + }, + { + "epoch": 2.0194876238964414, + "grad_norm": 1.2403239011764526, + "learning_rate": 3.474516600769106e-05, + "loss": 0.1287, + "step": 12830 + }, + { + "epoch": 2.019541786275253, + "grad_norm": 1.9632463455200195, + "learning_rate": 3.47722471970969e-05, + "loss": 0.071, + "step": 12840 + }, + { + "epoch": 2.019595948654065, + "grad_norm": 3.312678337097168, + "learning_rate": 3.479932838650274e-05, + "loss": 0.055, + "step": 12850 + }, + { + "epoch": 2.0196501110328766, + "grad_norm": 0.006416921969503164, + "learning_rate": 3.482640957590858e-05, + "loss": 0.0867, + "step": 12860 + }, + { + "epoch": 2.0197042734116883, + "grad_norm": 6.560697555541992, + "learning_rate": 3.4853490765314414e-05, + "loss": 0.2906, + "step": 12870 + }, + { + "epoch": 2.0197584357905, + "grad_norm": 0.18041515350341797, + "learning_rate": 3.488057195472025e-05, + "loss": 0.1017, + "step": 12880 + }, + { + "epoch": 2.019812598169312, + "grad_norm": 0.42519694566726685, + "learning_rate": 3.490765314412609e-05, + "loss": 0.137, + "step": 12890 + }, + { + "epoch": 2.019866760548123, + "grad_norm": 0.6923977732658386, + "learning_rate": 3.493473433353193e-05, + "loss": 0.156, + "step": 12900 + }, + { + "epoch": 2.019920922926935, + "grad_norm": 2.6313233375549316, + "learning_rate": 3.496181552293777e-05, + "loss": 0.0645, + "step": 12910 + }, + { + "epoch": 2.0199750853057465, + "grad_norm": 6.015663146972656, + "learning_rate": 3.4988896712343604e-05, + "loss": 0.2079, + "step": 12920 + }, + { + "epoch": 2.0200292476845583, + "grad_norm": 0.008254194632172585, + "learning_rate": 3.501597790174945e-05, + "loss": 0.1091, + "step": 12930 + }, + { + "epoch": 2.02008341006337, + "grad_norm": 0.5977756977081299, + "learning_rate": 3.5043059091155285e-05, + "loss": 0.3165, + "step": 12940 + }, + { + "epoch": 2.0201375724421817, + "grad_norm": 0.9801216125488281, + "learning_rate": 3.507014028056113e-05, + "loss": 0.0493, + "step": 12950 + }, + { + "epoch": 2.0201917348209935, + "grad_norm": 0.2772237956523895, + "learning_rate": 3.5097221469966965e-05, + "loss": 0.1041, + "step": 12960 + }, + { + "epoch": 2.020245897199805, + "grad_norm": 0.30428028106689453, + "learning_rate": 3.51243026593728e-05, + "loss": 0.065, + "step": 12970 + }, + { + "epoch": 2.0203000595786165, + "grad_norm": 0.3907930850982666, + "learning_rate": 3.515138384877864e-05, + "loss": 0.2733, + "step": 12980 + }, + { + "epoch": 2.020354221957428, + "grad_norm": 0.007508458103984594, + "learning_rate": 3.5178465038184475e-05, + "loss": 0.1538, + "step": 12990 + }, + { + "epoch": 2.02040838433624, + "grad_norm": 0.33050742745399475, + "learning_rate": 3.520554622759032e-05, + "loss": 0.0654, + "step": 13000 + }, + { + "epoch": 2.0204625467150517, + "grad_norm": 2.156100034713745, + "learning_rate": 3.5232627416996156e-05, + "loss": 0.1025, + "step": 13010 + }, + { + "epoch": 2.0205167090938634, + "grad_norm": 0.6146904230117798, + "learning_rate": 3.525970860640199e-05, + "loss": 0.1627, + "step": 13020 + }, + { + "epoch": 2.020570871472675, + "grad_norm": 8.243313789367676, + "learning_rate": 3.5286789795807836e-05, + "loss": 0.078, + "step": 13030 + }, + { + "epoch": 2.020625033851487, + "grad_norm": 0.306072860956192, + "learning_rate": 3.531387098521367e-05, + "loss": 0.1146, + "step": 13040 + }, + { + "epoch": 2.0206791962302986, + "grad_norm": 5.561939716339111, + "learning_rate": 3.534095217461951e-05, + "loss": 0.1796, + "step": 13050 + }, + { + "epoch": 2.0207333586091103, + "grad_norm": 0.22522930800914764, + "learning_rate": 3.5368033364025346e-05, + "loss": 0.104, + "step": 13060 + }, + { + "epoch": 2.0207875209879216, + "grad_norm": 0.30582940578460693, + "learning_rate": 3.539511455343118e-05, + "loss": 0.0442, + "step": 13070 + }, + { + "epoch": 2.0208416833667333, + "grad_norm": 3.2265381813049316, + "learning_rate": 3.5422195742837026e-05, + "loss": 0.2485, + "step": 13080 + }, + { + "epoch": 2.020895845745545, + "grad_norm": 0.6003962159156799, + "learning_rate": 3.544927693224286e-05, + "loss": 0.086, + "step": 13090 + }, + { + "epoch": 2.0209500081243568, + "grad_norm": 0.06173943728208542, + "learning_rate": 3.547635812164871e-05, + "loss": 0.0309, + "step": 13100 + }, + { + "epoch": 2.0210041705031685, + "grad_norm": 0.1695268601179123, + "learning_rate": 3.5503439311054543e-05, + "loss": 0.1236, + "step": 13110 + }, + { + "epoch": 2.0210583328819802, + "grad_norm": 0.04186403378844261, + "learning_rate": 3.553052050046038e-05, + "loss": 0.0973, + "step": 13120 + }, + { + "epoch": 2.021112495260792, + "grad_norm": 0.010137605480849743, + "learning_rate": 3.5557601689866224e-05, + "loss": 0.1558, + "step": 13130 + }, + { + "epoch": 2.0211666576396037, + "grad_norm": 3.771512746810913, + "learning_rate": 3.558468287927206e-05, + "loss": 0.199, + "step": 13140 + }, + { + "epoch": 2.0212208200184154, + "grad_norm": 0.024002406746149063, + "learning_rate": 3.56117640686779e-05, + "loss": 0.2427, + "step": 13150 + }, + { + "epoch": 2.0212749823972267, + "grad_norm": 0.06763074547052383, + "learning_rate": 3.5638845258083734e-05, + "loss": 0.1638, + "step": 13160 + }, + { + "epoch": 2.0213291447760384, + "grad_norm": 0.7310402989387512, + "learning_rate": 3.566592644748957e-05, + "loss": 0.1034, + "step": 13170 + }, + { + "epoch": 2.02138330715485, + "grad_norm": 1.8712944984436035, + "learning_rate": 3.5693007636895414e-05, + "loss": 0.0324, + "step": 13180 + }, + { + "epoch": 2.021437469533662, + "grad_norm": 0.3658255934715271, + "learning_rate": 3.572008882630125e-05, + "loss": 0.1621, + "step": 13190 + }, + { + "epoch": 2.0214916319124736, + "grad_norm": 0.007922078482806683, + "learning_rate": 3.5747170015707095e-05, + "loss": 0.1283, + "step": 13200 + }, + { + "epoch": 2.0215457942912853, + "grad_norm": 0.014067059382796288, + "learning_rate": 3.577425120511293e-05, + "loss": 0.0445, + "step": 13210 + }, + { + "epoch": 2.021599956670097, + "grad_norm": 0.015515385195612907, + "learning_rate": 3.5801332394518775e-05, + "loss": 0.1653, + "step": 13220 + }, + { + "epoch": 2.021654119048909, + "grad_norm": 2.9219281673431396, + "learning_rate": 3.582841358392461e-05, + "loss": 0.3304, + "step": 13230 + }, + { + "epoch": 2.0217082814277205, + "grad_norm": 15.487249374389648, + "learning_rate": 3.585549477333044e-05, + "loss": 0.0658, + "step": 13240 + }, + { + "epoch": 2.021762443806532, + "grad_norm": 0.023174168542027473, + "learning_rate": 3.5882575962736285e-05, + "loss": 0.1669, + "step": 13250 + }, + { + "epoch": 2.0218166061853435, + "grad_norm": 0.9400174617767334, + "learning_rate": 3.590965715214212e-05, + "loss": 0.122, + "step": 13260 + }, + { + "epoch": 2.0218707685641553, + "grad_norm": 0.012265530414879322, + "learning_rate": 3.5936738341547965e-05, + "loss": 0.0667, + "step": 13270 + }, + { + "epoch": 2.021924930942967, + "grad_norm": 0.02808716893196106, + "learning_rate": 3.59638195309538e-05, + "loss": 0.1016, + "step": 13280 + }, + { + "epoch": 2.0219790933217787, + "grad_norm": 0.03222494572401047, + "learning_rate": 3.599090072035964e-05, + "loss": 0.0654, + "step": 13290 + }, + { + "epoch": 2.0220332557005904, + "grad_norm": 0.023331142961978912, + "learning_rate": 3.601798190976548e-05, + "loss": 0.2178, + "step": 13300 + }, + { + "epoch": 2.022087418079402, + "grad_norm": 0.09499950706958771, + "learning_rate": 3.604506309917132e-05, + "loss": 0.1212, + "step": 13310 + }, + { + "epoch": 2.022141580458214, + "grad_norm": 0.16351404786109924, + "learning_rate": 3.6072144288577156e-05, + "loss": 0.058, + "step": 13320 + }, + { + "epoch": 2.022195742837025, + "grad_norm": 3.3329696655273438, + "learning_rate": 3.609922547798299e-05, + "loss": 0.1361, + "step": 13330 + }, + { + "epoch": 2.022249905215837, + "grad_norm": 6.433936595916748, + "learning_rate": 3.612630666738883e-05, + "loss": 0.0928, + "step": 13340 + }, + { + "epoch": 2.0223040675946486, + "grad_norm": 4.309000015258789, + "learning_rate": 3.615338785679467e-05, + "loss": 0.1405, + "step": 13350 + }, + { + "epoch": 2.0223582299734604, + "grad_norm": 0.013730677776038647, + "learning_rate": 3.618046904620051e-05, + "loss": 0.1613, + "step": 13360 + }, + { + "epoch": 2.022412392352272, + "grad_norm": 0.7588696479797363, + "learning_rate": 3.620755023560635e-05, + "loss": 0.0241, + "step": 13370 + }, + { + "epoch": 2.022466554731084, + "grad_norm": 0.002065176609903574, + "learning_rate": 3.623463142501219e-05, + "loss": 0.015, + "step": 13380 + }, + { + "epoch": 2.0225207171098956, + "grad_norm": 0.002614889293909073, + "learning_rate": 3.626171261441803e-05, + "loss": 0.2023, + "step": 13390 + }, + { + "epoch": 2.0225748794887073, + "grad_norm": 14.495290756225586, + "learning_rate": 3.628879380382387e-05, + "loss": 0.102, + "step": 13400 + }, + { + "epoch": 2.022629041867519, + "grad_norm": 8.994537353515625, + "learning_rate": 3.631587499322971e-05, + "loss": 0.1748, + "step": 13410 + }, + { + "epoch": 2.0226832042463303, + "grad_norm": 3.4395573139190674, + "learning_rate": 3.6342956182635544e-05, + "loss": 0.257, + "step": 13420 + }, + { + "epoch": 2.022737366625142, + "grad_norm": 0.3111759126186371, + "learning_rate": 3.637003737204138e-05, + "loss": 0.0935, + "step": 13430 + }, + { + "epoch": 2.0227915290039538, + "grad_norm": 1.0231740474700928, + "learning_rate": 3.639711856144722e-05, + "loss": 0.1297, + "step": 13440 + }, + { + "epoch": 2.0228456913827655, + "grad_norm": 0.0030491643119603395, + "learning_rate": 3.642419975085306e-05, + "loss": 0.0844, + "step": 13450 + }, + { + "epoch": 2.022899853761577, + "grad_norm": 7.325822353363037, + "learning_rate": 3.64512809402589e-05, + "loss": 0.2195, + "step": 13460 + }, + { + "epoch": 2.022954016140389, + "grad_norm": 0.1416507214307785, + "learning_rate": 3.647836212966474e-05, + "loss": 0.0615, + "step": 13470 + }, + { + "epoch": 2.0230081785192007, + "grad_norm": 0.08682950586080551, + "learning_rate": 3.650544331907058e-05, + "loss": 0.1504, + "step": 13480 + }, + { + "epoch": 2.0230623408980124, + "grad_norm": 0.0068215117789804935, + "learning_rate": 3.6532524508476415e-05, + "loss": 0.0571, + "step": 13490 + }, + { + "epoch": 2.023116503276824, + "grad_norm": 6.818014144897461, + "learning_rate": 3.655960569788225e-05, + "loss": 0.1345, + "step": 13500 + }, + { + "epoch": 2.0231706656556354, + "grad_norm": 6.38495397567749, + "learning_rate": 3.658668688728809e-05, + "loss": 0.178, + "step": 13510 + }, + { + "epoch": 2.023224828034447, + "grad_norm": 0.16442205011844635, + "learning_rate": 3.661376807669393e-05, + "loss": 0.2208, + "step": 13520 + }, + { + "epoch": 2.023278990413259, + "grad_norm": 7.203094482421875, + "learning_rate": 3.664084926609977e-05, + "loss": 0.0797, + "step": 13530 + }, + { + "epoch": 2.0233331527920706, + "grad_norm": 5.29758882522583, + "learning_rate": 3.6667930455505605e-05, + "loss": 0.5534, + "step": 13540 + }, + { + "epoch": 2.0233873151708823, + "grad_norm": 0.26195988059043884, + "learning_rate": 3.669501164491145e-05, + "loss": 0.0837, + "step": 13550 + }, + { + "epoch": 2.023441477549694, + "grad_norm": 0.02237279713153839, + "learning_rate": 3.6722092834317285e-05, + "loss": 0.0911, + "step": 13560 + }, + { + "epoch": 2.023495639928506, + "grad_norm": 0.04837825521826744, + "learning_rate": 3.674917402372313e-05, + "loss": 0.069, + "step": 13570 + }, + { + "epoch": 2.0235498023073175, + "grad_norm": 4.6657633781433105, + "learning_rate": 3.6776255213128966e-05, + "loss": 0.215, + "step": 13580 + }, + { + "epoch": 2.023603964686129, + "grad_norm": 0.19346962869167328, + "learning_rate": 3.68033364025348e-05, + "loss": 0.4189, + "step": 13590 + }, + { + "epoch": 2.0236581270649405, + "grad_norm": 6.103495121002197, + "learning_rate": 3.683041759194064e-05, + "loss": 0.1715, + "step": 13600 + }, + { + "epoch": 2.0237122894437523, + "grad_norm": 0.32517480850219727, + "learning_rate": 3.6857498781346476e-05, + "loss": 0.0464, + "step": 13610 + }, + { + "epoch": 2.023766451822564, + "grad_norm": 0.44383710622787476, + "learning_rate": 3.688457997075232e-05, + "loss": 0.1349, + "step": 13620 + }, + { + "epoch": 2.0238206142013757, + "grad_norm": 0.04815734177827835, + "learning_rate": 3.6911661160158156e-05, + "loss": 0.044, + "step": 13630 + }, + { + "epoch": 2.0238747765801874, + "grad_norm": 0.13865543901920319, + "learning_rate": 3.693874234956399e-05, + "loss": 0.1451, + "step": 13640 + }, + { + "epoch": 2.023928938958999, + "grad_norm": 0.6248111724853516, + "learning_rate": 3.6965823538969836e-05, + "loss": 0.0571, + "step": 13650 + }, + { + "epoch": 2.023983101337811, + "grad_norm": 0.005184170790016651, + "learning_rate": 3.699290472837567e-05, + "loss": 0.0586, + "step": 13660 + }, + { + "epoch": 2.0240372637166226, + "grad_norm": 0.04787658527493477, + "learning_rate": 3.701998591778152e-05, + "loss": 0.1335, + "step": 13670 + }, + { + "epoch": 2.024091426095434, + "grad_norm": 0.16856802999973297, + "learning_rate": 3.704706710718735e-05, + "loss": 0.1537, + "step": 13680 + }, + { + "epoch": 2.0241455884742456, + "grad_norm": 0.8261680603027344, + "learning_rate": 3.7074148296593183e-05, + "loss": 0.2705, + "step": 13690 + }, + { + "epoch": 2.0241997508530574, + "grad_norm": 0.22349123656749725, + "learning_rate": 3.710122948599903e-05, + "loss": 0.0841, + "step": 13700 + }, + { + "epoch": 2.024253913231869, + "grad_norm": 0.007204626686871052, + "learning_rate": 3.7128310675404864e-05, + "loss": 0.1228, + "step": 13710 + }, + { + "epoch": 2.024308075610681, + "grad_norm": 9.543985366821289, + "learning_rate": 3.715539186481071e-05, + "loss": 0.172, + "step": 13720 + }, + { + "epoch": 2.0243622379894926, + "grad_norm": 3.902230978012085, + "learning_rate": 3.7182473054216544e-05, + "loss": 0.0423, + "step": 13730 + }, + { + "epoch": 2.0244164003683043, + "grad_norm": 0.12203951179981232, + "learning_rate": 3.720955424362238e-05, + "loss": 0.1951, + "step": 13740 + }, + { + "epoch": 2.024470562747116, + "grad_norm": 11.332357406616211, + "learning_rate": 3.7236635433028224e-05, + "loss": 0.1704, + "step": 13750 + }, + { + "epoch": 2.0245247251259277, + "grad_norm": 0.08056975156068802, + "learning_rate": 3.726371662243406e-05, + "loss": 0.1295, + "step": 13760 + }, + { + "epoch": 2.024578887504739, + "grad_norm": 0.01079555694013834, + "learning_rate": 3.72907978118399e-05, + "loss": 0.1455, + "step": 13770 + }, + { + "epoch": 2.0246330498835508, + "grad_norm": 0.01123076118528843, + "learning_rate": 3.7317879001245735e-05, + "loss": 0.0099, + "step": 13780 + }, + { + "epoch": 2.0246872122623625, + "grad_norm": 6.710317134857178, + "learning_rate": 3.734496019065157e-05, + "loss": 0.2998, + "step": 13790 + }, + { + "epoch": 2.024741374641174, + "grad_norm": 0.7923845648765564, + "learning_rate": 3.7372041380057415e-05, + "loss": 0.161, + "step": 13800 + }, + { + "epoch": 2.024795537019986, + "grad_norm": 0.6541853547096252, + "learning_rate": 3.739912256946325e-05, + "loss": 0.0714, + "step": 13810 + }, + { + "epoch": 2.0248496993987977, + "grad_norm": 0.401120662689209, + "learning_rate": 3.7426203758869095e-05, + "loss": 0.1716, + "step": 13820 + }, + { + "epoch": 2.0249038617776094, + "grad_norm": 2.8608338832855225, + "learning_rate": 3.745328494827493e-05, + "loss": 0.1601, + "step": 13830 + }, + { + "epoch": 2.024958024156421, + "grad_norm": 0.022359520196914673, + "learning_rate": 3.748036613768077e-05, + "loss": 0.0482, + "step": 13840 + }, + { + "epoch": 2.0250013540594702, + "eval_accuracy": 0.8249510124101894, + "eval_loss": 0.38916346430778503, + "eval_runtime": 116.4289, + "eval_samples_per_second": 26.299, + "eval_steps_per_second": 3.29, + "step": 13848 + }, + { + "epoch": 3.000010832475762, + "grad_norm": 0.04362250119447708, + "learning_rate": 3.750744732708661e-05, + "loss": 0.0774, + "step": 13850 + }, + { + "epoch": 3.000064994854574, + "grad_norm": 0.0164005346596241, + "learning_rate": 3.753452851649244e-05, + "loss": 0.0117, + "step": 13860 + }, + { + "epoch": 3.0001191572333856, + "grad_norm": 0.06705018877983093, + "learning_rate": 3.7561609705898286e-05, + "loss": 0.0695, + "step": 13870 + }, + { + "epoch": 3.0001733196121974, + "grad_norm": 0.2872704267501831, + "learning_rate": 3.758869089530412e-05, + "loss": 0.0278, + "step": 13880 + }, + { + "epoch": 3.000227481991009, + "grad_norm": 0.2471306174993515, + "learning_rate": 3.761577208470996e-05, + "loss": 0.1863, + "step": 13890 + }, + { + "epoch": 3.000281644369821, + "grad_norm": 4.044432163238525, + "learning_rate": 3.76428532741158e-05, + "loss": 0.1821, + "step": 13900 + }, + { + "epoch": 3.0003358067486325, + "grad_norm": 0.2968287765979767, + "learning_rate": 3.766993446352164e-05, + "loss": 0.0807, + "step": 13910 + }, + { + "epoch": 3.0003899691274443, + "grad_norm": 5.592907428741455, + "learning_rate": 3.769701565292748e-05, + "loss": 0.1293, + "step": 13920 + }, + { + "epoch": 3.0004441315062556, + "grad_norm": 11.229927062988281, + "learning_rate": 3.772409684233332e-05, + "loss": 0.2937, + "step": 13930 + }, + { + "epoch": 3.0004982938850673, + "grad_norm": 3.5362038612365723, + "learning_rate": 3.7751178031739156e-05, + "loss": 0.0861, + "step": 13940 + }, + { + "epoch": 3.000552456263879, + "grad_norm": 0.06821081042289734, + "learning_rate": 3.777825922114499e-05, + "loss": 0.0223, + "step": 13950 + }, + { + "epoch": 3.0006066186426907, + "grad_norm": 0.06330364942550659, + "learning_rate": 3.780534041055083e-05, + "loss": 0.0648, + "step": 13960 + }, + { + "epoch": 3.0006607810215025, + "grad_norm": 8.54532527923584, + "learning_rate": 3.7832421599956674e-05, + "loss": 0.0513, + "step": 13970 + }, + { + "epoch": 3.000714943400314, + "grad_norm": 3.106491804122925, + "learning_rate": 3.785950278936251e-05, + "loss": 0.1193, + "step": 13980 + }, + { + "epoch": 3.000769105779126, + "grad_norm": 4.965819835662842, + "learning_rate": 3.788658397876835e-05, + "loss": 0.1418, + "step": 13990 + }, + { + "epoch": 3.0008232681579377, + "grad_norm": 7.625452518463135, + "learning_rate": 3.791366516817419e-05, + "loss": 0.2203, + "step": 14000 + }, + { + "epoch": 3.0008774305367494, + "grad_norm": 0.8365005850791931, + "learning_rate": 3.794074635758003e-05, + "loss": 0.171, + "step": 14010 + }, + { + "epoch": 3.0009315929155607, + "grad_norm": 7.312850475311279, + "learning_rate": 3.796782754698587e-05, + "loss": 0.1805, + "step": 14020 + }, + { + "epoch": 3.0009857552943724, + "grad_norm": 0.7580282092094421, + "learning_rate": 3.799490873639171e-05, + "loss": 0.1462, + "step": 14030 + }, + { + "epoch": 3.001039917673184, + "grad_norm": 0.010218721814453602, + "learning_rate": 3.802198992579754e-05, + "loss": 0.0376, + "step": 14040 + }, + { + "epoch": 3.001094080051996, + "grad_norm": 0.14862680435180664, + "learning_rate": 3.804907111520338e-05, + "loss": 0.1245, + "step": 14050 + }, + { + "epoch": 3.0011482424308076, + "grad_norm": 1.1603198051452637, + "learning_rate": 3.807615230460922e-05, + "loss": 0.1051, + "step": 14060 + }, + { + "epoch": 3.0012024048096193, + "grad_norm": 0.06616222858428955, + "learning_rate": 3.810323349401506e-05, + "loss": 0.0679, + "step": 14070 + }, + { + "epoch": 3.001256567188431, + "grad_norm": 0.22682209312915802, + "learning_rate": 3.81303146834209e-05, + "loss": 0.0488, + "step": 14080 + }, + { + "epoch": 3.0013107295672428, + "grad_norm": 0.6821897029876709, + "learning_rate": 3.8157395872826735e-05, + "loss": 0.1405, + "step": 14090 + }, + { + "epoch": 3.0013648919460545, + "grad_norm": 0.002249367069453001, + "learning_rate": 3.818447706223258e-05, + "loss": 0.0875, + "step": 14100 + }, + { + "epoch": 3.0014190543248658, + "grad_norm": 0.0059583187103271484, + "learning_rate": 3.8211558251638415e-05, + "loss": 0.1928, + "step": 14110 + }, + { + "epoch": 3.0014732167036775, + "grad_norm": 0.738783061504364, + "learning_rate": 3.823863944104425e-05, + "loss": 0.077, + "step": 14120 + }, + { + "epoch": 3.0015273790824892, + "grad_norm": 0.05391351133584976, + "learning_rate": 3.826572063045009e-05, + "loss": 0.0575, + "step": 14130 + }, + { + "epoch": 3.001581541461301, + "grad_norm": 10.350727081298828, + "learning_rate": 3.8292801819855925e-05, + "loss": 0.0826, + "step": 14140 + }, + { + "epoch": 3.0016357038401127, + "grad_norm": 0.3127734065055847, + "learning_rate": 3.831988300926177e-05, + "loss": 0.0404, + "step": 14150 + }, + { + "epoch": 3.0016898662189244, + "grad_norm": 0.09456849843263626, + "learning_rate": 3.8346964198667606e-05, + "loss": 0.2412, + "step": 14160 + }, + { + "epoch": 3.001744028597736, + "grad_norm": 0.20611435174942017, + "learning_rate": 3.837404538807345e-05, + "loss": 0.0259, + "step": 14170 + }, + { + "epoch": 3.001798190976548, + "grad_norm": 14.493871688842773, + "learning_rate": 3.8401126577479286e-05, + "loss": 0.0529, + "step": 14180 + }, + { + "epoch": 3.001852353355359, + "grad_norm": 0.012571786530315876, + "learning_rate": 3.842820776688512e-05, + "loss": 0.1394, + "step": 14190 + }, + { + "epoch": 3.001906515734171, + "grad_norm": 10.262550354003906, + "learning_rate": 3.8455288956290966e-05, + "loss": 0.3059, + "step": 14200 + }, + { + "epoch": 3.0019606781129826, + "grad_norm": 0.4705824553966522, + "learning_rate": 3.84823701456968e-05, + "loss": 0.1502, + "step": 14210 + }, + { + "epoch": 3.0020148404917943, + "grad_norm": 0.060341522097587585, + "learning_rate": 3.850945133510264e-05, + "loss": 0.0368, + "step": 14220 + }, + { + "epoch": 3.002069002870606, + "grad_norm": 0.23280911147594452, + "learning_rate": 3.8536532524508476e-05, + "loss": 0.1068, + "step": 14230 + }, + { + "epoch": 3.002123165249418, + "grad_norm": 2.736137866973877, + "learning_rate": 3.856361371391431e-05, + "loss": 0.0081, + "step": 14240 + }, + { + "epoch": 3.0021773276282295, + "grad_norm": 14.995973587036133, + "learning_rate": 3.859069490332016e-05, + "loss": 0.1889, + "step": 14250 + }, + { + "epoch": 3.0022314900070413, + "grad_norm": 0.008164318278431892, + "learning_rate": 3.8617776092725994e-05, + "loss": 0.1447, + "step": 14260 + }, + { + "epoch": 3.002285652385853, + "grad_norm": 0.0023840402718633413, + "learning_rate": 3.864485728213184e-05, + "loss": 0.0709, + "step": 14270 + }, + { + "epoch": 3.0023398147646643, + "grad_norm": 5.669618606567383, + "learning_rate": 3.8671938471537674e-05, + "loss": 0.1945, + "step": 14280 + }, + { + "epoch": 3.002393977143476, + "grad_norm": 7.262461185455322, + "learning_rate": 3.869901966094351e-05, + "loss": 0.1111, + "step": 14290 + }, + { + "epoch": 3.0024481395222877, + "grad_norm": 7.714439868927002, + "learning_rate": 3.872610085034935e-05, + "loss": 0.0893, + "step": 14300 + }, + { + "epoch": 3.0025023019010995, + "grad_norm": 2.9693408012390137, + "learning_rate": 3.8753182039755184e-05, + "loss": 0.0577, + "step": 14310 + }, + { + "epoch": 3.002556464279911, + "grad_norm": 0.1917165219783783, + "learning_rate": 3.878026322916103e-05, + "loss": 0.0716, + "step": 14320 + }, + { + "epoch": 3.002610626658723, + "grad_norm": 0.27298974990844727, + "learning_rate": 3.8807344418566864e-05, + "loss": 0.1273, + "step": 14330 + }, + { + "epoch": 3.0026647890375346, + "grad_norm": 0.4067623019218445, + "learning_rate": 3.88344256079727e-05, + "loss": 0.1211, + "step": 14340 + }, + { + "epoch": 3.0027189514163464, + "grad_norm": 3.616288423538208, + "learning_rate": 3.8861506797378545e-05, + "loss": 0.2349, + "step": 14350 + }, + { + "epoch": 3.002773113795158, + "grad_norm": 9.005189895629883, + "learning_rate": 3.888858798678438e-05, + "loss": 0.2168, + "step": 14360 + }, + { + "epoch": 3.0028272761739694, + "grad_norm": 0.7161813378334045, + "learning_rate": 3.8915669176190225e-05, + "loss": 0.1601, + "step": 14370 + }, + { + "epoch": 3.002881438552781, + "grad_norm": 0.08812485635280609, + "learning_rate": 3.894275036559606e-05, + "loss": 0.0682, + "step": 14380 + }, + { + "epoch": 3.002935600931593, + "grad_norm": 12.620969772338867, + "learning_rate": 3.89698315550019e-05, + "loss": 0.0691, + "step": 14390 + }, + { + "epoch": 3.0029897633104046, + "grad_norm": 0.06622711569070816, + "learning_rate": 3.8996912744407735e-05, + "loss": 0.282, + "step": 14400 + }, + { + "epoch": 3.0030439256892163, + "grad_norm": 0.029284248128533363, + "learning_rate": 3.902399393381357e-05, + "loss": 0.3501, + "step": 14410 + }, + { + "epoch": 3.003098088068028, + "grad_norm": 9.817947387695312, + "learning_rate": 3.9051075123219415e-05, + "loss": 0.0428, + "step": 14420 + }, + { + "epoch": 3.0031522504468398, + "grad_norm": 0.13313326239585876, + "learning_rate": 3.907815631262525e-05, + "loss": 0.1325, + "step": 14430 + }, + { + "epoch": 3.0032064128256515, + "grad_norm": 0.055329691618680954, + "learning_rate": 3.910523750203109e-05, + "loss": 0.1392, + "step": 14440 + }, + { + "epoch": 3.0032605752044628, + "grad_norm": 0.3811231553554535, + "learning_rate": 3.913231869143693e-05, + "loss": 0.0659, + "step": 14450 + }, + { + "epoch": 3.0033147375832745, + "grad_norm": 7.773914813995361, + "learning_rate": 3.915939988084277e-05, + "loss": 0.1188, + "step": 14460 + }, + { + "epoch": 3.0033688999620862, + "grad_norm": 0.05268823727965355, + "learning_rate": 3.918648107024861e-05, + "loss": 0.2653, + "step": 14470 + }, + { + "epoch": 3.003423062340898, + "grad_norm": 3.170468807220459, + "learning_rate": 3.921356225965444e-05, + "loss": 0.1033, + "step": 14480 + }, + { + "epoch": 3.0034772247197097, + "grad_norm": 0.10087042301893234, + "learning_rate": 3.924064344906028e-05, + "loss": 0.1718, + "step": 14490 + }, + { + "epoch": 3.0035313870985214, + "grad_norm": 0.41437727212905884, + "learning_rate": 3.926772463846612e-05, + "loss": 0.0594, + "step": 14500 + }, + { + "epoch": 3.003585549477333, + "grad_norm": 3.0519485473632812, + "learning_rate": 3.929480582787196e-05, + "loss": 0.1265, + "step": 14510 + }, + { + "epoch": 3.003639711856145, + "grad_norm": 0.10549195855855942, + "learning_rate": 3.93218870172778e-05, + "loss": 0.032, + "step": 14520 + }, + { + "epoch": 3.0036938742349566, + "grad_norm": 0.007695755455642939, + "learning_rate": 3.934896820668364e-05, + "loss": 0.0604, + "step": 14530 + }, + { + "epoch": 3.003748036613768, + "grad_norm": 5.661778450012207, + "learning_rate": 3.937604939608948e-05, + "loss": 0.162, + "step": 14540 + }, + { + "epoch": 3.0038021989925796, + "grad_norm": 0.34003743529319763, + "learning_rate": 3.940313058549532e-05, + "loss": 0.1079, + "step": 14550 + }, + { + "epoch": 3.0038563613713913, + "grad_norm": 0.10481598973274231, + "learning_rate": 3.943021177490116e-05, + "loss": 0.0467, + "step": 14560 + }, + { + "epoch": 3.003910523750203, + "grad_norm": 3.7295217514038086, + "learning_rate": 3.9457292964306994e-05, + "loss": 0.2537, + "step": 14570 + }, + { + "epoch": 3.003964686129015, + "grad_norm": 2.518439531326294, + "learning_rate": 3.948437415371283e-05, + "loss": 0.0468, + "step": 14580 + }, + { + "epoch": 3.0040188485078265, + "grad_norm": 0.061987631022930145, + "learning_rate": 3.951145534311867e-05, + "loss": 0.1971, + "step": 14590 + }, + { + "epoch": 3.0040730108866383, + "grad_norm": 0.2965956926345825, + "learning_rate": 3.953853653252451e-05, + "loss": 0.1315, + "step": 14600 + }, + { + "epoch": 3.00412717326545, + "grad_norm": 0.05713273584842682, + "learning_rate": 3.956561772193035e-05, + "loss": 0.1046, + "step": 14610 + }, + { + "epoch": 3.0041813356442617, + "grad_norm": 0.38882923126220703, + "learning_rate": 3.959269891133619e-05, + "loss": 0.0806, + "step": 14620 + }, + { + "epoch": 3.004235498023073, + "grad_norm": 15.70897102355957, + "learning_rate": 3.961978010074203e-05, + "loss": 0.1365, + "step": 14630 + }, + { + "epoch": 3.0042896604018847, + "grad_norm": 0.02958429977297783, + "learning_rate": 3.9646861290147865e-05, + "loss": 0.0691, + "step": 14640 + }, + { + "epoch": 3.0043438227806964, + "grad_norm": 0.22968076169490814, + "learning_rate": 3.967394247955371e-05, + "loss": 0.0586, + "step": 14650 + }, + { + "epoch": 3.004397985159508, + "grad_norm": 0.291593074798584, + "learning_rate": 3.9701023668959545e-05, + "loss": 0.0756, + "step": 14660 + }, + { + "epoch": 3.00445214753832, + "grad_norm": 10.852106094360352, + "learning_rate": 3.972810485836538e-05, + "loss": 0.1043, + "step": 14670 + }, + { + "epoch": 3.0045063099171316, + "grad_norm": 0.05304556339979172, + "learning_rate": 3.975518604777122e-05, + "loss": 0.0837, + "step": 14680 + }, + { + "epoch": 3.0045604722959434, + "grad_norm": 1.1212427616119385, + "learning_rate": 3.9782267237177055e-05, + "loss": 0.0866, + "step": 14690 + }, + { + "epoch": 3.004614634674755, + "grad_norm": 0.11061805486679077, + "learning_rate": 3.98093484265829e-05, + "loss": 0.1138, + "step": 14700 + }, + { + "epoch": 3.0046687970535664, + "grad_norm": 10.716434478759766, + "learning_rate": 3.9836429615988735e-05, + "loss": 0.1073, + "step": 14710 + }, + { + "epoch": 3.004722959432378, + "grad_norm": 0.1788860261440277, + "learning_rate": 3.986351080539458e-05, + "loss": 0.0868, + "step": 14720 + }, + { + "epoch": 3.00477712181119, + "grad_norm": 0.007481349166482687, + "learning_rate": 3.9890591994800416e-05, + "loss": 0.1913, + "step": 14730 + }, + { + "epoch": 3.0048312841900016, + "grad_norm": 0.09738175570964813, + "learning_rate": 3.991767318420625e-05, + "loss": 0.176, + "step": 14740 + }, + { + "epoch": 3.0048854465688133, + "grad_norm": 1.0490262508392334, + "learning_rate": 3.994475437361209e-05, + "loss": 0.215, + "step": 14750 + }, + { + "epoch": 3.004939608947625, + "grad_norm": 1.976493000984192, + "learning_rate": 3.9971835563017926e-05, + "loss": 0.0775, + "step": 14760 + }, + { + "epoch": 3.0049937713264367, + "grad_norm": 0.30605271458625793, + "learning_rate": 3.999891675242377e-05, + "loss": 0.1984, + "step": 14770 + }, + { + "epoch": 3.0050479337052485, + "grad_norm": 2.9043052196502686, + "learning_rate": 4.0025997941829606e-05, + "loss": 0.3612, + "step": 14780 + }, + { + "epoch": 3.00510209608406, + "grad_norm": 4.6057868003845215, + "learning_rate": 4.005307913123544e-05, + "loss": 0.126, + "step": 14790 + }, + { + "epoch": 3.0051562584628715, + "grad_norm": 4.801567077636719, + "learning_rate": 4.0080160320641287e-05, + "loss": 0.0797, + "step": 14800 + }, + { + "epoch": 3.005210420841683, + "grad_norm": 2.2707631587982178, + "learning_rate": 4.010724151004712e-05, + "loss": 0.1211, + "step": 14810 + }, + { + "epoch": 3.005264583220495, + "grad_norm": 0.00974002480506897, + "learning_rate": 4.013432269945297e-05, + "loss": 0.0222, + "step": 14820 + }, + { + "epoch": 3.0053187455993067, + "grad_norm": 2.823479413986206, + "learning_rate": 4.0161403888858804e-05, + "loss": 0.0741, + "step": 14830 + }, + { + "epoch": 3.0053729079781184, + "grad_norm": 6.2621541023254395, + "learning_rate": 4.018848507826464e-05, + "loss": 0.2379, + "step": 14840 + }, + { + "epoch": 3.00542707035693, + "grad_norm": 0.1730659455060959, + "learning_rate": 4.021556626767048e-05, + "loss": 0.0932, + "step": 14850 + }, + { + "epoch": 3.005481232735742, + "grad_norm": 4.850111961364746, + "learning_rate": 4.0242647457076314e-05, + "loss": 0.1403, + "step": 14860 + }, + { + "epoch": 3.0055353951145536, + "grad_norm": 0.13457810878753662, + "learning_rate": 4.026972864648216e-05, + "loss": 0.0399, + "step": 14870 + }, + { + "epoch": 3.0055895574933653, + "grad_norm": 0.16804611682891846, + "learning_rate": 4.0296809835887994e-05, + "loss": 0.2292, + "step": 14880 + }, + { + "epoch": 3.0056437198721766, + "grad_norm": 3.268326997756958, + "learning_rate": 4.032389102529383e-05, + "loss": 0.1984, + "step": 14890 + }, + { + "epoch": 3.0056978822509883, + "grad_norm": 4.031891345977783, + "learning_rate": 4.0350972214699674e-05, + "loss": 0.07, + "step": 14900 + }, + { + "epoch": 3.0057520446298, + "grad_norm": 0.1646047830581665, + "learning_rate": 4.037805340410551e-05, + "loss": 0.1579, + "step": 14910 + }, + { + "epoch": 3.005806207008612, + "grad_norm": 0.39961081743240356, + "learning_rate": 4.040513459351135e-05, + "loss": 0.0329, + "step": 14920 + }, + { + "epoch": 3.0058603693874235, + "grad_norm": 0.2764289677143097, + "learning_rate": 4.0432215782917185e-05, + "loss": 0.0188, + "step": 14930 + }, + { + "epoch": 3.0059145317662352, + "grad_norm": 5.788010597229004, + "learning_rate": 4.045929697232302e-05, + "loss": 0.0339, + "step": 14940 + }, + { + "epoch": 3.005968694145047, + "grad_norm": 0.15621539950370789, + "learning_rate": 4.0486378161728865e-05, + "loss": 0.1505, + "step": 14950 + }, + { + "epoch": 3.0060228565238587, + "grad_norm": 0.005087927915155888, + "learning_rate": 4.05134593511347e-05, + "loss": 0.1292, + "step": 14960 + }, + { + "epoch": 3.00607701890267, + "grad_norm": 0.01382511854171753, + "learning_rate": 4.0540540540540545e-05, + "loss": 0.0378, + "step": 14970 + }, + { + "epoch": 3.0061311812814817, + "grad_norm": 6.103155612945557, + "learning_rate": 4.056762172994638e-05, + "loss": 0.1697, + "step": 14980 + }, + { + "epoch": 3.0061853436602934, + "grad_norm": 0.2239512950181961, + "learning_rate": 4.059470291935222e-05, + "loss": 0.1791, + "step": 14990 + }, + { + "epoch": 3.006239506039105, + "grad_norm": 0.3707486093044281, + "learning_rate": 4.062178410875806e-05, + "loss": 0.0581, + "step": 15000 + }, + { + "epoch": 3.006293668417917, + "grad_norm": 0.17592556774616241, + "learning_rate": 4.06488652981639e-05, + "loss": 0.0301, + "step": 15010 + }, + { + "epoch": 3.0063478307967286, + "grad_norm": 14.720727920532227, + "learning_rate": 4.0675946487569736e-05, + "loss": 0.205, + "step": 15020 + }, + { + "epoch": 3.0064019931755404, + "grad_norm": 0.0956425592303276, + "learning_rate": 4.070302767697557e-05, + "loss": 0.061, + "step": 15030 + }, + { + "epoch": 3.006456155554352, + "grad_norm": 2.979625701904297, + "learning_rate": 4.073010886638141e-05, + "loss": 0.1182, + "step": 15040 + }, + { + "epoch": 3.006510317933164, + "grad_norm": 3.436140537261963, + "learning_rate": 4.075719005578725e-05, + "loss": 0.1137, + "step": 15050 + }, + { + "epoch": 3.006564480311975, + "grad_norm": 10.336283683776855, + "learning_rate": 4.078427124519309e-05, + "loss": 0.2412, + "step": 15060 + }, + { + "epoch": 3.006618642690787, + "grad_norm": 3.771433115005493, + "learning_rate": 4.081135243459893e-05, + "loss": 0.115, + "step": 15070 + }, + { + "epoch": 3.0066728050695986, + "grad_norm": 0.025701863691210747, + "learning_rate": 4.083843362400477e-05, + "loss": 0.1302, + "step": 15080 + }, + { + "epoch": 3.0067269674484103, + "grad_norm": 4.515552520751953, + "learning_rate": 4.0865514813410607e-05, + "loss": 0.1068, + "step": 15090 + }, + { + "epoch": 3.006781129827222, + "grad_norm": 3.0689358711242676, + "learning_rate": 4.089259600281645e-05, + "loss": 0.14, + "step": 15100 + }, + { + "epoch": 3.0068352922060337, + "grad_norm": 13.40284538269043, + "learning_rate": 4.091967719222228e-05, + "loss": 0.0544, + "step": 15110 + }, + { + "epoch": 3.0068894545848455, + "grad_norm": 0.07778024673461914, + "learning_rate": 4.0946758381628124e-05, + "loss": 0.0842, + "step": 15120 + }, + { + "epoch": 3.006943616963657, + "grad_norm": 0.05222177505493164, + "learning_rate": 4.097383957103396e-05, + "loss": 0.0898, + "step": 15130 + }, + { + "epoch": 3.006997779342469, + "grad_norm": 0.13338585197925568, + "learning_rate": 4.10009207604398e-05, + "loss": 0.0306, + "step": 15140 + }, + { + "epoch": 3.00705194172128, + "grad_norm": 0.17777805030345917, + "learning_rate": 4.102800194984564e-05, + "loss": 0.1356, + "step": 15150 + }, + { + "epoch": 3.007106104100092, + "grad_norm": 8.297799110412598, + "learning_rate": 4.105508313925148e-05, + "loss": 0.2151, + "step": 15160 + }, + { + "epoch": 3.0071602664789037, + "grad_norm": 0.23784416913986206, + "learning_rate": 4.108216432865732e-05, + "loss": 0.1043, + "step": 15170 + }, + { + "epoch": 3.0072144288577154, + "grad_norm": 0.3223084509372711, + "learning_rate": 4.110924551806316e-05, + "loss": 0.1091, + "step": 15180 + }, + { + "epoch": 3.007268591236527, + "grad_norm": 0.36064040660858154, + "learning_rate": 4.1136326707468994e-05, + "loss": 0.0866, + "step": 15190 + }, + { + "epoch": 3.007322753615339, + "grad_norm": 0.2945515811443329, + "learning_rate": 4.116340789687483e-05, + "loss": 0.0361, + "step": 15200 + }, + { + "epoch": 3.0073769159941506, + "grad_norm": 0.010701441206037998, + "learning_rate": 4.119048908628067e-05, + "loss": 0.0904, + "step": 15210 + }, + { + "epoch": 3.0074310783729623, + "grad_norm": 2.9697086811065674, + "learning_rate": 4.121757027568651e-05, + "loss": 0.0223, + "step": 15220 + }, + { + "epoch": 3.007485240751774, + "grad_norm": 4.075450897216797, + "learning_rate": 4.124465146509235e-05, + "loss": 0.311, + "step": 15230 + }, + { + "epoch": 3.0075394031305853, + "grad_norm": 4.2815704345703125, + "learning_rate": 4.1271732654498185e-05, + "loss": 0.2574, + "step": 15240 + }, + { + "epoch": 3.007593565509397, + "grad_norm": 0.05922488495707512, + "learning_rate": 4.129881384390403e-05, + "loss": 0.0454, + "step": 15250 + }, + { + "epoch": 3.0076477278882088, + "grad_norm": 0.09268099069595337, + "learning_rate": 4.1325895033309865e-05, + "loss": 0.1542, + "step": 15260 + }, + { + "epoch": 3.0077018902670205, + "grad_norm": 0.1427162140607834, + "learning_rate": 4.135297622271571e-05, + "loss": 0.032, + "step": 15270 + }, + { + "epoch": 3.0077560526458322, + "grad_norm": 10.935617446899414, + "learning_rate": 4.1380057412121546e-05, + "loss": 0.2105, + "step": 15280 + }, + { + "epoch": 3.007810215024644, + "grad_norm": 0.14691020548343658, + "learning_rate": 4.1407138601527376e-05, + "loss": 0.0449, + "step": 15290 + }, + { + "epoch": 3.0078643774034557, + "grad_norm": 0.34223976731300354, + "learning_rate": 4.143421979093322e-05, + "loss": 0.0396, + "step": 15300 + }, + { + "epoch": 3.0079185397822674, + "grad_norm": 3.233823537826538, + "learning_rate": 4.1461300980339056e-05, + "loss": 0.2708, + "step": 15310 + }, + { + "epoch": 3.0079727021610787, + "grad_norm": 0.07635565102100372, + "learning_rate": 4.14883821697449e-05, + "loss": 0.1347, + "step": 15320 + }, + { + "epoch": 3.0080268645398904, + "grad_norm": 0.2779875695705414, + "learning_rate": 4.1515463359150736e-05, + "loss": 0.0992, + "step": 15330 + }, + { + "epoch": 3.008081026918702, + "grad_norm": 0.17887729406356812, + "learning_rate": 4.154254454855657e-05, + "loss": 0.1357, + "step": 15340 + }, + { + "epoch": 3.008135189297514, + "grad_norm": 7.691494464874268, + "learning_rate": 4.1569625737962416e-05, + "loss": 0.1259, + "step": 15350 + }, + { + "epoch": 3.0081893516763256, + "grad_norm": 4.823867321014404, + "learning_rate": 4.159670692736825e-05, + "loss": 0.1141, + "step": 15360 + }, + { + "epoch": 3.0082435140551373, + "grad_norm": 0.11640796810388565, + "learning_rate": 4.162378811677409e-05, + "loss": 0.1172, + "step": 15370 + }, + { + "epoch": 3.008297676433949, + "grad_norm": 0.084653340280056, + "learning_rate": 4.1650869306179927e-05, + "loss": 0.1498, + "step": 15380 + }, + { + "epoch": 3.008351838812761, + "grad_norm": 0.007792215328663588, + "learning_rate": 4.167795049558576e-05, + "loss": 0.09, + "step": 15390 + }, + { + "epoch": 3.0084060011915725, + "grad_norm": 14.134629249572754, + "learning_rate": 4.170503168499161e-05, + "loss": 0.076, + "step": 15400 + }, + { + "epoch": 3.008460163570384, + "grad_norm": 0.017401285469532013, + "learning_rate": 4.1732112874397444e-05, + "loss": 0.0881, + "step": 15410 + }, + { + "epoch": 3.0085143259491955, + "grad_norm": 0.0016060079215094447, + "learning_rate": 4.175919406380329e-05, + "loss": 0.038, + "step": 15420 + }, + { + "epoch": 3.0085684883280073, + "grad_norm": 1.0422897338867188, + "learning_rate": 4.1786275253209124e-05, + "loss": 0.3276, + "step": 15430 + }, + { + "epoch": 3.008622650706819, + "grad_norm": 8.370077133178711, + "learning_rate": 4.181335644261496e-05, + "loss": 0.0993, + "step": 15440 + }, + { + "epoch": 3.0086768130856307, + "grad_norm": 0.6887727975845337, + "learning_rate": 4.1840437632020804e-05, + "loss": 0.0364, + "step": 15450 + }, + { + "epoch": 3.0087309754644425, + "grad_norm": 0.6129658222198486, + "learning_rate": 4.186751882142664e-05, + "loss": 0.046, + "step": 15460 + }, + { + "epoch": 3.008785137843254, + "grad_norm": 0.3956260085105896, + "learning_rate": 4.189460001083248e-05, + "loss": 0.1194, + "step": 15470 + }, + { + "epoch": 3.008839300222066, + "grad_norm": 7.874311447143555, + "learning_rate": 4.1921681200238314e-05, + "loss": 0.1805, + "step": 15480 + }, + { + "epoch": 3.0088934626008776, + "grad_norm": 12.417441368103027, + "learning_rate": 4.194876238964415e-05, + "loss": 0.1161, + "step": 15490 + }, + { + "epoch": 3.008947624979689, + "grad_norm": 0.3751140832901001, + "learning_rate": 4.1975843579049995e-05, + "loss": 0.1551, + "step": 15500 + }, + { + "epoch": 3.0090017873585007, + "grad_norm": 0.0021130333188921213, + "learning_rate": 4.200292476845583e-05, + "loss": 0.0414, + "step": 15510 + }, + { + "epoch": 3.0090559497373124, + "grad_norm": 0.04432280361652374, + "learning_rate": 4.2030005957861675e-05, + "loss": 0.1135, + "step": 15520 + }, + { + "epoch": 3.009110112116124, + "grad_norm": 1.223649024963379, + "learning_rate": 4.205708714726751e-05, + "loss": 0.2283, + "step": 15530 + }, + { + "epoch": 3.009164274494936, + "grad_norm": 0.21858809888362885, + "learning_rate": 4.208416833667335e-05, + "loss": 0.1168, + "step": 15540 + }, + { + "epoch": 3.0092184368737476, + "grad_norm": 0.23651650547981262, + "learning_rate": 4.2111249526079185e-05, + "loss": 0.0869, + "step": 15550 + }, + { + "epoch": 3.0092725992525593, + "grad_norm": 8.000663757324219, + "learning_rate": 4.213833071548502e-05, + "loss": 0.129, + "step": 15560 + }, + { + "epoch": 3.009326761631371, + "grad_norm": 0.05047451704740524, + "learning_rate": 4.2165411904890866e-05, + "loss": 0.173, + "step": 15570 + }, + { + "epoch": 3.0093809240101823, + "grad_norm": 0.11185944080352783, + "learning_rate": 4.21924930942967e-05, + "loss": 0.1759, + "step": 15580 + }, + { + "epoch": 3.009435086388994, + "grad_norm": 0.0885884165763855, + "learning_rate": 4.221957428370254e-05, + "loss": 0.0765, + "step": 15590 + }, + { + "epoch": 3.0094892487678058, + "grad_norm": 9.1130952835083, + "learning_rate": 4.224665547310838e-05, + "loss": 0.1664, + "step": 15600 + }, + { + "epoch": 3.0095434111466175, + "grad_norm": 0.5247887372970581, + "learning_rate": 4.227373666251422e-05, + "loss": 0.0843, + "step": 15610 + }, + { + "epoch": 3.0095975735254292, + "grad_norm": 2.71105694770813, + "learning_rate": 4.230081785192006e-05, + "loss": 0.151, + "step": 15620 + }, + { + "epoch": 3.009651735904241, + "grad_norm": 0.12476892024278641, + "learning_rate": 4.23278990413259e-05, + "loss": 0.0589, + "step": 15630 + }, + { + "epoch": 3.0097058982830527, + "grad_norm": 0.08227730542421341, + "learning_rate": 4.2354980230731736e-05, + "loss": 0.0985, + "step": 15640 + }, + { + "epoch": 3.0097600606618644, + "grad_norm": 0.3867718577384949, + "learning_rate": 4.238206142013757e-05, + "loss": 0.0211, + "step": 15650 + }, + { + "epoch": 3.009814223040676, + "grad_norm": 0.041501402854919434, + "learning_rate": 4.240914260954341e-05, + "loss": 0.0341, + "step": 15660 + }, + { + "epoch": 3.0098683854194874, + "grad_norm": 2.825124502182007, + "learning_rate": 4.2436223798949253e-05, + "loss": 0.1101, + "step": 15670 + }, + { + "epoch": 3.009922547798299, + "grad_norm": 0.29688572883605957, + "learning_rate": 4.246330498835509e-05, + "loss": 0.1384, + "step": 15680 + }, + { + "epoch": 3.009976710177111, + "grad_norm": 0.9343748092651367, + "learning_rate": 4.249038617776093e-05, + "loss": 0.0669, + "step": 15690 + }, + { + "epoch": 3.0100308725559226, + "grad_norm": 10.568252563476562, + "learning_rate": 4.251746736716677e-05, + "loss": 0.1704, + "step": 15700 + }, + { + "epoch": 3.0100850349347343, + "grad_norm": 7.774553298950195, + "learning_rate": 4.254454855657261e-05, + "loss": 0.0753, + "step": 15710 + }, + { + "epoch": 3.010139197313546, + "grad_norm": 0.9834293127059937, + "learning_rate": 4.257162974597845e-05, + "loss": 0.0664, + "step": 15720 + }, + { + "epoch": 3.010193359692358, + "grad_norm": 0.06420604139566422, + "learning_rate": 4.259871093538428e-05, + "loss": 0.1591, + "step": 15730 + }, + { + "epoch": 3.0102475220711695, + "grad_norm": 5.790508270263672, + "learning_rate": 4.262579212479012e-05, + "loss": 0.1037, + "step": 15740 + }, + { + "epoch": 3.0103016844499813, + "grad_norm": 0.4389277696609497, + "learning_rate": 4.265287331419596e-05, + "loss": 0.1643, + "step": 15750 + }, + { + "epoch": 3.0103558468287925, + "grad_norm": 0.30258509516716003, + "learning_rate": 4.26799545036018e-05, + "loss": 0.0737, + "step": 15760 + }, + { + "epoch": 3.0104100092076043, + "grad_norm": 2.7156267166137695, + "learning_rate": 4.270703569300764e-05, + "loss": 0.1703, + "step": 15770 + }, + { + "epoch": 3.010464171586416, + "grad_norm": 4.854014873504639, + "learning_rate": 4.273411688241348e-05, + "loss": 0.1504, + "step": 15780 + }, + { + "epoch": 3.0105183339652277, + "grad_norm": 0.10174641013145447, + "learning_rate": 4.2761198071819315e-05, + "loss": 0.1628, + "step": 15790 + }, + { + "epoch": 3.0105724963440395, + "grad_norm": 0.2836451828479767, + "learning_rate": 4.278827926122516e-05, + "loss": 0.0126, + "step": 15800 + }, + { + "epoch": 3.010626658722851, + "grad_norm": 0.47493860125541687, + "learning_rate": 4.2815360450630995e-05, + "loss": 0.1163, + "step": 15810 + }, + { + "epoch": 3.010680821101663, + "grad_norm": 11.169921875, + "learning_rate": 4.284244164003683e-05, + "loss": 0.0599, + "step": 15820 + }, + { + "epoch": 3.0107349834804746, + "grad_norm": 3.6606993675231934, + "learning_rate": 4.286952282944267e-05, + "loss": 0.1592, + "step": 15830 + }, + { + "epoch": 3.010789145859286, + "grad_norm": 4.0488810539245605, + "learning_rate": 4.2896604018848505e-05, + "loss": 0.152, + "step": 15840 + }, + { + "epoch": 3.0108433082380976, + "grad_norm": 0.5507377982139587, + "learning_rate": 4.292368520825435e-05, + "loss": 0.1429, + "step": 15850 + }, + { + "epoch": 3.0108974706169094, + "grad_norm": 2.4000182151794434, + "learning_rate": 4.2950766397660186e-05, + "loss": 0.1442, + "step": 15860 + }, + { + "epoch": 3.010951632995721, + "grad_norm": 0.3415244519710541, + "learning_rate": 4.297784758706603e-05, + "loss": 0.0949, + "step": 15870 + }, + { + "epoch": 3.011005795374533, + "grad_norm": 3.2792739868164062, + "learning_rate": 4.3004928776471866e-05, + "loss": 0.1031, + "step": 15880 + }, + { + "epoch": 3.0110599577533446, + "grad_norm": 0.3103080093860626, + "learning_rate": 4.30320099658777e-05, + "loss": 0.0387, + "step": 15890 + }, + { + "epoch": 3.0111141201321563, + "grad_norm": 0.0281240101903677, + "learning_rate": 4.3059091155283546e-05, + "loss": 0.1345, + "step": 15900 + }, + { + "epoch": 3.011168282510968, + "grad_norm": 0.2159939855337143, + "learning_rate": 4.3086172344689376e-05, + "loss": 0.1458, + "step": 15910 + }, + { + "epoch": 3.0112224448897797, + "grad_norm": 1.752315878868103, + "learning_rate": 4.311325353409522e-05, + "loss": 0.2327, + "step": 15920 + }, + { + "epoch": 3.011276607268591, + "grad_norm": 0.7996095418930054, + "learning_rate": 4.3140334723501056e-05, + "loss": 0.0588, + "step": 15930 + }, + { + "epoch": 3.0113307696474028, + "grad_norm": 0.4094845950603485, + "learning_rate": 4.316741591290689e-05, + "loss": 0.0717, + "step": 15940 + }, + { + "epoch": 3.0113849320262145, + "grad_norm": 0.0354730486869812, + "learning_rate": 4.319449710231274e-05, + "loss": 0.0815, + "step": 15950 + }, + { + "epoch": 3.011439094405026, + "grad_norm": 0.29858535528182983, + "learning_rate": 4.3221578291718573e-05, + "loss": 0.0436, + "step": 15960 + }, + { + "epoch": 3.011493256783838, + "grad_norm": 0.016336528584361076, + "learning_rate": 4.324865948112442e-05, + "loss": 0.1485, + "step": 15970 + }, + { + "epoch": 3.0115474191626497, + "grad_norm": 0.0021617072634398937, + "learning_rate": 4.3275740670530254e-05, + "loss": 0.0448, + "step": 15980 + }, + { + "epoch": 3.0116015815414614, + "grad_norm": 0.083122119307518, + "learning_rate": 4.330282185993609e-05, + "loss": 0.224, + "step": 15990 + }, + { + "epoch": 3.011655743920273, + "grad_norm": 0.023303255438804626, + "learning_rate": 4.332990304934193e-05, + "loss": 0.2966, + "step": 16000 + }, + { + "epoch": 3.011709906299085, + "grad_norm": 5.809020519256592, + "learning_rate": 4.3356984238747764e-05, + "loss": 0.122, + "step": 16010 + }, + { + "epoch": 3.011764068677896, + "grad_norm": 3.536181688308716, + "learning_rate": 4.338406542815361e-05, + "loss": 0.1743, + "step": 16020 + }, + { + "epoch": 3.011818231056708, + "grad_norm": 0.5422411561012268, + "learning_rate": 4.3411146617559444e-05, + "loss": 0.0773, + "step": 16030 + }, + { + "epoch": 3.0118723934355196, + "grad_norm": 0.030679097399115562, + "learning_rate": 4.343822780696528e-05, + "loss": 0.0371, + "step": 16040 + }, + { + "epoch": 3.0119265558143313, + "grad_norm": 0.25890886783599854, + "learning_rate": 4.3465308996371125e-05, + "loss": 0.0486, + "step": 16050 + }, + { + "epoch": 3.011980718193143, + "grad_norm": 0.01007203757762909, + "learning_rate": 4.349239018577696e-05, + "loss": 0.0569, + "step": 16060 + }, + { + "epoch": 3.012034880571955, + "grad_norm": 0.09665495157241821, + "learning_rate": 4.3519471375182805e-05, + "loss": 0.1844, + "step": 16070 + }, + { + "epoch": 3.0120890429507665, + "grad_norm": 0.8126301765441895, + "learning_rate": 4.354655256458864e-05, + "loss": 0.2403, + "step": 16080 + }, + { + "epoch": 3.0121432053295782, + "grad_norm": 0.19559815526008606, + "learning_rate": 4.357363375399448e-05, + "loss": 0.0867, + "step": 16090 + }, + { + "epoch": 3.01219736770839, + "grad_norm": 2.5418992042541504, + "learning_rate": 4.3600714943400315e-05, + "loss": 0.1174, + "step": 16100 + }, + { + "epoch": 3.0122515300872013, + "grad_norm": 3.302356719970703, + "learning_rate": 4.362779613280615e-05, + "loss": 0.1322, + "step": 16110 + }, + { + "epoch": 3.012305692466013, + "grad_norm": 0.2784973084926605, + "learning_rate": 4.3654877322211995e-05, + "loss": 0.1459, + "step": 16120 + }, + { + "epoch": 3.0123598548448247, + "grad_norm": 0.31471264362335205, + "learning_rate": 4.368195851161783e-05, + "loss": 0.0226, + "step": 16130 + }, + { + "epoch": 3.0124140172236364, + "grad_norm": 0.016427630558609962, + "learning_rate": 4.370903970102367e-05, + "loss": 0.1968, + "step": 16140 + }, + { + "epoch": 3.012468179602448, + "grad_norm": 0.252187043428421, + "learning_rate": 4.373612089042951e-05, + "loss": 0.1133, + "step": 16150 + }, + { + "epoch": 3.01252234198126, + "grad_norm": 0.0732557401061058, + "learning_rate": 4.376320207983535e-05, + "loss": 0.1408, + "step": 16160 + }, + { + "epoch": 3.0125765043600716, + "grad_norm": 0.003108927281573415, + "learning_rate": 4.3790283269241186e-05, + "loss": 0.0482, + "step": 16170 + }, + { + "epoch": 3.0126306667388834, + "grad_norm": 0.1890098601579666, + "learning_rate": 4.381736445864702e-05, + "loss": 0.0387, + "step": 16180 + }, + { + "epoch": 3.0126848291176946, + "grad_norm": 0.33962318301200867, + "learning_rate": 4.384444564805286e-05, + "loss": 0.046, + "step": 16190 + }, + { + "epoch": 3.0127389914965064, + "grad_norm": 1.3511384725570679, + "learning_rate": 4.38715268374587e-05, + "loss": 0.0763, + "step": 16200 + }, + { + "epoch": 3.012793153875318, + "grad_norm": 2.8948028087615967, + "learning_rate": 4.389860802686454e-05, + "loss": 0.1156, + "step": 16210 + }, + { + "epoch": 3.01284731625413, + "grad_norm": 0.23460617661476135, + "learning_rate": 4.392568921627038e-05, + "loss": 0.1041, + "step": 16220 + }, + { + "epoch": 3.0129014786329416, + "grad_norm": 0.9791744351387024, + "learning_rate": 4.395277040567622e-05, + "loss": 0.2606, + "step": 16230 + }, + { + "epoch": 3.0129556410117533, + "grad_norm": 0.858026385307312, + "learning_rate": 4.397985159508206e-05, + "loss": 0.1332, + "step": 16240 + }, + { + "epoch": 3.013009803390565, + "grad_norm": 0.37122416496276855, + "learning_rate": 4.40069327844879e-05, + "loss": 0.0837, + "step": 16250 + }, + { + "epoch": 3.0130639657693767, + "grad_norm": 0.1300388127565384, + "learning_rate": 4.403401397389374e-05, + "loss": 0.1398, + "step": 16260 + }, + { + "epoch": 3.0131181281481885, + "grad_norm": 0.2015957236289978, + "learning_rate": 4.4061095163299574e-05, + "loss": 0.0085, + "step": 16270 + }, + { + "epoch": 3.0131722905269998, + "grad_norm": 0.2948000431060791, + "learning_rate": 4.408817635270541e-05, + "loss": 0.0241, + "step": 16280 + }, + { + "epoch": 3.0132264529058115, + "grad_norm": 0.019296329468488693, + "learning_rate": 4.411525754211125e-05, + "loss": 0.1724, + "step": 16290 + }, + { + "epoch": 3.013280615284623, + "grad_norm": 0.09786015003919601, + "learning_rate": 4.414233873151709e-05, + "loss": 0.0652, + "step": 16300 + }, + { + "epoch": 3.013334777663435, + "grad_norm": 0.15805961191654205, + "learning_rate": 4.416941992092293e-05, + "loss": 0.1345, + "step": 16310 + }, + { + "epoch": 3.0133889400422467, + "grad_norm": 13.473389625549316, + "learning_rate": 4.419650111032877e-05, + "loss": 0.2036, + "step": 16320 + }, + { + "epoch": 3.0134431024210584, + "grad_norm": 0.08977416157722473, + "learning_rate": 4.422358229973461e-05, + "loss": 0.1793, + "step": 16330 + }, + { + "epoch": 3.01349726479987, + "grad_norm": 4.0233941078186035, + "learning_rate": 4.4250663489140445e-05, + "loss": 0.2258, + "step": 16340 + }, + { + "epoch": 3.013551427178682, + "grad_norm": 0.20007306337356567, + "learning_rate": 4.427774467854628e-05, + "loss": 0.1361, + "step": 16350 + }, + { + "epoch": 3.0136055895574936, + "grad_norm": 5.7641520500183105, + "learning_rate": 4.430482586795212e-05, + "loss": 0.0569, + "step": 16360 + }, + { + "epoch": 3.013659751936305, + "grad_norm": 0.12171123176813126, + "learning_rate": 4.433190705735796e-05, + "loss": 0.0954, + "step": 16370 + }, + { + "epoch": 3.0137139143151166, + "grad_norm": 0.01036800816655159, + "learning_rate": 4.43589882467638e-05, + "loss": 0.1114, + "step": 16380 + }, + { + "epoch": 3.0137680766939283, + "grad_norm": 0.32323595881462097, + "learning_rate": 4.4386069436169635e-05, + "loss": 0.093, + "step": 16390 + }, + { + "epoch": 3.01382223907274, + "grad_norm": 0.5887606143951416, + "learning_rate": 4.441315062557548e-05, + "loss": 0.1023, + "step": 16400 + }, + { + "epoch": 3.013876401451552, + "grad_norm": 0.01656387560069561, + "learning_rate": 4.4440231814981315e-05, + "loss": 0.0122, + "step": 16410 + }, + { + "epoch": 3.0139305638303635, + "grad_norm": 0.21218939125537872, + "learning_rate": 4.446731300438716e-05, + "loss": 0.2016, + "step": 16420 + }, + { + "epoch": 3.0139847262091752, + "grad_norm": 0.01316329836845398, + "learning_rate": 4.4494394193792996e-05, + "loss": 0.1003, + "step": 16430 + }, + { + "epoch": 3.014038888587987, + "grad_norm": 2.438732862472534, + "learning_rate": 4.452147538319883e-05, + "loss": 0.1187, + "step": 16440 + }, + { + "epoch": 3.0140930509667982, + "grad_norm": 0.0031685144640505314, + "learning_rate": 4.454855657260467e-05, + "loss": 0.1729, + "step": 16450 + }, + { + "epoch": 3.01414721334561, + "grad_norm": 5.6914286613464355, + "learning_rate": 4.4575637762010506e-05, + "loss": 0.1998, + "step": 16460 + }, + { + "epoch": 3.0142013757244217, + "grad_norm": 0.17893433570861816, + "learning_rate": 4.460271895141635e-05, + "loss": 0.1861, + "step": 16470 + }, + { + "epoch": 3.0142555381032334, + "grad_norm": 2.4760589599609375, + "learning_rate": 4.4629800140822186e-05, + "loss": 0.131, + "step": 16480 + }, + { + "epoch": 3.014309700482045, + "grad_norm": 0.7349247932434082, + "learning_rate": 4.465688133022802e-05, + "loss": 0.086, + "step": 16490 + }, + { + "epoch": 3.014363862860857, + "grad_norm": 0.002368457615375519, + "learning_rate": 4.4683962519633866e-05, + "loss": 0.0966, + "step": 16500 + }, + { + "epoch": 3.0144180252396686, + "grad_norm": 3.377166271209717, + "learning_rate": 4.47110437090397e-05, + "loss": 0.1441, + "step": 16510 + }, + { + "epoch": 3.0144721876184803, + "grad_norm": 2.513995409011841, + "learning_rate": 4.473812489844555e-05, + "loss": 0.0676, + "step": 16520 + }, + { + "epoch": 3.014526349997292, + "grad_norm": 0.5543758273124695, + "learning_rate": 4.4765206087851383e-05, + "loss": 0.0821, + "step": 16530 + }, + { + "epoch": 3.0145805123761034, + "grad_norm": 5.844802379608154, + "learning_rate": 4.4792287277257213e-05, + "loss": 0.3799, + "step": 16540 + }, + { + "epoch": 3.014634674754915, + "grad_norm": 0.9753679037094116, + "learning_rate": 4.481936846666306e-05, + "loss": 0.066, + "step": 16550 + }, + { + "epoch": 3.014688837133727, + "grad_norm": 0.0510251447558403, + "learning_rate": 4.4846449656068894e-05, + "loss": 0.097, + "step": 16560 + }, + { + "epoch": 3.0147429995125385, + "grad_norm": 0.06536215543746948, + "learning_rate": 4.487353084547474e-05, + "loss": 0.0384, + "step": 16570 + }, + { + "epoch": 3.0147971618913503, + "grad_norm": 2.357060194015503, + "learning_rate": 4.4900612034880574e-05, + "loss": 0.1072, + "step": 16580 + }, + { + "epoch": 3.014851324270162, + "grad_norm": 0.503730297088623, + "learning_rate": 4.492769322428641e-05, + "loss": 0.1318, + "step": 16590 + }, + { + "epoch": 3.0149054866489737, + "grad_norm": 1.1331027746200562, + "learning_rate": 4.4954774413692254e-05, + "loss": 0.1532, + "step": 16600 + }, + { + "epoch": 3.0149596490277855, + "grad_norm": 0.8510584831237793, + "learning_rate": 4.498185560309809e-05, + "loss": 0.1382, + "step": 16610 + }, + { + "epoch": 3.015013811406597, + "grad_norm": 0.023254141211509705, + "learning_rate": 4.500893679250393e-05, + "loss": 0.035, + "step": 16620 + }, + { + "epoch": 3.0150679737854085, + "grad_norm": 5.347555160522461, + "learning_rate": 4.5036017981909765e-05, + "loss": 0.2276, + "step": 16630 + }, + { + "epoch": 3.01512213616422, + "grad_norm": 0.39231637120246887, + "learning_rate": 4.50630991713156e-05, + "loss": 0.0912, + "step": 16640 + }, + { + "epoch": 3.015176298543032, + "grad_norm": 0.15837788581848145, + "learning_rate": 4.5090180360721445e-05, + "loss": 0.1193, + "step": 16650 + }, + { + "epoch": 3.0152304609218437, + "grad_norm": 0.011441843584179878, + "learning_rate": 4.511726155012728e-05, + "loss": 0.0654, + "step": 16660 + }, + { + "epoch": 3.0152846233006554, + "grad_norm": 0.17611144483089447, + "learning_rate": 4.5144342739533125e-05, + "loss": 0.1295, + "step": 16670 + }, + { + "epoch": 3.015338785679467, + "grad_norm": 6.975933074951172, + "learning_rate": 4.517142392893896e-05, + "loss": 0.1277, + "step": 16680 + }, + { + "epoch": 3.015392948058279, + "grad_norm": 2.396185874938965, + "learning_rate": 4.51985051183448e-05, + "loss": 0.3129, + "step": 16690 + }, + { + "epoch": 3.0154471104370906, + "grad_norm": 0.5407237410545349, + "learning_rate": 4.522558630775064e-05, + "loss": 0.1201, + "step": 16700 + }, + { + "epoch": 3.015501272815902, + "grad_norm": 6.317454814910889, + "learning_rate": 4.525266749715648e-05, + "loss": 0.2434, + "step": 16710 + }, + { + "epoch": 3.0155554351947136, + "grad_norm": 0.5912926197052002, + "learning_rate": 4.5279748686562316e-05, + "loss": 0.1754, + "step": 16720 + }, + { + "epoch": 3.0156095975735253, + "grad_norm": 0.11523130536079407, + "learning_rate": 4.530682987596815e-05, + "loss": 0.0681, + "step": 16730 + }, + { + "epoch": 3.015663759952337, + "grad_norm": 0.010466706939041615, + "learning_rate": 4.533391106537399e-05, + "loss": 0.0421, + "step": 16740 + }, + { + "epoch": 3.0157179223311488, + "grad_norm": 0.01121821440756321, + "learning_rate": 4.536099225477983e-05, + "loss": 0.1405, + "step": 16750 + }, + { + "epoch": 3.0157720847099605, + "grad_norm": 1.6320679187774658, + "learning_rate": 4.538807344418567e-05, + "loss": 0.1238, + "step": 16760 + }, + { + "epoch": 3.0158262470887722, + "grad_norm": 3.155487060546875, + "learning_rate": 4.541515463359151e-05, + "loss": 0.1011, + "step": 16770 + }, + { + "epoch": 3.015880409467584, + "grad_norm": 3.3086116313934326, + "learning_rate": 4.544223582299735e-05, + "loss": 0.0636, + "step": 16780 + }, + { + "epoch": 3.0159345718463957, + "grad_norm": 0.10845986753702164, + "learning_rate": 4.5469317012403186e-05, + "loss": 0.1364, + "step": 16790 + }, + { + "epoch": 3.015988734225207, + "grad_norm": 0.06531304121017456, + "learning_rate": 4.549639820180902e-05, + "loss": 0.1091, + "step": 16800 + }, + { + "epoch": 3.0160428966040187, + "grad_norm": 4.298405170440674, + "learning_rate": 4.552347939121486e-05, + "loss": 0.096, + "step": 16810 + }, + { + "epoch": 3.0160970589828304, + "grad_norm": 14.53825855255127, + "learning_rate": 4.5550560580620704e-05, + "loss": 0.1613, + "step": 16820 + }, + { + "epoch": 3.016151221361642, + "grad_norm": 6.179991722106934, + "learning_rate": 4.557764177002654e-05, + "loss": 0.197, + "step": 16830 + }, + { + "epoch": 3.016205383740454, + "grad_norm": 0.5761241912841797, + "learning_rate": 4.560472295943238e-05, + "loss": 0.2248, + "step": 16840 + }, + { + "epoch": 3.0162595461192656, + "grad_norm": 1.3884644508361816, + "learning_rate": 4.563180414883822e-05, + "loss": 0.1579, + "step": 16850 + }, + { + "epoch": 3.0163137084980773, + "grad_norm": 0.1635373830795288, + "learning_rate": 4.565888533824406e-05, + "loss": 0.0696, + "step": 16860 + }, + { + "epoch": 3.016367870876889, + "grad_norm": 0.18328924477100372, + "learning_rate": 4.56859665276499e-05, + "loss": 0.1172, + "step": 16870 + }, + { + "epoch": 3.016422033255701, + "grad_norm": 0.05044291540980339, + "learning_rate": 4.571304771705574e-05, + "loss": 0.2455, + "step": 16880 + }, + { + "epoch": 3.016476195634512, + "grad_norm": 0.16251514852046967, + "learning_rate": 4.5740128906461574e-05, + "loss": 0.1616, + "step": 16890 + }, + { + "epoch": 3.016530358013324, + "grad_norm": 0.48677077889442444, + "learning_rate": 4.576721009586741e-05, + "loss": 0.1646, + "step": 16900 + }, + { + "epoch": 3.0165845203921355, + "grad_norm": 0.14539079368114471, + "learning_rate": 4.579429128527325e-05, + "loss": 0.1182, + "step": 16910 + }, + { + "epoch": 3.0166386827709473, + "grad_norm": 0.12363772839307785, + "learning_rate": 4.582137247467909e-05, + "loss": 0.2126, + "step": 16920 + }, + { + "epoch": 3.016692845149759, + "grad_norm": 3.596465826034546, + "learning_rate": 4.584845366408493e-05, + "loss": 0.191, + "step": 16930 + }, + { + "epoch": 3.0167470075285707, + "grad_norm": 0.35254189372062683, + "learning_rate": 4.5875534853490765e-05, + "loss": 0.1011, + "step": 16940 + }, + { + "epoch": 3.0168011699073825, + "grad_norm": 0.5105412006378174, + "learning_rate": 4.590261604289661e-05, + "loss": 0.0946, + "step": 16950 + }, + { + "epoch": 3.016855332286194, + "grad_norm": 3.3415486812591553, + "learning_rate": 4.5929697232302445e-05, + "loss": 0.22, + "step": 16960 + }, + { + "epoch": 3.016909494665006, + "grad_norm": 0.4780622124671936, + "learning_rate": 4.595677842170829e-05, + "loss": 0.0622, + "step": 16970 + }, + { + "epoch": 3.016963657043817, + "grad_norm": 3.3567075729370117, + "learning_rate": 4.598385961111412e-05, + "loss": 0.259, + "step": 16980 + }, + { + "epoch": 3.017017819422629, + "grad_norm": 7.764454364776611, + "learning_rate": 4.601094080051996e-05, + "loss": 0.234, + "step": 16990 + }, + { + "epoch": 3.0170719818014406, + "grad_norm": 2.26466703414917, + "learning_rate": 4.60380219899258e-05, + "loss": 0.1286, + "step": 17000 + }, + { + "epoch": 3.0171261441802524, + "grad_norm": 0.29700368642807007, + "learning_rate": 4.6065103179331636e-05, + "loss": 0.1511, + "step": 17010 + }, + { + "epoch": 3.017180306559064, + "grad_norm": 4.51762580871582, + "learning_rate": 4.609218436873748e-05, + "loss": 0.1577, + "step": 17020 + }, + { + "epoch": 3.017234468937876, + "grad_norm": 0.12636373937129974, + "learning_rate": 4.6119265558143316e-05, + "loss": 0.1014, + "step": 17030 + }, + { + "epoch": 3.0172886313166876, + "grad_norm": 0.21909190714359283, + "learning_rate": 4.614634674754916e-05, + "loss": 0.1006, + "step": 17040 + }, + { + "epoch": 3.0173427936954993, + "grad_norm": 1.8623734712600708, + "learning_rate": 4.6173427936954996e-05, + "loss": 0.0741, + "step": 17050 + }, + { + "epoch": 3.0173969560743106, + "grad_norm": 0.01384083740413189, + "learning_rate": 4.620050912636083e-05, + "loss": 0.0976, + "step": 17060 + }, + { + "epoch": 3.0174511184531223, + "grad_norm": 0.08327919989824295, + "learning_rate": 4.622759031576667e-05, + "loss": 0.0265, + "step": 17070 + }, + { + "epoch": 3.017505280831934, + "grad_norm": 0.05928775668144226, + "learning_rate": 4.6254671505172506e-05, + "loss": 0.0562, + "step": 17080 + }, + { + "epoch": 3.0175594432107458, + "grad_norm": 0.23723214864730835, + "learning_rate": 4.628175269457835e-05, + "loss": 0.3056, + "step": 17090 + }, + { + "epoch": 3.0176136055895575, + "grad_norm": 0.21459893882274628, + "learning_rate": 4.630883388398419e-05, + "loss": 0.1846, + "step": 17100 + }, + { + "epoch": 3.017667767968369, + "grad_norm": 3.8925817012786865, + "learning_rate": 4.6335915073390024e-05, + "loss": 0.0673, + "step": 17110 + }, + { + "epoch": 3.017721930347181, + "grad_norm": 0.4905264973640442, + "learning_rate": 4.636299626279587e-05, + "loss": 0.0711, + "step": 17120 + }, + { + "epoch": 3.0177760927259927, + "grad_norm": 8.644896507263184, + "learning_rate": 4.6390077452201704e-05, + "loss": 0.205, + "step": 17130 + }, + { + "epoch": 3.0178302551048044, + "grad_norm": 0.11822371929883957, + "learning_rate": 4.641715864160755e-05, + "loss": 0.1133, + "step": 17140 + }, + { + "epoch": 3.0178844174836157, + "grad_norm": 0.40622442960739136, + "learning_rate": 4.6444239831013384e-05, + "loss": 0.081, + "step": 17150 + }, + { + "epoch": 3.0179385798624274, + "grad_norm": 0.052125025540590286, + "learning_rate": 4.6471321020419214e-05, + "loss": 0.1439, + "step": 17160 + }, + { + "epoch": 3.017992742241239, + "grad_norm": 0.025274742394685745, + "learning_rate": 4.649840220982506e-05, + "loss": 0.0863, + "step": 17170 + }, + { + "epoch": 3.018046904620051, + "grad_norm": 0.2672179639339447, + "learning_rate": 4.6525483399230894e-05, + "loss": 0.0858, + "step": 17180 + }, + { + "epoch": 3.0181010669988626, + "grad_norm": 0.45881927013397217, + "learning_rate": 4.655256458863674e-05, + "loss": 0.086, + "step": 17190 + }, + { + "epoch": 3.0181552293776743, + "grad_norm": 0.22623072564601898, + "learning_rate": 4.6579645778042575e-05, + "loss": 0.1412, + "step": 17200 + }, + { + "epoch": 3.018209391756486, + "grad_norm": 6.412494659423828, + "learning_rate": 4.660672696744841e-05, + "loss": 0.1205, + "step": 17210 + }, + { + "epoch": 3.018263554135298, + "grad_norm": 7.306236267089844, + "learning_rate": 4.6633808156854255e-05, + "loss": 0.11, + "step": 17220 + }, + { + "epoch": 3.0183177165141095, + "grad_norm": 0.11428675055503845, + "learning_rate": 4.666088934626009e-05, + "loss": 0.2256, + "step": 17230 + }, + { + "epoch": 3.018371878892921, + "grad_norm": 0.07358516752719879, + "learning_rate": 4.668797053566593e-05, + "loss": 0.0846, + "step": 17240 + }, + { + "epoch": 3.0184260412717325, + "grad_norm": 1.0470863580703735, + "learning_rate": 4.6715051725071765e-05, + "loss": 0.1528, + "step": 17250 + }, + { + "epoch": 3.0184802036505443, + "grad_norm": 2.728452444076538, + "learning_rate": 4.67421329144776e-05, + "loss": 0.1092, + "step": 17260 + }, + { + "epoch": 3.018534366029356, + "grad_norm": 0.07114117592573166, + "learning_rate": 4.6769214103883445e-05, + "loss": 0.1181, + "step": 17270 + }, + { + "epoch": 3.0185885284081677, + "grad_norm": 0.3502472937107086, + "learning_rate": 4.679629529328928e-05, + "loss": 0.1448, + "step": 17280 + }, + { + "epoch": 3.0186426907869794, + "grad_norm": 3.669137954711914, + "learning_rate": 4.6823376482695126e-05, + "loss": 0.2413, + "step": 17290 + }, + { + "epoch": 3.018696853165791, + "grad_norm": 0.01150873489677906, + "learning_rate": 4.685045767210096e-05, + "loss": 0.0546, + "step": 17300 + }, + { + "epoch": 3.018751015544603, + "grad_norm": 0.014094672165811062, + "learning_rate": 4.68775388615068e-05, + "loss": 0.1006, + "step": 17310 + }, + { + "epoch": 3.018805177923414, + "grad_norm": 0.023676840588450432, + "learning_rate": 4.690462005091264e-05, + "loss": 0.1062, + "step": 17320 + }, + { + "epoch": 3.018859340302226, + "grad_norm": 0.1445838212966919, + "learning_rate": 4.693170124031848e-05, + "loss": 0.056, + "step": 17330 + }, + { + "epoch": 3.0189135026810376, + "grad_norm": 0.005047122482210398, + "learning_rate": 4.6958782429724316e-05, + "loss": 0.1255, + "step": 17340 + }, + { + "epoch": 3.0189676650598494, + "grad_norm": 0.21651357412338257, + "learning_rate": 4.698586361913015e-05, + "loss": 0.0812, + "step": 17350 + }, + { + "epoch": 3.019021827438661, + "grad_norm": 0.23011456429958344, + "learning_rate": 4.701294480853599e-05, + "loss": 0.1193, + "step": 17360 + }, + { + "epoch": 3.019075989817473, + "grad_norm": 2.5323476791381836, + "learning_rate": 4.704002599794183e-05, + "loss": 0.132, + "step": 17370 + }, + { + "epoch": 3.0191301521962846, + "grad_norm": 10.300464630126953, + "learning_rate": 4.706710718734767e-05, + "loss": 0.1085, + "step": 17380 + }, + { + "epoch": 3.0191843145750963, + "grad_norm": 2.245537042617798, + "learning_rate": 4.7094188376753514e-05, + "loss": 0.0635, + "step": 17390 + }, + { + "epoch": 3.019238476953908, + "grad_norm": 0.0027965763583779335, + "learning_rate": 4.712126956615935e-05, + "loss": 0.0941, + "step": 17400 + }, + { + "epoch": 3.0192926393327193, + "grad_norm": 8.790120124816895, + "learning_rate": 4.714835075556519e-05, + "loss": 0.2586, + "step": 17410 + }, + { + "epoch": 3.019346801711531, + "grad_norm": 3.0327627658843994, + "learning_rate": 4.7175431944971024e-05, + "loss": 0.2499, + "step": 17420 + }, + { + "epoch": 3.0194009640903428, + "grad_norm": 0.2888825535774231, + "learning_rate": 4.720251313437686e-05, + "loss": 0.2366, + "step": 17430 + }, + { + "epoch": 3.0194551264691545, + "grad_norm": 0.76970374584198, + "learning_rate": 4.7229594323782704e-05, + "loss": 0.0497, + "step": 17440 + }, + { + "epoch": 3.019509288847966, + "grad_norm": 0.015148009173572063, + "learning_rate": 4.725667551318854e-05, + "loss": 0.2203, + "step": 17450 + }, + { + "epoch": 3.019563451226778, + "grad_norm": 0.9461076259613037, + "learning_rate": 4.728375670259438e-05, + "loss": 0.052, + "step": 17460 + }, + { + "epoch": 3.0196176136055897, + "grad_norm": 3.115302801132202, + "learning_rate": 4.731083789200022e-05, + "loss": 0.2117, + "step": 17470 + }, + { + "epoch": 3.0196717759844014, + "grad_norm": 0.06717372685670853, + "learning_rate": 4.733791908140606e-05, + "loss": 0.1005, + "step": 17480 + }, + { + "epoch": 3.019725938363213, + "grad_norm": 1.257912278175354, + "learning_rate": 4.73650002708119e-05, + "loss": 0.0659, + "step": 17490 + }, + { + "epoch": 3.0197801007420244, + "grad_norm": 0.5377878546714783, + "learning_rate": 4.739208146021774e-05, + "loss": 0.1372, + "step": 17500 + }, + { + "epoch": 3.019834263120836, + "grad_norm": 0.15887551009655, + "learning_rate": 4.7419162649623575e-05, + "loss": 0.1006, + "step": 17510 + }, + { + "epoch": 3.019888425499648, + "grad_norm": 0.2621247470378876, + "learning_rate": 4.744624383902941e-05, + "loss": 0.041, + "step": 17520 + }, + { + "epoch": 3.0199425878784596, + "grad_norm": 5.055777072906494, + "learning_rate": 4.747332502843525e-05, + "loss": 0.0804, + "step": 17530 + }, + { + "epoch": 3.0199967502572713, + "grad_norm": 0.03970037028193474, + "learning_rate": 4.750040621784109e-05, + "loss": 0.1209, + "step": 17540 + }, + { + "epoch": 3.020050912636083, + "grad_norm": 0.005300465039908886, + "learning_rate": 4.752748740724693e-05, + "loss": 0.0705, + "step": 17550 + }, + { + "epoch": 3.020105075014895, + "grad_norm": 0.6804311871528625, + "learning_rate": 4.7554568596652765e-05, + "loss": 0.083, + "step": 17560 + }, + { + "epoch": 3.0201592373937065, + "grad_norm": 0.15702199935913086, + "learning_rate": 4.758164978605861e-05, + "loss": 0.2341, + "step": 17570 + }, + { + "epoch": 3.020213399772518, + "grad_norm": 0.025629345327615738, + "learning_rate": 4.7608730975464446e-05, + "loss": 0.0359, + "step": 17580 + }, + { + "epoch": 3.0202675621513295, + "grad_norm": 5.236415386199951, + "learning_rate": 4.763581216487029e-05, + "loss": 0.1238, + "step": 17590 + }, + { + "epoch": 3.0203217245301412, + "grad_norm": 0.5973139405250549, + "learning_rate": 4.766289335427612e-05, + "loss": 0.1348, + "step": 17600 + }, + { + "epoch": 3.020375886908953, + "grad_norm": 0.2426173985004425, + "learning_rate": 4.7689974543681956e-05, + "loss": 0.1032, + "step": 17610 + }, + { + "epoch": 3.0204300492877647, + "grad_norm": 0.48615503311157227, + "learning_rate": 4.77170557330878e-05, + "loss": 0.1557, + "step": 17620 + }, + { + "epoch": 3.0204842116665764, + "grad_norm": 14.191229820251465, + "learning_rate": 4.7744136922493636e-05, + "loss": 0.0993, + "step": 17630 + }, + { + "epoch": 3.020538374045388, + "grad_norm": 3.7056796550750732, + "learning_rate": 4.777121811189948e-05, + "loss": 0.1458, + "step": 17640 + }, + { + "epoch": 3.0205925364242, + "grad_norm": 0.03734838590025902, + "learning_rate": 4.7798299301305317e-05, + "loss": 0.0199, + "step": 17650 + }, + { + "epoch": 3.0206466988030116, + "grad_norm": 0.04578607156872749, + "learning_rate": 4.782538049071115e-05, + "loss": 0.0955, + "step": 17660 + }, + { + "epoch": 3.020700861181823, + "grad_norm": 2.064796209335327, + "learning_rate": 4.7852461680117e-05, + "loss": 0.1621, + "step": 17670 + }, + { + "epoch": 3.0207550235606346, + "grad_norm": 0.36234286427497864, + "learning_rate": 4.7879542869522834e-05, + "loss": 0.1121, + "step": 17680 + }, + { + "epoch": 3.0208091859394464, + "grad_norm": 9.361454963684082, + "learning_rate": 4.790662405892867e-05, + "loss": 0.2226, + "step": 17690 + }, + { + "epoch": 3.020863348318258, + "grad_norm": 0.2652874290943146, + "learning_rate": 4.793370524833451e-05, + "loss": 0.1619, + "step": 17700 + }, + { + "epoch": 3.02091751069707, + "grad_norm": 16.041234970092773, + "learning_rate": 4.7960786437740344e-05, + "loss": 0.0673, + "step": 17710 + }, + { + "epoch": 3.0209716730758815, + "grad_norm": 3.3534371852874756, + "learning_rate": 4.798786762714619e-05, + "loss": 0.1685, + "step": 17720 + }, + { + "epoch": 3.0210258354546933, + "grad_norm": 0.18332384526729584, + "learning_rate": 4.8014948816552024e-05, + "loss": 0.1402, + "step": 17730 + }, + { + "epoch": 3.021079997833505, + "grad_norm": 6.829606056213379, + "learning_rate": 4.804203000595787e-05, + "loss": 0.0322, + "step": 17740 + }, + { + "epoch": 3.0211341602123167, + "grad_norm": 5.901334285736084, + "learning_rate": 4.8069111195363704e-05, + "loss": 0.4203, + "step": 17750 + }, + { + "epoch": 3.021188322591128, + "grad_norm": 0.06271854043006897, + "learning_rate": 4.809619238476954e-05, + "loss": 0.2452, + "step": 17760 + }, + { + "epoch": 3.0212424849699397, + "grad_norm": 3.2797224521636963, + "learning_rate": 4.8123273574175385e-05, + "loss": 0.0354, + "step": 17770 + }, + { + "epoch": 3.0212966473487515, + "grad_norm": 0.14072281122207642, + "learning_rate": 4.8150354763581215e-05, + "loss": 0.0465, + "step": 17780 + }, + { + "epoch": 3.021350809727563, + "grad_norm": 0.04392288997769356, + "learning_rate": 4.817743595298706e-05, + "loss": 0.1615, + "step": 17790 + }, + { + "epoch": 3.021404972106375, + "grad_norm": 2.3903791904449463, + "learning_rate": 4.8204517142392895e-05, + "loss": 0.1632, + "step": 17800 + }, + { + "epoch": 3.0214591344851867, + "grad_norm": 13.995230674743652, + "learning_rate": 4.823159833179873e-05, + "loss": 0.1171, + "step": 17810 + }, + { + "epoch": 3.0215132968639984, + "grad_norm": 0.04884158819913864, + "learning_rate": 4.8258679521204575e-05, + "loss": 0.0913, + "step": 17820 + }, + { + "epoch": 3.02156745924281, + "grad_norm": 2.8413665294647217, + "learning_rate": 4.828576071061041e-05, + "loss": 0.1135, + "step": 17830 + }, + { + "epoch": 3.0216216216216214, + "grad_norm": 0.014439503662288189, + "learning_rate": 4.8312841900016256e-05, + "loss": 0.0861, + "step": 17840 + }, + { + "epoch": 3.021675784000433, + "grad_norm": 0.19261011481285095, + "learning_rate": 4.833992308942209e-05, + "loss": 0.19, + "step": 17850 + }, + { + "epoch": 3.021729946379245, + "grad_norm": 7.859121322631836, + "learning_rate": 4.836700427882793e-05, + "loss": 0.2199, + "step": 17860 + }, + { + "epoch": 3.0217841087580566, + "grad_norm": 2.843081474304199, + "learning_rate": 4.8394085468233766e-05, + "loss": 0.2403, + "step": 17870 + }, + { + "epoch": 3.0218382711368683, + "grad_norm": 4.81859827041626, + "learning_rate": 4.84211666576396e-05, + "loss": 0.2333, + "step": 17880 + }, + { + "epoch": 3.02189243351568, + "grad_norm": 2.8537704944610596, + "learning_rate": 4.8448247847045446e-05, + "loss": 0.1411, + "step": 17890 + }, + { + "epoch": 3.0219465958944918, + "grad_norm": 1.453966736793518, + "learning_rate": 4.847532903645128e-05, + "loss": 0.1618, + "step": 17900 + }, + { + "epoch": 3.0220007582733035, + "grad_norm": 0.2959138751029968, + "learning_rate": 4.850241022585712e-05, + "loss": 0.1225, + "step": 17910 + }, + { + "epoch": 3.0220549206521152, + "grad_norm": 1.358945369720459, + "learning_rate": 4.852949141526296e-05, + "loss": 0.1296, + "step": 17920 + }, + { + "epoch": 3.0221090830309265, + "grad_norm": 0.007559897378087044, + "learning_rate": 4.85565726046688e-05, + "loss": 0.0198, + "step": 17930 + }, + { + "epoch": 3.0221632454097382, + "grad_norm": 2.441443920135498, + "learning_rate": 4.858365379407464e-05, + "loss": 0.0516, + "step": 17940 + }, + { + "epoch": 3.02221740778855, + "grad_norm": 0.2199961543083191, + "learning_rate": 4.861073498348048e-05, + "loss": 0.0875, + "step": 17950 + }, + { + "epoch": 3.0222715701673617, + "grad_norm": 0.09471318870782852, + "learning_rate": 4.863781617288632e-05, + "loss": 0.2394, + "step": 17960 + }, + { + "epoch": 3.0223257325461734, + "grad_norm": 2.5406835079193115, + "learning_rate": 4.8664897362292154e-05, + "loss": 0.1475, + "step": 17970 + }, + { + "epoch": 3.022379894924985, + "grad_norm": 17.552364349365234, + "learning_rate": 4.869197855169799e-05, + "loss": 0.1066, + "step": 17980 + }, + { + "epoch": 3.022434057303797, + "grad_norm": 0.0019268222386017442, + "learning_rate": 4.8719059741103834e-05, + "loss": 0.1176, + "step": 17990 + }, + { + "epoch": 3.0224882196826086, + "grad_norm": 5.661135673522949, + "learning_rate": 4.874614093050967e-05, + "loss": 0.0622, + "step": 18000 + }, + { + "epoch": 3.0225423820614203, + "grad_norm": 1.1341850757598877, + "learning_rate": 4.877322211991551e-05, + "loss": 0.1473, + "step": 18010 + }, + { + "epoch": 3.0225965444402316, + "grad_norm": 0.9709892868995667, + "learning_rate": 4.880030330932135e-05, + "loss": 0.236, + "step": 18020 + }, + { + "epoch": 3.0226507068190434, + "grad_norm": 0.07000138610601425, + "learning_rate": 4.882738449872719e-05, + "loss": 0.0735, + "step": 18030 + }, + { + "epoch": 3.022704869197855, + "grad_norm": 0.46052175760269165, + "learning_rate": 4.8854465688133024e-05, + "loss": 0.1435, + "step": 18040 + }, + { + "epoch": 3.022759031576667, + "grad_norm": 0.14378465712070465, + "learning_rate": 4.888154687753886e-05, + "loss": 0.0632, + "step": 18050 + }, + { + "epoch": 3.0228131939554785, + "grad_norm": 0.01188215333968401, + "learning_rate": 4.89086280669447e-05, + "loss": 0.0604, + "step": 18060 + }, + { + "epoch": 3.0228673563342903, + "grad_norm": 8.992758750915527, + "learning_rate": 4.893570925635054e-05, + "loss": 0.0636, + "step": 18070 + }, + { + "epoch": 3.022921518713102, + "grad_norm": 0.02854861132800579, + "learning_rate": 4.896279044575638e-05, + "loss": 0.1487, + "step": 18080 + }, + { + "epoch": 3.0229756810919137, + "grad_norm": 0.03130706027150154, + "learning_rate": 4.898987163516222e-05, + "loss": 0.0943, + "step": 18090 + }, + { + "epoch": 3.0230298434707255, + "grad_norm": 1.218820571899414, + "learning_rate": 4.901695282456806e-05, + "loss": 0.1538, + "step": 18100 + }, + { + "epoch": 3.0230840058495367, + "grad_norm": 0.05349326133728027, + "learning_rate": 4.9044034013973895e-05, + "loss": 0.1022, + "step": 18110 + }, + { + "epoch": 3.0231381682283485, + "grad_norm": 1.7356195449829102, + "learning_rate": 4.907111520337974e-05, + "loss": 0.028, + "step": 18120 + }, + { + "epoch": 3.02319233060716, + "grad_norm": 3.127291202545166, + "learning_rate": 4.9098196392785576e-05, + "loss": 0.1298, + "step": 18130 + }, + { + "epoch": 3.023246492985972, + "grad_norm": 0.08414258807897568, + "learning_rate": 4.912527758219141e-05, + "loss": 0.2466, + "step": 18140 + }, + { + "epoch": 3.0233006553647837, + "grad_norm": 8.527313232421875, + "learning_rate": 4.915235877159725e-05, + "loss": 0.1386, + "step": 18150 + }, + { + "epoch": 3.0233548177435954, + "grad_norm": 10.05178165435791, + "learning_rate": 4.9179439961003086e-05, + "loss": 0.1447, + "step": 18160 + }, + { + "epoch": 3.023408980122407, + "grad_norm": 0.09049778431653976, + "learning_rate": 4.920652115040893e-05, + "loss": 0.2474, + "step": 18170 + }, + { + "epoch": 3.023463142501219, + "grad_norm": 0.35119229555130005, + "learning_rate": 4.9233602339814766e-05, + "loss": 0.071, + "step": 18180 + }, + { + "epoch": 3.02351730488003, + "grad_norm": 0.8239344954490662, + "learning_rate": 4.926068352922061e-05, + "loss": 0.0355, + "step": 18190 + }, + { + "epoch": 3.023571467258842, + "grad_norm": 0.0943911001086235, + "learning_rate": 4.9287764718626446e-05, + "loss": 0.1768, + "step": 18200 + }, + { + "epoch": 3.0236256296376536, + "grad_norm": 1.7887928485870361, + "learning_rate": 4.931484590803228e-05, + "loss": 0.1003, + "step": 18210 + }, + { + "epoch": 3.0236797920164653, + "grad_norm": 0.017379336059093475, + "learning_rate": 4.934192709743812e-05, + "loss": 0.0417, + "step": 18220 + }, + { + "epoch": 3.023733954395277, + "grad_norm": 5.620433330535889, + "learning_rate": 4.9369008286843957e-05, + "loss": 0.0371, + "step": 18230 + }, + { + "epoch": 3.0237881167740888, + "grad_norm": 0.07633041590452194, + "learning_rate": 4.93960894762498e-05, + "loss": 0.0635, + "step": 18240 + }, + { + "epoch": 3.0238422791529005, + "grad_norm": 0.13998937606811523, + "learning_rate": 4.942317066565564e-05, + "loss": 0.0881, + "step": 18250 + }, + { + "epoch": 3.023896441531712, + "grad_norm": 0.1340373158454895, + "learning_rate": 4.9450251855061474e-05, + "loss": 0.1111, + "step": 18260 + }, + { + "epoch": 3.023950603910524, + "grad_norm": 0.03118700161576271, + "learning_rate": 4.947733304446732e-05, + "loss": 0.113, + "step": 18270 + }, + { + "epoch": 3.0240047662893352, + "grad_norm": 0.018209943547844887, + "learning_rate": 4.9504414233873154e-05, + "loss": 0.1296, + "step": 18280 + }, + { + "epoch": 3.024058928668147, + "grad_norm": 0.0021859805565327406, + "learning_rate": 4.9531495423279e-05, + "loss": 0.0291, + "step": 18290 + }, + { + "epoch": 3.0241130910469587, + "grad_norm": 0.004888099152594805, + "learning_rate": 4.9558576612684834e-05, + "loss": 0.0637, + "step": 18300 + }, + { + "epoch": 3.0241672534257704, + "grad_norm": 0.04342144355177879, + "learning_rate": 4.958565780209067e-05, + "loss": 0.1672, + "step": 18310 + }, + { + "epoch": 3.024221415804582, + "grad_norm": 0.006838384084403515, + "learning_rate": 4.961273899149651e-05, + "loss": 0.1432, + "step": 18320 + }, + { + "epoch": 3.024275578183394, + "grad_norm": 0.0923570841550827, + "learning_rate": 4.9639820180902344e-05, + "loss": 0.1504, + "step": 18330 + }, + { + "epoch": 3.0243297405622056, + "grad_norm": 0.0909096747636795, + "learning_rate": 4.966690137030819e-05, + "loss": 0.0391, + "step": 18340 + }, + { + "epoch": 3.0243839029410173, + "grad_norm": 6.799511909484863, + "learning_rate": 4.9693982559714025e-05, + "loss": 0.1233, + "step": 18350 + }, + { + "epoch": 3.024438065319829, + "grad_norm": 0.47758403420448303, + "learning_rate": 4.972106374911986e-05, + "loss": 0.0984, + "step": 18360 + }, + { + "epoch": 3.0244922276986403, + "grad_norm": 0.7072124481201172, + "learning_rate": 4.9748144938525705e-05, + "loss": 0.334, + "step": 18370 + }, + { + "epoch": 3.024546390077452, + "grad_norm": 0.6049679517745972, + "learning_rate": 4.977522612793154e-05, + "loss": 0.0833, + "step": 18380 + }, + { + "epoch": 3.024600552456264, + "grad_norm": 0.1295345276594162, + "learning_rate": 4.9802307317337385e-05, + "loss": 0.0672, + "step": 18390 + }, + { + "epoch": 3.0246547148350755, + "grad_norm": 25.98175048828125, + "learning_rate": 4.982938850674322e-05, + "loss": 0.4845, + "step": 18400 + }, + { + "epoch": 3.0247088772138873, + "grad_norm": 0.483500599861145, + "learning_rate": 4.985646969614905e-05, + "loss": 0.0528, + "step": 18410 + }, + { + "epoch": 3.024763039592699, + "grad_norm": 2.3386178016662598, + "learning_rate": 4.9883550885554896e-05, + "loss": 0.1348, + "step": 18420 + }, + { + "epoch": 3.0248172019715107, + "grad_norm": 5.613415241241455, + "learning_rate": 4.991063207496073e-05, + "loss": 0.0574, + "step": 18430 + }, + { + "epoch": 3.0248713643503224, + "grad_norm": 4.685136795043945, + "learning_rate": 4.9937713264366576e-05, + "loss": 0.2964, + "step": 18440 + }, + { + "epoch": 3.0249255267291337, + "grad_norm": 0.0694117471575737, + "learning_rate": 4.996479445377241e-05, + "loss": 0.0619, + "step": 18450 + }, + { + "epoch": 3.0249796891079455, + "grad_norm": 0.09913896024227142, + "learning_rate": 4.999187564317825e-05, + "loss": 0.0496, + "step": 18460 + }, + { + "epoch": 3.0250013540594702, + "eval_accuracy": 0.8017635532331809, + "eval_loss": 0.6797383427619934, + "eval_runtime": 117.4438, + "eval_samples_per_second": 26.072, + "eval_steps_per_second": 3.261, + "step": 18464 + }, + { + "epoch": 4.000032497427287, + "grad_norm": 2.414024829864502, + "learning_rate": 4.999789368526844e-05, + "loss": 0.0415, + "step": 18470 + }, + { + "epoch": 4.000086659806099, + "grad_norm": 16.926599502563477, + "learning_rate": 4.999488466422334e-05, + "loss": 0.1521, + "step": 18480 + }, + { + "epoch": 4.00014082218491, + "grad_norm": 4.317107677459717, + "learning_rate": 4.999187564317825e-05, + "loss": 0.1559, + "step": 18490 + }, + { + "epoch": 4.000194984563722, + "grad_norm": 1.1466176509857178, + "learning_rate": 4.9988866622133156e-05, + "loss": 0.2736, + "step": 18500 + }, + { + "epoch": 4.000249146942534, + "grad_norm": 0.23055508732795715, + "learning_rate": 4.998585760108806e-05, + "loss": 0.112, + "step": 18510 + }, + { + "epoch": 4.000303309321345, + "grad_norm": 6.304684638977051, + "learning_rate": 4.9982848580042975e-05, + "loss": 0.1133, + "step": 18520 + }, + { + "epoch": 4.000357471700157, + "grad_norm": 0.03901456668972969, + "learning_rate": 4.9979839558997874e-05, + "loss": 0.0549, + "step": 18530 + }, + { + "epoch": 4.000411634078969, + "grad_norm": 0.03190433606505394, + "learning_rate": 4.997683053795279e-05, + "loss": 0.1017, + "step": 18540 + }, + { + "epoch": 4.000465796457781, + "grad_norm": 0.018570702522993088, + "learning_rate": 4.9973821516907694e-05, + "loss": 0.2601, + "step": 18550 + }, + { + "epoch": 4.000519958836592, + "grad_norm": 1.4546805620193481, + "learning_rate": 4.99708124958626e-05, + "loss": 0.0996, + "step": 18560 + }, + { + "epoch": 4.000574121215403, + "grad_norm": 0.3574541509151459, + "learning_rate": 4.9967803474817506e-05, + "loss": 0.0825, + "step": 18570 + }, + { + "epoch": 4.0006282835942155, + "grad_norm": 0.415255069732666, + "learning_rate": 4.996479445377241e-05, + "loss": 0.0742, + "step": 18580 + }, + { + "epoch": 4.000682445973027, + "grad_norm": 0.020412951707839966, + "learning_rate": 4.996178543272732e-05, + "loss": 0.0776, + "step": 18590 + }, + { + "epoch": 4.000736608351839, + "grad_norm": 7.841737270355225, + "learning_rate": 4.9958776411682225e-05, + "loss": 0.1583, + "step": 18600 + }, + { + "epoch": 4.00079077073065, + "grad_norm": 4.155407428741455, + "learning_rate": 4.995576739063713e-05, + "loss": 0.0359, + "step": 18610 + }, + { + "epoch": 4.000844933109462, + "grad_norm": 0.3471609354019165, + "learning_rate": 4.995275836959204e-05, + "loss": 0.1272, + "step": 18620 + }, + { + "epoch": 4.000899095488274, + "grad_norm": 0.2239498794078827, + "learning_rate": 4.994974934854695e-05, + "loss": 0.0615, + "step": 18630 + }, + { + "epoch": 4.000953257867086, + "grad_norm": 0.681570827960968, + "learning_rate": 4.994674032750185e-05, + "loss": 0.1015, + "step": 18640 + }, + { + "epoch": 4.001007420245897, + "grad_norm": 0.7507639527320862, + "learning_rate": 4.9943731306456757e-05, + "loss": 0.0834, + "step": 18650 + }, + { + "epoch": 4.0010615826247085, + "grad_norm": 15.327187538146973, + "learning_rate": 4.994072228541167e-05, + "loss": 0.2837, + "step": 18660 + }, + { + "epoch": 4.001115745003521, + "grad_norm": 0.15494488179683685, + "learning_rate": 4.9937713264366576e-05, + "loss": 0.098, + "step": 18670 + }, + { + "epoch": 4.001169907382332, + "grad_norm": 0.037692535668611526, + "learning_rate": 4.9934704243321475e-05, + "loss": 0.0647, + "step": 18680 + }, + { + "epoch": 4.001224069761144, + "grad_norm": 0.9781171679496765, + "learning_rate": 4.993169522227639e-05, + "loss": 0.0327, + "step": 18690 + }, + { + "epoch": 4.001278232139955, + "grad_norm": 0.004741961602121592, + "learning_rate": 4.9928686201231295e-05, + "loss": 0.0606, + "step": 18700 + }, + { + "epoch": 4.0013323945187675, + "grad_norm": 0.004651352297514677, + "learning_rate": 4.99256771801862e-05, + "loss": 0.13, + "step": 18710 + }, + { + "epoch": 4.001386556897579, + "grad_norm": 0.022536195814609528, + "learning_rate": 4.992266815914111e-05, + "loss": 0.0276, + "step": 18720 + }, + { + "epoch": 4.001440719276391, + "grad_norm": 0.1908181607723236, + "learning_rate": 4.9919659138096013e-05, + "loss": 0.1071, + "step": 18730 + }, + { + "epoch": 4.001494881655202, + "grad_norm": 3.735893964767456, + "learning_rate": 4.991665011705092e-05, + "loss": 0.1135, + "step": 18740 + }, + { + "epoch": 4.001549044034014, + "grad_norm": 1.8586093187332153, + "learning_rate": 4.9913641096005826e-05, + "loss": 0.0676, + "step": 18750 + }, + { + "epoch": 4.001603206412826, + "grad_norm": 13.507389068603516, + "learning_rate": 4.991063207496073e-05, + "loss": 0.0782, + "step": 18760 + }, + { + "epoch": 4.001657368791637, + "grad_norm": 0.002966654021292925, + "learning_rate": 4.9907623053915645e-05, + "loss": 0.1683, + "step": 18770 + }, + { + "epoch": 4.001711531170449, + "grad_norm": 0.001692447578534484, + "learning_rate": 4.990461403287055e-05, + "loss": 0.0112, + "step": 18780 + }, + { + "epoch": 4.0017656935492605, + "grad_norm": 0.3190646171569824, + "learning_rate": 4.990160501182545e-05, + "loss": 0.1923, + "step": 18790 + }, + { + "epoch": 4.001819855928073, + "grad_norm": 4.713146686553955, + "learning_rate": 4.9898595990780364e-05, + "loss": 0.0596, + "step": 18800 + }, + { + "epoch": 4.001874018306884, + "grad_norm": 0.035573508590459824, + "learning_rate": 4.989558696973527e-05, + "loss": 0.0831, + "step": 18810 + }, + { + "epoch": 4.001928180685696, + "grad_norm": 1.322647213935852, + "learning_rate": 4.989257794869018e-05, + "loss": 0.086, + "step": 18820 + }, + { + "epoch": 4.001982343064507, + "grad_norm": 0.008738083764910698, + "learning_rate": 4.988956892764508e-05, + "loss": 0.1274, + "step": 18830 + }, + { + "epoch": 4.002036505443319, + "grad_norm": 5.168344974517822, + "learning_rate": 4.988655990659999e-05, + "loss": 0.2261, + "step": 18840 + }, + { + "epoch": 4.002090667822131, + "grad_norm": 0.17926345765590668, + "learning_rate": 4.9883550885554896e-05, + "loss": 0.1296, + "step": 18850 + }, + { + "epoch": 4.002144830200942, + "grad_norm": 4.2772908210754395, + "learning_rate": 4.988054186450981e-05, + "loss": 0.0948, + "step": 18860 + }, + { + "epoch": 4.002198992579754, + "grad_norm": 0.25212562084198, + "learning_rate": 4.987753284346471e-05, + "loss": 0.0096, + "step": 18870 + }, + { + "epoch": 4.002253154958566, + "grad_norm": 0.912562906742096, + "learning_rate": 4.9874523822419614e-05, + "loss": 0.1388, + "step": 18880 + }, + { + "epoch": 4.002307317337378, + "grad_norm": 0.025678643956780434, + "learning_rate": 4.987151480137453e-05, + "loss": 0.0422, + "step": 18890 + }, + { + "epoch": 4.002361479716189, + "grad_norm": 0.9006250500679016, + "learning_rate": 4.986850578032943e-05, + "loss": 0.4493, + "step": 18900 + }, + { + "epoch": 4.002415642095001, + "grad_norm": 0.04530886188149452, + "learning_rate": 4.986549675928433e-05, + "loss": 0.1649, + "step": 18910 + }, + { + "epoch": 4.0024698044738125, + "grad_norm": 0.15238983929157257, + "learning_rate": 4.9862487738239246e-05, + "loss": 0.1778, + "step": 18920 + }, + { + "epoch": 4.002523966852624, + "grad_norm": 0.038510702550411224, + "learning_rate": 4.985947871719415e-05, + "loss": 0.0755, + "step": 18930 + }, + { + "epoch": 4.002578129231436, + "grad_norm": 3.1564793586730957, + "learning_rate": 4.985646969614905e-05, + "loss": 0.2796, + "step": 18940 + }, + { + "epoch": 4.002632291610247, + "grad_norm": 8.351426124572754, + "learning_rate": 4.9853460675103965e-05, + "loss": 0.1896, + "step": 18950 + }, + { + "epoch": 4.002686453989059, + "grad_norm": 4.061903476715088, + "learning_rate": 4.985045165405887e-05, + "loss": 0.1856, + "step": 18960 + }, + { + "epoch": 4.002740616367871, + "grad_norm": 2.876166820526123, + "learning_rate": 4.984744263301378e-05, + "loss": 0.046, + "step": 18970 + }, + { + "epoch": 4.002794778746683, + "grad_norm": 1.9677259922027588, + "learning_rate": 4.9844433611968684e-05, + "loss": 0.1074, + "step": 18980 + }, + { + "epoch": 4.002848941125494, + "grad_norm": 0.014843023382127285, + "learning_rate": 4.984142459092359e-05, + "loss": 0.0766, + "step": 18990 + }, + { + "epoch": 4.002903103504306, + "grad_norm": 0.4828489422798157, + "learning_rate": 4.9838415569878496e-05, + "loss": 0.1129, + "step": 19000 + }, + { + "epoch": 4.002957265883118, + "grad_norm": 0.05584312975406647, + "learning_rate": 4.983540654883341e-05, + "loss": 0.0163, + "step": 19010 + }, + { + "epoch": 4.003011428261929, + "grad_norm": 0.014520134776830673, + "learning_rate": 4.983239752778831e-05, + "loss": 0.0103, + "step": 19020 + }, + { + "epoch": 4.003065590640741, + "grad_norm": 0.0629306361079216, + "learning_rate": 4.982938850674322e-05, + "loss": 0.0425, + "step": 19030 + }, + { + "epoch": 4.003119753019552, + "grad_norm": 0.12161201238632202, + "learning_rate": 4.982637948569813e-05, + "loss": 0.1275, + "step": 19040 + }, + { + "epoch": 4.0031739153983645, + "grad_norm": 16.23068618774414, + "learning_rate": 4.982337046465303e-05, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 4.003228077777176, + "grad_norm": 0.006428275257349014, + "learning_rate": 4.982036144360794e-05, + "loss": 0.1954, + "step": 19060 + }, + { + "epoch": 4.003282240155988, + "grad_norm": 0.8172194957733154, + "learning_rate": 4.981735242256285e-05, + "loss": 0.1374, + "step": 19070 + }, + { + "epoch": 4.003336402534799, + "grad_norm": 4.7951202392578125, + "learning_rate": 4.9814343401517753e-05, + "loss": 0.2381, + "step": 19080 + }, + { + "epoch": 4.0033905649136114, + "grad_norm": 0.332155704498291, + "learning_rate": 4.981133438047266e-05, + "loss": 0.1203, + "step": 19090 + }, + { + "epoch": 4.003444727292423, + "grad_norm": 0.1551278978586197, + "learning_rate": 4.9808325359427566e-05, + "loss": 0.2352, + "step": 19100 + }, + { + "epoch": 4.003498889671234, + "grad_norm": 0.6135412454605103, + "learning_rate": 4.980531633838247e-05, + "loss": 0.0497, + "step": 19110 + }, + { + "epoch": 4.003553052050046, + "grad_norm": 6.141516208648682, + "learning_rate": 4.9802307317337385e-05, + "loss": 0.0865, + "step": 19120 + }, + { + "epoch": 4.0036072144288575, + "grad_norm": 0.013867524452507496, + "learning_rate": 4.9799298296292285e-05, + "loss": 0.207, + "step": 19130 + }, + { + "epoch": 4.00366137680767, + "grad_norm": 0.1417069435119629, + "learning_rate": 4.979628927524719e-05, + "loss": 0.0609, + "step": 19140 + }, + { + "epoch": 4.003715539186481, + "grad_norm": 7.034412860870361, + "learning_rate": 4.9793280254202104e-05, + "loss": 0.2122, + "step": 19150 + }, + { + "epoch": 4.003769701565293, + "grad_norm": 0.045723769813776016, + "learning_rate": 4.979027123315701e-05, + "loss": 0.1708, + "step": 19160 + }, + { + "epoch": 4.003823863944104, + "grad_norm": 4.881666660308838, + "learning_rate": 4.978726221211191e-05, + "loss": 0.1312, + "step": 19170 + }, + { + "epoch": 4.003878026322916, + "grad_norm": 0.0894942358136177, + "learning_rate": 4.978425319106682e-05, + "loss": 0.1005, + "step": 19180 + }, + { + "epoch": 4.003932188701728, + "grad_norm": 0.11654670536518097, + "learning_rate": 4.978124417002173e-05, + "loss": 0.0998, + "step": 19190 + }, + { + "epoch": 4.003986351080539, + "grad_norm": 0.11550813913345337, + "learning_rate": 4.977823514897663e-05, + "loss": 0.0636, + "step": 19200 + }, + { + "epoch": 4.004040513459351, + "grad_norm": 0.015611570328474045, + "learning_rate": 4.977522612793154e-05, + "loss": 0.1033, + "step": 19210 + }, + { + "epoch": 4.004094675838163, + "grad_norm": 0.02385098673403263, + "learning_rate": 4.977221710688645e-05, + "loss": 0.1586, + "step": 19220 + }, + { + "epoch": 4.004148838216975, + "grad_norm": 0.2910217046737671, + "learning_rate": 4.9769208085841354e-05, + "loss": 0.12, + "step": 19230 + }, + { + "epoch": 4.004203000595786, + "grad_norm": 0.0591222420334816, + "learning_rate": 4.976619906479626e-05, + "loss": 0.062, + "step": 19240 + }, + { + "epoch": 4.004257162974598, + "grad_norm": 12.848390579223633, + "learning_rate": 4.976319004375117e-05, + "loss": 0.0619, + "step": 19250 + }, + { + "epoch": 4.0043113253534095, + "grad_norm": 0.013956123031675816, + "learning_rate": 4.976018102270607e-05, + "loss": 0.0897, + "step": 19260 + }, + { + "epoch": 4.004365487732221, + "grad_norm": 2.7877771854400635, + "learning_rate": 4.9757172001660986e-05, + "loss": 0.0646, + "step": 19270 + }, + { + "epoch": 4.004419650111033, + "grad_norm": 0.14100952446460724, + "learning_rate": 4.9754162980615886e-05, + "loss": 0.0193, + "step": 19280 + }, + { + "epoch": 4.004473812489844, + "grad_norm": 0.521274983882904, + "learning_rate": 4.97511539595708e-05, + "loss": 0.1129, + "step": 19290 + }, + { + "epoch": 4.004527974868656, + "grad_norm": 3.8746345043182373, + "learning_rate": 4.9748144938525705e-05, + "loss": 0.145, + "step": 19300 + }, + { + "epoch": 4.004582137247468, + "grad_norm": 0.5460067391395569, + "learning_rate": 4.974513591748061e-05, + "loss": 0.1267, + "step": 19310 + }, + { + "epoch": 4.00463629962628, + "grad_norm": 0.16507567465305328, + "learning_rate": 4.974212689643552e-05, + "loss": 0.055, + "step": 19320 + }, + { + "epoch": 4.004690462005091, + "grad_norm": 0.003414560342207551, + "learning_rate": 4.9739117875390424e-05, + "loss": 0.146, + "step": 19330 + }, + { + "epoch": 4.004744624383903, + "grad_norm": 0.018147358670830727, + "learning_rate": 4.973610885434533e-05, + "loss": 0.009, + "step": 19340 + }, + { + "epoch": 4.004798786762715, + "grad_norm": 2.501927375793457, + "learning_rate": 4.9733099833300236e-05, + "loss": 0.0336, + "step": 19350 + }, + { + "epoch": 4.004852949141526, + "grad_norm": 0.22212505340576172, + "learning_rate": 4.973009081225514e-05, + "loss": 0.0084, + "step": 19360 + }, + { + "epoch": 4.004907111520338, + "grad_norm": 0.8729001879692078, + "learning_rate": 4.972708179121005e-05, + "loss": 0.071, + "step": 19370 + }, + { + "epoch": 4.004961273899149, + "grad_norm": 0.00218696566298604, + "learning_rate": 4.972407277016496e-05, + "loss": 0.0372, + "step": 19380 + }, + { + "epoch": 4.0050154362779615, + "grad_norm": 5.673499584197998, + "learning_rate": 4.972106374911986e-05, + "loss": 0.1782, + "step": 19390 + }, + { + "epoch": 4.005069598656773, + "grad_norm": 0.03593791276216507, + "learning_rate": 4.971805472807477e-05, + "loss": 0.0694, + "step": 19400 + }, + { + "epoch": 4.005123761035585, + "grad_norm": 0.03905562683939934, + "learning_rate": 4.971504570702968e-05, + "loss": 0.0428, + "step": 19410 + }, + { + "epoch": 4.005177923414396, + "grad_norm": 3.0355420112609863, + "learning_rate": 4.971203668598459e-05, + "loss": 0.1872, + "step": 19420 + }, + { + "epoch": 4.005232085793208, + "grad_norm": 0.6066368818283081, + "learning_rate": 4.9709027664939487e-05, + "loss": 0.1841, + "step": 19430 + }, + { + "epoch": 4.00528624817202, + "grad_norm": 5.393535137176514, + "learning_rate": 4.97060186438944e-05, + "loss": 0.0691, + "step": 19440 + }, + { + "epoch": 4.005340410550831, + "grad_norm": 2.327395439147949, + "learning_rate": 4.9703009622849306e-05, + "loss": 0.1927, + "step": 19450 + }, + { + "epoch": 4.005394572929643, + "grad_norm": 3.166475772857666, + "learning_rate": 4.970000060180421e-05, + "loss": 0.147, + "step": 19460 + }, + { + "epoch": 4.0054487353084545, + "grad_norm": 0.17505989968776703, + "learning_rate": 4.969699158075912e-05, + "loss": 0.0521, + "step": 19470 + }, + { + "epoch": 4.005502897687267, + "grad_norm": 0.4284748435020447, + "learning_rate": 4.9693982559714025e-05, + "loss": 0.1212, + "step": 19480 + }, + { + "epoch": 4.005557060066078, + "grad_norm": 0.26634082198143005, + "learning_rate": 4.969097353866893e-05, + "loss": 0.1912, + "step": 19490 + }, + { + "epoch": 4.00561122244489, + "grad_norm": 12.750648498535156, + "learning_rate": 4.968796451762384e-05, + "loss": 0.1168, + "step": 19500 + }, + { + "epoch": 4.005665384823701, + "grad_norm": 0.017974775284528732, + "learning_rate": 4.9684955496578744e-05, + "loss": 0.045, + "step": 19510 + }, + { + "epoch": 4.0057195472025136, + "grad_norm": 5.168978691101074, + "learning_rate": 4.968194647553365e-05, + "loss": 0.0749, + "step": 19520 + }, + { + "epoch": 4.005773709581325, + "grad_norm": 0.07613505423069, + "learning_rate": 4.967893745448856e-05, + "loss": 0.044, + "step": 19530 + }, + { + "epoch": 4.005827871960136, + "grad_norm": 0.08396238088607788, + "learning_rate": 4.967592843344346e-05, + "loss": 0.1547, + "step": 19540 + }, + { + "epoch": 4.005882034338948, + "grad_norm": 0.31114229559898376, + "learning_rate": 4.9672919412398375e-05, + "loss": 0.2283, + "step": 19550 + }, + { + "epoch": 4.00593619671776, + "grad_norm": 12.921367645263672, + "learning_rate": 4.966991039135328e-05, + "loss": 0.0873, + "step": 19560 + }, + { + "epoch": 4.005990359096572, + "grad_norm": 0.5162879824638367, + "learning_rate": 4.966690137030819e-05, + "loss": 0.0965, + "step": 19570 + }, + { + "epoch": 4.006044521475383, + "grad_norm": 0.25772425532341003, + "learning_rate": 4.9663892349263094e-05, + "loss": 0.1406, + "step": 19580 + }, + { + "epoch": 4.006098683854195, + "grad_norm": 0.48874539136886597, + "learning_rate": 4.9660883328218e-05, + "loss": 0.232, + "step": 19590 + }, + { + "epoch": 4.0061528462330065, + "grad_norm": 7.377795696258545, + "learning_rate": 4.965787430717291e-05, + "loss": 0.0511, + "step": 19600 + }, + { + "epoch": 4.006207008611819, + "grad_norm": 9.581839561462402, + "learning_rate": 4.965486528612781e-05, + "loss": 0.2198, + "step": 19610 + }, + { + "epoch": 4.00626117099063, + "grad_norm": 1.4403495788574219, + "learning_rate": 4.965185626508272e-05, + "loss": 0.1784, + "step": 19620 + }, + { + "epoch": 4.006315333369441, + "grad_norm": 0.014752495102584362, + "learning_rate": 4.9648847244037626e-05, + "loss": 0.055, + "step": 19630 + }, + { + "epoch": 4.006369495748253, + "grad_norm": 0.4542013108730316, + "learning_rate": 4.964583822299254e-05, + "loss": 0.2159, + "step": 19640 + }, + { + "epoch": 4.006423658127065, + "grad_norm": 6.223991394042969, + "learning_rate": 4.964282920194744e-05, + "loss": 0.2193, + "step": 19650 + }, + { + "epoch": 4.006477820505877, + "grad_norm": 6.211861610412598, + "learning_rate": 4.9639820180902344e-05, + "loss": 0.0701, + "step": 19660 + }, + { + "epoch": 4.006531982884688, + "grad_norm": 2.3662142753601074, + "learning_rate": 4.963681115985726e-05, + "loss": 0.0425, + "step": 19670 + }, + { + "epoch": 4.0065861452635, + "grad_norm": 0.10043216496706009, + "learning_rate": 4.9633802138812164e-05, + "loss": 0.1455, + "step": 19680 + }, + { + "epoch": 4.006640307642312, + "grad_norm": 0.38855499029159546, + "learning_rate": 4.963079311776706e-05, + "loss": 0.1088, + "step": 19690 + }, + { + "epoch": 4.006694470021123, + "grad_norm": 7.430174350738525, + "learning_rate": 4.9627784096721976e-05, + "loss": 0.1845, + "step": 19700 + }, + { + "epoch": 4.006748632399935, + "grad_norm": 0.3013147711753845, + "learning_rate": 4.962477507567688e-05, + "loss": 0.0747, + "step": 19710 + }, + { + "epoch": 4.006802794778746, + "grad_norm": 0.06464339792728424, + "learning_rate": 4.962176605463179e-05, + "loss": 0.1009, + "step": 19720 + }, + { + "epoch": 4.0068569571575585, + "grad_norm": 0.2361489087343216, + "learning_rate": 4.9618757033586695e-05, + "loss": 0.045, + "step": 19730 + }, + { + "epoch": 4.00691111953637, + "grad_norm": 2.8399882316589355, + "learning_rate": 4.96157480125416e-05, + "loss": 0.1411, + "step": 19740 + }, + { + "epoch": 4.006965281915182, + "grad_norm": 0.5867394208908081, + "learning_rate": 4.961273899149651e-05, + "loss": 0.1527, + "step": 19750 + }, + { + "epoch": 4.007019444293993, + "grad_norm": 0.6647461652755737, + "learning_rate": 4.960972997045142e-05, + "loss": 0.1247, + "step": 19760 + }, + { + "epoch": 4.007073606672805, + "grad_norm": 0.14577510952949524, + "learning_rate": 4.960672094940632e-05, + "loss": 0.0864, + "step": 19770 + }, + { + "epoch": 4.007127769051617, + "grad_norm": 0.08160528540611267, + "learning_rate": 4.9603711928361227e-05, + "loss": 0.0442, + "step": 19780 + }, + { + "epoch": 4.007181931430428, + "grad_norm": 0.03239111602306366, + "learning_rate": 4.960070290731614e-05, + "loss": 0.0685, + "step": 19790 + }, + { + "epoch": 4.00723609380924, + "grad_norm": 0.023503005504608154, + "learning_rate": 4.959769388627104e-05, + "loss": 0.015, + "step": 19800 + }, + { + "epoch": 4.0072902561880515, + "grad_norm": 0.00668123597279191, + "learning_rate": 4.959468486522595e-05, + "loss": 0.0405, + "step": 19810 + }, + { + "epoch": 4.007344418566864, + "grad_norm": 5.363600730895996, + "learning_rate": 4.959167584418086e-05, + "loss": 0.1444, + "step": 19820 + }, + { + "epoch": 4.007398580945675, + "grad_norm": 4.3501410484313965, + "learning_rate": 4.9588666823135765e-05, + "loss": 0.1308, + "step": 19830 + }, + { + "epoch": 4.007452743324487, + "grad_norm": 0.01971019245684147, + "learning_rate": 4.958565780209067e-05, + "loss": 0.0062, + "step": 19840 + }, + { + "epoch": 4.007506905703298, + "grad_norm": 0.32933494448661804, + "learning_rate": 4.958264878104558e-05, + "loss": 0.0276, + "step": 19850 + }, + { + "epoch": 4.0075610680821105, + "grad_norm": 1.8562648296356201, + "learning_rate": 4.9579639760000484e-05, + "loss": 0.1362, + "step": 19860 + }, + { + "epoch": 4.007615230460922, + "grad_norm": 4.021598815917969, + "learning_rate": 4.957663073895539e-05, + "loss": 0.1585, + "step": 19870 + }, + { + "epoch": 4.007669392839733, + "grad_norm": 0.6957274079322815, + "learning_rate": 4.9573621717910296e-05, + "loss": 0.0763, + "step": 19880 + }, + { + "epoch": 4.007723555218545, + "grad_norm": 0.015504605136811733, + "learning_rate": 4.95706126968652e-05, + "loss": 0.0259, + "step": 19890 + }, + { + "epoch": 4.007777717597357, + "grad_norm": 0.062293797731399536, + "learning_rate": 4.9567603675820115e-05, + "loss": 0.303, + "step": 19900 + }, + { + "epoch": 4.007831879976169, + "grad_norm": 9.77768325805664, + "learning_rate": 4.956459465477502e-05, + "loss": 0.1466, + "step": 19910 + }, + { + "epoch": 4.00788604235498, + "grad_norm": 0.010823491960763931, + "learning_rate": 4.956158563372992e-05, + "loss": 0.0483, + "step": 19920 + }, + { + "epoch": 4.007940204733792, + "grad_norm": 17.707420349121094, + "learning_rate": 4.9558576612684834e-05, + "loss": 0.1616, + "step": 19930 + }, + { + "epoch": 4.0079943671126035, + "grad_norm": 0.023961266502738, + "learning_rate": 4.955556759163974e-05, + "loss": 0.0777, + "step": 19940 + }, + { + "epoch": 4.008048529491416, + "grad_norm": 0.05856113135814667, + "learning_rate": 4.955255857059464e-05, + "loss": 0.1237, + "step": 19950 + }, + { + "epoch": 4.008102691870227, + "grad_norm": 0.338380366563797, + "learning_rate": 4.954954954954955e-05, + "loss": 0.1551, + "step": 19960 + }, + { + "epoch": 4.008156854249038, + "grad_norm": 0.26535850763320923, + "learning_rate": 4.954654052850446e-05, + "loss": 0.0755, + "step": 19970 + }, + { + "epoch": 4.00821101662785, + "grad_norm": 0.2479545921087265, + "learning_rate": 4.9543531507459366e-05, + "loss": 0.0361, + "step": 19980 + }, + { + "epoch": 4.008265179006662, + "grad_norm": 0.03225356712937355, + "learning_rate": 4.954052248641427e-05, + "loss": 0.106, + "step": 19990 + }, + { + "epoch": 4.008319341385474, + "grad_norm": 0.08907619118690491, + "learning_rate": 4.953751346536918e-05, + "loss": 0.1404, + "step": 20000 + }, + { + "epoch": 4.008373503764285, + "grad_norm": 0.2465740442276001, + "learning_rate": 4.9534504444324084e-05, + "loss": 0.063, + "step": 20010 + }, + { + "epoch": 4.008427666143097, + "grad_norm": 0.010303778573870659, + "learning_rate": 4.9531495423279e-05, + "loss": 0.075, + "step": 20020 + }, + { + "epoch": 4.008481828521909, + "grad_norm": 0.05414590612053871, + "learning_rate": 4.95284864022339e-05, + "loss": 0.0767, + "step": 20030 + }, + { + "epoch": 4.008535990900721, + "grad_norm": 0.26204702258110046, + "learning_rate": 4.95254773811888e-05, + "loss": 0.0654, + "step": 20040 + }, + { + "epoch": 4.008590153279532, + "grad_norm": 0.014013085514307022, + "learning_rate": 4.9522468360143716e-05, + "loss": 0.2428, + "step": 20050 + }, + { + "epoch": 4.008644315658343, + "grad_norm": 0.28510603308677673, + "learning_rate": 4.951945933909862e-05, + "loss": 0.0678, + "step": 20060 + }, + { + "epoch": 4.0086984780371555, + "grad_norm": 0.011717161163687706, + "learning_rate": 4.951645031805353e-05, + "loss": 0.117, + "step": 20070 + }, + { + "epoch": 4.008752640415967, + "grad_norm": 0.005221355706453323, + "learning_rate": 4.9513441297008435e-05, + "loss": 0.1357, + "step": 20080 + }, + { + "epoch": 4.008806802794779, + "grad_norm": 1.9939466714859009, + "learning_rate": 4.951043227596334e-05, + "loss": 0.0592, + "step": 20090 + }, + { + "epoch": 4.00886096517359, + "grad_norm": 0.005369182210415602, + "learning_rate": 4.950742325491825e-05, + "loss": 0.1594, + "step": 20100 + }, + { + "epoch": 4.008915127552402, + "grad_norm": 0.1724676638841629, + "learning_rate": 4.9504414233873154e-05, + "loss": 0.1201, + "step": 20110 + }, + { + "epoch": 4.008969289931214, + "grad_norm": 0.9298872351646423, + "learning_rate": 4.950140521282806e-05, + "loss": 0.1036, + "step": 20120 + }, + { + "epoch": 4.009023452310026, + "grad_norm": 0.09636041522026062, + "learning_rate": 4.9498396191782966e-05, + "loss": 0.0759, + "step": 20130 + }, + { + "epoch": 4.009077614688837, + "grad_norm": 14.375079154968262, + "learning_rate": 4.949538717073787e-05, + "loss": 0.1207, + "step": 20140 + }, + { + "epoch": 4.0091317770676484, + "grad_norm": 2.4124436378479004, + "learning_rate": 4.949237814969278e-05, + "loss": 0.1163, + "step": 20150 + }, + { + "epoch": 4.009185939446461, + "grad_norm": 5.42016077041626, + "learning_rate": 4.948936912864769e-05, + "loss": 0.0535, + "step": 20160 + }, + { + "epoch": 4.009240101825272, + "grad_norm": 2.96366548538208, + "learning_rate": 4.94863601076026e-05, + "loss": 0.0783, + "step": 20170 + }, + { + "epoch": 4.009294264204084, + "grad_norm": 7.45358419418335, + "learning_rate": 4.94833510865575e-05, + "loss": 0.011, + "step": 20180 + }, + { + "epoch": 4.009348426582895, + "grad_norm": 0.7821863889694214, + "learning_rate": 4.948034206551241e-05, + "loss": 0.0883, + "step": 20190 + }, + { + "epoch": 4.0094025889617075, + "grad_norm": 0.0028281717095524073, + "learning_rate": 4.947733304446732e-05, + "loss": 0.0862, + "step": 20200 + }, + { + "epoch": 4.009456751340519, + "grad_norm": 0.32263264060020447, + "learning_rate": 4.9474324023422223e-05, + "loss": 0.1571, + "step": 20210 + }, + { + "epoch": 4.009510913719331, + "grad_norm": 0.04448610544204712, + "learning_rate": 4.947131500237713e-05, + "loss": 0.061, + "step": 20220 + }, + { + "epoch": 4.009565076098142, + "grad_norm": 7.941728115081787, + "learning_rate": 4.9468305981332036e-05, + "loss": 0.2162, + "step": 20230 + }, + { + "epoch": 4.009619238476954, + "grad_norm": 0.36206552386283875, + "learning_rate": 4.946529696028694e-05, + "loss": 0.0798, + "step": 20240 + }, + { + "epoch": 4.009673400855766, + "grad_norm": 0.008992808870971203, + "learning_rate": 4.946228793924185e-05, + "loss": 0.0463, + "step": 20250 + }, + { + "epoch": 4.009727563234577, + "grad_norm": 0.5550539493560791, + "learning_rate": 4.9459278918196755e-05, + "loss": 0.1765, + "step": 20260 + }, + { + "epoch": 4.009781725613389, + "grad_norm": 0.13605596125125885, + "learning_rate": 4.945626989715166e-05, + "loss": 0.1746, + "step": 20270 + }, + { + "epoch": 4.0098358879922005, + "grad_norm": 0.21545808017253876, + "learning_rate": 4.9453260876106574e-05, + "loss": 0.0483, + "step": 20280 + }, + { + "epoch": 4.009890050371013, + "grad_norm": 1.832812786102295, + "learning_rate": 4.9450251855061474e-05, + "loss": 0.1298, + "step": 20290 + }, + { + "epoch": 4.009944212749824, + "grad_norm": 0.02019290253520012, + "learning_rate": 4.944724283401638e-05, + "loss": 0.0893, + "step": 20300 + }, + { + "epoch": 4.009998375128635, + "grad_norm": 0.10162672400474548, + "learning_rate": 4.944423381297129e-05, + "loss": 0.0857, + "step": 20310 + }, + { + "epoch": 4.010052537507447, + "grad_norm": 0.12668593227863312, + "learning_rate": 4.94412247919262e-05, + "loss": 0.0513, + "step": 20320 + }, + { + "epoch": 4.010106699886259, + "grad_norm": 0.0724068209528923, + "learning_rate": 4.9438215770881106e-05, + "loss": 0.2312, + "step": 20330 + }, + { + "epoch": 4.010160862265071, + "grad_norm": 3.6326205730438232, + "learning_rate": 4.943520674983601e-05, + "loss": 0.0857, + "step": 20340 + }, + { + "epoch": 4.010215024643882, + "grad_norm": 0.28930938243865967, + "learning_rate": 4.943219772879092e-05, + "loss": 0.0955, + "step": 20350 + }, + { + "epoch": 4.010269187022694, + "grad_norm": 0.02364981733262539, + "learning_rate": 4.9429188707745824e-05, + "loss": 0.1074, + "step": 20360 + }, + { + "epoch": 4.010323349401506, + "grad_norm": 5.586335182189941, + "learning_rate": 4.942617968670073e-05, + "loss": 0.1812, + "step": 20370 + }, + { + "epoch": 4.010377511780318, + "grad_norm": 6.402353286743164, + "learning_rate": 4.942317066565564e-05, + "loss": 0.0407, + "step": 20380 + }, + { + "epoch": 4.010431674159129, + "grad_norm": 8.2615327835083, + "learning_rate": 4.942016164461054e-05, + "loss": 0.1315, + "step": 20390 + }, + { + "epoch": 4.01048583653794, + "grad_norm": 0.4087466895580292, + "learning_rate": 4.941715262356545e-05, + "loss": 0.1017, + "step": 20400 + }, + { + "epoch": 4.0105399989167525, + "grad_norm": 2.0877861976623535, + "learning_rate": 4.9414143602520356e-05, + "loss": 0.0354, + "step": 20410 + }, + { + "epoch": 4.010594161295564, + "grad_norm": 0.2586577832698822, + "learning_rate": 4.941113458147527e-05, + "loss": 0.1856, + "step": 20420 + }, + { + "epoch": 4.010648323674376, + "grad_norm": 0.08032501488924026, + "learning_rate": 4.9408125560430175e-05, + "loss": 0.1227, + "step": 20430 + }, + { + "epoch": 4.010702486053187, + "grad_norm": 1.4941450357437134, + "learning_rate": 4.9405116539385075e-05, + "loss": 0.0453, + "step": 20440 + }, + { + "epoch": 4.010756648431999, + "grad_norm": 12.392464637756348, + "learning_rate": 4.940210751833999e-05, + "loss": 0.0803, + "step": 20450 + }, + { + "epoch": 4.010810810810811, + "grad_norm": 0.5691604018211365, + "learning_rate": 4.9399098497294894e-05, + "loss": 0.1466, + "step": 20460 + }, + { + "epoch": 4.010864973189623, + "grad_norm": 0.06762626767158508, + "learning_rate": 4.93960894762498e-05, + "loss": 0.0949, + "step": 20470 + }, + { + "epoch": 4.010919135568434, + "grad_norm": 0.11472900956869125, + "learning_rate": 4.9393080455204706e-05, + "loss": 0.0227, + "step": 20480 + }, + { + "epoch": 4.010973297947245, + "grad_norm": 3.4768459796905518, + "learning_rate": 4.939007143415961e-05, + "loss": 0.1375, + "step": 20490 + }, + { + "epoch": 4.011027460326058, + "grad_norm": 0.09275023639202118, + "learning_rate": 4.938706241311452e-05, + "loss": 0.1467, + "step": 20500 + }, + { + "epoch": 4.011081622704869, + "grad_norm": 4.655501365661621, + "learning_rate": 4.938405339206943e-05, + "loss": 0.0898, + "step": 20510 + }, + { + "epoch": 4.011135785083681, + "grad_norm": 0.12196100503206253, + "learning_rate": 4.938104437102433e-05, + "loss": 0.0136, + "step": 20520 + }, + { + "epoch": 4.011189947462492, + "grad_norm": 8.579094886779785, + "learning_rate": 4.937803534997924e-05, + "loss": 0.2661, + "step": 20530 + }, + { + "epoch": 4.0112441098413045, + "grad_norm": 2.2260215282440186, + "learning_rate": 4.937502632893415e-05, + "loss": 0.2921, + "step": 20540 + }, + { + "epoch": 4.011298272220116, + "grad_norm": 1.0013906955718994, + "learning_rate": 4.937201730788905e-05, + "loss": 0.1379, + "step": 20550 + }, + { + "epoch": 4.011352434598928, + "grad_norm": 0.2984519600868225, + "learning_rate": 4.9369008286843957e-05, + "loss": 0.0964, + "step": 20560 + }, + { + "epoch": 4.011406596977739, + "grad_norm": 0.18774928152561188, + "learning_rate": 4.936599926579887e-05, + "loss": 0.0457, + "step": 20570 + }, + { + "epoch": 4.0114607593565506, + "grad_norm": 0.014345025643706322, + "learning_rate": 4.9362990244753776e-05, + "loss": 0.0623, + "step": 20580 + }, + { + "epoch": 4.011514921735363, + "grad_norm": 0.11583065241575241, + "learning_rate": 4.935998122370868e-05, + "loss": 0.1418, + "step": 20590 + }, + { + "epoch": 4.011569084114174, + "grad_norm": 0.6500497460365295, + "learning_rate": 4.935697220266359e-05, + "loss": 0.2182, + "step": 20600 + }, + { + "epoch": 4.011623246492986, + "grad_norm": 0.3418170213699341, + "learning_rate": 4.9353963181618495e-05, + "loss": 0.0513, + "step": 20610 + }, + { + "epoch": 4.0116774088717975, + "grad_norm": 10.049148559570312, + "learning_rate": 4.93509541605734e-05, + "loss": 0.1255, + "step": 20620 + }, + { + "epoch": 4.01173157125061, + "grad_norm": 6.365511417388916, + "learning_rate": 4.934794513952831e-05, + "loss": 0.1141, + "step": 20630 + }, + { + "epoch": 4.011785733629421, + "grad_norm": 0.009516280144453049, + "learning_rate": 4.9344936118483214e-05, + "loss": 0.0089, + "step": 20640 + }, + { + "epoch": 4.011839896008233, + "grad_norm": 0.058772191405296326, + "learning_rate": 4.934192709743812e-05, + "loss": 0.2499, + "step": 20650 + }, + { + "epoch": 4.011894058387044, + "grad_norm": 0.3563263416290283, + "learning_rate": 4.933891807639303e-05, + "loss": 0.0659, + "step": 20660 + }, + { + "epoch": 4.011948220765856, + "grad_norm": 0.31730377674102783, + "learning_rate": 4.933590905534793e-05, + "loss": 0.1168, + "step": 20670 + }, + { + "epoch": 4.012002383144668, + "grad_norm": 40.3726692199707, + "learning_rate": 4.9332900034302845e-05, + "loss": 0.1589, + "step": 20680 + }, + { + "epoch": 4.012056545523479, + "grad_norm": 7.876128673553467, + "learning_rate": 4.932989101325775e-05, + "loss": 0.1265, + "step": 20690 + }, + { + "epoch": 4.012110707902291, + "grad_norm": 0.09422269463539124, + "learning_rate": 4.932688199221265e-05, + "loss": 0.011, + "step": 20700 + }, + { + "epoch": 4.012164870281103, + "grad_norm": 0.009282371029257774, + "learning_rate": 4.9323872971167564e-05, + "loss": 0.0394, + "step": 20710 + }, + { + "epoch": 4.012219032659915, + "grad_norm": 8.744529724121094, + "learning_rate": 4.932086395012247e-05, + "loss": 0.0962, + "step": 20720 + }, + { + "epoch": 4.012273195038726, + "grad_norm": 0.035774052143096924, + "learning_rate": 4.931785492907738e-05, + "loss": 0.3414, + "step": 20730 + }, + { + "epoch": 4.012327357417538, + "grad_norm": 0.33228352665901184, + "learning_rate": 4.931484590803228e-05, + "loss": 0.1202, + "step": 20740 + }, + { + "epoch": 4.0123815197963495, + "grad_norm": 7.60584831237793, + "learning_rate": 4.931183688698719e-05, + "loss": 0.1769, + "step": 20750 + }, + { + "epoch": 4.012435682175161, + "grad_norm": 3.1742947101593018, + "learning_rate": 4.9308827865942096e-05, + "loss": 0.2025, + "step": 20760 + }, + { + "epoch": 4.012489844553973, + "grad_norm": 2.0100109577178955, + "learning_rate": 4.930581884489701e-05, + "loss": 0.066, + "step": 20770 + }, + { + "epoch": 4.012544006932784, + "grad_norm": 0.021521223708987236, + "learning_rate": 4.930280982385191e-05, + "loss": 0.1145, + "step": 20780 + }, + { + "epoch": 4.012598169311596, + "grad_norm": 0.1110144704580307, + "learning_rate": 4.9299800802806814e-05, + "loss": 0.0501, + "step": 20790 + }, + { + "epoch": 4.012652331690408, + "grad_norm": 5.472123146057129, + "learning_rate": 4.929679178176173e-05, + "loss": 0.1121, + "step": 20800 + }, + { + "epoch": 4.01270649406922, + "grad_norm": 0.10475726425647736, + "learning_rate": 4.929378276071663e-05, + "loss": 0.2847, + "step": 20810 + }, + { + "epoch": 4.012760656448031, + "grad_norm": 0.12739945948123932, + "learning_rate": 4.929077373967153e-05, + "loss": 0.1221, + "step": 20820 + }, + { + "epoch": 4.012814818826842, + "grad_norm": 0.23958002030849457, + "learning_rate": 4.9287764718626446e-05, + "loss": 0.1051, + "step": 20830 + }, + { + "epoch": 4.012868981205655, + "grad_norm": 0.48908060789108276, + "learning_rate": 4.928475569758135e-05, + "loss": 0.1142, + "step": 20840 + }, + { + "epoch": 4.012923143584466, + "grad_norm": 0.1510150283575058, + "learning_rate": 4.928174667653626e-05, + "loss": 0.0459, + "step": 20850 + }, + { + "epoch": 4.012977305963278, + "grad_norm": 5.219150543212891, + "learning_rate": 4.9278737655491165e-05, + "loss": 0.1215, + "step": 20860 + }, + { + "epoch": 4.013031468342089, + "grad_norm": 0.005715701263397932, + "learning_rate": 4.927572863444607e-05, + "loss": 0.008, + "step": 20870 + }, + { + "epoch": 4.0130856307209015, + "grad_norm": 0.015824109315872192, + "learning_rate": 4.927271961340098e-05, + "loss": 0.1827, + "step": 20880 + }, + { + "epoch": 4.013139793099713, + "grad_norm": 0.34636402130126953, + "learning_rate": 4.9269710592355884e-05, + "loss": 0.0646, + "step": 20890 + }, + { + "epoch": 4.013193955478525, + "grad_norm": 2.046964168548584, + "learning_rate": 4.926670157131079e-05, + "loss": 0.1669, + "step": 20900 + }, + { + "epoch": 4.013248117857336, + "grad_norm": 1.3688335418701172, + "learning_rate": 4.9263692550265697e-05, + "loss": 0.0205, + "step": 20910 + }, + { + "epoch": 4.0133022802361475, + "grad_norm": 0.09101493656635284, + "learning_rate": 4.926068352922061e-05, + "loss": 0.1178, + "step": 20920 + }, + { + "epoch": 4.01335644261496, + "grad_norm": 0.002914185868576169, + "learning_rate": 4.925767450817551e-05, + "loss": 0.0792, + "step": 20930 + }, + { + "epoch": 4.013410604993771, + "grad_norm": 0.3033078908920288, + "learning_rate": 4.925466548713042e-05, + "loss": 0.0755, + "step": 20940 + }, + { + "epoch": 4.013464767372583, + "grad_norm": 8.88992977142334, + "learning_rate": 4.925165646608533e-05, + "loss": 0.1275, + "step": 20950 + }, + { + "epoch": 4.0135189297513945, + "grad_norm": 0.3274344205856323, + "learning_rate": 4.924864744504023e-05, + "loss": 0.0129, + "step": 20960 + }, + { + "epoch": 4.013573092130207, + "grad_norm": 0.21457439661026, + "learning_rate": 4.924563842399514e-05, + "loss": 0.0921, + "step": 20970 + }, + { + "epoch": 4.013627254509018, + "grad_norm": 0.005853466689586639, + "learning_rate": 4.924262940295005e-05, + "loss": 0.029, + "step": 20980 + }, + { + "epoch": 4.01368141688783, + "grad_norm": 0.013005976565182209, + "learning_rate": 4.9239620381904954e-05, + "loss": 0.161, + "step": 20990 + }, + { + "epoch": 4.013735579266641, + "grad_norm": 0.18741463124752045, + "learning_rate": 4.923661136085986e-05, + "loss": 0.0778, + "step": 21000 + }, + { + "epoch": 4.013789741645453, + "grad_norm": 0.1269441843032837, + "learning_rate": 4.9233602339814766e-05, + "loss": 0.1282, + "step": 21010 + }, + { + "epoch": 4.013843904024265, + "grad_norm": 7.628439426422119, + "learning_rate": 4.923059331876967e-05, + "loss": 0.221, + "step": 21020 + }, + { + "epoch": 4.013898066403076, + "grad_norm": 3.682340621948242, + "learning_rate": 4.9227584297724585e-05, + "loss": 0.0872, + "step": 21030 + }, + { + "epoch": 4.013952228781888, + "grad_norm": 0.16413074731826782, + "learning_rate": 4.9224575276679485e-05, + "loss": 0.1775, + "step": 21040 + }, + { + "epoch": 4.0140063911607, + "grad_norm": 0.6787016987800598, + "learning_rate": 4.922156625563439e-05, + "loss": 0.0845, + "step": 21050 + }, + { + "epoch": 4.014060553539512, + "grad_norm": 0.47196441888809204, + "learning_rate": 4.9218557234589304e-05, + "loss": 0.1619, + "step": 21060 + }, + { + "epoch": 4.014114715918323, + "grad_norm": 0.4079420268535614, + "learning_rate": 4.921554821354421e-05, + "loss": 0.0478, + "step": 21070 + }, + { + "epoch": 4.014168878297135, + "grad_norm": 4.026284694671631, + "learning_rate": 4.921253919249911e-05, + "loss": 0.0537, + "step": 21080 + }, + { + "epoch": 4.0142230406759465, + "grad_norm": 6.775891304016113, + "learning_rate": 4.920953017145402e-05, + "loss": 0.1636, + "step": 21090 + }, + { + "epoch": 4.014277203054758, + "grad_norm": 7.466149806976318, + "learning_rate": 4.920652115040893e-05, + "loss": 0.0684, + "step": 21100 + }, + { + "epoch": 4.01433136543357, + "grad_norm": 6.3954644203186035, + "learning_rate": 4.9203512129363836e-05, + "loss": 0.2522, + "step": 21110 + }, + { + "epoch": 4.014385527812381, + "grad_norm": 0.015296237543225288, + "learning_rate": 4.920050310831874e-05, + "loss": 0.1574, + "step": 21120 + }, + { + "epoch": 4.014439690191193, + "grad_norm": 0.5040444135665894, + "learning_rate": 4.919749408727365e-05, + "loss": 0.1776, + "step": 21130 + }, + { + "epoch": 4.014493852570005, + "grad_norm": 3.2161293029785156, + "learning_rate": 4.9194485066228554e-05, + "loss": 0.1066, + "step": 21140 + }, + { + "epoch": 4.014548014948817, + "grad_norm": 0.13269804418087006, + "learning_rate": 4.919147604518346e-05, + "loss": 0.0753, + "step": 21150 + }, + { + "epoch": 4.014602177327628, + "grad_norm": 0.023134218528866768, + "learning_rate": 4.918846702413837e-05, + "loss": 0.0827, + "step": 21160 + }, + { + "epoch": 4.01465633970644, + "grad_norm": 0.15080119669437408, + "learning_rate": 4.918545800309327e-05, + "loss": 0.0782, + "step": 21170 + }, + { + "epoch": 4.014710502085252, + "grad_norm": 0.004043443128466606, + "learning_rate": 4.9182448982048186e-05, + "loss": 0.0572, + "step": 21180 + }, + { + "epoch": 4.014764664464063, + "grad_norm": 0.20364971458911896, + "learning_rate": 4.9179439961003086e-05, + "loss": 0.346, + "step": 21190 + }, + { + "epoch": 4.014818826842875, + "grad_norm": 0.7727216482162476, + "learning_rate": 4.9176430939958e-05, + "loss": 0.0966, + "step": 21200 + }, + { + "epoch": 4.014872989221686, + "grad_norm": 0.14724190533161163, + "learning_rate": 4.9173421918912905e-05, + "loss": 0.2058, + "step": 21210 + }, + { + "epoch": 4.0149271516004985, + "grad_norm": 0.7948965430259705, + "learning_rate": 4.917041289786781e-05, + "loss": 0.0116, + "step": 21220 + }, + { + "epoch": 4.01498131397931, + "grad_norm": 0.048997655510902405, + "learning_rate": 4.916740387682272e-05, + "loss": 0.1363, + "step": 21230 + }, + { + "epoch": 4.015035476358122, + "grad_norm": 0.14751692116260529, + "learning_rate": 4.9164394855777624e-05, + "loss": 0.0795, + "step": 21240 + }, + { + "epoch": 4.015089638736933, + "grad_norm": 0.012117450125515461, + "learning_rate": 4.916138583473253e-05, + "loss": 0.1869, + "step": 21250 + }, + { + "epoch": 4.015143801115745, + "grad_norm": 0.23664957284927368, + "learning_rate": 4.9158376813687436e-05, + "loss": 0.1495, + "step": 21260 + }, + { + "epoch": 4.015197963494557, + "grad_norm": 0.05587494745850563, + "learning_rate": 4.915536779264234e-05, + "loss": 0.0573, + "step": 21270 + }, + { + "epoch": 4.015252125873368, + "grad_norm": 0.5605996251106262, + "learning_rate": 4.915235877159725e-05, + "loss": 0.0863, + "step": 21280 + }, + { + "epoch": 4.01530628825218, + "grad_norm": 0.9025561213493347, + "learning_rate": 4.914934975055216e-05, + "loss": 0.1864, + "step": 21290 + }, + { + "epoch": 4.0153604506309915, + "grad_norm": 0.3901229202747345, + "learning_rate": 4.914634072950706e-05, + "loss": 0.109, + "step": 21300 + }, + { + "epoch": 4.015414613009804, + "grad_norm": 0.019383734092116356, + "learning_rate": 4.914333170846197e-05, + "loss": 0.0504, + "step": 21310 + }, + { + "epoch": 4.015468775388615, + "grad_norm": 13.325583457946777, + "learning_rate": 4.914032268741688e-05, + "loss": 0.1033, + "step": 21320 + }, + { + "epoch": 4.015522937767427, + "grad_norm": 0.9941532611846924, + "learning_rate": 4.913731366637179e-05, + "loss": 0.0151, + "step": 21330 + }, + { + "epoch": 4.015577100146238, + "grad_norm": 9.357792854309082, + "learning_rate": 4.913430464532669e-05, + "loss": 0.1718, + "step": 21340 + }, + { + "epoch": 4.0156312625250505, + "grad_norm": 0.1573568880558014, + "learning_rate": 4.91312956242816e-05, + "loss": 0.1381, + "step": 21350 + }, + { + "epoch": 4.015685424903862, + "grad_norm": 0.033529773354530334, + "learning_rate": 4.9128286603236506e-05, + "loss": 0.1491, + "step": 21360 + }, + { + "epoch": 4.015739587282673, + "grad_norm": 0.018396204337477684, + "learning_rate": 4.912527758219141e-05, + "loss": 0.1108, + "step": 21370 + }, + { + "epoch": 4.015793749661485, + "grad_norm": 0.12777338922023773, + "learning_rate": 4.912226856114632e-05, + "loss": 0.0877, + "step": 21380 + }, + { + "epoch": 4.015847912040297, + "grad_norm": 0.21939116716384888, + "learning_rate": 4.9119259540101225e-05, + "loss": 0.0802, + "step": 21390 + }, + { + "epoch": 4.015902074419109, + "grad_norm": 4.042576789855957, + "learning_rate": 4.911625051905613e-05, + "loss": 0.2204, + "step": 21400 + }, + { + "epoch": 4.01595623679792, + "grad_norm": 1.5049537420272827, + "learning_rate": 4.911324149801104e-05, + "loss": 0.1578, + "step": 21410 + }, + { + "epoch": 4.016010399176732, + "grad_norm": 0.01732448861002922, + "learning_rate": 4.9110232476965944e-05, + "loss": 0.248, + "step": 21420 + }, + { + "epoch": 4.0160645615555435, + "grad_norm": 1.2878692150115967, + "learning_rate": 4.910722345592085e-05, + "loss": 0.198, + "step": 21430 + }, + { + "epoch": 4.016118723934355, + "grad_norm": 0.01911083050072193, + "learning_rate": 4.910421443487576e-05, + "loss": 0.0752, + "step": 21440 + }, + { + "epoch": 4.016172886313167, + "grad_norm": 0.18372976779937744, + "learning_rate": 4.910120541383066e-05, + "loss": 0.13, + "step": 21450 + }, + { + "epoch": 4.016227048691978, + "grad_norm": 1.7545762062072754, + "learning_rate": 4.9098196392785576e-05, + "loss": 0.0427, + "step": 21460 + }, + { + "epoch": 4.01628121107079, + "grad_norm": 11.18275260925293, + "learning_rate": 4.909518737174048e-05, + "loss": 0.1588, + "step": 21470 + }, + { + "epoch": 4.016335373449602, + "grad_norm": 0.009969675913453102, + "learning_rate": 4.909217835069539e-05, + "loss": 0.097, + "step": 21480 + }, + { + "epoch": 4.016389535828414, + "grad_norm": 0.03603946045041084, + "learning_rate": 4.9089169329650294e-05, + "loss": 0.2155, + "step": 21490 + }, + { + "epoch": 4.016443698207225, + "grad_norm": 1.8307228088378906, + "learning_rate": 4.90861603086052e-05, + "loss": 0.1556, + "step": 21500 + }, + { + "epoch": 4.016497860586037, + "grad_norm": 0.11819703876972198, + "learning_rate": 4.908315128756011e-05, + "loss": 0.082, + "step": 21510 + }, + { + "epoch": 4.016552022964849, + "grad_norm": 1.736847996711731, + "learning_rate": 4.908014226651502e-05, + "loss": 0.0537, + "step": 21520 + }, + { + "epoch": 4.01660618534366, + "grad_norm": 1.1089755296707153, + "learning_rate": 4.907713324546992e-05, + "loss": 0.1223, + "step": 21530 + }, + { + "epoch": 4.016660347722472, + "grad_norm": 0.13680098950862885, + "learning_rate": 4.9074124224424826e-05, + "loss": 0.1069, + "step": 21540 + }, + { + "epoch": 4.016714510101283, + "grad_norm": 0.040904298424720764, + "learning_rate": 4.907111520337974e-05, + "loss": 0.0646, + "step": 21550 + }, + { + "epoch": 4.0167686724800955, + "grad_norm": 0.46527424454689026, + "learning_rate": 4.906810618233464e-05, + "loss": 0.1089, + "step": 21560 + }, + { + "epoch": 4.016822834858907, + "grad_norm": 0.4080806076526642, + "learning_rate": 4.9065097161289545e-05, + "loss": 0.1635, + "step": 21570 + }, + { + "epoch": 4.016876997237719, + "grad_norm": 0.028091954067349434, + "learning_rate": 4.906208814024446e-05, + "loss": 0.136, + "step": 21580 + }, + { + "epoch": 4.01693115961653, + "grad_norm": 0.23661060631275177, + "learning_rate": 4.9059079119199364e-05, + "loss": 0.1097, + "step": 21590 + }, + { + "epoch": 4.016985321995342, + "grad_norm": 0.2407005876302719, + "learning_rate": 4.905607009815426e-05, + "loss": 0.072, + "step": 21600 + }, + { + "epoch": 4.017039484374154, + "grad_norm": 5.208584308624268, + "learning_rate": 4.9053061077109176e-05, + "loss": 0.1255, + "step": 21610 + }, + { + "epoch": 4.017093646752965, + "grad_norm": 0.17831429839134216, + "learning_rate": 4.905005205606408e-05, + "loss": 0.1465, + "step": 21620 + }, + { + "epoch": 4.017147809131777, + "grad_norm": 0.34978318214416504, + "learning_rate": 4.904704303501899e-05, + "loss": 0.0334, + "step": 21630 + }, + { + "epoch": 4.017201971510588, + "grad_norm": 0.9893481731414795, + "learning_rate": 4.9044034013973895e-05, + "loss": 0.1159, + "step": 21640 + }, + { + "epoch": 4.017256133889401, + "grad_norm": 2.6577346324920654, + "learning_rate": 4.90410249929288e-05, + "loss": 0.26, + "step": 21650 + }, + { + "epoch": 4.017310296268212, + "grad_norm": 0.44679999351501465, + "learning_rate": 4.903801597188371e-05, + "loss": 0.0126, + "step": 21660 + }, + { + "epoch": 4.017364458647024, + "grad_norm": 0.09567216038703918, + "learning_rate": 4.903500695083862e-05, + "loss": 0.0902, + "step": 21670 + }, + { + "epoch": 4.017418621025835, + "grad_norm": 8.207633972167969, + "learning_rate": 4.903199792979352e-05, + "loss": 0.1608, + "step": 21680 + }, + { + "epoch": 4.0174727834046475, + "grad_norm": 0.006763566751033068, + "learning_rate": 4.902898890874843e-05, + "loss": 0.0731, + "step": 21690 + }, + { + "epoch": 4.017526945783459, + "grad_norm": 13.2515230178833, + "learning_rate": 4.902597988770334e-05, + "loss": 0.0853, + "step": 21700 + }, + { + "epoch": 4.01758110816227, + "grad_norm": 0.3281399607658386, + "learning_rate": 4.902297086665824e-05, + "loss": 0.0799, + "step": 21710 + }, + { + "epoch": 4.017635270541082, + "grad_norm": 0.23662588000297546, + "learning_rate": 4.901996184561315e-05, + "loss": 0.022, + "step": 21720 + }, + { + "epoch": 4.0176894329198936, + "grad_norm": 0.4758821427822113, + "learning_rate": 4.901695282456806e-05, + "loss": 0.098, + "step": 21730 + }, + { + "epoch": 4.017743595298706, + "grad_norm": 0.23756511509418488, + "learning_rate": 4.9013943803522965e-05, + "loss": 0.0823, + "step": 21740 + }, + { + "epoch": 4.017797757677517, + "grad_norm": 2.6645593643188477, + "learning_rate": 4.901093478247787e-05, + "loss": 0.0857, + "step": 21750 + }, + { + "epoch": 4.017851920056329, + "grad_norm": 0.7737169861793518, + "learning_rate": 4.900792576143278e-05, + "loss": 0.1371, + "step": 21760 + }, + { + "epoch": 4.0179060824351405, + "grad_norm": 2.660740375518799, + "learning_rate": 4.9004916740387684e-05, + "loss": 0.0464, + "step": 21770 + }, + { + "epoch": 4.017960244813953, + "grad_norm": 0.006789573933929205, + "learning_rate": 4.90019077193426e-05, + "loss": 0.0087, + "step": 21780 + }, + { + "epoch": 4.018014407192764, + "grad_norm": 0.042319707572460175, + "learning_rate": 4.8998898698297496e-05, + "loss": 0.3223, + "step": 21790 + }, + { + "epoch": 4.018068569571575, + "grad_norm": 0.15767371654510498, + "learning_rate": 4.89958896772524e-05, + "loss": 0.1227, + "step": 21800 + }, + { + "epoch": 4.018122731950387, + "grad_norm": 0.25718769431114197, + "learning_rate": 4.8992880656207315e-05, + "loss": 0.0117, + "step": 21810 + }, + { + "epoch": 4.018176894329199, + "grad_norm": 0.018889695405960083, + "learning_rate": 4.898987163516222e-05, + "loss": 0.0404, + "step": 21820 + }, + { + "epoch": 4.018231056708011, + "grad_norm": 0.25821447372436523, + "learning_rate": 4.898686261411712e-05, + "loss": 0.0602, + "step": 21830 + }, + { + "epoch": 4.018285219086822, + "grad_norm": 0.18688346445560455, + "learning_rate": 4.8983853593072034e-05, + "loss": 0.0773, + "step": 21840 + }, + { + "epoch": 4.018339381465634, + "grad_norm": 0.13267897069454193, + "learning_rate": 4.898084457202694e-05, + "loss": 0.0649, + "step": 21850 + }, + { + "epoch": 4.018393543844446, + "grad_norm": 0.03527243807911873, + "learning_rate": 4.897783555098184e-05, + "loss": 0.1004, + "step": 21860 + }, + { + "epoch": 4.018447706223258, + "grad_norm": 0.27236276865005493, + "learning_rate": 4.897482652993675e-05, + "loss": 0.0812, + "step": 21870 + }, + { + "epoch": 4.018501868602069, + "grad_norm": 0.1143566444516182, + "learning_rate": 4.897181750889166e-05, + "loss": 0.1976, + "step": 21880 + }, + { + "epoch": 4.01855603098088, + "grad_norm": 0.12065441161394119, + "learning_rate": 4.8968808487846566e-05, + "loss": 0.0569, + "step": 21890 + }, + { + "epoch": 4.0186101933596925, + "grad_norm": 0.06908392906188965, + "learning_rate": 4.896579946680147e-05, + "loss": 0.0724, + "step": 21900 + }, + { + "epoch": 4.018664355738504, + "grad_norm": 0.3146020770072937, + "learning_rate": 4.896279044575638e-05, + "loss": 0.1299, + "step": 21910 + }, + { + "epoch": 4.018718518117316, + "grad_norm": 0.1055571585893631, + "learning_rate": 4.8959781424711284e-05, + "loss": 0.1087, + "step": 21920 + }, + { + "epoch": 4.018772680496127, + "grad_norm": 0.29834744334220886, + "learning_rate": 4.89567724036662e-05, + "loss": 0.1324, + "step": 21930 + }, + { + "epoch": 4.018826842874939, + "grad_norm": 0.31864356994628906, + "learning_rate": 4.89537633826211e-05, + "loss": 0.0306, + "step": 21940 + }, + { + "epoch": 4.018881005253751, + "grad_norm": 0.12343823164701462, + "learning_rate": 4.8950754361576e-05, + "loss": 0.1048, + "step": 21950 + }, + { + "epoch": 4.018935167632563, + "grad_norm": 0.1785653680562973, + "learning_rate": 4.8947745340530916e-05, + "loss": 0.0867, + "step": 21960 + }, + { + "epoch": 4.018989330011374, + "grad_norm": 0.21253785490989685, + "learning_rate": 4.894473631948582e-05, + "loss": 0.2523, + "step": 21970 + }, + { + "epoch": 4.019043492390185, + "grad_norm": 0.4470615088939667, + "learning_rate": 4.894172729844073e-05, + "loss": 0.0714, + "step": 21980 + }, + { + "epoch": 4.019097654768998, + "grad_norm": 0.12981542944908142, + "learning_rate": 4.8938718277395635e-05, + "loss": 0.0914, + "step": 21990 + }, + { + "epoch": 4.019151817147809, + "grad_norm": 0.01714794710278511, + "learning_rate": 4.893570925635054e-05, + "loss": 0.0612, + "step": 22000 + }, + { + "epoch": 4.019205979526621, + "grad_norm": 1.750002145767212, + "learning_rate": 4.893270023530545e-05, + "loss": 0.1294, + "step": 22010 + }, + { + "epoch": 4.019260141905432, + "grad_norm": 0.8858515024185181, + "learning_rate": 4.8929691214260354e-05, + "loss": 0.0388, + "step": 22020 + }, + { + "epoch": 4.0193143042842445, + "grad_norm": 0.6906415224075317, + "learning_rate": 4.892668219321526e-05, + "loss": 0.0376, + "step": 22030 + }, + { + "epoch": 4.019368466663056, + "grad_norm": 7.736617565155029, + "learning_rate": 4.892367317217017e-05, + "loss": 0.0905, + "step": 22040 + }, + { + "epoch": 4.019422629041867, + "grad_norm": 0.2186422049999237, + "learning_rate": 4.892066415112507e-05, + "loss": 0.1899, + "step": 22050 + }, + { + "epoch": 4.019476791420679, + "grad_norm": 7.022078514099121, + "learning_rate": 4.891765513007998e-05, + "loss": 0.1809, + "step": 22060 + }, + { + "epoch": 4.0195309537994905, + "grad_norm": 0.37548863887786865, + "learning_rate": 4.891464610903489e-05, + "loss": 0.0279, + "step": 22070 + }, + { + "epoch": 4.019585116178303, + "grad_norm": 0.5639292001724243, + "learning_rate": 4.89116370879898e-05, + "loss": 0.0632, + "step": 22080 + }, + { + "epoch": 4.019639278557114, + "grad_norm": 0.48432859778404236, + "learning_rate": 4.89086280669447e-05, + "loss": 0.1174, + "step": 22090 + }, + { + "epoch": 4.019693440935926, + "grad_norm": 0.06022549793124199, + "learning_rate": 4.890561904589961e-05, + "loss": 0.0708, + "step": 22100 + }, + { + "epoch": 4.0197476033147375, + "grad_norm": 0.01200170535594225, + "learning_rate": 4.890261002485452e-05, + "loss": 0.0732, + "step": 22110 + }, + { + "epoch": 4.01980176569355, + "grad_norm": 10.299477577209473, + "learning_rate": 4.8899601003809424e-05, + "loss": 0.0854, + "step": 22120 + }, + { + "epoch": 4.019855928072361, + "grad_norm": 0.019068151712417603, + "learning_rate": 4.889659198276433e-05, + "loss": 0.1555, + "step": 22130 + }, + { + "epoch": 4.019910090451172, + "grad_norm": 0.5740295648574829, + "learning_rate": 4.8893582961719236e-05, + "loss": 0.018, + "step": 22140 + }, + { + "epoch": 4.019964252829984, + "grad_norm": 5.503551959991455, + "learning_rate": 4.889057394067414e-05, + "loss": 0.1317, + "step": 22150 + }, + { + "epoch": 4.020018415208796, + "grad_norm": 0.004557761363685131, + "learning_rate": 4.888756491962905e-05, + "loss": 0.0601, + "step": 22160 + }, + { + "epoch": 4.020072577587608, + "grad_norm": 2.7110471725463867, + "learning_rate": 4.8884555898583955e-05, + "loss": 0.3007, + "step": 22170 + }, + { + "epoch": 4.020126739966419, + "grad_norm": 0.39913371205329895, + "learning_rate": 4.888154687753886e-05, + "loss": 0.0603, + "step": 22180 + }, + { + "epoch": 4.020180902345231, + "grad_norm": 0.5915253758430481, + "learning_rate": 4.8878537856493774e-05, + "loss": 0.1135, + "step": 22190 + }, + { + "epoch": 4.020235064724043, + "grad_norm": 0.02582080475986004, + "learning_rate": 4.8875528835448674e-05, + "loss": 0.1448, + "step": 22200 + }, + { + "epoch": 4.020289227102855, + "grad_norm": 0.039457470178604126, + "learning_rate": 4.887251981440359e-05, + "loss": 0.1344, + "step": 22210 + }, + { + "epoch": 4.020343389481666, + "grad_norm": 4.864310264587402, + "learning_rate": 4.886951079335849e-05, + "loss": 0.1303, + "step": 22220 + }, + { + "epoch": 4.020397551860477, + "grad_norm": 3.495102882385254, + "learning_rate": 4.88665017723134e-05, + "loss": 0.2073, + "step": 22230 + }, + { + "epoch": 4.0204517142392895, + "grad_norm": 0.14408384263515472, + "learning_rate": 4.8863492751268306e-05, + "loss": 0.1017, + "step": 22240 + }, + { + "epoch": 4.020505876618101, + "grad_norm": 0.5452383756637573, + "learning_rate": 4.886048373022321e-05, + "loss": 0.0668, + "step": 22250 + }, + { + "epoch": 4.020560038996913, + "grad_norm": 0.03062097169458866, + "learning_rate": 4.885747470917812e-05, + "loss": 0.1873, + "step": 22260 + }, + { + "epoch": 4.020614201375724, + "grad_norm": 0.06789854913949966, + "learning_rate": 4.8854465688133024e-05, + "loss": 0.0803, + "step": 22270 + }, + { + "epoch": 4.020668363754536, + "grad_norm": 6.473040580749512, + "learning_rate": 4.885145666708793e-05, + "loss": 0.1353, + "step": 22280 + }, + { + "epoch": 4.020722526133348, + "grad_norm": 0.053658537566661835, + "learning_rate": 4.884844764604284e-05, + "loss": 0.105, + "step": 22290 + }, + { + "epoch": 4.02077668851216, + "grad_norm": 2.4618897438049316, + "learning_rate": 4.884543862499775e-05, + "loss": 0.1223, + "step": 22300 + }, + { + "epoch": 4.020830850890971, + "grad_norm": 0.9409419298171997, + "learning_rate": 4.884242960395265e-05, + "loss": 0.0253, + "step": 22310 + }, + { + "epoch": 4.020885013269782, + "grad_norm": 0.2160395383834839, + "learning_rate": 4.8839420582907556e-05, + "loss": 0.0104, + "step": 22320 + }, + { + "epoch": 4.020939175648595, + "grad_norm": 0.017867572605609894, + "learning_rate": 4.883641156186247e-05, + "loss": 0.066, + "step": 22330 + }, + { + "epoch": 4.020993338027406, + "grad_norm": 0.28589797019958496, + "learning_rate": 4.8833402540817375e-05, + "loss": 0.0607, + "step": 22340 + }, + { + "epoch": 4.021047500406218, + "grad_norm": 0.19391418993473053, + "learning_rate": 4.8830393519772275e-05, + "loss": 0.0455, + "step": 22350 + }, + { + "epoch": 4.021101662785029, + "grad_norm": 0.11820780485868454, + "learning_rate": 4.882738449872719e-05, + "loss": 0.2249, + "step": 22360 + }, + { + "epoch": 4.0211558251638415, + "grad_norm": 0.5589813590049744, + "learning_rate": 4.8824375477682094e-05, + "loss": 0.1166, + "step": 22370 + }, + { + "epoch": 4.021209987542653, + "grad_norm": 0.10497388243675232, + "learning_rate": 4.8821366456637e-05, + "loss": 0.0986, + "step": 22380 + }, + { + "epoch": 4.021264149921465, + "grad_norm": 0.029140399768948555, + "learning_rate": 4.8818357435591907e-05, + "loss": 0.0486, + "step": 22390 + }, + { + "epoch": 4.021318312300276, + "grad_norm": 0.20632213354110718, + "learning_rate": 4.881534841454681e-05, + "loss": 0.1302, + "step": 22400 + }, + { + "epoch": 4.0213724746790875, + "grad_norm": 2.858090877532959, + "learning_rate": 4.881233939350172e-05, + "loss": 0.1235, + "step": 22410 + }, + { + "epoch": 4.0214266370579, + "grad_norm": 3.4513728618621826, + "learning_rate": 4.880933037245663e-05, + "loss": 0.097, + "step": 22420 + }, + { + "epoch": 4.021480799436711, + "grad_norm": 0.07569991797208786, + "learning_rate": 4.880632135141153e-05, + "loss": 0.0737, + "step": 22430 + }, + { + "epoch": 4.021534961815523, + "grad_norm": 0.015084769576787949, + "learning_rate": 4.880331233036644e-05, + "loss": 0.0604, + "step": 22440 + }, + { + "epoch": 4.0215891241943345, + "grad_norm": 0.4937070310115814, + "learning_rate": 4.880030330932135e-05, + "loss": 0.0193, + "step": 22450 + }, + { + "epoch": 4.021643286573147, + "grad_norm": 0.02155827358365059, + "learning_rate": 4.879729428827625e-05, + "loss": 0.0762, + "step": 22460 + }, + { + "epoch": 4.021697448951958, + "grad_norm": 0.14123331010341644, + "learning_rate": 4.8794285267231163e-05, + "loss": 0.122, + "step": 22470 + }, + { + "epoch": 4.02175161133077, + "grad_norm": 0.004313413053750992, + "learning_rate": 4.879127624618607e-05, + "loss": 0.1071, + "step": 22480 + }, + { + "epoch": 4.021805773709581, + "grad_norm": 0.3619861602783203, + "learning_rate": 4.8788267225140976e-05, + "loss": 0.0348, + "step": 22490 + }, + { + "epoch": 4.021859936088393, + "grad_norm": 0.3255338668823242, + "learning_rate": 4.878525820409588e-05, + "loss": 0.1618, + "step": 22500 + }, + { + "epoch": 4.021914098467205, + "grad_norm": 22.269550323486328, + "learning_rate": 4.878224918305079e-05, + "loss": 0.1581, + "step": 22510 + }, + { + "epoch": 4.021968260846016, + "grad_norm": 0.05274924263358116, + "learning_rate": 4.8779240162005695e-05, + "loss": 0.0846, + "step": 22520 + }, + { + "epoch": 4.022022423224828, + "grad_norm": 0.24644149839878082, + "learning_rate": 4.87762311409606e-05, + "loss": 0.0996, + "step": 22530 + }, + { + "epoch": 4.02207658560364, + "grad_norm": 3.1648788452148438, + "learning_rate": 4.877322211991551e-05, + "loss": 0.1079, + "step": 22540 + }, + { + "epoch": 4.022130747982452, + "grad_norm": 0.5905427932739258, + "learning_rate": 4.8770213098870414e-05, + "loss": 0.0326, + "step": 22550 + }, + { + "epoch": 4.022184910361263, + "grad_norm": 0.015308168716728687, + "learning_rate": 4.876720407782533e-05, + "loss": 0.2147, + "step": 22560 + }, + { + "epoch": 4.022239072740074, + "grad_norm": 0.043390024453401566, + "learning_rate": 4.876419505678023e-05, + "loss": 0.0714, + "step": 22570 + }, + { + "epoch": 4.0222932351188865, + "grad_norm": 79.53964233398438, + "learning_rate": 4.876118603573513e-05, + "loss": 0.1245, + "step": 22580 + }, + { + "epoch": 4.022347397497698, + "grad_norm": 0.023645443841814995, + "learning_rate": 4.8758177014690046e-05, + "loss": 0.0117, + "step": 22590 + }, + { + "epoch": 4.02240155987651, + "grad_norm": 0.0016468754038214684, + "learning_rate": 4.875516799364495e-05, + "loss": 0.2781, + "step": 22600 + }, + { + "epoch": 4.022455722255321, + "grad_norm": 1.2154135704040527, + "learning_rate": 4.875215897259985e-05, + "loss": 0.1347, + "step": 22610 + }, + { + "epoch": 4.022509884634133, + "grad_norm": 0.40332961082458496, + "learning_rate": 4.8749149951554764e-05, + "loss": 0.0821, + "step": 22620 + }, + { + "epoch": 4.022564047012945, + "grad_norm": 0.4294419586658478, + "learning_rate": 4.874614093050967e-05, + "loss": 0.0722, + "step": 22630 + }, + { + "epoch": 4.022618209391757, + "grad_norm": 6.433352947235107, + "learning_rate": 4.874313190946458e-05, + "loss": 0.0895, + "step": 22640 + }, + { + "epoch": 4.022672371770568, + "grad_norm": 4.478430271148682, + "learning_rate": 4.874012288841948e-05, + "loss": 0.2444, + "step": 22650 + }, + { + "epoch": 4.022726534149379, + "grad_norm": 0.15660782158374786, + "learning_rate": 4.873711386737439e-05, + "loss": 0.2279, + "step": 22660 + }, + { + "epoch": 4.022780696528192, + "grad_norm": 0.6104583144187927, + "learning_rate": 4.8734104846329296e-05, + "loss": 0.0689, + "step": 22670 + }, + { + "epoch": 4.022834858907003, + "grad_norm": 1.889777660369873, + "learning_rate": 4.873109582528421e-05, + "loss": 0.1718, + "step": 22680 + }, + { + "epoch": 4.022889021285815, + "grad_norm": 0.32070276141166687, + "learning_rate": 4.872808680423911e-05, + "loss": 0.0403, + "step": 22690 + }, + { + "epoch": 4.022943183664626, + "grad_norm": 0.012801401317119598, + "learning_rate": 4.8725077783194015e-05, + "loss": 0.1355, + "step": 22700 + }, + { + "epoch": 4.0229973460434385, + "grad_norm": 0.008033066987991333, + "learning_rate": 4.872206876214893e-05, + "loss": 0.0426, + "step": 22710 + }, + { + "epoch": 4.02305150842225, + "grad_norm": 0.009862185455858707, + "learning_rate": 4.8719059741103834e-05, + "loss": 0.2023, + "step": 22720 + }, + { + "epoch": 4.023105670801062, + "grad_norm": 0.198494553565979, + "learning_rate": 4.871605072005874e-05, + "loss": 0.0935, + "step": 22730 + }, + { + "epoch": 4.023159833179873, + "grad_norm": 0.23313017189502716, + "learning_rate": 4.8713041699013646e-05, + "loss": 0.0669, + "step": 22740 + }, + { + "epoch": 4.0232139955586845, + "grad_norm": 0.013955236412584782, + "learning_rate": 4.871003267796855e-05, + "loss": 0.0968, + "step": 22750 + }, + { + "epoch": 4.023268157937497, + "grad_norm": 0.19470463693141937, + "learning_rate": 4.870702365692346e-05, + "loss": 0.0406, + "step": 22760 + }, + { + "epoch": 4.023322320316308, + "grad_norm": 28.275535583496094, + "learning_rate": 4.8704014635878365e-05, + "loss": 0.3303, + "step": 22770 + }, + { + "epoch": 4.02337648269512, + "grad_norm": 0.04342326894402504, + "learning_rate": 4.870100561483327e-05, + "loss": 0.0291, + "step": 22780 + }, + { + "epoch": 4.0234306450739314, + "grad_norm": 0.03497331589460373, + "learning_rate": 4.869799659378818e-05, + "loss": 0.1151, + "step": 22790 + }, + { + "epoch": 4.023484807452744, + "grad_norm": 2.065477132797241, + "learning_rate": 4.8694987572743084e-05, + "loss": 0.1543, + "step": 22800 + }, + { + "epoch": 4.023538969831555, + "grad_norm": 0.013909186236560345, + "learning_rate": 4.869197855169799e-05, + "loss": 0.0857, + "step": 22810 + }, + { + "epoch": 4.023593132210367, + "grad_norm": 6.793869972229004, + "learning_rate": 4.8688969530652903e-05, + "loss": 0.3612, + "step": 22820 + }, + { + "epoch": 4.023647294589178, + "grad_norm": 0.21175992488861084, + "learning_rate": 4.868596050960781e-05, + "loss": 0.1112, + "step": 22830 + }, + { + "epoch": 4.02370145696799, + "grad_norm": 0.06721527874469757, + "learning_rate": 4.868295148856271e-05, + "loss": 0.2054, + "step": 22840 + }, + { + "epoch": 4.023755619346802, + "grad_norm": 0.03923126682639122, + "learning_rate": 4.867994246751762e-05, + "loss": 0.0149, + "step": 22850 + }, + { + "epoch": 4.023809781725613, + "grad_norm": 0.23451553285121918, + "learning_rate": 4.867693344647253e-05, + "loss": 0.0371, + "step": 22860 + }, + { + "epoch": 4.023863944104425, + "grad_norm": 0.30591922998428345, + "learning_rate": 4.8673924425427435e-05, + "loss": 0.0093, + "step": 22870 + }, + { + "epoch": 4.023918106483237, + "grad_norm": 0.549356997013092, + "learning_rate": 4.867091540438234e-05, + "loss": 0.0812, + "step": 22880 + }, + { + "epoch": 4.023972268862049, + "grad_norm": 0.022375542670488358, + "learning_rate": 4.866790638333725e-05, + "loss": 0.0782, + "step": 22890 + }, + { + "epoch": 4.02402643124086, + "grad_norm": 0.11172296851873398, + "learning_rate": 4.8664897362292154e-05, + "loss": 0.0106, + "step": 22900 + }, + { + "epoch": 4.024080593619672, + "grad_norm": 0.1890357881784439, + "learning_rate": 4.866188834124706e-05, + "loss": 0.0077, + "step": 22910 + }, + { + "epoch": 4.0241347559984835, + "grad_norm": 6.5473151206970215, + "learning_rate": 4.8658879320201966e-05, + "loss": 0.1709, + "step": 22920 + }, + { + "epoch": 4.024188918377295, + "grad_norm": 0.35083773732185364, + "learning_rate": 4.865587029915687e-05, + "loss": 0.207, + "step": 22930 + }, + { + "epoch": 4.024243080756107, + "grad_norm": 0.07285748422145844, + "learning_rate": 4.8652861278111785e-05, + "loss": 0.2109, + "step": 22940 + }, + { + "epoch": 4.024297243134918, + "grad_norm": 0.5260762572288513, + "learning_rate": 4.8649852257066685e-05, + "loss": 0.1342, + "step": 22950 + }, + { + "epoch": 4.02435140551373, + "grad_norm": 0.07195799052715302, + "learning_rate": 4.864684323602159e-05, + "loss": 0.1179, + "step": 22960 + }, + { + "epoch": 4.024405567892542, + "grad_norm": 0.08950089663267136, + "learning_rate": 4.8643834214976504e-05, + "loss": 0.0382, + "step": 22970 + }, + { + "epoch": 4.024459730271354, + "grad_norm": 0.011024805717170238, + "learning_rate": 4.864082519393141e-05, + "loss": 0.0852, + "step": 22980 + }, + { + "epoch": 4.024513892650165, + "grad_norm": 0.1610771119594574, + "learning_rate": 4.863781617288632e-05, + "loss": 0.108, + "step": 22990 + }, + { + "epoch": 4.024568055028977, + "grad_norm": 0.7711285352706909, + "learning_rate": 4.863480715184122e-05, + "loss": 0.1484, + "step": 23000 + }, + { + "epoch": 4.024622217407789, + "grad_norm": 4.497324466705322, + "learning_rate": 4.863179813079613e-05, + "loss": 0.2335, + "step": 23010 + }, + { + "epoch": 4.0246763797866, + "grad_norm": 0.2517387568950653, + "learning_rate": 4.8628789109751036e-05, + "loss": 0.1193, + "step": 23020 + }, + { + "epoch": 4.024730542165412, + "grad_norm": 0.005432984791696072, + "learning_rate": 4.862578008870594e-05, + "loss": 0.146, + "step": 23030 + }, + { + "epoch": 4.024784704544223, + "grad_norm": 0.002979324432089925, + "learning_rate": 4.862277106766085e-05, + "loss": 0.0762, + "step": 23040 + }, + { + "epoch": 4.0248388669230355, + "grad_norm": 0.01589818485081196, + "learning_rate": 4.8619762046615755e-05, + "loss": 0.1759, + "step": 23050 + }, + { + "epoch": 4.024893029301847, + "grad_norm": 1.6227318048477173, + "learning_rate": 4.861675302557066e-05, + "loss": 0.3382, + "step": 23060 + }, + { + "epoch": 4.024947191680659, + "grad_norm": 0.21389798820018768, + "learning_rate": 4.861374400452557e-05, + "loss": 0.4476, + "step": 23070 + }, + { + "epoch": 4.02500135405947, + "grad_norm": 0.6682267189025879, + "learning_rate": 4.861073498348048e-05, + "loss": 0.061, + "step": 23080 + }, + { + "epoch": 4.02500135405947, + "eval_accuracy": 0.8001306335728282, + "eval_loss": 0.5792274475097656, + "eval_runtime": 117.7513, + "eval_samples_per_second": 26.004, + "eval_steps_per_second": 3.253, + "step": 23080 + }, + { + "epoch": 5.000054162378811, + "grad_norm": 0.10053475946187973, + "learning_rate": 4.8607725962435386e-05, + "loss": 0.0147, + "step": 23090 + }, + { + "epoch": 5.0001083247576235, + "grad_norm": 0.03318730741739273, + "learning_rate": 4.8604716941390286e-05, + "loss": 0.1048, + "step": 23100 + }, + { + "epoch": 5.000162487136435, + "grad_norm": 4.706642150878906, + "learning_rate": 4.86017079203452e-05, + "loss": 0.1701, + "step": 23110 + }, + { + "epoch": 5.000216649515247, + "grad_norm": 0.2578330338001251, + "learning_rate": 4.8598698899300105e-05, + "loss": 0.1175, + "step": 23120 + }, + { + "epoch": 5.000270811894058, + "grad_norm": 5.4305100440979, + "learning_rate": 4.859568987825501e-05, + "loss": 0.1476, + "step": 23130 + }, + { + "epoch": 5.00032497427287, + "grad_norm": 14.04636001586914, + "learning_rate": 4.859268085720992e-05, + "loss": 0.1394, + "step": 23140 + }, + { + "epoch": 5.000379136651682, + "grad_norm": 0.30776703357696533, + "learning_rate": 4.8589671836164824e-05, + "loss": 0.0347, + "step": 23150 + }, + { + "epoch": 5.000433299030494, + "grad_norm": 0.038998011499643326, + "learning_rate": 4.858666281511973e-05, + "loss": 0.098, + "step": 23160 + }, + { + "epoch": 5.000487461409305, + "grad_norm": 0.46544063091278076, + "learning_rate": 4.858365379407464e-05, + "loss": 0.0316, + "step": 23170 + }, + { + "epoch": 5.000541623788116, + "grad_norm": 0.023437367752194405, + "learning_rate": 4.858064477302954e-05, + "loss": 0.046, + "step": 23180 + }, + { + "epoch": 5.000595786166929, + "grad_norm": 0.02137090638279915, + "learning_rate": 4.857763575198445e-05, + "loss": 0.1072, + "step": 23190 + }, + { + "epoch": 5.00064994854574, + "grad_norm": 4.397226810455322, + "learning_rate": 4.857462673093936e-05, + "loss": 0.1164, + "step": 23200 + }, + { + "epoch": 5.000704110924552, + "grad_norm": 0.35850590467453003, + "learning_rate": 4.857161770989426e-05, + "loss": 0.1166, + "step": 23210 + }, + { + "epoch": 5.000758273303363, + "grad_norm": 3.0780534744262695, + "learning_rate": 4.856860868884917e-05, + "loss": 0.1232, + "step": 23220 + }, + { + "epoch": 5.0008124356821755, + "grad_norm": 2.670628309249878, + "learning_rate": 4.856559966780408e-05, + "loss": 0.0695, + "step": 23230 + }, + { + "epoch": 5.000866598060987, + "grad_norm": 0.5444124341011047, + "learning_rate": 4.856259064675899e-05, + "loss": 0.1845, + "step": 23240 + }, + { + "epoch": 5.000920760439799, + "grad_norm": 0.4598838984966278, + "learning_rate": 4.8559581625713894e-05, + "loss": 0.1668, + "step": 23250 + }, + { + "epoch": 5.00097492281861, + "grad_norm": 0.06909613311290741, + "learning_rate": 4.85565726046688e-05, + "loss": 0.1015, + "step": 23260 + }, + { + "epoch": 5.0010290851974215, + "grad_norm": 0.0928112268447876, + "learning_rate": 4.8553563583623706e-05, + "loss": 0.0782, + "step": 23270 + }, + { + "epoch": 5.001083247576234, + "grad_norm": 0.43567878007888794, + "learning_rate": 4.855055456257861e-05, + "loss": 0.0874, + "step": 23280 + }, + { + "epoch": 5.001137409955045, + "grad_norm": 2.1101479530334473, + "learning_rate": 4.854754554153352e-05, + "loss": 0.0589, + "step": 23290 + }, + { + "epoch": 5.001191572333857, + "grad_norm": 0.5080452561378479, + "learning_rate": 4.8544536520488425e-05, + "loss": 0.0728, + "step": 23300 + }, + { + "epoch": 5.001245734712668, + "grad_norm": 0.016088400036096573, + "learning_rate": 4.854152749944333e-05, + "loss": 0.0136, + "step": 23310 + }, + { + "epoch": 5.001299897091481, + "grad_norm": 0.13387644290924072, + "learning_rate": 4.8538518478398244e-05, + "loss": 0.1076, + "step": 23320 + }, + { + "epoch": 5.001354059470292, + "grad_norm": 0.013219181448221207, + "learning_rate": 4.8535509457353144e-05, + "loss": 0.0037, + "step": 23330 + }, + { + "epoch": 5.001408221849104, + "grad_norm": 0.02018587850034237, + "learning_rate": 4.853250043630806e-05, + "loss": 0.1727, + "step": 23340 + }, + { + "epoch": 5.001462384227915, + "grad_norm": 5.111493110656738, + "learning_rate": 4.852949141526296e-05, + "loss": 0.1159, + "step": 23350 + }, + { + "epoch": 5.001516546606727, + "grad_norm": 0.1939477026462555, + "learning_rate": 4.852648239421786e-05, + "loss": 0.0502, + "step": 23360 + }, + { + "epoch": 5.001570708985539, + "grad_norm": 3.594850778579712, + "learning_rate": 4.8523473373172776e-05, + "loss": 0.1628, + "step": 23370 + }, + { + "epoch": 5.00162487136435, + "grad_norm": 6.701674461364746, + "learning_rate": 4.852046435212768e-05, + "loss": 0.2241, + "step": 23380 + }, + { + "epoch": 5.001679033743162, + "grad_norm": 0.8746975660324097, + "learning_rate": 4.851745533108259e-05, + "loss": 0.0888, + "step": 23390 + }, + { + "epoch": 5.0017331961219735, + "grad_norm": 4.2296648025512695, + "learning_rate": 4.8514446310037494e-05, + "loss": 0.0787, + "step": 23400 + }, + { + "epoch": 5.001787358500786, + "grad_norm": 0.028977608308196068, + "learning_rate": 4.85114372889924e-05, + "loss": 0.1868, + "step": 23410 + }, + { + "epoch": 5.001841520879597, + "grad_norm": 0.2387576699256897, + "learning_rate": 4.850842826794731e-05, + "loss": 0.012, + "step": 23420 + }, + { + "epoch": 5.001895683258408, + "grad_norm": 0.05468031391501427, + "learning_rate": 4.850541924690222e-05, + "loss": 0.0904, + "step": 23430 + }, + { + "epoch": 5.0019498456372204, + "grad_norm": 5.511716842651367, + "learning_rate": 4.850241022585712e-05, + "loss": 0.2238, + "step": 23440 + }, + { + "epoch": 5.002004008016032, + "grad_norm": 0.1594473421573639, + "learning_rate": 4.8499401204812026e-05, + "loss": 0.1994, + "step": 23450 + }, + { + "epoch": 5.002058170394844, + "grad_norm": 8.772860527038574, + "learning_rate": 4.849639218376694e-05, + "loss": 0.1118, + "step": 23460 + }, + { + "epoch": 5.002112332773655, + "grad_norm": 0.05568993464112282, + "learning_rate": 4.8493383162721845e-05, + "loss": 0.0539, + "step": 23470 + }, + { + "epoch": 5.002166495152467, + "grad_norm": 0.05566523224115372, + "learning_rate": 4.8490374141676745e-05, + "loss": 0.0457, + "step": 23480 + }, + { + "epoch": 5.002220657531279, + "grad_norm": 0.251263290643692, + "learning_rate": 4.848736512063166e-05, + "loss": 0.1307, + "step": 23490 + }, + { + "epoch": 5.002274819910091, + "grad_norm": 0.16484303772449493, + "learning_rate": 4.8484356099586564e-05, + "loss": 0.1121, + "step": 23500 + }, + { + "epoch": 5.002328982288902, + "grad_norm": 0.10979129374027252, + "learning_rate": 4.848134707854147e-05, + "loss": 0.0287, + "step": 23510 + }, + { + "epoch": 5.002383144667713, + "grad_norm": 0.004061123356223106, + "learning_rate": 4.8478338057496377e-05, + "loss": 0.0055, + "step": 23520 + }, + { + "epoch": 5.002437307046526, + "grad_norm": 0.04279875010251999, + "learning_rate": 4.847532903645128e-05, + "loss": 0.0075, + "step": 23530 + }, + { + "epoch": 5.002491469425337, + "grad_norm": 0.007183169946074486, + "learning_rate": 4.847232001540619e-05, + "loss": 0.0074, + "step": 23540 + }, + { + "epoch": 5.002545631804149, + "grad_norm": 10.648181915283203, + "learning_rate": 4.8469310994361095e-05, + "loss": 0.2217, + "step": 23550 + }, + { + "epoch": 5.00259979418296, + "grad_norm": 8.18816089630127, + "learning_rate": 4.8466301973316e-05, + "loss": 0.144, + "step": 23560 + }, + { + "epoch": 5.0026539565617725, + "grad_norm": 0.14592042565345764, + "learning_rate": 4.846329295227091e-05, + "loss": 0.089, + "step": 23570 + }, + { + "epoch": 5.002708118940584, + "grad_norm": 0.18751507997512817, + "learning_rate": 4.846028393122582e-05, + "loss": 0.1812, + "step": 23580 + }, + { + "epoch": 5.002762281319396, + "grad_norm": 0.014582816511392593, + "learning_rate": 4.845727491018072e-05, + "loss": 0.0313, + "step": 23590 + }, + { + "epoch": 5.002816443698207, + "grad_norm": 3.4727611541748047, + "learning_rate": 4.8454265889135633e-05, + "loss": 0.0436, + "step": 23600 + }, + { + "epoch": 5.0028706060770185, + "grad_norm": 0.012187139131128788, + "learning_rate": 4.845125686809054e-05, + "loss": 0.0757, + "step": 23610 + }, + { + "epoch": 5.002924768455831, + "grad_norm": 0.09357541799545288, + "learning_rate": 4.8448247847045446e-05, + "loss": 0.0849, + "step": 23620 + }, + { + "epoch": 5.002978930834642, + "grad_norm": 0.05516833811998367, + "learning_rate": 4.844523882600035e-05, + "loss": 0.0112, + "step": 23630 + }, + { + "epoch": 5.003033093213454, + "grad_norm": 0.0370270274579525, + "learning_rate": 4.844222980495526e-05, + "loss": 0.0501, + "step": 23640 + }, + { + "epoch": 5.003087255592265, + "grad_norm": 0.017647666856646538, + "learning_rate": 4.8439220783910165e-05, + "loss": 0.0584, + "step": 23650 + }, + { + "epoch": 5.003141417971078, + "grad_norm": 0.012116649188101292, + "learning_rate": 4.843621176286507e-05, + "loss": 0.0327, + "step": 23660 + }, + { + "epoch": 5.003195580349889, + "grad_norm": 8.966796875, + "learning_rate": 4.843320274181998e-05, + "loss": 0.1055, + "step": 23670 + }, + { + "epoch": 5.003249742728701, + "grad_norm": 0.004937412682920694, + "learning_rate": 4.8430193720774884e-05, + "loss": 0.0026, + "step": 23680 + }, + { + "epoch": 5.003303905107512, + "grad_norm": 1.6481629610061646, + "learning_rate": 4.84271846997298e-05, + "loss": 0.0551, + "step": 23690 + }, + { + "epoch": 5.003358067486324, + "grad_norm": 0.3035014569759369, + "learning_rate": 4.8424175678684696e-05, + "loss": 0.0578, + "step": 23700 + }, + { + "epoch": 5.003412229865136, + "grad_norm": 0.005401875823736191, + "learning_rate": 4.84211666576396e-05, + "loss": 0.039, + "step": 23710 + }, + { + "epoch": 5.003466392243947, + "grad_norm": 0.38294270634651184, + "learning_rate": 4.8418157636594516e-05, + "loss": 0.1014, + "step": 23720 + }, + { + "epoch": 5.003520554622759, + "grad_norm": 4.578383445739746, + "learning_rate": 4.841514861554942e-05, + "loss": 0.0696, + "step": 23730 + }, + { + "epoch": 5.0035747170015705, + "grad_norm": 0.009441632777452469, + "learning_rate": 4.841213959450432e-05, + "loss": 0.1437, + "step": 23740 + }, + { + "epoch": 5.003628879380383, + "grad_norm": 0.004103015176951885, + "learning_rate": 4.8409130573459234e-05, + "loss": 0.0235, + "step": 23750 + }, + { + "epoch": 5.003683041759194, + "grad_norm": 0.04950791597366333, + "learning_rate": 4.840612155241414e-05, + "loss": 0.0125, + "step": 23760 + }, + { + "epoch": 5.003737204138006, + "grad_norm": 13.385335922241211, + "learning_rate": 4.840311253136905e-05, + "loss": 0.2142, + "step": 23770 + }, + { + "epoch": 5.003791366516817, + "grad_norm": 2.47572922706604, + "learning_rate": 4.840010351032395e-05, + "loss": 0.1785, + "step": 23780 + }, + { + "epoch": 5.003845528895629, + "grad_norm": 0.036901604384183884, + "learning_rate": 4.839709448927886e-05, + "loss": 0.0633, + "step": 23790 + }, + { + "epoch": 5.003899691274441, + "grad_norm": 0.6003131866455078, + "learning_rate": 4.8394085468233766e-05, + "loss": 0.1797, + "step": 23800 + }, + { + "epoch": 5.003953853653252, + "grad_norm": 0.2633151710033417, + "learning_rate": 4.839107644718867e-05, + "loss": 0.0332, + "step": 23810 + }, + { + "epoch": 5.004008016032064, + "grad_norm": 0.0812859982252121, + "learning_rate": 4.838806742614358e-05, + "loss": 0.074, + "step": 23820 + }, + { + "epoch": 5.004062178410876, + "grad_norm": 0.2477843314409256, + "learning_rate": 4.8385058405098485e-05, + "loss": 0.0065, + "step": 23830 + }, + { + "epoch": 5.004116340789688, + "grad_norm": 0.23124955594539642, + "learning_rate": 4.83820493840534e-05, + "loss": 0.2486, + "step": 23840 + }, + { + "epoch": 5.004170503168499, + "grad_norm": 0.27669358253479004, + "learning_rate": 4.83790403630083e-05, + "loss": 0.113, + "step": 23850 + }, + { + "epoch": 5.004224665547311, + "grad_norm": 0.17472614347934723, + "learning_rate": 4.837603134196321e-05, + "loss": 0.0859, + "step": 23860 + }, + { + "epoch": 5.0042788279261226, + "grad_norm": 0.08573763817548752, + "learning_rate": 4.8373022320918116e-05, + "loss": 0.0579, + "step": 23870 + }, + { + "epoch": 5.004332990304934, + "grad_norm": 0.07014349102973938, + "learning_rate": 4.837001329987302e-05, + "loss": 0.0138, + "step": 23880 + }, + { + "epoch": 5.004387152683746, + "grad_norm": 0.05062032490968704, + "learning_rate": 4.836700427882793e-05, + "loss": 0.075, + "step": 23890 + }, + { + "epoch": 5.004441315062557, + "grad_norm": 0.021257571876049042, + "learning_rate": 4.8363995257782835e-05, + "loss": 0.0487, + "step": 23900 + }, + { + "epoch": 5.0044954774413695, + "grad_norm": 0.005915232002735138, + "learning_rate": 4.836098623673774e-05, + "loss": 0.1504, + "step": 23910 + }, + { + "epoch": 5.004549639820181, + "grad_norm": 0.1261223703622818, + "learning_rate": 4.8357977215692655e-05, + "loss": 0.0763, + "step": 23920 + }, + { + "epoch": 5.004603802198993, + "grad_norm": 2.9814727306365967, + "learning_rate": 4.8354968194647554e-05, + "loss": 0.0866, + "step": 23930 + }, + { + "epoch": 5.004657964577804, + "grad_norm": 4.606469631195068, + "learning_rate": 4.835195917360246e-05, + "loss": 0.1603, + "step": 23940 + }, + { + "epoch": 5.0047121269566155, + "grad_norm": 0.23409461975097656, + "learning_rate": 4.8348950152557373e-05, + "loss": 0.0318, + "step": 23950 + }, + { + "epoch": 5.004766289335428, + "grad_norm": 3.171405076980591, + "learning_rate": 4.834594113151227e-05, + "loss": 0.1955, + "step": 23960 + }, + { + "epoch": 5.004820451714239, + "grad_norm": 0.3875008821487427, + "learning_rate": 4.834293211046718e-05, + "loss": 0.0473, + "step": 23970 + }, + { + "epoch": 5.004874614093051, + "grad_norm": 0.08878286927938461, + "learning_rate": 4.833992308942209e-05, + "loss": 0.0396, + "step": 23980 + }, + { + "epoch": 5.004928776471862, + "grad_norm": 0.0047308714129030704, + "learning_rate": 4.8336914068377e-05, + "loss": 0.0809, + "step": 23990 + }, + { + "epoch": 5.004982938850675, + "grad_norm": 0.21639186143875122, + "learning_rate": 4.83339050473319e-05, + "loss": 0.2332, + "step": 24000 + }, + { + "epoch": 5.005037101229486, + "grad_norm": 0.08161355555057526, + "learning_rate": 4.833089602628681e-05, + "loss": 0.0098, + "step": 24010 + }, + { + "epoch": 5.005091263608298, + "grad_norm": 0.22811567783355713, + "learning_rate": 4.832788700524172e-05, + "loss": 0.0883, + "step": 24020 + }, + { + "epoch": 5.005145425987109, + "grad_norm": 0.2321818321943283, + "learning_rate": 4.8324877984196624e-05, + "loss": 0.1542, + "step": 24030 + }, + { + "epoch": 5.005199588365921, + "grad_norm": 2.385770559310913, + "learning_rate": 4.832186896315153e-05, + "loss": 0.0771, + "step": 24040 + }, + { + "epoch": 5.005253750744733, + "grad_norm": 3.6843602657318115, + "learning_rate": 4.8318859942106436e-05, + "loss": 0.0609, + "step": 24050 + }, + { + "epoch": 5.005307913123544, + "grad_norm": 0.0032299081794917583, + "learning_rate": 4.831585092106134e-05, + "loss": 0.0056, + "step": 24060 + }, + { + "epoch": 5.005362075502356, + "grad_norm": 9.159615516662598, + "learning_rate": 4.8312841900016256e-05, + "loss": 0.1306, + "step": 24070 + }, + { + "epoch": 5.0054162378811675, + "grad_norm": 2.7787563800811768, + "learning_rate": 4.8309832878971155e-05, + "loss": 0.0692, + "step": 24080 + }, + { + "epoch": 5.00547040025998, + "grad_norm": 0.08222964406013489, + "learning_rate": 4.830682385792606e-05, + "loss": 0.0475, + "step": 24090 + }, + { + "epoch": 5.005524562638791, + "grad_norm": 0.021351628005504608, + "learning_rate": 4.8303814836880974e-05, + "loss": 0.0452, + "step": 24100 + }, + { + "epoch": 5.005578725017603, + "grad_norm": 0.3673562705516815, + "learning_rate": 4.8300805815835874e-05, + "loss": 0.1019, + "step": 24110 + }, + { + "epoch": 5.005632887396414, + "grad_norm": 0.028677664697170258, + "learning_rate": 4.829779679479079e-05, + "loss": 0.0529, + "step": 24120 + }, + { + "epoch": 5.005687049775226, + "grad_norm": 0.7139677405357361, + "learning_rate": 4.829478777374569e-05, + "loss": 0.0749, + "step": 24130 + }, + { + "epoch": 5.005741212154038, + "grad_norm": 0.014328844845294952, + "learning_rate": 4.82917787527006e-05, + "loss": 0.0264, + "step": 24140 + }, + { + "epoch": 5.005795374532849, + "grad_norm": 0.0022321443539112806, + "learning_rate": 4.8288769731655506e-05, + "loss": 0.0748, + "step": 24150 + }, + { + "epoch": 5.005849536911661, + "grad_norm": 0.10690213739871979, + "learning_rate": 4.828576071061041e-05, + "loss": 0.3193, + "step": 24160 + }, + { + "epoch": 5.005903699290473, + "grad_norm": 4.431240558624268, + "learning_rate": 4.828275168956532e-05, + "loss": 0.1678, + "step": 24170 + }, + { + "epoch": 5.005957861669285, + "grad_norm": 0.02620316483080387, + "learning_rate": 4.827974266852023e-05, + "loss": 0.1204, + "step": 24180 + }, + { + "epoch": 5.006012024048096, + "grad_norm": 0.004614606965333223, + "learning_rate": 4.827673364747513e-05, + "loss": 0.0083, + "step": 24190 + }, + { + "epoch": 5.006066186426908, + "grad_norm": 4.4434123039245605, + "learning_rate": 4.827372462643004e-05, + "loss": 0.114, + "step": 24200 + }, + { + "epoch": 5.0061203488057195, + "grad_norm": 3.2499330043792725, + "learning_rate": 4.827071560538495e-05, + "loss": 0.3523, + "step": 24210 + }, + { + "epoch": 5.006174511184531, + "grad_norm": 0.3036500811576843, + "learning_rate": 4.8267706584339856e-05, + "loss": 0.0419, + "step": 24220 + }, + { + "epoch": 5.006228673563343, + "grad_norm": 0.4316418468952179, + "learning_rate": 4.8264697563294756e-05, + "loss": 0.1602, + "step": 24230 + }, + { + "epoch": 5.006282835942154, + "grad_norm": 0.3001784682273865, + "learning_rate": 4.826168854224967e-05, + "loss": 0.1127, + "step": 24240 + }, + { + "epoch": 5.0063369983209665, + "grad_norm": 0.27350637316703796, + "learning_rate": 4.8258679521204575e-05, + "loss": 0.1341, + "step": 24250 + }, + { + "epoch": 5.006391160699778, + "grad_norm": 0.1829669326543808, + "learning_rate": 4.8255670500159475e-05, + "loss": 0.0374, + "step": 24260 + }, + { + "epoch": 5.00644532307859, + "grad_norm": 0.17174440622329712, + "learning_rate": 4.825266147911439e-05, + "loss": 0.0508, + "step": 24270 + }, + { + "epoch": 5.006499485457401, + "grad_norm": 0.8979014158248901, + "learning_rate": 4.8249652458069294e-05, + "loss": 0.0211, + "step": 24280 + }, + { + "epoch": 5.006553647836213, + "grad_norm": 24.24631690979004, + "learning_rate": 4.82466434370242e-05, + "loss": 0.1191, + "step": 24290 + }, + { + "epoch": 5.006607810215025, + "grad_norm": 8.028286933898926, + "learning_rate": 4.8243634415979107e-05, + "loss": 0.1129, + "step": 24300 + }, + { + "epoch": 5.006661972593836, + "grad_norm": 3.0853583812713623, + "learning_rate": 4.824062539493401e-05, + "loss": 0.0638, + "step": 24310 + }, + { + "epoch": 5.006716134972648, + "grad_norm": 0.041716668754816055, + "learning_rate": 4.823761637388892e-05, + "loss": 0.1965, + "step": 24320 + }, + { + "epoch": 5.006770297351459, + "grad_norm": 4.784609317779541, + "learning_rate": 4.823460735284383e-05, + "loss": 0.337, + "step": 24330 + }, + { + "epoch": 5.006824459730272, + "grad_norm": 0.1090875044465065, + "learning_rate": 4.823159833179873e-05, + "loss": 0.1057, + "step": 24340 + }, + { + "epoch": 5.006878622109083, + "grad_norm": 0.1340867578983307, + "learning_rate": 4.822858931075364e-05, + "loss": 0.0186, + "step": 24350 + }, + { + "epoch": 5.006932784487895, + "grad_norm": 0.10647327452898026, + "learning_rate": 4.822558028970855e-05, + "loss": 0.0501, + "step": 24360 + }, + { + "epoch": 5.006986946866706, + "grad_norm": 0.009162059053778648, + "learning_rate": 4.822257126866346e-05, + "loss": 0.1368, + "step": 24370 + }, + { + "epoch": 5.0070411092455185, + "grad_norm": 0.10855554044246674, + "learning_rate": 4.8219562247618364e-05, + "loss": 0.3913, + "step": 24380 + }, + { + "epoch": 5.00709527162433, + "grad_norm": 0.5719864964485168, + "learning_rate": 4.821655322657327e-05, + "loss": 0.2041, + "step": 24390 + }, + { + "epoch": 5.007149434003141, + "grad_norm": 0.042532868683338165, + "learning_rate": 4.8213544205528176e-05, + "loss": 0.098, + "step": 24400 + }, + { + "epoch": 5.007203596381953, + "grad_norm": 0.36181753873825073, + "learning_rate": 4.821053518448308e-05, + "loss": 0.0887, + "step": 24410 + }, + { + "epoch": 5.0072577587607645, + "grad_norm": 1.381920576095581, + "learning_rate": 4.820752616343799e-05, + "loss": 0.1128, + "step": 24420 + }, + { + "epoch": 5.007311921139577, + "grad_norm": 0.027058007195591927, + "learning_rate": 4.8204517142392895e-05, + "loss": 0.062, + "step": 24430 + }, + { + "epoch": 5.007366083518388, + "grad_norm": 0.7234348654747009, + "learning_rate": 4.820150812134781e-05, + "loss": 0.0546, + "step": 24440 + }, + { + "epoch": 5.0074202458972, + "grad_norm": 0.14839115738868713, + "learning_rate": 4.819849910030271e-05, + "loss": 0.0809, + "step": 24450 + }, + { + "epoch": 5.007474408276011, + "grad_norm": 0.0034227012656629086, + "learning_rate": 4.8195490079257614e-05, + "loss": 0.1274, + "step": 24460 + }, + { + "epoch": 5.007528570654824, + "grad_norm": 3.587024450302124, + "learning_rate": 4.819248105821253e-05, + "loss": 0.0852, + "step": 24470 + }, + { + "epoch": 5.007582733033635, + "grad_norm": 0.28219160437583923, + "learning_rate": 4.818947203716743e-05, + "loss": 0.0566, + "step": 24480 + }, + { + "epoch": 5.007636895412446, + "grad_norm": 32.369850158691406, + "learning_rate": 4.818646301612233e-05, + "loss": 0.1045, + "step": 24490 + }, + { + "epoch": 5.007691057791258, + "grad_norm": 0.1929645836353302, + "learning_rate": 4.8183453995077246e-05, + "loss": 0.0555, + "step": 24500 + }, + { + "epoch": 5.00774522017007, + "grad_norm": 0.035508837550878525, + "learning_rate": 4.818044497403215e-05, + "loss": 0.0789, + "step": 24510 + }, + { + "epoch": 5.007799382548882, + "grad_norm": 0.06273870170116425, + "learning_rate": 4.817743595298706e-05, + "loss": 0.1258, + "step": 24520 + }, + { + "epoch": 5.007853544927693, + "grad_norm": 0.011900629848241806, + "learning_rate": 4.8174426931941964e-05, + "loss": 0.0523, + "step": 24530 + }, + { + "epoch": 5.007907707306505, + "grad_norm": 0.5132324695587158, + "learning_rate": 4.817141791089687e-05, + "loss": 0.0265, + "step": 24540 + }, + { + "epoch": 5.0079618696853165, + "grad_norm": 0.0030487836338579655, + "learning_rate": 4.816840888985178e-05, + "loss": 0.257, + "step": 24550 + }, + { + "epoch": 5.008016032064128, + "grad_norm": 0.45865926146507263, + "learning_rate": 4.816539986880668e-05, + "loss": 0.0167, + "step": 24560 + }, + { + "epoch": 5.00807019444294, + "grad_norm": 0.005923417862504721, + "learning_rate": 4.816239084776159e-05, + "loss": 0.1787, + "step": 24570 + }, + { + "epoch": 5.008124356821751, + "grad_norm": 0.0405096597969532, + "learning_rate": 4.8159381826716496e-05, + "loss": 0.1425, + "step": 24580 + }, + { + "epoch": 5.0081785192005634, + "grad_norm": 0.008342016488313675, + "learning_rate": 4.815637280567141e-05, + "loss": 0.0234, + "step": 24590 + }, + { + "epoch": 5.008232681579375, + "grad_norm": 7.34586763381958, + "learning_rate": 4.815336378462631e-05, + "loss": 0.0624, + "step": 24600 + }, + { + "epoch": 5.008286843958187, + "grad_norm": 17.09014129638672, + "learning_rate": 4.8150354763581215e-05, + "loss": 0.2206, + "step": 24610 + }, + { + "epoch": 5.008341006336998, + "grad_norm": 0.39842987060546875, + "learning_rate": 4.814734574253613e-05, + "loss": 0.0617, + "step": 24620 + }, + { + "epoch": 5.00839516871581, + "grad_norm": 0.1934039294719696, + "learning_rate": 4.8144336721491034e-05, + "loss": 0.1554, + "step": 24630 + }, + { + "epoch": 5.008449331094622, + "grad_norm": 0.9438275098800659, + "learning_rate": 4.814132770044594e-05, + "loss": 0.0706, + "step": 24640 + }, + { + "epoch": 5.008503493473433, + "grad_norm": 0.15135619044303894, + "learning_rate": 4.8138318679400847e-05, + "loss": 0.0336, + "step": 24650 + }, + { + "epoch": 5.008557655852245, + "grad_norm": 0.043725594878196716, + "learning_rate": 4.813530965835575e-05, + "loss": 0.1196, + "step": 24660 + }, + { + "epoch": 5.008611818231056, + "grad_norm": 4.828185558319092, + "learning_rate": 4.813230063731066e-05, + "loss": 0.2298, + "step": 24670 + }, + { + "epoch": 5.008665980609869, + "grad_norm": 0.03350524976849556, + "learning_rate": 4.8129291616265565e-05, + "loss": 0.0928, + "step": 24680 + }, + { + "epoch": 5.00872014298868, + "grad_norm": 0.025951137766242027, + "learning_rate": 4.812628259522047e-05, + "loss": 0.1412, + "step": 24690 + }, + { + "epoch": 5.008774305367492, + "grad_norm": 0.13679836690425873, + "learning_rate": 4.8123273574175385e-05, + "loss": 0.1097, + "step": 24700 + }, + { + "epoch": 5.008828467746303, + "grad_norm": 0.1920901983976364, + "learning_rate": 4.8120264553130284e-05, + "loss": 0.1194, + "step": 24710 + }, + { + "epoch": 5.0088826301251155, + "grad_norm": 0.24997176229953766, + "learning_rate": 4.811725553208519e-05, + "loss": 0.2492, + "step": 24720 + }, + { + "epoch": 5.008936792503927, + "grad_norm": 0.04635058343410492, + "learning_rate": 4.8114246511040104e-05, + "loss": 0.0416, + "step": 24730 + }, + { + "epoch": 5.008990954882738, + "grad_norm": 0.17008723318576813, + "learning_rate": 4.811123748999501e-05, + "loss": 0.1318, + "step": 24740 + }, + { + "epoch": 5.00904511726155, + "grad_norm": 0.006133070681244135, + "learning_rate": 4.810822846894991e-05, + "loss": 0.1164, + "step": 24750 + }, + { + "epoch": 5.0090992796403615, + "grad_norm": 2.9006032943725586, + "learning_rate": 4.810521944790482e-05, + "loss": 0.0769, + "step": 24760 + }, + { + "epoch": 5.009153442019174, + "grad_norm": 0.12409084290266037, + "learning_rate": 4.810221042685973e-05, + "loss": 0.1303, + "step": 24770 + }, + { + "epoch": 5.009207604397985, + "grad_norm": 0.5886746048927307, + "learning_rate": 4.8099201405814635e-05, + "loss": 0.1455, + "step": 24780 + }, + { + "epoch": 5.009261766776797, + "grad_norm": 2.6347432136535645, + "learning_rate": 4.809619238476954e-05, + "loss": 0.1621, + "step": 24790 + }, + { + "epoch": 5.009315929155608, + "grad_norm": 0.160436749458313, + "learning_rate": 4.809318336372445e-05, + "loss": 0.1311, + "step": 24800 + }, + { + "epoch": 5.009370091534421, + "grad_norm": 0.008572841063141823, + "learning_rate": 4.8090174342679354e-05, + "loss": 0.1035, + "step": 24810 + }, + { + "epoch": 5.009424253913232, + "grad_norm": 0.1430208683013916, + "learning_rate": 4.808716532163427e-05, + "loss": 0.0425, + "step": 24820 + }, + { + "epoch": 5.009478416292043, + "grad_norm": 1.0036622285842896, + "learning_rate": 4.8084156300589166e-05, + "loss": 0.1751, + "step": 24830 + }, + { + "epoch": 5.009532578670855, + "grad_norm": 0.034965816885232925, + "learning_rate": 4.808114727954407e-05, + "loss": 0.0885, + "step": 24840 + }, + { + "epoch": 5.009586741049667, + "grad_norm": 0.09972193837165833, + "learning_rate": 4.8078138258498986e-05, + "loss": 0.0477, + "step": 24850 + }, + { + "epoch": 5.009640903428479, + "grad_norm": 0.025997698307037354, + "learning_rate": 4.8075129237453885e-05, + "loss": 0.0934, + "step": 24860 + }, + { + "epoch": 5.00969506580729, + "grad_norm": 0.08531764149665833, + "learning_rate": 4.807212021640879e-05, + "loss": 0.0706, + "step": 24870 + }, + { + "epoch": 5.009749228186102, + "grad_norm": 0.014863788150250912, + "learning_rate": 4.8069111195363704e-05, + "loss": 0.2309, + "step": 24880 + }, + { + "epoch": 5.0098033905649135, + "grad_norm": 18.943050384521484, + "learning_rate": 4.806610217431861e-05, + "loss": 0.0794, + "step": 24890 + }, + { + "epoch": 5.009857552943726, + "grad_norm": 0.0749753937125206, + "learning_rate": 4.806309315327352e-05, + "loss": 0.03, + "step": 24900 + }, + { + "epoch": 5.009911715322537, + "grad_norm": 5.757457733154297, + "learning_rate": 4.806008413222842e-05, + "loss": 0.1122, + "step": 24910 + }, + { + "epoch": 5.009965877701348, + "grad_norm": 0.30351829528808594, + "learning_rate": 4.805707511118333e-05, + "loss": 0.1438, + "step": 24920 + }, + { + "epoch": 5.01002004008016, + "grad_norm": 0.25530192255973816, + "learning_rate": 4.8054066090138236e-05, + "loss": 0.1824, + "step": 24930 + }, + { + "epoch": 5.010074202458972, + "grad_norm": 1.8027150630950928, + "learning_rate": 4.805105706909314e-05, + "loss": 0.1456, + "step": 24940 + }, + { + "epoch": 5.010128364837784, + "grad_norm": 0.311943382024765, + "learning_rate": 4.804804804804805e-05, + "loss": 0.1071, + "step": 24950 + }, + { + "epoch": 5.010182527216595, + "grad_norm": 0.012373005971312523, + "learning_rate": 4.804503902700296e-05, + "loss": 0.0403, + "step": 24960 + }, + { + "epoch": 5.010236689595407, + "grad_norm": 0.3876262903213501, + "learning_rate": 4.804203000595787e-05, + "loss": 0.0679, + "step": 24970 + }, + { + "epoch": 5.010290851974219, + "grad_norm": 0.08306006342172623, + "learning_rate": 4.803902098491277e-05, + "loss": 0.1619, + "step": 24980 + }, + { + "epoch": 5.010345014353031, + "grad_norm": 0.08293802291154861, + "learning_rate": 4.803601196386768e-05, + "loss": 0.1047, + "step": 24990 + }, + { + "epoch": 5.010399176731842, + "grad_norm": 0.03446601331233978, + "learning_rate": 4.8033002942822586e-05, + "loss": 0.0087, + "step": 25000 + }, + { + "epoch": 5.010453339110653, + "grad_norm": 0.7328843474388123, + "learning_rate": 4.8029993921777486e-05, + "loss": 0.1731, + "step": 25010 + }, + { + "epoch": 5.0105075014894656, + "grad_norm": 0.030025742948055267, + "learning_rate": 4.80269849007324e-05, + "loss": 0.0692, + "step": 25020 + }, + { + "epoch": 5.010561663868277, + "grad_norm": 5.425129413604736, + "learning_rate": 4.8023975879687305e-05, + "loss": 0.1386, + "step": 25030 + }, + { + "epoch": 5.010615826247089, + "grad_norm": 0.015178671106696129, + "learning_rate": 4.802096685864221e-05, + "loss": 0.1022, + "step": 25040 + }, + { + "epoch": 5.0106699886259, + "grad_norm": 0.10853561758995056, + "learning_rate": 4.801795783759712e-05, + "loss": 0.0842, + "step": 25050 + }, + { + "epoch": 5.0107241510047125, + "grad_norm": 0.005476293619722128, + "learning_rate": 4.8014948816552024e-05, + "loss": 0.0822, + "step": 25060 + }, + { + "epoch": 5.010778313383524, + "grad_norm": 0.11097882688045502, + "learning_rate": 4.801193979550693e-05, + "loss": 0.123, + "step": 25070 + }, + { + "epoch": 5.010832475762336, + "grad_norm": 0.00694697443395853, + "learning_rate": 4.8008930774461843e-05, + "loss": 0.0946, + "step": 25080 + }, + { + "epoch": 5.010886638141147, + "grad_norm": 0.04307837784290314, + "learning_rate": 4.800592175341674e-05, + "loss": 0.1055, + "step": 25090 + }, + { + "epoch": 5.0109408005199585, + "grad_norm": 10.512062072753906, + "learning_rate": 4.800291273237165e-05, + "loss": 0.049, + "step": 25100 + }, + { + "epoch": 5.010994962898771, + "grad_norm": 0.029278550297021866, + "learning_rate": 4.799990371132656e-05, + "loss": 0.1478, + "step": 25110 + }, + { + "epoch": 5.011049125277582, + "grad_norm": 1.8350075483322144, + "learning_rate": 4.799689469028147e-05, + "loss": 0.1221, + "step": 25120 + }, + { + "epoch": 5.011103287656394, + "grad_norm": 0.00484894635155797, + "learning_rate": 4.799388566923637e-05, + "loss": 0.2036, + "step": 25130 + }, + { + "epoch": 5.011157450035205, + "grad_norm": 0.016310539096593857, + "learning_rate": 4.799087664819128e-05, + "loss": 0.2952, + "step": 25140 + }, + { + "epoch": 5.011211612414018, + "grad_norm": 0.062355782836675644, + "learning_rate": 4.798786762714619e-05, + "loss": 0.0625, + "step": 25150 + }, + { + "epoch": 5.011265774792829, + "grad_norm": 0.008176767267286777, + "learning_rate": 4.7984858606101094e-05, + "loss": 0.0274, + "step": 25160 + }, + { + "epoch": 5.01131993717164, + "grad_norm": 0.08699031174182892, + "learning_rate": 4.7981849585056e-05, + "loss": 0.1155, + "step": 25170 + }, + { + "epoch": 5.011374099550452, + "grad_norm": 0.3026621639728546, + "learning_rate": 4.7978840564010906e-05, + "loss": 0.0167, + "step": 25180 + }, + { + "epoch": 5.011428261929264, + "grad_norm": 0.020805813372135162, + "learning_rate": 4.797583154296581e-05, + "loss": 0.0228, + "step": 25190 + }, + { + "epoch": 5.011482424308076, + "grad_norm": 4.114956378936768, + "learning_rate": 4.797282252192072e-05, + "loss": 0.0218, + "step": 25200 + }, + { + "epoch": 5.011536586686887, + "grad_norm": 0.35447824001312256, + "learning_rate": 4.7969813500875625e-05, + "loss": 0.177, + "step": 25210 + }, + { + "epoch": 5.011590749065699, + "grad_norm": 0.07965609431266785, + "learning_rate": 4.796680447983054e-05, + "loss": 0.3759, + "step": 25220 + }, + { + "epoch": 5.0116449114445105, + "grad_norm": 2.843907117843628, + "learning_rate": 4.7963795458785444e-05, + "loss": 0.1681, + "step": 25230 + }, + { + "epoch": 5.011699073823323, + "grad_norm": 5.002478122711182, + "learning_rate": 4.7960786437740344e-05, + "loss": 0.1619, + "step": 25240 + }, + { + "epoch": 5.011753236202134, + "grad_norm": 0.5401621460914612, + "learning_rate": 4.795777741669526e-05, + "loss": 0.0151, + "step": 25250 + }, + { + "epoch": 5.011807398580945, + "grad_norm": 6.027167320251465, + "learning_rate": 4.795476839565016e-05, + "loss": 0.1151, + "step": 25260 + }, + { + "epoch": 5.011861560959757, + "grad_norm": 3.913090705871582, + "learning_rate": 4.795175937460507e-05, + "loss": 0.0446, + "step": 25270 + }, + { + "epoch": 5.011915723338569, + "grad_norm": 0.08999086171388626, + "learning_rate": 4.7948750353559976e-05, + "loss": 0.1701, + "step": 25280 + }, + { + "epoch": 5.011969885717381, + "grad_norm": 1.9693036079406738, + "learning_rate": 4.794574133251488e-05, + "loss": 0.1619, + "step": 25290 + }, + { + "epoch": 5.012024048096192, + "grad_norm": 2.7444474697113037, + "learning_rate": 4.794273231146979e-05, + "loss": 0.208, + "step": 25300 + }, + { + "epoch": 5.012078210475004, + "grad_norm": 0.13573680818080902, + "learning_rate": 4.7939723290424695e-05, + "loss": 0.0759, + "step": 25310 + }, + { + "epoch": 5.012132372853816, + "grad_norm": 0.4329802095890045, + "learning_rate": 4.79367142693796e-05, + "loss": 0.0669, + "step": 25320 + }, + { + "epoch": 5.012186535232628, + "grad_norm": 0.42868420481681824, + "learning_rate": 4.793370524833451e-05, + "loss": 0.0718, + "step": 25330 + }, + { + "epoch": 5.012240697611439, + "grad_norm": 0.37501588463783264, + "learning_rate": 4.793069622728942e-05, + "loss": 0.1423, + "step": 25340 + }, + { + "epoch": 5.01229485999025, + "grad_norm": 1.7185322046279907, + "learning_rate": 4.792768720624432e-05, + "loss": 0.0772, + "step": 25350 + }, + { + "epoch": 5.0123490223690625, + "grad_norm": 0.17014987766742706, + "learning_rate": 4.7924678185199226e-05, + "loss": 0.0667, + "step": 25360 + }, + { + "epoch": 5.012403184747874, + "grad_norm": 0.132149800658226, + "learning_rate": 4.792166916415414e-05, + "loss": 0.0112, + "step": 25370 + }, + { + "epoch": 5.012457347126686, + "grad_norm": 0.22136355936527252, + "learning_rate": 4.7918660143109045e-05, + "loss": 0.0603, + "step": 25380 + }, + { + "epoch": 5.012511509505497, + "grad_norm": 0.020315716043114662, + "learning_rate": 4.7915651122063945e-05, + "loss": 0.0382, + "step": 25390 + }, + { + "epoch": 5.0125656718843095, + "grad_norm": 0.0801909789443016, + "learning_rate": 4.791264210101886e-05, + "loss": 0.0044, + "step": 25400 + }, + { + "epoch": 5.012619834263121, + "grad_norm": 0.19526921212673187, + "learning_rate": 4.7909633079973764e-05, + "loss": 0.0843, + "step": 25410 + }, + { + "epoch": 5.012673996641933, + "grad_norm": 0.3069625198841095, + "learning_rate": 4.790662405892867e-05, + "loss": 0.0082, + "step": 25420 + }, + { + "epoch": 5.012728159020744, + "grad_norm": 0.0021004974842071533, + "learning_rate": 4.790361503788358e-05, + "loss": 0.0635, + "step": 25430 + }, + { + "epoch": 5.0127823213995555, + "grad_norm": 0.9010891914367676, + "learning_rate": 4.790060601683848e-05, + "loss": 0.0501, + "step": 25440 + }, + { + "epoch": 5.012836483778368, + "grad_norm": 0.06537572294473648, + "learning_rate": 4.789759699579339e-05, + "loss": 0.0639, + "step": 25450 + }, + { + "epoch": 5.012890646157179, + "grad_norm": 0.0032653710804879665, + "learning_rate": 4.7894587974748295e-05, + "loss": 0.0343, + "step": 25460 + }, + { + "epoch": 5.012944808535991, + "grad_norm": 0.002843608846887946, + "learning_rate": 4.78915789537032e-05, + "loss": 0.1437, + "step": 25470 + }, + { + "epoch": 5.012998970914802, + "grad_norm": 4.339361190795898, + "learning_rate": 4.7888569932658115e-05, + "loss": 0.2499, + "step": 25480 + }, + { + "epoch": 5.013053133293615, + "grad_norm": 1.311684012413025, + "learning_rate": 4.788556091161302e-05, + "loss": 0.1004, + "step": 25490 + }, + { + "epoch": 5.013107295672426, + "grad_norm": 4.656996726989746, + "learning_rate": 4.788255189056792e-05, + "loss": 0.1784, + "step": 25500 + }, + { + "epoch": 5.013161458051238, + "grad_norm": 0.10018052160739899, + "learning_rate": 4.7879542869522834e-05, + "loss": 0.0838, + "step": 25510 + }, + { + "epoch": 5.013215620430049, + "grad_norm": 0.11592897772789001, + "learning_rate": 4.787653384847774e-05, + "loss": 0.1605, + "step": 25520 + }, + { + "epoch": 5.013269782808861, + "grad_norm": 3.2427215576171875, + "learning_rate": 4.7873524827432646e-05, + "loss": 0.044, + "step": 25530 + }, + { + "epoch": 5.013323945187673, + "grad_norm": 0.017027391120791435, + "learning_rate": 4.787051580638755e-05, + "loss": 0.0593, + "step": 25540 + }, + { + "epoch": 5.013378107566484, + "grad_norm": 3.1883881092071533, + "learning_rate": 4.786750678534246e-05, + "loss": 0.305, + "step": 25550 + }, + { + "epoch": 5.013432269945296, + "grad_norm": 2.8145198822021484, + "learning_rate": 4.7864497764297365e-05, + "loss": 0.0488, + "step": 25560 + }, + { + "epoch": 5.0134864323241075, + "grad_norm": 1.2213400602340698, + "learning_rate": 4.786148874325228e-05, + "loss": 0.0559, + "step": 25570 + }, + { + "epoch": 5.01354059470292, + "grad_norm": 0.0039237444289028645, + "learning_rate": 4.785847972220718e-05, + "loss": 0.1057, + "step": 25580 + }, + { + "epoch": 5.013594757081731, + "grad_norm": 23.003049850463867, + "learning_rate": 4.7855470701162084e-05, + "loss": 0.0445, + "step": 25590 + }, + { + "epoch": 5.013648919460543, + "grad_norm": 9.112311363220215, + "learning_rate": 4.7852461680117e-05, + "loss": 0.1201, + "step": 25600 + }, + { + "epoch": 5.013703081839354, + "grad_norm": 9.574174880981445, + "learning_rate": 4.7849452659071896e-05, + "loss": 0.2453, + "step": 25610 + }, + { + "epoch": 5.013757244218166, + "grad_norm": 0.37774935364723206, + "learning_rate": 4.78464436380268e-05, + "loss": 0.0091, + "step": 25620 + }, + { + "epoch": 5.013811406596978, + "grad_norm": 2.6149210929870605, + "learning_rate": 4.7843434616981716e-05, + "loss": 0.1292, + "step": 25630 + }, + { + "epoch": 5.013865568975789, + "grad_norm": 0.07810110598802567, + "learning_rate": 4.784042559593662e-05, + "loss": 0.1526, + "step": 25640 + }, + { + "epoch": 5.013919731354601, + "grad_norm": 0.15396420657634735, + "learning_rate": 4.783741657489153e-05, + "loss": 0.0245, + "step": 25650 + }, + { + "epoch": 5.013973893733413, + "grad_norm": 0.21620331704616547, + "learning_rate": 4.7834407553846434e-05, + "loss": 0.0728, + "step": 25660 + }, + { + "epoch": 5.014028056112225, + "grad_norm": 2.870718479156494, + "learning_rate": 4.783139853280134e-05, + "loss": 0.1608, + "step": 25670 + }, + { + "epoch": 5.014082218491036, + "grad_norm": 3.472691059112549, + "learning_rate": 4.782838951175625e-05, + "loss": 0.1089, + "step": 25680 + }, + { + "epoch": 5.014136380869847, + "grad_norm": 0.45885026454925537, + "learning_rate": 4.782538049071115e-05, + "loss": 0.0703, + "step": 25690 + }, + { + "epoch": 5.0141905432486595, + "grad_norm": 0.02234150469303131, + "learning_rate": 4.782237146966606e-05, + "loss": 0.1372, + "step": 25700 + }, + { + "epoch": 5.014244705627471, + "grad_norm": 3.8338167667388916, + "learning_rate": 4.7819362448620966e-05, + "loss": 0.1621, + "step": 25710 + }, + { + "epoch": 5.014298868006283, + "grad_norm": 0.17110705375671387, + "learning_rate": 4.781635342757588e-05, + "loss": 0.1268, + "step": 25720 + }, + { + "epoch": 5.014353030385094, + "grad_norm": 0.14436228573322296, + "learning_rate": 4.781334440653078e-05, + "loss": 0.0438, + "step": 25730 + }, + { + "epoch": 5.0144071927639065, + "grad_norm": 0.19990749657154083, + "learning_rate": 4.781033538548569e-05, + "loss": 0.2418, + "step": 25740 + }, + { + "epoch": 5.014461355142718, + "grad_norm": 3.888319730758667, + "learning_rate": 4.78073263644406e-05, + "loss": 0.1678, + "step": 25750 + }, + { + "epoch": 5.01451551752153, + "grad_norm": 0.014463570900261402, + "learning_rate": 4.78043173433955e-05, + "loss": 0.0954, + "step": 25760 + }, + { + "epoch": 5.014569679900341, + "grad_norm": 0.518078625202179, + "learning_rate": 4.780130832235041e-05, + "loss": 0.0158, + "step": 25770 + }, + { + "epoch": 5.0146238422791525, + "grad_norm": 1.910826563835144, + "learning_rate": 4.7798299301305317e-05, + "loss": 0.0952, + "step": 25780 + }, + { + "epoch": 5.014678004657965, + "grad_norm": 9.225374221801758, + "learning_rate": 4.779529028026022e-05, + "loss": 0.1657, + "step": 25790 + }, + { + "epoch": 5.014732167036776, + "grad_norm": 0.2738267481327057, + "learning_rate": 4.779228125921513e-05, + "loss": 0.1022, + "step": 25800 + }, + { + "epoch": 5.014786329415588, + "grad_norm": 3.3728020191192627, + "learning_rate": 4.7789272238170035e-05, + "loss": 0.1473, + "step": 25810 + }, + { + "epoch": 5.014840491794399, + "grad_norm": 1.490346074104309, + "learning_rate": 4.778626321712494e-05, + "loss": 0.056, + "step": 25820 + }, + { + "epoch": 5.014894654173212, + "grad_norm": 0.21727782487869263, + "learning_rate": 4.7783254196079855e-05, + "loss": 0.0828, + "step": 25830 + }, + { + "epoch": 5.014948816552023, + "grad_norm": 0.2605508863925934, + "learning_rate": 4.7780245175034754e-05, + "loss": 0.0533, + "step": 25840 + }, + { + "epoch": 5.015002978930835, + "grad_norm": 0.041430871933698654, + "learning_rate": 4.777723615398966e-05, + "loss": 0.1352, + "step": 25850 + }, + { + "epoch": 5.015057141309646, + "grad_norm": 0.2740143835544586, + "learning_rate": 4.7774227132944574e-05, + "loss": 0.1312, + "step": 25860 + }, + { + "epoch": 5.015111303688458, + "grad_norm": 0.3777243494987488, + "learning_rate": 4.777121811189948e-05, + "loss": 0.1253, + "step": 25870 + }, + { + "epoch": 5.01516546606727, + "grad_norm": 0.28751203417778015, + "learning_rate": 4.776820909085438e-05, + "loss": 0.1547, + "step": 25880 + }, + { + "epoch": 5.015219628446081, + "grad_norm": 0.3449265658855438, + "learning_rate": 4.776520006980929e-05, + "loss": 0.0428, + "step": 25890 + }, + { + "epoch": 5.015273790824893, + "grad_norm": 0.005621649324893951, + "learning_rate": 4.77621910487642e-05, + "loss": 0.0209, + "step": 25900 + }, + { + "epoch": 5.0153279532037045, + "grad_norm": 0.003992537502199411, + "learning_rate": 4.7759182027719105e-05, + "loss": 0.0891, + "step": 25910 + }, + { + "epoch": 5.015382115582517, + "grad_norm": 0.8608254790306091, + "learning_rate": 4.775617300667401e-05, + "loss": 0.1363, + "step": 25920 + }, + { + "epoch": 5.015436277961328, + "grad_norm": 0.13044537603855133, + "learning_rate": 4.775316398562892e-05, + "loss": 0.0882, + "step": 25930 + }, + { + "epoch": 5.01549044034014, + "grad_norm": 0.1842847317457199, + "learning_rate": 4.7750154964583824e-05, + "loss": 0.0237, + "step": 25940 + }, + { + "epoch": 5.015544602718951, + "grad_norm": 0.10070805251598358, + "learning_rate": 4.774714594353873e-05, + "loss": 0.0329, + "step": 25950 + }, + { + "epoch": 5.015598765097763, + "grad_norm": 0.14235879480838776, + "learning_rate": 4.7744136922493636e-05, + "loss": 0.1777, + "step": 25960 + }, + { + "epoch": 5.015652927476575, + "grad_norm": 3.162381410598755, + "learning_rate": 4.774112790144854e-05, + "loss": 0.245, + "step": 25970 + }, + { + "epoch": 5.015707089855386, + "grad_norm": 1.7650758028030396, + "learning_rate": 4.7738118880403456e-05, + "loss": 0.1624, + "step": 25980 + }, + { + "epoch": 5.015761252234198, + "grad_norm": 0.170909121632576, + "learning_rate": 4.7735109859358355e-05, + "loss": 0.0629, + "step": 25990 + }, + { + "epoch": 5.01581541461301, + "grad_norm": 1.6411267518997192, + "learning_rate": 4.773210083831327e-05, + "loss": 0.0684, + "step": 26000 + }, + { + "epoch": 5.015869576991822, + "grad_norm": 0.6120477318763733, + "learning_rate": 4.7729091817268174e-05, + "loss": 0.0585, + "step": 26010 + }, + { + "epoch": 5.015923739370633, + "grad_norm": 0.10125768929719925, + "learning_rate": 4.772608279622308e-05, + "loss": 0.056, + "step": 26020 + }, + { + "epoch": 5.015977901749445, + "grad_norm": 4.127563953399658, + "learning_rate": 4.772307377517799e-05, + "loss": 0.0508, + "step": 26030 + }, + { + "epoch": 5.0160320641282565, + "grad_norm": 0.1745188683271408, + "learning_rate": 4.772006475413289e-05, + "loss": 0.0675, + "step": 26040 + }, + { + "epoch": 5.016086226507068, + "grad_norm": 0.20925545692443848, + "learning_rate": 4.77170557330878e-05, + "loss": 0.0859, + "step": 26050 + }, + { + "epoch": 5.01614038888588, + "grad_norm": 0.9341488480567932, + "learning_rate": 4.7714046712042706e-05, + "loss": 0.0797, + "step": 26060 + }, + { + "epoch": 5.016194551264691, + "grad_norm": 0.05284073203802109, + "learning_rate": 4.771103769099761e-05, + "loss": 0.0023, + "step": 26070 + }, + { + "epoch": 5.016248713643503, + "grad_norm": 0.37844350934028625, + "learning_rate": 4.770802866995252e-05, + "loss": 0.1778, + "step": 26080 + }, + { + "epoch": 5.016302876022315, + "grad_norm": 0.13624897599220276, + "learning_rate": 4.770501964890743e-05, + "loss": 0.086, + "step": 26090 + }, + { + "epoch": 5.016357038401127, + "grad_norm": 37.35713195800781, + "learning_rate": 4.770201062786233e-05, + "loss": 0.0613, + "step": 26100 + }, + { + "epoch": 5.016411200779938, + "grad_norm": 2.7200071811676025, + "learning_rate": 4.769900160681724e-05, + "loss": 0.178, + "step": 26110 + }, + { + "epoch": 5.01646536315875, + "grad_norm": 1.3890860080718994, + "learning_rate": 4.769599258577215e-05, + "loss": 0.0653, + "step": 26120 + }, + { + "epoch": 5.016519525537562, + "grad_norm": 0.17049361765384674, + "learning_rate": 4.7692983564727057e-05, + "loss": 0.0191, + "step": 26130 + }, + { + "epoch": 5.016573687916373, + "grad_norm": 0.004461994394659996, + "learning_rate": 4.7689974543681956e-05, + "loss": 0.1019, + "step": 26140 + }, + { + "epoch": 5.016627850295185, + "grad_norm": 0.0465693324804306, + "learning_rate": 4.768696552263687e-05, + "loss": 0.0536, + "step": 26150 + }, + { + "epoch": 5.016682012673996, + "grad_norm": 0.004405955318361521, + "learning_rate": 4.7683956501591775e-05, + "loss": 0.0379, + "step": 26160 + }, + { + "epoch": 5.0167361750528086, + "grad_norm": 1.5557804107666016, + "learning_rate": 4.768094748054668e-05, + "loss": 0.0373, + "step": 26170 + }, + { + "epoch": 5.01679033743162, + "grad_norm": 6.621648788452148, + "learning_rate": 4.767793845950159e-05, + "loss": 0.2723, + "step": 26180 + }, + { + "epoch": 5.016844499810432, + "grad_norm": 0.057805709540843964, + "learning_rate": 4.7674929438456494e-05, + "loss": 0.0692, + "step": 26190 + }, + { + "epoch": 5.016898662189243, + "grad_norm": 1.4731545448303223, + "learning_rate": 4.76719204174114e-05, + "loss": 0.095, + "step": 26200 + }, + { + "epoch": 5.0169528245680555, + "grad_norm": 1.4684443473815918, + "learning_rate": 4.766891139636631e-05, + "loss": 0.1103, + "step": 26210 + }, + { + "epoch": 5.017006986946867, + "grad_norm": 0.005878448020666838, + "learning_rate": 4.766590237532121e-05, + "loss": 0.0865, + "step": 26220 + }, + { + "epoch": 5.017061149325678, + "grad_norm": 0.013669965788722038, + "learning_rate": 4.766289335427612e-05, + "loss": 0.0812, + "step": 26230 + }, + { + "epoch": 5.01711531170449, + "grad_norm": 6.667762756347656, + "learning_rate": 4.765988433323103e-05, + "loss": 0.1504, + "step": 26240 + }, + { + "epoch": 5.0171694740833015, + "grad_norm": 0.2800846993923187, + "learning_rate": 4.765687531218593e-05, + "loss": 0.0161, + "step": 26250 + }, + { + "epoch": 5.017223636462114, + "grad_norm": 0.165145605802536, + "learning_rate": 4.7653866291140845e-05, + "loss": 0.1562, + "step": 26260 + }, + { + "epoch": 5.017277798840925, + "grad_norm": 0.00900730025023222, + "learning_rate": 4.765085727009575e-05, + "loss": 0.0157, + "step": 26270 + }, + { + "epoch": 5.017331961219737, + "grad_norm": 0.02713088132441044, + "learning_rate": 4.764784824905066e-05, + "loss": 0.0228, + "step": 26280 + }, + { + "epoch": 5.017386123598548, + "grad_norm": 0.01783573627471924, + "learning_rate": 4.7644839228005564e-05, + "loss": 0.0748, + "step": 26290 + }, + { + "epoch": 5.01744028597736, + "grad_norm": 0.012673270888626575, + "learning_rate": 4.764183020696047e-05, + "loss": 0.0584, + "step": 26300 + }, + { + "epoch": 5.017494448356172, + "grad_norm": 0.05771521478891373, + "learning_rate": 4.7638821185915376e-05, + "loss": 0.0596, + "step": 26310 + }, + { + "epoch": 5.017548610734983, + "grad_norm": 0.14820700883865356, + "learning_rate": 4.763581216487029e-05, + "loss": 0.1924, + "step": 26320 + }, + { + "epoch": 5.017602773113795, + "grad_norm": 0.008521138690412045, + "learning_rate": 4.763280314382519e-05, + "loss": 0.0838, + "step": 26330 + }, + { + "epoch": 5.017656935492607, + "grad_norm": 0.15519413352012634, + "learning_rate": 4.7629794122780095e-05, + "loss": 0.0931, + "step": 26340 + }, + { + "epoch": 5.017711097871419, + "grad_norm": 0.02063065953552723, + "learning_rate": 4.762678510173501e-05, + "loss": 0.1145, + "step": 26350 + }, + { + "epoch": 5.01776526025023, + "grad_norm": 0.013227908872067928, + "learning_rate": 4.762377608068991e-05, + "loss": 0.04, + "step": 26360 + }, + { + "epoch": 5.017819422629042, + "grad_norm": 0.27403995394706726, + "learning_rate": 4.7620767059644814e-05, + "loss": 0.0338, + "step": 26370 + }, + { + "epoch": 5.0178735850078535, + "grad_norm": 1.4129784107208252, + "learning_rate": 4.761775803859973e-05, + "loss": 0.0108, + "step": 26380 + }, + { + "epoch": 5.017927747386665, + "grad_norm": 0.03841264173388481, + "learning_rate": 4.761474901755463e-05, + "loss": 0.0519, + "step": 26390 + }, + { + "epoch": 5.017981909765477, + "grad_norm": 0.13467177748680115, + "learning_rate": 4.761173999650953e-05, + "loss": 0.1363, + "step": 26400 + }, + { + "epoch": 5.018036072144288, + "grad_norm": 0.2941958010196686, + "learning_rate": 4.7608730975464446e-05, + "loss": 0.034, + "step": 26410 + }, + { + "epoch": 5.0180902345231, + "grad_norm": 0.06898830831050873, + "learning_rate": 4.760572195441935e-05, + "loss": 0.0971, + "step": 26420 + }, + { + "epoch": 5.018144396901912, + "grad_norm": 0.2464761883020401, + "learning_rate": 4.760271293337426e-05, + "loss": 0.0868, + "step": 26430 + }, + { + "epoch": 5.018198559280724, + "grad_norm": 0.3398682475090027, + "learning_rate": 4.7599703912329165e-05, + "loss": 0.0338, + "step": 26440 + }, + { + "epoch": 5.018252721659535, + "grad_norm": 12.83751392364502, + "learning_rate": 4.759669489128407e-05, + "loss": 0.0986, + "step": 26450 + }, + { + "epoch": 5.018306884038347, + "grad_norm": 0.13941989839076996, + "learning_rate": 4.759368587023898e-05, + "loss": 0.0783, + "step": 26460 + }, + { + "epoch": 5.018361046417159, + "grad_norm": 6.777514934539795, + "learning_rate": 4.759067684919389e-05, + "loss": 0.0632, + "step": 26470 + }, + { + "epoch": 5.01841520879597, + "grad_norm": 0.019531074911355972, + "learning_rate": 4.758766782814879e-05, + "loss": 0.1018, + "step": 26480 + }, + { + "epoch": 5.018469371174782, + "grad_norm": 0.42210790514945984, + "learning_rate": 4.7584658807103696e-05, + "loss": 0.0594, + "step": 26490 + }, + { + "epoch": 5.018523533553593, + "grad_norm": 0.012375107035040855, + "learning_rate": 4.758164978605861e-05, + "loss": 0.1182, + "step": 26500 + }, + { + "epoch": 5.0185776959324055, + "grad_norm": 0.13618142902851105, + "learning_rate": 4.757864076501351e-05, + "loss": 0.1243, + "step": 26510 + }, + { + "epoch": 5.018631858311217, + "grad_norm": 0.6310217380523682, + "learning_rate": 4.757563174396842e-05, + "loss": 0.044, + "step": 26520 + }, + { + "epoch": 5.018686020690029, + "grad_norm": 0.033977020531892776, + "learning_rate": 4.757262272292333e-05, + "loss": 0.2149, + "step": 26530 + }, + { + "epoch": 5.01874018306884, + "grad_norm": 4.737452507019043, + "learning_rate": 4.7569613701878234e-05, + "loss": 0.1122, + "step": 26540 + }, + { + "epoch": 5.0187943454476525, + "grad_norm": 2.1419548988342285, + "learning_rate": 4.756660468083314e-05, + "loss": 0.1306, + "step": 26550 + }, + { + "epoch": 5.018848507826464, + "grad_norm": 5.230221271514893, + "learning_rate": 4.756359565978805e-05, + "loss": 0.0972, + "step": 26560 + }, + { + "epoch": 5.018902670205275, + "grad_norm": 0.11410851776599884, + "learning_rate": 4.756058663874295e-05, + "loss": 0.0386, + "step": 26570 + }, + { + "epoch": 5.018956832584087, + "grad_norm": 0.08211657404899597, + "learning_rate": 4.7557577617697866e-05, + "loss": 0.1279, + "step": 26580 + }, + { + "epoch": 5.0190109949628985, + "grad_norm": 0.012507806532084942, + "learning_rate": 4.7554568596652765e-05, + "loss": 0.0328, + "step": 26590 + }, + { + "epoch": 5.019065157341711, + "grad_norm": 10.47273063659668, + "learning_rate": 4.755155957560767e-05, + "loss": 0.0695, + "step": 26600 + }, + { + "epoch": 5.019119319720522, + "grad_norm": 0.006996387615799904, + "learning_rate": 4.7548550554562585e-05, + "loss": 0.1476, + "step": 26610 + }, + { + "epoch": 5.019173482099334, + "grad_norm": 0.13761992752552032, + "learning_rate": 4.754554153351749e-05, + "loss": 0.2216, + "step": 26620 + }, + { + "epoch": 5.019227644478145, + "grad_norm": 0.5833784937858582, + "learning_rate": 4.754253251247239e-05, + "loss": 0.146, + "step": 26630 + }, + { + "epoch": 5.019281806856958, + "grad_norm": 0.10556651651859283, + "learning_rate": 4.7539523491427304e-05, + "loss": 0.0513, + "step": 26640 + }, + { + "epoch": 5.019335969235769, + "grad_norm": 0.13427098095417023, + "learning_rate": 4.753651447038221e-05, + "loss": 0.1371, + "step": 26650 + }, + { + "epoch": 5.01939013161458, + "grad_norm": 8.228912353515625, + "learning_rate": 4.753350544933711e-05, + "loss": 0.0981, + "step": 26660 + }, + { + "epoch": 5.019444293993392, + "grad_norm": 0.04103434085845947, + "learning_rate": 4.753049642829202e-05, + "loss": 0.1129, + "step": 26670 + }, + { + "epoch": 5.019498456372204, + "grad_norm": 0.7701200842857361, + "learning_rate": 4.752748740724693e-05, + "loss": 0.0555, + "step": 26680 + }, + { + "epoch": 5.019552618751016, + "grad_norm": 0.009225839748978615, + "learning_rate": 4.7524478386201835e-05, + "loss": 0.0131, + "step": 26690 + }, + { + "epoch": 5.019606781129827, + "grad_norm": 0.7400829792022705, + "learning_rate": 4.752146936515674e-05, + "loss": 0.1394, + "step": 26700 + }, + { + "epoch": 5.019660943508639, + "grad_norm": 4.026261329650879, + "learning_rate": 4.751846034411165e-05, + "loss": 0.096, + "step": 26710 + }, + { + "epoch": 5.0197151058874505, + "grad_norm": 0.39146098494529724, + "learning_rate": 4.7515451323066554e-05, + "loss": 0.0483, + "step": 26720 + }, + { + "epoch": 5.019769268266263, + "grad_norm": 0.8356143236160278, + "learning_rate": 4.751244230202147e-05, + "loss": 0.1126, + "step": 26730 + }, + { + "epoch": 5.019823430645074, + "grad_norm": 0.06516045331954956, + "learning_rate": 4.7509433280976366e-05, + "loss": 0.1545, + "step": 26740 + }, + { + "epoch": 5.019877593023885, + "grad_norm": 0.031146200373768806, + "learning_rate": 4.750642425993127e-05, + "loss": 0.0679, + "step": 26750 + }, + { + "epoch": 5.019931755402697, + "grad_norm": 3.8488540649414062, + "learning_rate": 4.7503415238886186e-05, + "loss": 0.1363, + "step": 26760 + }, + { + "epoch": 5.019985917781509, + "grad_norm": 0.13079802691936493, + "learning_rate": 4.750040621784109e-05, + "loss": 0.0683, + "step": 26770 + }, + { + "epoch": 5.020040080160321, + "grad_norm": 12.568288803100586, + "learning_rate": 4.7497397196796e-05, + "loss": 0.1094, + "step": 26780 + }, + { + "epoch": 5.020094242539132, + "grad_norm": 0.5422631502151489, + "learning_rate": 4.7494388175750905e-05, + "loss": 0.0862, + "step": 26790 + }, + { + "epoch": 5.020148404917944, + "grad_norm": 0.7471862435340881, + "learning_rate": 4.749137915470581e-05, + "loss": 0.1112, + "step": 26800 + }, + { + "epoch": 5.020202567296756, + "grad_norm": 0.03440138325095177, + "learning_rate": 4.748837013366072e-05, + "loss": 0.2116, + "step": 26810 + }, + { + "epoch": 5.020256729675567, + "grad_norm": 0.4361610412597656, + "learning_rate": 4.748536111261562e-05, + "loss": 0.0922, + "step": 26820 + }, + { + "epoch": 5.020310892054379, + "grad_norm": 0.06834501028060913, + "learning_rate": 4.748235209157053e-05, + "loss": 0.1797, + "step": 26830 + }, + { + "epoch": 5.02036505443319, + "grad_norm": 1.3442074060440063, + "learning_rate": 4.747934307052544e-05, + "loss": 0.0523, + "step": 26840 + }, + { + "epoch": 5.0204192168120025, + "grad_norm": 0.05395922437310219, + "learning_rate": 4.747633404948034e-05, + "loss": 0.0136, + "step": 26850 + }, + { + "epoch": 5.020473379190814, + "grad_norm": 0.6298744678497314, + "learning_rate": 4.747332502843525e-05, + "loss": 0.1087, + "step": 26860 + }, + { + "epoch": 5.020527541569626, + "grad_norm": 0.9708184003829956, + "learning_rate": 4.747031600739016e-05, + "loss": 0.076, + "step": 26870 + }, + { + "epoch": 5.020581703948437, + "grad_norm": 0.16954819858074188, + "learning_rate": 4.746730698634507e-05, + "loss": 0.0787, + "step": 26880 + }, + { + "epoch": 5.0206358663272495, + "grad_norm": 0.041150256991386414, + "learning_rate": 4.746429796529997e-05, + "loss": 0.1302, + "step": 26890 + }, + { + "epoch": 5.020690028706061, + "grad_norm": 0.22257453203201294, + "learning_rate": 4.746128894425488e-05, + "loss": 0.092, + "step": 26900 + }, + { + "epoch": 5.020744191084872, + "grad_norm": 0.09307315945625305, + "learning_rate": 4.7458279923209787e-05, + "loss": 0.1803, + "step": 26910 + }, + { + "epoch": 5.020798353463684, + "grad_norm": 0.07322167605161667, + "learning_rate": 4.745527090216469e-05, + "loss": 0.0492, + "step": 26920 + }, + { + "epoch": 5.0208525158424955, + "grad_norm": 6.240938663482666, + "learning_rate": 4.74522618811196e-05, + "loss": 0.0777, + "step": 26930 + }, + { + "epoch": 5.020906678221308, + "grad_norm": 2.7097907066345215, + "learning_rate": 4.7449252860074505e-05, + "loss": 0.0581, + "step": 26940 + }, + { + "epoch": 5.020960840600119, + "grad_norm": 0.03538663685321808, + "learning_rate": 4.744624383902941e-05, + "loss": 0.0321, + "step": 26950 + }, + { + "epoch": 5.021015002978931, + "grad_norm": 0.1606016904115677, + "learning_rate": 4.744323481798432e-05, + "loss": 0.0499, + "step": 26960 + }, + { + "epoch": 5.021069165357742, + "grad_norm": 7.413328647613525, + "learning_rate": 4.7440225796939224e-05, + "loss": 0.117, + "step": 26970 + }, + { + "epoch": 5.021123327736555, + "grad_norm": 0.22307921946048737, + "learning_rate": 4.743721677589413e-05, + "loss": 0.187, + "step": 26980 + }, + { + "epoch": 5.021177490115366, + "grad_norm": 0.07316253334283829, + "learning_rate": 4.7434207754849044e-05, + "loss": 0.0691, + "step": 26990 + }, + { + "epoch": 5.021231652494177, + "grad_norm": 0.28399553894996643, + "learning_rate": 4.743119873380394e-05, + "loss": 0.0397, + "step": 27000 + }, + { + "epoch": 5.021285814872989, + "grad_norm": 0.07985018938779831, + "learning_rate": 4.742818971275885e-05, + "loss": 0.0512, + "step": 27010 + }, + { + "epoch": 5.021339977251801, + "grad_norm": 6.40956449508667, + "learning_rate": 4.742518069171376e-05, + "loss": 0.0502, + "step": 27020 + }, + { + "epoch": 5.021394139630613, + "grad_norm": 0.024382593110203743, + "learning_rate": 4.742217167066867e-05, + "loss": 0.1012, + "step": 27030 + }, + { + "epoch": 5.021448302009424, + "grad_norm": 0.28924670815467834, + "learning_rate": 4.7419162649623575e-05, + "loss": 0.2883, + "step": 27040 + }, + { + "epoch": 5.021502464388236, + "grad_norm": 0.06553584337234497, + "learning_rate": 4.741615362857848e-05, + "loss": 0.0618, + "step": 27050 + }, + { + "epoch": 5.0215566267670475, + "grad_norm": 0.04493043199181557, + "learning_rate": 4.741314460753339e-05, + "loss": 0.0595, + "step": 27060 + }, + { + "epoch": 5.02161078914586, + "grad_norm": 0.01480347290635109, + "learning_rate": 4.7410135586488294e-05, + "loss": 0.0033, + "step": 27070 + }, + { + "epoch": 5.021664951524671, + "grad_norm": 0.10280214250087738, + "learning_rate": 4.74071265654432e-05, + "loss": 0.1484, + "step": 27080 + }, + { + "epoch": 5.021719113903482, + "grad_norm": 0.026624267920851707, + "learning_rate": 4.7404117544398106e-05, + "loss": 0.0678, + "step": 27090 + }, + { + "epoch": 5.021773276282294, + "grad_norm": 13.2225341796875, + "learning_rate": 4.740110852335302e-05, + "loss": 0.1831, + "step": 27100 + }, + { + "epoch": 5.021827438661106, + "grad_norm": 0.18881559371948242, + "learning_rate": 4.739809950230792e-05, + "loss": 0.1286, + "step": 27110 + }, + { + "epoch": 5.021881601039918, + "grad_norm": 0.2695063650608063, + "learning_rate": 4.7395090481262825e-05, + "loss": 0.1113, + "step": 27120 + }, + { + "epoch": 5.021935763418729, + "grad_norm": 0.33242687582969666, + "learning_rate": 4.739208146021774e-05, + "loss": 0.0486, + "step": 27130 + }, + { + "epoch": 5.021989925797541, + "grad_norm": 0.20437389612197876, + "learning_rate": 4.7389072439172644e-05, + "loss": 0.1077, + "step": 27140 + }, + { + "epoch": 5.022044088176353, + "grad_norm": 0.013139230199158192, + "learning_rate": 4.7386063418127544e-05, + "loss": 0.0939, + "step": 27150 + }, + { + "epoch": 5.022098250555165, + "grad_norm": 5.682295322418213, + "learning_rate": 4.738305439708246e-05, + "loss": 0.3105, + "step": 27160 + }, + { + "epoch": 5.022152412933976, + "grad_norm": 0.0033552844543009996, + "learning_rate": 4.738004537603736e-05, + "loss": 0.0069, + "step": 27170 + }, + { + "epoch": 5.022206575312787, + "grad_norm": 0.531399667263031, + "learning_rate": 4.737703635499227e-05, + "loss": 0.1728, + "step": 27180 + }, + { + "epoch": 5.0222607376915995, + "grad_norm": 14.020809173583984, + "learning_rate": 4.7374027333947176e-05, + "loss": 0.1783, + "step": 27190 + }, + { + "epoch": 5.022314900070411, + "grad_norm": 0.02602389268577099, + "learning_rate": 4.737101831290208e-05, + "loss": 0.1091, + "step": 27200 + }, + { + "epoch": 5.022369062449223, + "grad_norm": 3.1854424476623535, + "learning_rate": 4.736800929185699e-05, + "loss": 0.1148, + "step": 27210 + }, + { + "epoch": 5.022423224828034, + "grad_norm": 0.0419287271797657, + "learning_rate": 4.73650002708119e-05, + "loss": 0.0616, + "step": 27220 + }, + { + "epoch": 5.0224773872068464, + "grad_norm": 0.7649651765823364, + "learning_rate": 4.73619912497668e-05, + "loss": 0.1673, + "step": 27230 + }, + { + "epoch": 5.022531549585658, + "grad_norm": 4.384261131286621, + "learning_rate": 4.735898222872171e-05, + "loss": 0.1298, + "step": 27240 + }, + { + "epoch": 5.02258571196447, + "grad_norm": 3.451000452041626, + "learning_rate": 4.735597320767662e-05, + "loss": 0.1087, + "step": 27250 + }, + { + "epoch": 5.022639874343281, + "grad_norm": 7.1823272705078125, + "learning_rate": 4.735296418663152e-05, + "loss": 0.0678, + "step": 27260 + }, + { + "epoch": 5.0226940367220925, + "grad_norm": 0.19879400730133057, + "learning_rate": 4.7349955165586426e-05, + "loss": 0.0487, + "step": 27270 + }, + { + "epoch": 5.022748199100905, + "grad_norm": 0.1450985223054886, + "learning_rate": 4.734694614454134e-05, + "loss": 0.1572, + "step": 27280 + }, + { + "epoch": 5.022802361479716, + "grad_norm": 11.314007759094238, + "learning_rate": 4.7343937123496245e-05, + "loss": 0.1172, + "step": 27290 + }, + { + "epoch": 5.022856523858528, + "grad_norm": 0.9500318169593811, + "learning_rate": 4.734092810245115e-05, + "loss": 0.087, + "step": 27300 + }, + { + "epoch": 5.022910686237339, + "grad_norm": 4.428118705749512, + "learning_rate": 4.733791908140606e-05, + "loss": 0.139, + "step": 27310 + }, + { + "epoch": 5.022964848616152, + "grad_norm": 0.30673494935035706, + "learning_rate": 4.7334910060360964e-05, + "loss": 0.1613, + "step": 27320 + }, + { + "epoch": 5.023019010994963, + "grad_norm": 0.008107736706733704, + "learning_rate": 4.733190103931587e-05, + "loss": 0.0485, + "step": 27330 + }, + { + "epoch": 5.023073173373775, + "grad_norm": 4.387223243713379, + "learning_rate": 4.732889201827078e-05, + "loss": 0.1252, + "step": 27340 + }, + { + "epoch": 5.023127335752586, + "grad_norm": 0.1252613365650177, + "learning_rate": 4.732588299722568e-05, + "loss": 0.1797, + "step": 27350 + }, + { + "epoch": 5.023181498131398, + "grad_norm": 6.3949785232543945, + "learning_rate": 4.7322873976180596e-05, + "loss": 0.197, + "step": 27360 + }, + { + "epoch": 5.02323566051021, + "grad_norm": 0.3661939203739166, + "learning_rate": 4.73198649551355e-05, + "loss": 0.0746, + "step": 27370 + }, + { + "epoch": 5.023289822889021, + "grad_norm": 0.017780618742108345, + "learning_rate": 4.73168559340904e-05, + "loss": 0.2482, + "step": 27380 + }, + { + "epoch": 5.023343985267833, + "grad_norm": 1.454431414604187, + "learning_rate": 4.7313846913045315e-05, + "loss": 0.0466, + "step": 27390 + }, + { + "epoch": 5.0233981476466445, + "grad_norm": 3.4663474559783936, + "learning_rate": 4.731083789200022e-05, + "loss": 0.172, + "step": 27400 + }, + { + "epoch": 5.023452310025457, + "grad_norm": 1.4900226593017578, + "learning_rate": 4.730782887095512e-05, + "loss": 0.0585, + "step": 27410 + }, + { + "epoch": 5.023506472404268, + "grad_norm": 0.023262908682227135, + "learning_rate": 4.7304819849910034e-05, + "loss": 0.039, + "step": 27420 + }, + { + "epoch": 5.023560634783079, + "grad_norm": 0.20138230919837952, + "learning_rate": 4.730181082886494e-05, + "loss": 0.0791, + "step": 27430 + }, + { + "epoch": 5.023614797161891, + "grad_norm": 0.11377330869436264, + "learning_rate": 4.7298801807819846e-05, + "loss": 0.1104, + "step": 27440 + }, + { + "epoch": 5.023668959540703, + "grad_norm": 1.4885822534561157, + "learning_rate": 4.729579278677475e-05, + "loss": 0.259, + "step": 27450 + }, + { + "epoch": 5.023723121919515, + "grad_norm": 18.524150848388672, + "learning_rate": 4.729278376572966e-05, + "loss": 0.0575, + "step": 27460 + }, + { + "epoch": 5.023777284298326, + "grad_norm": 1.1890497207641602, + "learning_rate": 4.7289774744684565e-05, + "loss": 0.022, + "step": 27470 + }, + { + "epoch": 5.023831446677138, + "grad_norm": 1.6389949321746826, + "learning_rate": 4.728676572363948e-05, + "loss": 0.1138, + "step": 27480 + }, + { + "epoch": 5.02388560905595, + "grad_norm": 0.6861257553100586, + "learning_rate": 4.728375670259438e-05, + "loss": 0.0712, + "step": 27490 + }, + { + "epoch": 5.023939771434762, + "grad_norm": 0.01762770488858223, + "learning_rate": 4.7280747681549284e-05, + "loss": 0.0538, + "step": 27500 + }, + { + "epoch": 5.023993933813573, + "grad_norm": 0.2669026553630829, + "learning_rate": 4.72777386605042e-05, + "loss": 0.1096, + "step": 27510 + }, + { + "epoch": 5.024048096192384, + "grad_norm": 0.14625681936740875, + "learning_rate": 4.72747296394591e-05, + "loss": 0.0381, + "step": 27520 + }, + { + "epoch": 5.0241022585711965, + "grad_norm": 0.01016173418611288, + "learning_rate": 4.7271720618414e-05, + "loss": 0.0869, + "step": 27530 + }, + { + "epoch": 5.024156420950008, + "grad_norm": 0.258146733045578, + "learning_rate": 4.7268711597368916e-05, + "loss": 0.1735, + "step": 27540 + }, + { + "epoch": 5.02421058332882, + "grad_norm": 1.83601975440979, + "learning_rate": 4.726570257632382e-05, + "loss": 0.1068, + "step": 27550 + }, + { + "epoch": 5.024264745707631, + "grad_norm": 0.005431958008557558, + "learning_rate": 4.726269355527873e-05, + "loss": 0.0749, + "step": 27560 + }, + { + "epoch": 5.024318908086443, + "grad_norm": 0.07325197756290436, + "learning_rate": 4.7259684534233635e-05, + "loss": 0.0449, + "step": 27570 + }, + { + "epoch": 5.024373070465255, + "grad_norm": 0.10079021006822586, + "learning_rate": 4.725667551318854e-05, + "loss": 0.0575, + "step": 27580 + }, + { + "epoch": 5.024427232844067, + "grad_norm": 4.238770961761475, + "learning_rate": 4.725366649214345e-05, + "loss": 0.085, + "step": 27590 + }, + { + "epoch": 5.024481395222878, + "grad_norm": 2.2579233646392822, + "learning_rate": 4.7250657471098353e-05, + "loss": 0.1192, + "step": 27600 + }, + { + "epoch": 5.0245355576016895, + "grad_norm": 0.436627596616745, + "learning_rate": 4.724764845005326e-05, + "loss": 0.1169, + "step": 27610 + }, + { + "epoch": 5.024589719980502, + "grad_norm": 0.8434627056121826, + "learning_rate": 4.724463942900817e-05, + "loss": 0.0961, + "step": 27620 + }, + { + "epoch": 5.024643882359313, + "grad_norm": 0.5407119393348694, + "learning_rate": 4.724163040796308e-05, + "loss": 0.1209, + "step": 27630 + }, + { + "epoch": 5.024698044738125, + "grad_norm": 0.7032239437103271, + "learning_rate": 4.723862138691798e-05, + "loss": 0.0291, + "step": 27640 + }, + { + "epoch": 5.024752207116936, + "grad_norm": 4.592583656311035, + "learning_rate": 4.723561236587289e-05, + "loss": 0.1747, + "step": 27650 + }, + { + "epoch": 5.0248063694957485, + "grad_norm": 0.2450110912322998, + "learning_rate": 4.72326033448278e-05, + "loss": 0.1072, + "step": 27660 + }, + { + "epoch": 5.02486053187456, + "grad_norm": 1.7323665618896484, + "learning_rate": 4.7229594323782704e-05, + "loss": 0.0423, + "step": 27670 + }, + { + "epoch": 5.024914694253372, + "grad_norm": 0.00534079410135746, + "learning_rate": 4.722658530273761e-05, + "loss": 0.1297, + "step": 27680 + }, + { + "epoch": 5.024968856632183, + "grad_norm": 0.003143110778182745, + "learning_rate": 4.722357628169252e-05, + "loss": 0.0668, + "step": 27690 + }, + { + "epoch": 5.02500135405947, + "eval_accuracy": 0.8053559764859569, + "eval_loss": 0.7014486193656921, + "eval_runtime": 116.8048, + "eval_samples_per_second": 26.215, + "eval_steps_per_second": 3.279, + "step": 27696 + }, + { + "epoch": 6.000021664951524, + "grad_norm": 0.009593755938112736, + "learning_rate": 4.722056726064742e-05, + "loss": 0.0666, + "step": 27700 + }, + { + "epoch": 6.0000758273303365, + "grad_norm": 0.011405020952224731, + "learning_rate": 4.721755823960233e-05, + "loss": 0.1147, + "step": 27710 + }, + { + "epoch": 6.000129989709148, + "grad_norm": 0.1796988844871521, + "learning_rate": 4.7214549218557235e-05, + "loss": 0.0978, + "step": 27720 + }, + { + "epoch": 6.00018415208796, + "grad_norm": 0.012713583186268806, + "learning_rate": 4.721154019751214e-05, + "loss": 0.0808, + "step": 27730 + }, + { + "epoch": 6.000238314466771, + "grad_norm": 0.6441027522087097, + "learning_rate": 4.7208531176467055e-05, + "loss": 0.0238, + "step": 27740 + }, + { + "epoch": 6.000292476845583, + "grad_norm": 0.24530960619449615, + "learning_rate": 4.7205522155421954e-05, + "loss": 0.0079, + "step": 27750 + }, + { + "epoch": 6.000346639224395, + "grad_norm": 0.0027564875781536102, + "learning_rate": 4.720251313437686e-05, + "loss": 0.138, + "step": 27760 + }, + { + "epoch": 6.000400801603206, + "grad_norm": 0.2671341598033905, + "learning_rate": 4.7199504113331774e-05, + "loss": 0.2903, + "step": 27770 + }, + { + "epoch": 6.000454963982018, + "grad_norm": 0.06109284982085228, + "learning_rate": 4.719649509228668e-05, + "loss": 0.0814, + "step": 27780 + }, + { + "epoch": 6.0005091263608294, + "grad_norm": 0.06835351139307022, + "learning_rate": 4.719348607124158e-05, + "loss": 0.1839, + "step": 27790 + }, + { + "epoch": 6.000563288739642, + "grad_norm": 0.09606810659170151, + "learning_rate": 4.719047705019649e-05, + "loss": 0.0143, + "step": 27800 + }, + { + "epoch": 6.000617451118453, + "grad_norm": 0.8674576878547668, + "learning_rate": 4.71874680291514e-05, + "loss": 0.0133, + "step": 27810 + }, + { + "epoch": 6.000671613497265, + "grad_norm": 0.033472780138254166, + "learning_rate": 4.7184459008106305e-05, + "loss": 0.0048, + "step": 27820 + }, + { + "epoch": 6.000725775876076, + "grad_norm": 0.0886915773153305, + "learning_rate": 4.718144998706121e-05, + "loss": 0.1446, + "step": 27830 + }, + { + "epoch": 6.0007799382548885, + "grad_norm": 0.04116635397076607, + "learning_rate": 4.717844096601612e-05, + "loss": 0.0079, + "step": 27840 + }, + { + "epoch": 6.0008341006337, + "grad_norm": 0.01195868942886591, + "learning_rate": 4.7175431944971024e-05, + "loss": 0.0218, + "step": 27850 + }, + { + "epoch": 6.000888263012511, + "grad_norm": 4.263750076293945, + "learning_rate": 4.717242292392593e-05, + "loss": 0.1849, + "step": 27860 + }, + { + "epoch": 6.000942425391323, + "grad_norm": 0.008663957007229328, + "learning_rate": 4.7169413902880836e-05, + "loss": 0.0804, + "step": 27870 + }, + { + "epoch": 6.000996587770135, + "grad_norm": 0.05425834283232689, + "learning_rate": 4.716640488183575e-05, + "loss": 0.1201, + "step": 27880 + }, + { + "epoch": 6.001050750148947, + "grad_norm": 0.018609989434480667, + "learning_rate": 4.7163395860790656e-05, + "loss": 0.0749, + "step": 27890 + }, + { + "epoch": 6.001104912527758, + "grad_norm": 6.284837245941162, + "learning_rate": 4.7160386839745555e-05, + "loss": 0.1159, + "step": 27900 + }, + { + "epoch": 6.00115907490657, + "grad_norm": 0.03773236647248268, + "learning_rate": 4.715737781870047e-05, + "loss": 0.0374, + "step": 27910 + }, + { + "epoch": 6.0012132372853815, + "grad_norm": 0.08810388296842575, + "learning_rate": 4.7154368797655375e-05, + "loss": 0.1239, + "step": 27920 + }, + { + "epoch": 6.001267399664194, + "grad_norm": 0.03509538993239403, + "learning_rate": 4.715135977661028e-05, + "loss": 0.0285, + "step": 27930 + }, + { + "epoch": 6.001321562043005, + "grad_norm": 0.026540884748101234, + "learning_rate": 4.714835075556519e-05, + "loss": 0.0067, + "step": 27940 + }, + { + "epoch": 6.001375724421816, + "grad_norm": 0.05340982973575592, + "learning_rate": 4.714534173452009e-05, + "loss": 0.0671, + "step": 27950 + }, + { + "epoch": 6.001429886800628, + "grad_norm": 10.257699012756348, + "learning_rate": 4.7142332713475e-05, + "loss": 0.3031, + "step": 27960 + }, + { + "epoch": 6.00148404917944, + "grad_norm": 0.12752866744995117, + "learning_rate": 4.713932369242991e-05, + "loss": 0.0342, + "step": 27970 + }, + { + "epoch": 6.001538211558252, + "grad_norm": 0.06842093914747238, + "learning_rate": 4.713631467138481e-05, + "loss": 0.0925, + "step": 27980 + }, + { + "epoch": 6.001592373937063, + "grad_norm": 6.7743964195251465, + "learning_rate": 4.713330565033972e-05, + "loss": 0.038, + "step": 27990 + }, + { + "epoch": 6.001646536315875, + "grad_norm": 0.4098590612411499, + "learning_rate": 4.713029662929463e-05, + "loss": 0.0062, + "step": 28000 + }, + { + "epoch": 6.001700698694687, + "grad_norm": 0.06962278485298157, + "learning_rate": 4.712728760824953e-05, + "loss": 0.1151, + "step": 28010 + }, + { + "epoch": 6.001754861073499, + "grad_norm": 0.06788711249828339, + "learning_rate": 4.712427858720444e-05, + "loss": 0.2183, + "step": 28020 + }, + { + "epoch": 6.00180902345231, + "grad_norm": 1.9502683877944946, + "learning_rate": 4.712126956615935e-05, + "loss": 0.0897, + "step": 28030 + }, + { + "epoch": 6.001863185831121, + "grad_norm": 0.046824291348457336, + "learning_rate": 4.7118260545114257e-05, + "loss": 0.0715, + "step": 28040 + }, + { + "epoch": 6.0019173482099335, + "grad_norm": 0.9247515201568604, + "learning_rate": 4.7115251524069156e-05, + "loss": 0.0526, + "step": 28050 + }, + { + "epoch": 6.001971510588745, + "grad_norm": 4.865513801574707, + "learning_rate": 4.711224250302407e-05, + "loss": 0.0448, + "step": 28060 + }, + { + "epoch": 6.002025672967557, + "grad_norm": 0.05693408101797104, + "learning_rate": 4.7109233481978975e-05, + "loss": 0.0984, + "step": 28070 + }, + { + "epoch": 6.002079835346368, + "grad_norm": 0.06922930479049683, + "learning_rate": 4.710622446093388e-05, + "loss": 0.0986, + "step": 28080 + }, + { + "epoch": 6.00213399772518, + "grad_norm": 0.019369961693882942, + "learning_rate": 4.710321543988879e-05, + "loss": 0.0655, + "step": 28090 + }, + { + "epoch": 6.002188160103992, + "grad_norm": 0.0019936927128583193, + "learning_rate": 4.7100206418843694e-05, + "loss": 0.0664, + "step": 28100 + }, + { + "epoch": 6.002242322482804, + "grad_norm": 0.10955397039651871, + "learning_rate": 4.70971973977986e-05, + "loss": 0.0125, + "step": 28110 + }, + { + "epoch": 6.002296484861615, + "grad_norm": 0.1353255957365036, + "learning_rate": 4.7094188376753514e-05, + "loss": 0.1717, + "step": 28120 + }, + { + "epoch": 6.002350647240426, + "grad_norm": 2.358855962753296, + "learning_rate": 4.709117935570841e-05, + "loss": 0.163, + "step": 28130 + }, + { + "epoch": 6.002404809619239, + "grad_norm": 0.2342577874660492, + "learning_rate": 4.7088170334663326e-05, + "loss": 0.0411, + "step": 28140 + }, + { + "epoch": 6.00245897199805, + "grad_norm": 0.11165423691272736, + "learning_rate": 4.708516131361823e-05, + "loss": 0.0817, + "step": 28150 + }, + { + "epoch": 6.002513134376862, + "grad_norm": 0.0026985586155205965, + "learning_rate": 4.708215229257313e-05, + "loss": 0.0338, + "step": 28160 + }, + { + "epoch": 6.002567296755673, + "grad_norm": 9.120376586914062, + "learning_rate": 4.7079143271528045e-05, + "loss": 0.0767, + "step": 28170 + }, + { + "epoch": 6.0026214591344855, + "grad_norm": 0.07890775054693222, + "learning_rate": 4.707613425048295e-05, + "loss": 0.1241, + "step": 28180 + }, + { + "epoch": 6.002675621513297, + "grad_norm": 0.30059802532196045, + "learning_rate": 4.707312522943786e-05, + "loss": 0.1511, + "step": 28190 + }, + { + "epoch": 6.002729783892109, + "grad_norm": 1.9856221675872803, + "learning_rate": 4.7070116208392764e-05, + "loss": 0.1931, + "step": 28200 + }, + { + "epoch": 6.00278394627092, + "grad_norm": 0.11005580425262451, + "learning_rate": 4.706710718734767e-05, + "loss": 0.0374, + "step": 28210 + }, + { + "epoch": 6.0028381086497316, + "grad_norm": 0.3030928671360016, + "learning_rate": 4.7064098166302576e-05, + "loss": 0.0729, + "step": 28220 + }, + { + "epoch": 6.002892271028544, + "grad_norm": 3.541085720062256, + "learning_rate": 4.706108914525749e-05, + "loss": 0.0608, + "step": 28230 + }, + { + "epoch": 6.002946433407355, + "grad_norm": 0.009025284089148045, + "learning_rate": 4.705808012421239e-05, + "loss": 0.058, + "step": 28240 + }, + { + "epoch": 6.003000595786167, + "grad_norm": 0.03495163843035698, + "learning_rate": 4.7055071103167295e-05, + "loss": 0.0056, + "step": 28250 + }, + { + "epoch": 6.0030547581649785, + "grad_norm": 0.05633912608027458, + "learning_rate": 4.705206208212221e-05, + "loss": 0.0012, + "step": 28260 + }, + { + "epoch": 6.003108920543791, + "grad_norm": 0.2999371886253357, + "learning_rate": 4.7049053061077114e-05, + "loss": 0.0612, + "step": 28270 + }, + { + "epoch": 6.003163082922602, + "grad_norm": 0.007837432436645031, + "learning_rate": 4.7046044040032014e-05, + "loss": 0.0031, + "step": 28280 + }, + { + "epoch": 6.003217245301413, + "grad_norm": 0.007028015796095133, + "learning_rate": 4.704303501898693e-05, + "loss": 0.1106, + "step": 28290 + }, + { + "epoch": 6.003271407680225, + "grad_norm": 39.100460052490234, + "learning_rate": 4.704002599794183e-05, + "loss": 0.0667, + "step": 28300 + }, + { + "epoch": 6.003325570059037, + "grad_norm": 0.2582971751689911, + "learning_rate": 4.703701697689673e-05, + "loss": 0.0761, + "step": 28310 + }, + { + "epoch": 6.003379732437849, + "grad_norm": 0.08324125409126282, + "learning_rate": 4.7034007955851646e-05, + "loss": 0.0216, + "step": 28320 + }, + { + "epoch": 6.00343389481666, + "grad_norm": 0.17853522300720215, + "learning_rate": 4.703099893480655e-05, + "loss": 0.0636, + "step": 28330 + }, + { + "epoch": 6.003488057195472, + "grad_norm": 0.4627106189727783, + "learning_rate": 4.702798991376146e-05, + "loss": 0.172, + "step": 28340 + }, + { + "epoch": 6.003542219574284, + "grad_norm": 0.0028027526568621397, + "learning_rate": 4.7024980892716365e-05, + "loss": 0.1886, + "step": 28350 + }, + { + "epoch": 6.003596381953096, + "grad_norm": 0.054296836256980896, + "learning_rate": 4.702197187167127e-05, + "loss": 0.0221, + "step": 28360 + }, + { + "epoch": 6.003650544331907, + "grad_norm": 0.565324604511261, + "learning_rate": 4.701896285062618e-05, + "loss": 0.0535, + "step": 28370 + }, + { + "epoch": 6.003704706710718, + "grad_norm": 0.15401998162269592, + "learning_rate": 4.701595382958109e-05, + "loss": 0.0283, + "step": 28380 + }, + { + "epoch": 6.0037588690895305, + "grad_norm": 0.002383455168455839, + "learning_rate": 4.701294480853599e-05, + "loss": 0.0241, + "step": 28390 + }, + { + "epoch": 6.003813031468342, + "grad_norm": 7.675652980804443, + "learning_rate": 4.70099357874909e-05, + "loss": 0.194, + "step": 28400 + }, + { + "epoch": 6.003867193847154, + "grad_norm": 0.009206882677972317, + "learning_rate": 4.700692676644581e-05, + "loss": 0.0895, + "step": 28410 + }, + { + "epoch": 6.003921356225965, + "grad_norm": 0.286598801612854, + "learning_rate": 4.7003917745400715e-05, + "loss": 0.0138, + "step": 28420 + }, + { + "epoch": 6.003975518604777, + "grad_norm": 0.0469852089881897, + "learning_rate": 4.700090872435562e-05, + "loss": 0.0127, + "step": 28430 + }, + { + "epoch": 6.004029680983589, + "grad_norm": 0.7227616906166077, + "learning_rate": 4.699789970331053e-05, + "loss": 0.1157, + "step": 28440 + }, + { + "epoch": 6.004083843362401, + "grad_norm": 0.10403361916542053, + "learning_rate": 4.6994890682265434e-05, + "loss": 0.0907, + "step": 28450 + }, + { + "epoch": 6.004138005741212, + "grad_norm": 2.1077396869659424, + "learning_rate": 4.699188166122034e-05, + "loss": 0.0577, + "step": 28460 + }, + { + "epoch": 6.004192168120023, + "grad_norm": 0.09736678004264832, + "learning_rate": 4.698887264017525e-05, + "loss": 0.1573, + "step": 28470 + }, + { + "epoch": 6.004246330498836, + "grad_norm": 7.128440856933594, + "learning_rate": 4.698586361913015e-05, + "loss": 0.1285, + "step": 28480 + }, + { + "epoch": 6.004300492877647, + "grad_norm": 0.13448482751846313, + "learning_rate": 4.6982854598085066e-05, + "loss": 0.2016, + "step": 28490 + }, + { + "epoch": 6.004354655256459, + "grad_norm": 0.5527577996253967, + "learning_rate": 4.6979845577039966e-05, + "loss": 0.1003, + "step": 28500 + }, + { + "epoch": 6.00440881763527, + "grad_norm": 4.456030368804932, + "learning_rate": 4.697683655599487e-05, + "loss": 0.0423, + "step": 28510 + }, + { + "epoch": 6.0044629800140825, + "grad_norm": 0.023760374635457993, + "learning_rate": 4.6973827534949785e-05, + "loss": 0.1997, + "step": 28520 + }, + { + "epoch": 6.004517142392894, + "grad_norm": 1.8514606952667236, + "learning_rate": 4.697081851390469e-05, + "loss": 0.0417, + "step": 28530 + }, + { + "epoch": 6.004571304771706, + "grad_norm": 0.005783025175333023, + "learning_rate": 4.696780949285959e-05, + "loss": 0.1068, + "step": 28540 + }, + { + "epoch": 6.004625467150517, + "grad_norm": 3.7566959857940674, + "learning_rate": 4.6964800471814504e-05, + "loss": 0.1177, + "step": 28550 + }, + { + "epoch": 6.0046796295293285, + "grad_norm": 0.13817548751831055, + "learning_rate": 4.696179145076941e-05, + "loss": 0.0377, + "step": 28560 + }, + { + "epoch": 6.004733791908141, + "grad_norm": 0.035960495471954346, + "learning_rate": 4.6958782429724316e-05, + "loss": 0.0518, + "step": 28570 + }, + { + "epoch": 6.004787954286952, + "grad_norm": 0.22458863258361816, + "learning_rate": 4.695577340867922e-05, + "loss": 0.0136, + "step": 28580 + }, + { + "epoch": 6.004842116665764, + "grad_norm": 0.0011057875817641616, + "learning_rate": 4.695276438763413e-05, + "loss": 0.0057, + "step": 28590 + }, + { + "epoch": 6.0048962790445755, + "grad_norm": 0.019661936908960342, + "learning_rate": 4.6949755366589035e-05, + "loss": 0.1438, + "step": 28600 + }, + { + "epoch": 6.004950441423388, + "grad_norm": 0.007720041088759899, + "learning_rate": 4.694674634554394e-05, + "loss": 0.058, + "step": 28610 + }, + { + "epoch": 6.005004603802199, + "grad_norm": 0.021563459187746048, + "learning_rate": 4.694373732449885e-05, + "loss": 0.033, + "step": 28620 + }, + { + "epoch": 6.005058766181011, + "grad_norm": 0.08267473429441452, + "learning_rate": 4.6940728303453754e-05, + "loss": 0.0826, + "step": 28630 + }, + { + "epoch": 6.005112928559822, + "grad_norm": 0.10793288797140121, + "learning_rate": 4.693771928240867e-05, + "loss": 0.133, + "step": 28640 + }, + { + "epoch": 6.005167090938634, + "grad_norm": 0.017502907663583755, + "learning_rate": 4.6934710261363566e-05, + "loss": 0.0631, + "step": 28650 + }, + { + "epoch": 6.005221253317446, + "grad_norm": 0.27187401056289673, + "learning_rate": 4.693170124031848e-05, + "loss": 0.1999, + "step": 28660 + }, + { + "epoch": 6.005275415696257, + "grad_norm": 0.033183805644512177, + "learning_rate": 4.6928692219273386e-05, + "loss": 0.0717, + "step": 28670 + }, + { + "epoch": 6.005329578075069, + "grad_norm": 1.5697919130325317, + "learning_rate": 4.692568319822829e-05, + "loss": 0.07, + "step": 28680 + }, + { + "epoch": 6.005383740453881, + "grad_norm": 0.013155700638890266, + "learning_rate": 4.69226741771832e-05, + "loss": 0.0208, + "step": 28690 + }, + { + "epoch": 6.005437902832693, + "grad_norm": 1.0484308004379272, + "learning_rate": 4.6919665156138105e-05, + "loss": 0.0834, + "step": 28700 + }, + { + "epoch": 6.005492065211504, + "grad_norm": 0.026368271559476852, + "learning_rate": 4.691665613509301e-05, + "loss": 0.1546, + "step": 28710 + }, + { + "epoch": 6.005546227590316, + "grad_norm": 0.004287156276404858, + "learning_rate": 4.6913647114047924e-05, + "loss": 0.093, + "step": 28720 + }, + { + "epoch": 6.0056003899691275, + "grad_norm": 1.2881956100463867, + "learning_rate": 4.6910638093002823e-05, + "loss": 0.1442, + "step": 28730 + }, + { + "epoch": 6.005654552347939, + "grad_norm": 0.6961562633514404, + "learning_rate": 4.690762907195773e-05, + "loss": 0.0453, + "step": 28740 + }, + { + "epoch": 6.005708714726751, + "grad_norm": 2.8525185585021973, + "learning_rate": 4.690462005091264e-05, + "loss": 0.123, + "step": 28750 + }, + { + "epoch": 6.005762877105562, + "grad_norm": 0.14435435831546783, + "learning_rate": 4.690161102986754e-05, + "loss": 0.071, + "step": 28760 + }, + { + "epoch": 6.005817039484374, + "grad_norm": 0.18481141328811646, + "learning_rate": 4.689860200882245e-05, + "loss": 0.107, + "step": 28770 + }, + { + "epoch": 6.005871201863186, + "grad_norm": 0.013959224335849285, + "learning_rate": 4.689559298777736e-05, + "loss": 0.0268, + "step": 28780 + }, + { + "epoch": 6.005925364241998, + "grad_norm": 0.0035628078039735556, + "learning_rate": 4.689258396673227e-05, + "loss": 0.1109, + "step": 28790 + }, + { + "epoch": 6.005979526620809, + "grad_norm": 1.8092424869537354, + "learning_rate": 4.688957494568717e-05, + "loss": 0.0639, + "step": 28800 + }, + { + "epoch": 6.00603368899962, + "grad_norm": 0.06156101077795029, + "learning_rate": 4.688656592464208e-05, + "loss": 0.0643, + "step": 28810 + }, + { + "epoch": 6.006087851378433, + "grad_norm": 0.14955313503742218, + "learning_rate": 4.688355690359699e-05, + "loss": 0.0864, + "step": 28820 + }, + { + "epoch": 6.006142013757244, + "grad_norm": 0.11598925292491913, + "learning_rate": 4.688054788255189e-05, + "loss": 0.0682, + "step": 28830 + }, + { + "epoch": 6.006196176136056, + "grad_norm": 3.749974012374878, + "learning_rate": 4.68775388615068e-05, + "loss": 0.081, + "step": 28840 + }, + { + "epoch": 6.006250338514867, + "grad_norm": 0.07641590386629105, + "learning_rate": 4.6874529840461706e-05, + "loss": 0.0303, + "step": 28850 + }, + { + "epoch": 6.0063045008936795, + "grad_norm": 0.40287405252456665, + "learning_rate": 4.687152081941661e-05, + "loss": 0.0647, + "step": 28860 + }, + { + "epoch": 6.006358663272491, + "grad_norm": 0.03583931922912598, + "learning_rate": 4.6868511798371525e-05, + "loss": 0.1223, + "step": 28870 + }, + { + "epoch": 6.006412825651303, + "grad_norm": 0.4059659242630005, + "learning_rate": 4.6865502777326424e-05, + "loss": 0.161, + "step": 28880 + }, + { + "epoch": 6.006466988030114, + "grad_norm": 1.4167745113372803, + "learning_rate": 4.686249375628133e-05, + "loss": 0.1087, + "step": 28890 + }, + { + "epoch": 6.0065211504089255, + "grad_norm": 0.10919767618179321, + "learning_rate": 4.6859484735236244e-05, + "loss": 0.0547, + "step": 28900 + }, + { + "epoch": 6.006575312787738, + "grad_norm": 0.33511975407600403, + "learning_rate": 4.685647571419114e-05, + "loss": 0.0142, + "step": 28910 + }, + { + "epoch": 6.006629475166549, + "grad_norm": 0.31434279680252075, + "learning_rate": 4.6853466693146056e-05, + "loss": 0.0895, + "step": 28920 + }, + { + "epoch": 6.006683637545361, + "grad_norm": 0.03759273141622543, + "learning_rate": 4.685045767210096e-05, + "loss": 0.117, + "step": 28930 + }, + { + "epoch": 6.0067377999241724, + "grad_norm": 0.27144190669059753, + "learning_rate": 4.684744865105587e-05, + "loss": 0.0213, + "step": 28940 + }, + { + "epoch": 6.006791962302985, + "grad_norm": 0.040153734385967255, + "learning_rate": 4.6844439630010775e-05, + "loss": 0.0921, + "step": 28950 + }, + { + "epoch": 6.006846124681796, + "grad_norm": 9.79482364654541, + "learning_rate": 4.684143060896568e-05, + "loss": 0.0781, + "step": 28960 + }, + { + "epoch": 6.006900287060608, + "grad_norm": 0.4171457588672638, + "learning_rate": 4.683842158792059e-05, + "loss": 0.0039, + "step": 28970 + }, + { + "epoch": 6.006954449439419, + "grad_norm": 0.0019550842698663473, + "learning_rate": 4.68354125668755e-05, + "loss": 0.1771, + "step": 28980 + }, + { + "epoch": 6.007008611818231, + "grad_norm": 0.004162718076258898, + "learning_rate": 4.68324035458304e-05, + "loss": 0.0795, + "step": 28990 + }, + { + "epoch": 6.007062774197043, + "grad_norm": 4.106736183166504, + "learning_rate": 4.6829394524785306e-05, + "loss": 0.0658, + "step": 29000 + }, + { + "epoch": 6.007116936575854, + "grad_norm": 2.9049391746520996, + "learning_rate": 4.682638550374022e-05, + "loss": 0.11, + "step": 29010 + }, + { + "epoch": 6.007171098954666, + "grad_norm": 2.58010196685791, + "learning_rate": 4.6823376482695126e-05, + "loss": 0.029, + "step": 29020 + }, + { + "epoch": 6.007225261333478, + "grad_norm": 0.0009815533412620425, + "learning_rate": 4.6820367461650025e-05, + "loss": 0.0245, + "step": 29030 + }, + { + "epoch": 6.00727942371229, + "grad_norm": 0.007972851395606995, + "learning_rate": 4.681735844060494e-05, + "loss": 0.2619, + "step": 29040 + }, + { + "epoch": 6.007333586091101, + "grad_norm": 0.006909970659762621, + "learning_rate": 4.6814349419559845e-05, + "loss": 0.1104, + "step": 29050 + }, + { + "epoch": 6.007387748469913, + "grad_norm": 0.005275439936667681, + "learning_rate": 4.6811340398514744e-05, + "loss": 0.1028, + "step": 29060 + }, + { + "epoch": 6.0074419108487245, + "grad_norm": 0.274128794670105, + "learning_rate": 4.680833137746966e-05, + "loss": 0.1048, + "step": 29070 + }, + { + "epoch": 6.007496073227536, + "grad_norm": 6.696926593780518, + "learning_rate": 4.680532235642456e-05, + "loss": 0.0976, + "step": 29080 + }, + { + "epoch": 6.007550235606348, + "grad_norm": 0.01583394594490528, + "learning_rate": 4.680231333537947e-05, + "loss": 0.1474, + "step": 29090 + }, + { + "epoch": 6.007604397985159, + "grad_norm": 0.12053176015615463, + "learning_rate": 4.6799304314334376e-05, + "loss": 0.0884, + "step": 29100 + }, + { + "epoch": 6.007658560363971, + "grad_norm": 0.5543510913848877, + "learning_rate": 4.679629529328928e-05, + "loss": 0.1078, + "step": 29110 + }, + { + "epoch": 6.007712722742783, + "grad_norm": 0.45577579736709595, + "learning_rate": 4.679328627224419e-05, + "loss": 0.138, + "step": 29120 + }, + { + "epoch": 6.007766885121595, + "grad_norm": 0.06535866856575012, + "learning_rate": 4.67902772511991e-05, + "loss": 0.0401, + "step": 29130 + }, + { + "epoch": 6.007821047500406, + "grad_norm": 0.009332796558737755, + "learning_rate": 4.6787268230154e-05, + "loss": 0.0617, + "step": 29140 + }, + { + "epoch": 6.007875209879218, + "grad_norm": 0.028400637209415436, + "learning_rate": 4.678425920910891e-05, + "loss": 0.0374, + "step": 29150 + }, + { + "epoch": 6.00792937225803, + "grad_norm": 0.3864336907863617, + "learning_rate": 4.678125018806382e-05, + "loss": 0.1723, + "step": 29160 + }, + { + "epoch": 6.007983534636841, + "grad_norm": 0.011841869913041592, + "learning_rate": 4.677824116701873e-05, + "loss": 0.25, + "step": 29170 + }, + { + "epoch": 6.008037697015653, + "grad_norm": 0.03443474695086479, + "learning_rate": 4.677523214597363e-05, + "loss": 0.1003, + "step": 29180 + }, + { + "epoch": 6.008091859394464, + "grad_norm": 0.17670105397701263, + "learning_rate": 4.677222312492854e-05, + "loss": 0.0349, + "step": 29190 + }, + { + "epoch": 6.0081460217732765, + "grad_norm": 1.2147963047027588, + "learning_rate": 4.6769214103883445e-05, + "loss": 0.0511, + "step": 29200 + }, + { + "epoch": 6.008200184152088, + "grad_norm": 0.3529532253742218, + "learning_rate": 4.676620508283835e-05, + "loss": 0.0096, + "step": 29210 + }, + { + "epoch": 6.0082543465309, + "grad_norm": 4.788204193115234, + "learning_rate": 4.676319606179326e-05, + "loss": 0.1475, + "step": 29220 + }, + { + "epoch": 6.008308508909711, + "grad_norm": 3.4961583614349365, + "learning_rate": 4.6760187040748164e-05, + "loss": 0.1712, + "step": 29230 + }, + { + "epoch": 6.008362671288523, + "grad_norm": 0.04919391870498657, + "learning_rate": 4.675717801970308e-05, + "loss": 0.0757, + "step": 29240 + }, + { + "epoch": 6.008416833667335, + "grad_norm": 0.0015370921464636922, + "learning_rate": 4.675416899865798e-05, + "loss": 0.0483, + "step": 29250 + }, + { + "epoch": 6.008470996046146, + "grad_norm": 0.12899813055992126, + "learning_rate": 4.675115997761288e-05, + "loss": 0.0221, + "step": 29260 + }, + { + "epoch": 6.008525158424958, + "grad_norm": 1.7685977220535278, + "learning_rate": 4.6748150956567796e-05, + "loss": 0.0587, + "step": 29270 + }, + { + "epoch": 6.008579320803769, + "grad_norm": 1.6970117092132568, + "learning_rate": 4.67451419355227e-05, + "loss": 0.18, + "step": 29280 + }, + { + "epoch": 6.008633483182582, + "grad_norm": 1.2862493991851807, + "learning_rate": 4.67421329144776e-05, + "loss": 0.096, + "step": 29290 + }, + { + "epoch": 6.008687645561393, + "grad_norm": 0.008812607266008854, + "learning_rate": 4.6739123893432515e-05, + "loss": 0.0424, + "step": 29300 + }, + { + "epoch": 6.008741807940205, + "grad_norm": 0.08417335897684097, + "learning_rate": 4.673611487238742e-05, + "loss": 0.1035, + "step": 29310 + }, + { + "epoch": 6.008795970319016, + "grad_norm": 0.6939535737037659, + "learning_rate": 4.673310585134233e-05, + "loss": 0.0523, + "step": 29320 + }, + { + "epoch": 6.0088501326978285, + "grad_norm": 0.0860423669219017, + "learning_rate": 4.6730096830297234e-05, + "loss": 0.0864, + "step": 29330 + }, + { + "epoch": 6.00890429507664, + "grad_norm": 0.1001831591129303, + "learning_rate": 4.672708780925214e-05, + "loss": 0.1475, + "step": 29340 + }, + { + "epoch": 6.008958457455451, + "grad_norm": 0.4756297767162323, + "learning_rate": 4.6724078788207046e-05, + "loss": 0.089, + "step": 29350 + }, + { + "epoch": 6.009012619834263, + "grad_norm": 0.3888107240200043, + "learning_rate": 4.672106976716195e-05, + "loss": 0.2314, + "step": 29360 + }, + { + "epoch": 6.0090667822130746, + "grad_norm": 0.06118840351700783, + "learning_rate": 4.671806074611686e-05, + "loss": 0.0189, + "step": 29370 + }, + { + "epoch": 6.009120944591887, + "grad_norm": 0.9852218627929688, + "learning_rate": 4.6715051725071765e-05, + "loss": 0.1072, + "step": 29380 + }, + { + "epoch": 6.009175106970698, + "grad_norm": 0.03023749403655529, + "learning_rate": 4.671204270402668e-05, + "loss": 0.0141, + "step": 29390 + }, + { + "epoch": 6.00922926934951, + "grad_norm": 0.13412417471408844, + "learning_rate": 4.670903368298158e-05, + "loss": 0.1421, + "step": 29400 + }, + { + "epoch": 6.0092834317283215, + "grad_norm": 0.006755267735570669, + "learning_rate": 4.6706024661936484e-05, + "loss": 0.0968, + "step": 29410 + }, + { + "epoch": 6.009337594107133, + "grad_norm": 8.550025939941406, + "learning_rate": 4.67030156408914e-05, + "loss": 0.0271, + "step": 29420 + }, + { + "epoch": 6.009391756485945, + "grad_norm": 0.043586403131484985, + "learning_rate": 4.67000066198463e-05, + "loss": 0.0609, + "step": 29430 + }, + { + "epoch": 6.009445918864756, + "grad_norm": 2.237543821334839, + "learning_rate": 4.669699759880121e-05, + "loss": 0.0595, + "step": 29440 + }, + { + "epoch": 6.009500081243568, + "grad_norm": 5.913501739501953, + "learning_rate": 4.6693988577756116e-05, + "loss": 0.2012, + "step": 29450 + }, + { + "epoch": 6.00955424362238, + "grad_norm": 0.018448011949658394, + "learning_rate": 4.669097955671102e-05, + "loss": 0.1168, + "step": 29460 + }, + { + "epoch": 6.009608406001192, + "grad_norm": 0.0040117064490914345, + "learning_rate": 4.668797053566593e-05, + "loss": 0.0545, + "step": 29470 + }, + { + "epoch": 6.009662568380003, + "grad_norm": 0.11918220669031143, + "learning_rate": 4.6684961514620835e-05, + "loss": 0.0058, + "step": 29480 + }, + { + "epoch": 6.009716730758815, + "grad_norm": 0.22750324010849, + "learning_rate": 4.668195249357574e-05, + "loss": 0.0824, + "step": 29490 + }, + { + "epoch": 6.009770893137627, + "grad_norm": 1.2450004816055298, + "learning_rate": 4.6678943472530654e-05, + "loss": 0.0304, + "step": 29500 + }, + { + "epoch": 6.009825055516438, + "grad_norm": 12.27327823638916, + "learning_rate": 4.6675934451485554e-05, + "loss": 0.0874, + "step": 29510 + }, + { + "epoch": 6.00987921789525, + "grad_norm": 0.2714628279209137, + "learning_rate": 4.667292543044046e-05, + "loss": 0.0105, + "step": 29520 + }, + { + "epoch": 6.009933380274061, + "grad_norm": 0.04141714796423912, + "learning_rate": 4.666991640939537e-05, + "loss": 0.1177, + "step": 29530 + }, + { + "epoch": 6.0099875426528735, + "grad_norm": 2.6447343826293945, + "learning_rate": 4.666690738835028e-05, + "loss": 0.1398, + "step": 29540 + }, + { + "epoch": 6.010041705031685, + "grad_norm": 0.33376026153564453, + "learning_rate": 4.666389836730518e-05, + "loss": 0.1752, + "step": 29550 + }, + { + "epoch": 6.010095867410497, + "grad_norm": 0.5636570453643799, + "learning_rate": 4.666088934626009e-05, + "loss": 0.1095, + "step": 29560 + }, + { + "epoch": 6.010150029789308, + "grad_norm": 0.21537652611732483, + "learning_rate": 4.6657880325215e-05, + "loss": 0.0642, + "step": 29570 + }, + { + "epoch": 6.01020419216812, + "grad_norm": 0.0015390143962576985, + "learning_rate": 4.6654871304169904e-05, + "loss": 0.0225, + "step": 29580 + }, + { + "epoch": 6.010258354546932, + "grad_norm": 0.004746619611978531, + "learning_rate": 4.665186228312481e-05, + "loss": 0.2972, + "step": 29590 + }, + { + "epoch": 6.010312516925743, + "grad_norm": 6.687509536743164, + "learning_rate": 4.664885326207972e-05, + "loss": 0.1067, + "step": 29600 + }, + { + "epoch": 6.010366679304555, + "grad_norm": 4.485691547393799, + "learning_rate": 4.664584424103462e-05, + "loss": 0.2155, + "step": 29610 + }, + { + "epoch": 6.010420841683366, + "grad_norm": 1.064910650253296, + "learning_rate": 4.6642835219989536e-05, + "loss": 0.0528, + "step": 29620 + }, + { + "epoch": 6.010475004062179, + "grad_norm": 0.18832547962665558, + "learning_rate": 4.6639826198944436e-05, + "loss": 0.0856, + "step": 29630 + }, + { + "epoch": 6.01052916644099, + "grad_norm": 0.010711967945098877, + "learning_rate": 4.663681717789934e-05, + "loss": 0.2043, + "step": 29640 + }, + { + "epoch": 6.010583328819802, + "grad_norm": 1.7243287563323975, + "learning_rate": 4.6633808156854255e-05, + "loss": 0.0931, + "step": 29650 + }, + { + "epoch": 6.010637491198613, + "grad_norm": 1.9281470775604248, + "learning_rate": 4.6630799135809154e-05, + "loss": 0.1425, + "step": 29660 + }, + { + "epoch": 6.0106916535774255, + "grad_norm": 0.11493345350027084, + "learning_rate": 4.662779011476406e-05, + "loss": 0.1223, + "step": 29670 + }, + { + "epoch": 6.010745815956237, + "grad_norm": 3.6857311725616455, + "learning_rate": 4.6624781093718974e-05, + "loss": 0.1341, + "step": 29680 + }, + { + "epoch": 6.010799978335048, + "grad_norm": 0.3360546827316284, + "learning_rate": 4.662177207267388e-05, + "loss": 0.115, + "step": 29690 + }, + { + "epoch": 6.01085414071386, + "grad_norm": 0.21837294101715088, + "learning_rate": 4.6618763051628786e-05, + "loss": 0.0733, + "step": 29700 + }, + { + "epoch": 6.0109083030926715, + "grad_norm": 3.3510043621063232, + "learning_rate": 4.661575403058369e-05, + "loss": 0.1017, + "step": 29710 + }, + { + "epoch": 6.010962465471484, + "grad_norm": 0.12630458176136017, + "learning_rate": 4.66127450095386e-05, + "loss": 0.0153, + "step": 29720 + }, + { + "epoch": 6.011016627850295, + "grad_norm": 8.91951847076416, + "learning_rate": 4.6609735988493505e-05, + "loss": 0.2072, + "step": 29730 + }, + { + "epoch": 6.011070790229107, + "grad_norm": 0.0024135769344866276, + "learning_rate": 4.660672696744841e-05, + "loss": 0.136, + "step": 29740 + }, + { + "epoch": 6.0111249526079185, + "grad_norm": 0.35308948159217834, + "learning_rate": 4.660371794640332e-05, + "loss": 0.0281, + "step": 29750 + }, + { + "epoch": 6.011179114986731, + "grad_norm": 0.11393905431032181, + "learning_rate": 4.660070892535823e-05, + "loss": 0.0911, + "step": 29760 + }, + { + "epoch": 6.011233277365542, + "grad_norm": 0.0036989033687859774, + "learning_rate": 4.659769990431314e-05, + "loss": 0.254, + "step": 29770 + }, + { + "epoch": 6.011287439744353, + "grad_norm": 0.5138756632804871, + "learning_rate": 4.6594690883268036e-05, + "loss": 0.0442, + "step": 29780 + }, + { + "epoch": 6.011341602123165, + "grad_norm": 0.29041123390197754, + "learning_rate": 4.659168186222295e-05, + "loss": 0.0732, + "step": 29790 + }, + { + "epoch": 6.011395764501977, + "grad_norm": 0.025729240849614143, + "learning_rate": 4.6588672841177856e-05, + "loss": 0.03, + "step": 29800 + }, + { + "epoch": 6.011449926880789, + "grad_norm": 3.18282413482666, + "learning_rate": 4.6585663820132755e-05, + "loss": 0.0905, + "step": 29810 + }, + { + "epoch": 6.0115040892596, + "grad_norm": 8.868168830871582, + "learning_rate": 4.658265479908767e-05, + "loss": 0.1768, + "step": 29820 + }, + { + "epoch": 6.011558251638412, + "grad_norm": 0.26506659388542175, + "learning_rate": 4.6579645778042575e-05, + "loss": 0.0873, + "step": 29830 + }, + { + "epoch": 6.011612414017224, + "grad_norm": 0.1643570065498352, + "learning_rate": 4.657663675699748e-05, + "loss": 0.1134, + "step": 29840 + }, + { + "epoch": 6.011666576396036, + "grad_norm": 0.07217414677143097, + "learning_rate": 4.657362773595239e-05, + "loss": 0.101, + "step": 29850 + }, + { + "epoch": 6.011720738774847, + "grad_norm": 0.16829605400562286, + "learning_rate": 4.6570618714907293e-05, + "loss": 0.0736, + "step": 29860 + }, + { + "epoch": 6.011774901153658, + "grad_norm": 0.02368907630443573, + "learning_rate": 4.65676096938622e-05, + "loss": 0.0888, + "step": 29870 + }, + { + "epoch": 6.0118290635324705, + "grad_norm": 0.004372761584818363, + "learning_rate": 4.656460067281711e-05, + "loss": 0.1049, + "step": 29880 + }, + { + "epoch": 6.011883225911282, + "grad_norm": 0.1844439059495926, + "learning_rate": 4.656159165177201e-05, + "loss": 0.1127, + "step": 29890 + }, + { + "epoch": 6.011937388290094, + "grad_norm": 0.5458278059959412, + "learning_rate": 4.655858263072692e-05, + "loss": 0.0478, + "step": 29900 + }, + { + "epoch": 6.011991550668905, + "grad_norm": 0.06357844918966293, + "learning_rate": 4.655557360968183e-05, + "loss": 0.0381, + "step": 29910 + }, + { + "epoch": 6.012045713047717, + "grad_norm": 0.016311563551425934, + "learning_rate": 4.655256458863674e-05, + "loss": 0.0289, + "step": 29920 + }, + { + "epoch": 6.012099875426529, + "grad_norm": 0.31565430760383606, + "learning_rate": 4.654955556759164e-05, + "loss": 0.2398, + "step": 29930 + }, + { + "epoch": 6.01215403780534, + "grad_norm": 0.016845520585775375, + "learning_rate": 4.654654654654655e-05, + "loss": 0.064, + "step": 29940 + }, + { + "epoch": 6.012208200184152, + "grad_norm": 0.20123828947544098, + "learning_rate": 4.654353752550146e-05, + "loss": 0.0664, + "step": 29950 + }, + { + "epoch": 6.012262362562963, + "grad_norm": 0.4424799084663391, + "learning_rate": 4.654052850445636e-05, + "loss": 0.0442, + "step": 29960 + }, + { + "epoch": 6.012316524941776, + "grad_norm": 0.8216269016265869, + "learning_rate": 4.653751948341127e-05, + "loss": 0.0502, + "step": 29970 + }, + { + "epoch": 6.012370687320587, + "grad_norm": 2.637273073196411, + "learning_rate": 4.6534510462366176e-05, + "loss": 0.0564, + "step": 29980 + }, + { + "epoch": 6.012424849699399, + "grad_norm": 0.06087104231119156, + "learning_rate": 4.653150144132108e-05, + "loss": 0.0882, + "step": 29990 + }, + { + "epoch": 6.01247901207821, + "grad_norm": 1.204526662826538, + "learning_rate": 4.652849242027599e-05, + "loss": 0.018, + "step": 30000 + }, + { + "epoch": 6.0125331744570225, + "grad_norm": 0.6195321083068848, + "learning_rate": 4.6525483399230894e-05, + "loss": 0.0913, + "step": 30010 + }, + { + "epoch": 6.012587336835834, + "grad_norm": 0.3072221577167511, + "learning_rate": 4.652247437818581e-05, + "loss": 0.1166, + "step": 30020 + }, + { + "epoch": 6.012641499214645, + "grad_norm": 0.12745103240013123, + "learning_rate": 4.6519465357140714e-05, + "loss": 0.0569, + "step": 30030 + }, + { + "epoch": 6.012695661593457, + "grad_norm": 0.260511189699173, + "learning_rate": 4.651645633609561e-05, + "loss": 0.064, + "step": 30040 + }, + { + "epoch": 6.0127498239722685, + "grad_norm": 0.15199527144432068, + "learning_rate": 4.6513447315050526e-05, + "loss": 0.1066, + "step": 30050 + }, + { + "epoch": 6.012803986351081, + "grad_norm": 12.720627784729004, + "learning_rate": 4.651043829400543e-05, + "loss": 0.1655, + "step": 30060 + }, + { + "epoch": 6.012858148729892, + "grad_norm": 0.0258773360401392, + "learning_rate": 4.650742927296034e-05, + "loss": 0.1157, + "step": 30070 + }, + { + "epoch": 6.012912311108704, + "grad_norm": 0.07973402738571167, + "learning_rate": 4.6504420251915245e-05, + "loss": 0.0207, + "step": 30080 + }, + { + "epoch": 6.0129664734875155, + "grad_norm": 3.8088674545288086, + "learning_rate": 4.650141123087015e-05, + "loss": 0.0971, + "step": 30090 + }, + { + "epoch": 6.013020635866328, + "grad_norm": 0.39367979764938354, + "learning_rate": 4.649840220982506e-05, + "loss": 0.058, + "step": 30100 + }, + { + "epoch": 6.013074798245139, + "grad_norm": 1.5916081666946411, + "learning_rate": 4.6495393188779964e-05, + "loss": 0.0435, + "step": 30110 + }, + { + "epoch": 6.01312896062395, + "grad_norm": 4.777333736419678, + "learning_rate": 4.649238416773487e-05, + "loss": 0.047, + "step": 30120 + }, + { + "epoch": 6.013183123002762, + "grad_norm": 0.025131482630968094, + "learning_rate": 4.6489375146689776e-05, + "loss": 0.0767, + "step": 30130 + }, + { + "epoch": 6.013237285381574, + "grad_norm": 0.5911473631858826, + "learning_rate": 4.648636612564469e-05, + "loss": 0.0802, + "step": 30140 + }, + { + "epoch": 6.013291447760386, + "grad_norm": 0.17746374011039734, + "learning_rate": 4.648335710459959e-05, + "loss": 0.094, + "step": 30150 + }, + { + "epoch": 6.013345610139197, + "grad_norm": 0.016200875863432884, + "learning_rate": 4.6480348083554495e-05, + "loss": 0.0197, + "step": 30160 + }, + { + "epoch": 6.013399772518009, + "grad_norm": 0.11342508345842361, + "learning_rate": 4.647733906250941e-05, + "loss": 0.1947, + "step": 30170 + }, + { + "epoch": 6.013453934896821, + "grad_norm": 3.496622323989868, + "learning_rate": 4.6474330041464315e-05, + "loss": 0.0685, + "step": 30180 + }, + { + "epoch": 6.013508097275633, + "grad_norm": 0.002943359548225999, + "learning_rate": 4.6471321020419214e-05, + "loss": 0.0677, + "step": 30190 + }, + { + "epoch": 6.013562259654444, + "grad_norm": 0.00765044754371047, + "learning_rate": 4.646831199937413e-05, + "loss": 0.0044, + "step": 30200 + }, + { + "epoch": 6.013616422033255, + "grad_norm": 0.3023904263973236, + "learning_rate": 4.6465302978329033e-05, + "loss": 0.0594, + "step": 30210 + }, + { + "epoch": 6.0136705844120675, + "grad_norm": 10.377420425415039, + "learning_rate": 4.646229395728394e-05, + "loss": 0.1968, + "step": 30220 + }, + { + "epoch": 6.013724746790879, + "grad_norm": 0.21369273960590363, + "learning_rate": 4.6459284936238846e-05, + "loss": 0.2526, + "step": 30230 + }, + { + "epoch": 6.013778909169691, + "grad_norm": 0.08002300560474396, + "learning_rate": 4.645627591519375e-05, + "loss": 0.1907, + "step": 30240 + }, + { + "epoch": 6.013833071548502, + "grad_norm": 0.3392958641052246, + "learning_rate": 4.645326689414866e-05, + "loss": 0.0092, + "step": 30250 + }, + { + "epoch": 6.013887233927314, + "grad_norm": 0.1935449093580246, + "learning_rate": 4.6450257873103565e-05, + "loss": 0.135, + "step": 30260 + }, + { + "epoch": 6.013941396306126, + "grad_norm": 0.16740627586841583, + "learning_rate": 4.644724885205847e-05, + "loss": 0.2204, + "step": 30270 + }, + { + "epoch": 6.013995558684938, + "grad_norm": 7.122953414916992, + "learning_rate": 4.6444239831013384e-05, + "loss": 0.117, + "step": 30280 + }, + { + "epoch": 6.014049721063749, + "grad_norm": 1.9970701932907104, + "learning_rate": 4.644123080996829e-05, + "loss": 0.2926, + "step": 30290 + }, + { + "epoch": 6.01410388344256, + "grad_norm": 0.24399808049201965, + "learning_rate": 4.643822178892319e-05, + "loss": 0.0966, + "step": 30300 + }, + { + "epoch": 6.014158045821373, + "grad_norm": 0.016879644244909286, + "learning_rate": 4.64352127678781e-05, + "loss": 0.1688, + "step": 30310 + }, + { + "epoch": 6.014212208200184, + "grad_norm": 0.06429193168878555, + "learning_rate": 4.643220374683301e-05, + "loss": 0.1455, + "step": 30320 + }, + { + "epoch": 6.014266370578996, + "grad_norm": 0.5526494383811951, + "learning_rate": 4.6429194725787915e-05, + "loss": 0.0283, + "step": 30330 + }, + { + "epoch": 6.014320532957807, + "grad_norm": 0.5316708087921143, + "learning_rate": 4.642618570474282e-05, + "loss": 0.2728, + "step": 30340 + }, + { + "epoch": 6.0143746953366195, + "grad_norm": 0.07466810941696167, + "learning_rate": 4.642317668369773e-05, + "loss": 0.1425, + "step": 30350 + }, + { + "epoch": 6.014428857715431, + "grad_norm": 0.21881096065044403, + "learning_rate": 4.6420167662652634e-05, + "loss": 0.0314, + "step": 30360 + }, + { + "epoch": 6.014483020094243, + "grad_norm": 0.10797882080078125, + "learning_rate": 4.641715864160755e-05, + "loss": 0.0716, + "step": 30370 + }, + { + "epoch": 6.014537182473054, + "grad_norm": 2.394653797149658, + "learning_rate": 4.641414962056245e-05, + "loss": 0.1791, + "step": 30380 + }, + { + "epoch": 6.0145913448518655, + "grad_norm": 2.46513295173645, + "learning_rate": 4.641114059951735e-05, + "loss": 0.0817, + "step": 30390 + }, + { + "epoch": 6.014645507230678, + "grad_norm": 0.35993656516075134, + "learning_rate": 4.6408131578472266e-05, + "loss": 0.0462, + "step": 30400 + }, + { + "epoch": 6.014699669609489, + "grad_norm": 1.0493741035461426, + "learning_rate": 4.6405122557427166e-05, + "loss": 0.1076, + "step": 30410 + }, + { + "epoch": 6.014753831988301, + "grad_norm": 0.1041775569319725, + "learning_rate": 4.640211353638207e-05, + "loss": 0.0778, + "step": 30420 + }, + { + "epoch": 6.014807994367112, + "grad_norm": 0.0032990151084959507, + "learning_rate": 4.6399104515336985e-05, + "loss": 0.1244, + "step": 30430 + }, + { + "epoch": 6.014862156745925, + "grad_norm": 5.540316581726074, + "learning_rate": 4.639609549429189e-05, + "loss": 0.2221, + "step": 30440 + }, + { + "epoch": 6.014916319124736, + "grad_norm": 0.04549994319677353, + "learning_rate": 4.639308647324679e-05, + "loss": 0.0568, + "step": 30450 + }, + { + "epoch": 6.014970481503548, + "grad_norm": 1.9305237531661987, + "learning_rate": 4.6390077452201704e-05, + "loss": 0.0181, + "step": 30460 + }, + { + "epoch": 6.015024643882359, + "grad_norm": 0.638871967792511, + "learning_rate": 4.638706843115661e-05, + "loss": 0.1283, + "step": 30470 + }, + { + "epoch": 6.015078806261171, + "grad_norm": 3.6559176445007324, + "learning_rate": 4.6384059410111516e-05, + "loss": 0.0843, + "step": 30480 + }, + { + "epoch": 6.015132968639983, + "grad_norm": 2.182827949523926, + "learning_rate": 4.638105038906642e-05, + "loss": 0.039, + "step": 30490 + }, + { + "epoch": 6.015187131018794, + "grad_norm": 0.49808961153030396, + "learning_rate": 4.637804136802133e-05, + "loss": 0.1057, + "step": 30500 + }, + { + "epoch": 6.015241293397606, + "grad_norm": 2.5474202632904053, + "learning_rate": 4.6375032346976235e-05, + "loss": 0.0913, + "step": 30510 + }, + { + "epoch": 6.0152954557764176, + "grad_norm": 0.9818310141563416, + "learning_rate": 4.637202332593115e-05, + "loss": 0.1759, + "step": 30520 + }, + { + "epoch": 6.01534961815523, + "grad_norm": 0.17677149176597595, + "learning_rate": 4.636901430488605e-05, + "loss": 0.0685, + "step": 30530 + }, + { + "epoch": 6.015403780534041, + "grad_norm": 2.319979429244995, + "learning_rate": 4.636600528384096e-05, + "loss": 0.0172, + "step": 30540 + }, + { + "epoch": 6.015457942912852, + "grad_norm": 1.0912867784500122, + "learning_rate": 4.636299626279587e-05, + "loss": 0.0155, + "step": 30550 + }, + { + "epoch": 6.0155121052916645, + "grad_norm": 0.24408356845378876, + "learning_rate": 4.6359987241750767e-05, + "loss": 0.107, + "step": 30560 + }, + { + "epoch": 6.015566267670476, + "grad_norm": 0.0030733784660696983, + "learning_rate": 4.635697822070568e-05, + "loss": 0.0036, + "step": 30570 + }, + { + "epoch": 6.015620430049288, + "grad_norm": 0.014540081843733788, + "learning_rate": 4.6353969199660586e-05, + "loss": 0.1794, + "step": 30580 + }, + { + "epoch": 6.015674592428099, + "grad_norm": 0.04688854143023491, + "learning_rate": 4.635096017861549e-05, + "loss": 0.0694, + "step": 30590 + }, + { + "epoch": 6.015728754806911, + "grad_norm": 0.14093567430973053, + "learning_rate": 4.63479511575704e-05, + "loss": 0.0453, + "step": 30600 + }, + { + "epoch": 6.015782917185723, + "grad_norm": 0.04278940334916115, + "learning_rate": 4.6344942136525305e-05, + "loss": 0.0474, + "step": 30610 + }, + { + "epoch": 6.015837079564535, + "grad_norm": 17.213184356689453, + "learning_rate": 4.634193311548021e-05, + "loss": 0.2245, + "step": 30620 + }, + { + "epoch": 6.015891241943346, + "grad_norm": 2.2962570190429688, + "learning_rate": 4.6338924094435124e-05, + "loss": 0.015, + "step": 30630 + }, + { + "epoch": 6.015945404322157, + "grad_norm": 5.174180030822754, + "learning_rate": 4.6335915073390024e-05, + "loss": 0.0986, + "step": 30640 + }, + { + "epoch": 6.01599956670097, + "grad_norm": 0.5337263345718384, + "learning_rate": 4.633290605234493e-05, + "loss": 0.0688, + "step": 30650 + }, + { + "epoch": 6.016053729079781, + "grad_norm": 0.11595858633518219, + "learning_rate": 4.632989703129984e-05, + "loss": 0.0506, + "step": 30660 + }, + { + "epoch": 6.016107891458593, + "grad_norm": 0.5342949032783508, + "learning_rate": 4.632688801025475e-05, + "loss": 0.1861, + "step": 30670 + }, + { + "epoch": 6.016162053837404, + "grad_norm": 0.12768758833408356, + "learning_rate": 4.632387898920965e-05, + "loss": 0.0169, + "step": 30680 + }, + { + "epoch": 6.0162162162162165, + "grad_norm": 2.9283089637756348, + "learning_rate": 4.632086996816456e-05, + "loss": 0.1181, + "step": 30690 + }, + { + "epoch": 6.016270378595028, + "grad_norm": 0.2211044281721115, + "learning_rate": 4.631786094711947e-05, + "loss": 0.0654, + "step": 30700 + }, + { + "epoch": 6.01632454097384, + "grad_norm": 1.0227444171905518, + "learning_rate": 4.631485192607437e-05, + "loss": 0.02, + "step": 30710 + }, + { + "epoch": 6.016378703352651, + "grad_norm": 2.037764310836792, + "learning_rate": 4.631184290502928e-05, + "loss": 0.0495, + "step": 30720 + }, + { + "epoch": 6.0164328657314625, + "grad_norm": 0.13079309463500977, + "learning_rate": 4.630883388398419e-05, + "loss": 0.2309, + "step": 30730 + }, + { + "epoch": 6.016487028110275, + "grad_norm": 0.058188196271657944, + "learning_rate": 4.630582486293909e-05, + "loss": 0.0421, + "step": 30740 + }, + { + "epoch": 6.016541190489086, + "grad_norm": 0.0036308723501861095, + "learning_rate": 4.6302815841894e-05, + "loss": 0.006, + "step": 30750 + }, + { + "epoch": 6.016595352867898, + "grad_norm": 0.07516404241323471, + "learning_rate": 4.6299806820848906e-05, + "loss": 0.0504, + "step": 30760 + }, + { + "epoch": 6.016649515246709, + "grad_norm": 0.046860720962285995, + "learning_rate": 4.629679779980381e-05, + "loss": 0.1235, + "step": 30770 + }, + { + "epoch": 6.016703677625522, + "grad_norm": 0.01490858569741249, + "learning_rate": 4.6293788778758725e-05, + "loss": 0.0451, + "step": 30780 + }, + { + "epoch": 6.016757840004333, + "grad_norm": 0.002769336337223649, + "learning_rate": 4.6290779757713624e-05, + "loss": 0.2199, + "step": 30790 + }, + { + "epoch": 6.016812002383145, + "grad_norm": 6.8492231369018555, + "learning_rate": 4.628777073666854e-05, + "loss": 0.1505, + "step": 30800 + }, + { + "epoch": 6.016866164761956, + "grad_norm": 0.01855875365436077, + "learning_rate": 4.6284761715623444e-05, + "loss": 0.1371, + "step": 30810 + }, + { + "epoch": 6.016920327140768, + "grad_norm": 0.29433977603912354, + "learning_rate": 4.628175269457835e-05, + "loss": 0.1891, + "step": 30820 + }, + { + "epoch": 6.01697448951958, + "grad_norm": 0.5996717214584351, + "learning_rate": 4.6278743673533256e-05, + "loss": 0.0507, + "step": 30830 + }, + { + "epoch": 6.017028651898391, + "grad_norm": 0.08268354833126068, + "learning_rate": 4.627573465248816e-05, + "loss": 0.1834, + "step": 30840 + }, + { + "epoch": 6.017082814277203, + "grad_norm": 14.139559745788574, + "learning_rate": 4.627272563144307e-05, + "loss": 0.1613, + "step": 30850 + }, + { + "epoch": 6.0171369766560145, + "grad_norm": 6.769413948059082, + "learning_rate": 4.6269716610397975e-05, + "loss": 0.0519, + "step": 30860 + }, + { + "epoch": 6.017191139034827, + "grad_norm": 0.12519119679927826, + "learning_rate": 4.626670758935288e-05, + "loss": 0.0709, + "step": 30870 + }, + { + "epoch": 6.017245301413638, + "grad_norm": 0.02435903064906597, + "learning_rate": 4.626369856830779e-05, + "loss": 0.0337, + "step": 30880 + }, + { + "epoch": 6.01729946379245, + "grad_norm": 0.3006383776664734, + "learning_rate": 4.62606895472627e-05, + "loss": 0.057, + "step": 30890 + }, + { + "epoch": 6.0173536261712615, + "grad_norm": 0.3278544843196869, + "learning_rate": 4.62576805262176e-05, + "loss": 0.0944, + "step": 30900 + }, + { + "epoch": 6.017407788550073, + "grad_norm": 0.1079806238412857, + "learning_rate": 4.6254671505172506e-05, + "loss": 0.1077, + "step": 30910 + }, + { + "epoch": 6.017461950928885, + "grad_norm": 0.22874587774276733, + "learning_rate": 4.625166248412742e-05, + "loss": 0.0875, + "step": 30920 + }, + { + "epoch": 6.017516113307696, + "grad_norm": 2.7420222759246826, + "learning_rate": 4.6248653463082326e-05, + "loss": 0.1672, + "step": 30930 + }, + { + "epoch": 6.017570275686508, + "grad_norm": 3.4173965454101562, + "learning_rate": 4.6245644442037225e-05, + "loss": 0.0791, + "step": 30940 + }, + { + "epoch": 6.01762443806532, + "grad_norm": 0.22603145241737366, + "learning_rate": 4.624263542099214e-05, + "loss": 0.1342, + "step": 30950 + }, + { + "epoch": 6.017678600444132, + "grad_norm": 0.15858511626720428, + "learning_rate": 4.6239626399947045e-05, + "loss": 0.198, + "step": 30960 + }, + { + "epoch": 6.017732762822943, + "grad_norm": 0.0882987231016159, + "learning_rate": 4.623661737890195e-05, + "loss": 0.0689, + "step": 30970 + }, + { + "epoch": 6.017786925201755, + "grad_norm": 0.28194043040275574, + "learning_rate": 4.623360835785686e-05, + "loss": 0.0282, + "step": 30980 + }, + { + "epoch": 6.017841087580567, + "grad_norm": 3.5884475708007812, + "learning_rate": 4.6230599336811763e-05, + "loss": 0.2136, + "step": 30990 + }, + { + "epoch": 6.017895249959378, + "grad_norm": 0.30943888425827026, + "learning_rate": 4.622759031576667e-05, + "loss": 0.1087, + "step": 31000 + }, + { + "epoch": 6.01794941233819, + "grad_norm": 0.16755938529968262, + "learning_rate": 4.6224581294721576e-05, + "loss": 0.0286, + "step": 31010 + }, + { + "epoch": 6.018003574717001, + "grad_norm": 0.2805452048778534, + "learning_rate": 4.622157227367648e-05, + "loss": 0.0875, + "step": 31020 + }, + { + "epoch": 6.0180577370958135, + "grad_norm": 0.05600176379084587, + "learning_rate": 4.621856325263139e-05, + "loss": 0.0699, + "step": 31030 + }, + { + "epoch": 6.018111899474625, + "grad_norm": 1.816372036933899, + "learning_rate": 4.62155542315863e-05, + "loss": 0.059, + "step": 31040 + }, + { + "epoch": 6.018166061853437, + "grad_norm": 0.0376855805516243, + "learning_rate": 4.62125452105412e-05, + "loss": 0.0694, + "step": 31050 + }, + { + "epoch": 6.018220224232248, + "grad_norm": 0.1500561386346817, + "learning_rate": 4.6209536189496114e-05, + "loss": 0.0674, + "step": 31060 + }, + { + "epoch": 6.0182743866110595, + "grad_norm": 0.08341585099697113, + "learning_rate": 4.620652716845102e-05, + "loss": 0.03, + "step": 31070 + }, + { + "epoch": 6.018328548989872, + "grad_norm": 0.013186094351112843, + "learning_rate": 4.620351814740593e-05, + "loss": 0.084, + "step": 31080 + }, + { + "epoch": 6.018382711368683, + "grad_norm": 8.626456260681152, + "learning_rate": 4.620050912636083e-05, + "loss": 0.0686, + "step": 31090 + }, + { + "epoch": 6.018436873747495, + "grad_norm": 33.21569061279297, + "learning_rate": 4.619750010531574e-05, + "loss": 0.1116, + "step": 31100 + }, + { + "epoch": 6.018491036126306, + "grad_norm": 0.08168456703424454, + "learning_rate": 4.6194491084270646e-05, + "loss": 0.1504, + "step": 31110 + }, + { + "epoch": 6.018545198505119, + "grad_norm": 0.057755839079618454, + "learning_rate": 4.619148206322555e-05, + "loss": 0.1755, + "step": 31120 + }, + { + "epoch": 6.01859936088393, + "grad_norm": 16.583086013793945, + "learning_rate": 4.618847304218046e-05, + "loss": 0.1092, + "step": 31130 + }, + { + "epoch": 6.018653523262742, + "grad_norm": 2.586824417114258, + "learning_rate": 4.6185464021135364e-05, + "loss": 0.1479, + "step": 31140 + }, + { + "epoch": 6.018707685641553, + "grad_norm": 0.011323721148073673, + "learning_rate": 4.618245500009028e-05, + "loss": 0.0499, + "step": 31150 + }, + { + "epoch": 6.018761848020365, + "grad_norm": 0.4347041845321655, + "learning_rate": 4.617944597904518e-05, + "loss": 0.1516, + "step": 31160 + }, + { + "epoch": 6.018816010399177, + "grad_norm": 0.5497750639915466, + "learning_rate": 4.617643695800008e-05, + "loss": 0.0244, + "step": 31170 + }, + { + "epoch": 6.018870172777988, + "grad_norm": 0.275813490152359, + "learning_rate": 4.6173427936954996e-05, + "loss": 0.1449, + "step": 31180 + }, + { + "epoch": 6.0189243351568, + "grad_norm": 4.662135601043701, + "learning_rate": 4.61704189159099e-05, + "loss": 0.1755, + "step": 31190 + }, + { + "epoch": 6.0189784975356115, + "grad_norm": 0.2605825960636139, + "learning_rate": 4.61674098948648e-05, + "loss": 0.0474, + "step": 31200 + }, + { + "epoch": 6.019032659914424, + "grad_norm": 0.19514891505241394, + "learning_rate": 4.6164400873819715e-05, + "loss": 0.0435, + "step": 31210 + }, + { + "epoch": 6.019086822293235, + "grad_norm": 0.22020508348941803, + "learning_rate": 4.616139185277462e-05, + "loss": 0.0809, + "step": 31220 + }, + { + "epoch": 6.019140984672047, + "grad_norm": 0.028657102957367897, + "learning_rate": 4.615838283172953e-05, + "loss": 0.0908, + "step": 31230 + }, + { + "epoch": 6.0191951470508585, + "grad_norm": 9.380732536315918, + "learning_rate": 4.6155373810684434e-05, + "loss": 0.1335, + "step": 31240 + }, + { + "epoch": 6.01924930942967, + "grad_norm": 2.8648033142089844, + "learning_rate": 4.615236478963934e-05, + "loss": 0.1049, + "step": 31250 + }, + { + "epoch": 6.019303471808482, + "grad_norm": 0.7640904188156128, + "learning_rate": 4.6149355768594246e-05, + "loss": 0.0092, + "step": 31260 + }, + { + "epoch": 6.019357634187293, + "grad_norm": 0.11460955440998077, + "learning_rate": 4.614634674754916e-05, + "loss": 0.1756, + "step": 31270 + }, + { + "epoch": 6.019411796566105, + "grad_norm": 0.0020333649590611458, + "learning_rate": 4.614333772650406e-05, + "loss": 0.0606, + "step": 31280 + }, + { + "epoch": 6.019465958944917, + "grad_norm": 0.8408780694007874, + "learning_rate": 4.6140328705458965e-05, + "loss": 0.0531, + "step": 31290 + }, + { + "epoch": 6.019520121323729, + "grad_norm": 0.10887809097766876, + "learning_rate": 4.613731968441388e-05, + "loss": 0.0757, + "step": 31300 + }, + { + "epoch": 6.01957428370254, + "grad_norm": 2.998934268951416, + "learning_rate": 4.613431066336878e-05, + "loss": 0.0427, + "step": 31310 + }, + { + "epoch": 6.019628446081352, + "grad_norm": 0.20987245440483093, + "learning_rate": 4.613130164232369e-05, + "loss": 0.008, + "step": 31320 + }, + { + "epoch": 6.019682608460164, + "grad_norm": 0.08909113705158234, + "learning_rate": 4.61282926212786e-05, + "loss": 0.1025, + "step": 31330 + }, + { + "epoch": 6.019736770838975, + "grad_norm": 0.12355227023363113, + "learning_rate": 4.6125283600233503e-05, + "loss": 0.0805, + "step": 31340 + }, + { + "epoch": 6.019790933217787, + "grad_norm": 0.34193098545074463, + "learning_rate": 4.612227457918841e-05, + "loss": 0.0322, + "step": 31350 + }, + { + "epoch": 6.019845095596598, + "grad_norm": 0.3303123414516449, + "learning_rate": 4.6119265558143316e-05, + "loss": 0.0649, + "step": 31360 + }, + { + "epoch": 6.0198992579754105, + "grad_norm": 0.27825286984443665, + "learning_rate": 4.611625653709822e-05, + "loss": 0.0758, + "step": 31370 + }, + { + "epoch": 6.019953420354222, + "grad_norm": 0.046206098049879074, + "learning_rate": 4.611324751605313e-05, + "loss": 0.0941, + "step": 31380 + }, + { + "epoch": 6.020007582733034, + "grad_norm": 3.5605437755584717, + "learning_rate": 4.6110238495008035e-05, + "loss": 0.0648, + "step": 31390 + }, + { + "epoch": 6.020061745111845, + "grad_norm": 0.005763388238847256, + "learning_rate": 4.610722947396294e-05, + "loss": 0.074, + "step": 31400 + }, + { + "epoch": 6.020115907490657, + "grad_norm": 0.05386185273528099, + "learning_rate": 4.6104220452917854e-05, + "loss": 0.0838, + "step": 31410 + }, + { + "epoch": 6.020170069869469, + "grad_norm": 0.0029704892076551914, + "learning_rate": 4.610121143187276e-05, + "loss": 0.1243, + "step": 31420 + }, + { + "epoch": 6.02022423224828, + "grad_norm": 0.9046436548233032, + "learning_rate": 4.609820241082766e-05, + "loss": 0.0091, + "step": 31430 + }, + { + "epoch": 6.020278394627092, + "grad_norm": 0.20612859725952148, + "learning_rate": 4.609519338978257e-05, + "loss": 0.0065, + "step": 31440 + }, + { + "epoch": 6.020332557005903, + "grad_norm": 0.3110942244529724, + "learning_rate": 4.609218436873748e-05, + "loss": 0.0872, + "step": 31450 + }, + { + "epoch": 6.020386719384716, + "grad_norm": 0.23151811957359314, + "learning_rate": 4.608917534769238e-05, + "loss": 0.0436, + "step": 31460 + }, + { + "epoch": 6.020440881763527, + "grad_norm": 0.3277739882469177, + "learning_rate": 4.608616632664729e-05, + "loss": 0.0662, + "step": 31470 + }, + { + "epoch": 6.020495044142339, + "grad_norm": 0.06398370862007141, + "learning_rate": 4.60831573056022e-05, + "loss": 0.0151, + "step": 31480 + }, + { + "epoch": 6.02054920652115, + "grad_norm": 0.06517576426267624, + "learning_rate": 4.6080148284557104e-05, + "loss": 0.08, + "step": 31490 + }, + { + "epoch": 6.0206033688999625, + "grad_norm": 4.57648229598999, + "learning_rate": 4.607713926351201e-05, + "loss": 0.106, + "step": 31500 + }, + { + "epoch": 6.020657531278774, + "grad_norm": 0.1637297421693802, + "learning_rate": 4.607413024246692e-05, + "loss": 0.0336, + "step": 31510 + }, + { + "epoch": 6.020711693657585, + "grad_norm": 21.789508819580078, + "learning_rate": 4.607112122142182e-05, + "loss": 0.3002, + "step": 31520 + }, + { + "epoch": 6.020765856036397, + "grad_norm": 0.09038389474153519, + "learning_rate": 4.6068112200376736e-05, + "loss": 0.0913, + "step": 31530 + }, + { + "epoch": 6.0208200184152085, + "grad_norm": 0.07913859188556671, + "learning_rate": 4.6065103179331636e-05, + "loss": 0.0366, + "step": 31540 + }, + { + "epoch": 6.020874180794021, + "grad_norm": 0.3373752236366272, + "learning_rate": 4.606209415828654e-05, + "loss": 0.1429, + "step": 31550 + }, + { + "epoch": 6.020928343172832, + "grad_norm": 0.0027481564320623875, + "learning_rate": 4.6059085137241455e-05, + "loss": 0.1422, + "step": 31560 + }, + { + "epoch": 6.020982505551644, + "grad_norm": 0.11116761714220047, + "learning_rate": 4.605607611619636e-05, + "loss": 0.1104, + "step": 31570 + }, + { + "epoch": 6.021036667930455, + "grad_norm": 0.47985073924064636, + "learning_rate": 4.605306709515127e-05, + "loss": 0.1022, + "step": 31580 + }, + { + "epoch": 6.021090830309268, + "grad_norm": 3.529104471206665, + "learning_rate": 4.6050058074106174e-05, + "loss": 0.1283, + "step": 31590 + }, + { + "epoch": 6.021144992688079, + "grad_norm": 0.4420580565929413, + "learning_rate": 4.604704905306108e-05, + "loss": 0.1021, + "step": 31600 + }, + { + "epoch": 6.02119915506689, + "grad_norm": 1.9426378011703491, + "learning_rate": 4.6044040032015986e-05, + "loss": 0.128, + "step": 31610 + }, + { + "epoch": 6.021253317445702, + "grad_norm": 5.948631763458252, + "learning_rate": 4.604103101097089e-05, + "loss": 0.0764, + "step": 31620 + }, + { + "epoch": 6.021307479824514, + "grad_norm": 0.5453802347183228, + "learning_rate": 4.60380219899258e-05, + "loss": 0.083, + "step": 31630 + }, + { + "epoch": 6.021361642203326, + "grad_norm": 0.2634495198726654, + "learning_rate": 4.6035012968880705e-05, + "loss": 0.0666, + "step": 31640 + }, + { + "epoch": 6.021415804582137, + "grad_norm": 0.0018069491488859057, + "learning_rate": 4.603200394783561e-05, + "loss": 0.1164, + "step": 31650 + }, + { + "epoch": 6.021469966960949, + "grad_norm": 3.046088218688965, + "learning_rate": 4.602899492679052e-05, + "loss": 0.1387, + "step": 31660 + }, + { + "epoch": 6.021524129339761, + "grad_norm": 0.08029884099960327, + "learning_rate": 4.602598590574543e-05, + "loss": 0.163, + "step": 31670 + }, + { + "epoch": 6.021578291718572, + "grad_norm": 0.004724238533526659, + "learning_rate": 4.602297688470034e-05, + "loss": 0.0981, + "step": 31680 + }, + { + "epoch": 6.021632454097384, + "grad_norm": 1.2931115627288818, + "learning_rate": 4.6019967863655237e-05, + "loss": 0.105, + "step": 31690 + }, + { + "epoch": 6.021686616476195, + "grad_norm": 0.8536443114280701, + "learning_rate": 4.601695884261015e-05, + "loss": 0.147, + "step": 31700 + }, + { + "epoch": 6.0217407788550075, + "grad_norm": 0.6701325178146362, + "learning_rate": 4.6013949821565056e-05, + "loss": 0.1054, + "step": 31710 + }, + { + "epoch": 6.021794941233819, + "grad_norm": 0.002993026515468955, + "learning_rate": 4.601094080051996e-05, + "loss": 0.2053, + "step": 31720 + }, + { + "epoch": 6.021849103612631, + "grad_norm": 0.08979784697294235, + "learning_rate": 4.600793177947487e-05, + "loss": 0.0683, + "step": 31730 + }, + { + "epoch": 6.021903265991442, + "grad_norm": 0.28996583819389343, + "learning_rate": 4.6004922758429775e-05, + "loss": 0.1073, + "step": 31740 + }, + { + "epoch": 6.021957428370254, + "grad_norm": 0.3196867108345032, + "learning_rate": 4.600191373738468e-05, + "loss": 0.0401, + "step": 31750 + }, + { + "epoch": 6.022011590749066, + "grad_norm": 0.09964177757501602, + "learning_rate": 4.599890471633959e-05, + "loss": 0.0368, + "step": 31760 + }, + { + "epoch": 6.022065753127877, + "grad_norm": 0.03318154811859131, + "learning_rate": 4.5995895695294494e-05, + "loss": 0.0232, + "step": 31770 + }, + { + "epoch": 6.022119915506689, + "grad_norm": 0.21701273322105408, + "learning_rate": 4.59928866742494e-05, + "loss": 0.1963, + "step": 31780 + }, + { + "epoch": 6.0221740778855, + "grad_norm": 0.7852664589881897, + "learning_rate": 4.598987765320431e-05, + "loss": 0.0281, + "step": 31790 + }, + { + "epoch": 6.022228240264313, + "grad_norm": 0.01032436452805996, + "learning_rate": 4.598686863215921e-05, + "loss": 0.1094, + "step": 31800 + }, + { + "epoch": 6.022282402643124, + "grad_norm": 0.01357176247984171, + "learning_rate": 4.598385961111412e-05, + "loss": 0.071, + "step": 31810 + }, + { + "epoch": 6.022336565021936, + "grad_norm": 0.036140091717243195, + "learning_rate": 4.598085059006903e-05, + "loss": 0.0268, + "step": 31820 + }, + { + "epoch": 6.022390727400747, + "grad_norm": 7.274936199188232, + "learning_rate": 4.597784156902394e-05, + "loss": 0.0964, + "step": 31830 + }, + { + "epoch": 6.0224448897795595, + "grad_norm": 0.0027829103637486696, + "learning_rate": 4.5974832547978844e-05, + "loss": 0.0542, + "step": 31840 + }, + { + "epoch": 6.022499052158371, + "grad_norm": 0.14943571388721466, + "learning_rate": 4.597182352693375e-05, + "loss": 0.0383, + "step": 31850 + }, + { + "epoch": 6.022553214537182, + "grad_norm": 0.0015685504768043756, + "learning_rate": 4.596881450588866e-05, + "loss": 0.0856, + "step": 31860 + }, + { + "epoch": 6.022607376915994, + "grad_norm": 0.0019016307778656483, + "learning_rate": 4.596580548484356e-05, + "loss": 0.1822, + "step": 31870 + }, + { + "epoch": 6.0226615392948055, + "grad_norm": 1.6715822219848633, + "learning_rate": 4.596279646379847e-05, + "loss": 0.0958, + "step": 31880 + }, + { + "epoch": 6.022715701673618, + "grad_norm": 0.40065863728523254, + "learning_rate": 4.5959787442753376e-05, + "loss": 0.0079, + "step": 31890 + }, + { + "epoch": 6.022769864052429, + "grad_norm": 0.07929910719394684, + "learning_rate": 4.595677842170829e-05, + "loss": 0.0454, + "step": 31900 + }, + { + "epoch": 6.022824026431241, + "grad_norm": 0.0031369177158921957, + "learning_rate": 4.595376940066319e-05, + "loss": 0.0898, + "step": 31910 + }, + { + "epoch": 6.022878188810052, + "grad_norm": 9.163656234741211, + "learning_rate": 4.5950760379618094e-05, + "loss": 0.0597, + "step": 31920 + }, + { + "epoch": 6.022932351188865, + "grad_norm": 0.6264885663986206, + "learning_rate": 4.594775135857301e-05, + "loss": 0.1024, + "step": 31930 + }, + { + "epoch": 6.022986513567676, + "grad_norm": 2.8459396362304688, + "learning_rate": 4.5944742337527914e-05, + "loss": 0.0594, + "step": 31940 + }, + { + "epoch": 6.023040675946487, + "grad_norm": 0.09172773361206055, + "learning_rate": 4.594173331648281e-05, + "loss": 0.0771, + "step": 31950 + }, + { + "epoch": 6.023094838325299, + "grad_norm": 0.028856653720140457, + "learning_rate": 4.5938724295437726e-05, + "loss": 0.114, + "step": 31960 + }, + { + "epoch": 6.023149000704111, + "grad_norm": 0.0779784545302391, + "learning_rate": 4.593571527439263e-05, + "loss": 0.0192, + "step": 31970 + }, + { + "epoch": 6.023203163082923, + "grad_norm": 0.1770598441362381, + "learning_rate": 4.593270625334754e-05, + "loss": 0.0717, + "step": 31980 + }, + { + "epoch": 6.023257325461734, + "grad_norm": 0.2586139738559723, + "learning_rate": 4.5929697232302445e-05, + "loss": 0.0885, + "step": 31990 + }, + { + "epoch": 6.023311487840546, + "grad_norm": 0.3847748041152954, + "learning_rate": 4.592668821125735e-05, + "loss": 0.1248, + "step": 32000 + }, + { + "epoch": 6.0233656502193575, + "grad_norm": 1.1061909198760986, + "learning_rate": 4.592367919021226e-05, + "loss": 0.0991, + "step": 32010 + }, + { + "epoch": 6.02341981259817, + "grad_norm": 0.356291800737381, + "learning_rate": 4.592067016916717e-05, + "loss": 0.0436, + "step": 32020 + }, + { + "epoch": 6.023473974976981, + "grad_norm": 0.0017873855540528893, + "learning_rate": 4.591766114812207e-05, + "loss": 0.128, + "step": 32030 + }, + { + "epoch": 6.023528137355792, + "grad_norm": 5.607778549194336, + "learning_rate": 4.5914652127076977e-05, + "loss": 0.1332, + "step": 32040 + }, + { + "epoch": 6.0235822997346045, + "grad_norm": 0.26808658242225647, + "learning_rate": 4.591164310603189e-05, + "loss": 0.0764, + "step": 32050 + }, + { + "epoch": 6.023636462113416, + "grad_norm": 0.003868049941956997, + "learning_rate": 4.590863408498679e-05, + "loss": 0.0348, + "step": 32060 + }, + { + "epoch": 6.023690624492228, + "grad_norm": 0.15455612540245056, + "learning_rate": 4.5905625063941695e-05, + "loss": 0.1023, + "step": 32070 + }, + { + "epoch": 6.023744786871039, + "grad_norm": 0.18132314085960388, + "learning_rate": 4.590261604289661e-05, + "loss": 0.0182, + "step": 32080 + }, + { + "epoch": 6.023798949249851, + "grad_norm": 0.2621218264102936, + "learning_rate": 4.5899607021851515e-05, + "loss": 0.0487, + "step": 32090 + }, + { + "epoch": 6.023853111628663, + "grad_norm": 2.786668539047241, + "learning_rate": 4.589659800080642e-05, + "loss": 0.1576, + "step": 32100 + }, + { + "epoch": 6.023907274007475, + "grad_norm": 0.6194859743118286, + "learning_rate": 4.589358897976133e-05, + "loss": 0.0343, + "step": 32110 + }, + { + "epoch": 6.023961436386286, + "grad_norm": 3.298781156539917, + "learning_rate": 4.5890579958716233e-05, + "loss": 0.0964, + "step": 32120 + }, + { + "epoch": 6.024015598765097, + "grad_norm": 2.4816322326660156, + "learning_rate": 4.588757093767114e-05, + "loss": 0.0809, + "step": 32130 + }, + { + "epoch": 6.02406976114391, + "grad_norm": 0.0030806101858615875, + "learning_rate": 4.5884561916626046e-05, + "loss": 0.011, + "step": 32140 + }, + { + "epoch": 6.024123923522721, + "grad_norm": 0.0036398118827492, + "learning_rate": 4.588155289558095e-05, + "loss": 0.1244, + "step": 32150 + }, + { + "epoch": 6.024178085901533, + "grad_norm": 0.28089550137519836, + "learning_rate": 4.5878543874535865e-05, + "loss": 0.1937, + "step": 32160 + }, + { + "epoch": 6.024232248280344, + "grad_norm": 16.676694869995117, + "learning_rate": 4.5875534853490765e-05, + "loss": 0.0846, + "step": 32170 + }, + { + "epoch": 6.0242864106591565, + "grad_norm": 2.753563165664673, + "learning_rate": 4.587252583244567e-05, + "loss": 0.0745, + "step": 32180 + }, + { + "epoch": 6.024340573037968, + "grad_norm": 11.229961395263672, + "learning_rate": 4.5869516811400584e-05, + "loss": 0.0532, + "step": 32190 + }, + { + "epoch": 6.02439473541678, + "grad_norm": 1.1459033489227295, + "learning_rate": 4.586650779035549e-05, + "loss": 0.0199, + "step": 32200 + }, + { + "epoch": 6.024448897795591, + "grad_norm": 0.02149255946278572, + "learning_rate": 4.586349876931039e-05, + "loss": 0.2091, + "step": 32210 + }, + { + "epoch": 6.0245030601744025, + "grad_norm": 1.460176944732666, + "learning_rate": 4.58604897482653e-05, + "loss": 0.1111, + "step": 32220 + }, + { + "epoch": 6.024557222553215, + "grad_norm": 0.21159479022026062, + "learning_rate": 4.585748072722021e-05, + "loss": 0.0426, + "step": 32230 + }, + { + "epoch": 6.024611384932026, + "grad_norm": 0.09960658103227615, + "learning_rate": 4.5854471706175116e-05, + "loss": 0.0478, + "step": 32240 + }, + { + "epoch": 6.024665547310838, + "grad_norm": 0.5686512589454651, + "learning_rate": 4.585146268513002e-05, + "loss": 0.2093, + "step": 32250 + }, + { + "epoch": 6.024719709689649, + "grad_norm": 2.9406204223632812, + "learning_rate": 4.584845366408493e-05, + "loss": 0.1816, + "step": 32260 + }, + { + "epoch": 6.024773872068462, + "grad_norm": 0.022038603201508522, + "learning_rate": 4.5845444643039834e-05, + "loss": 0.1368, + "step": 32270 + }, + { + "epoch": 6.024828034447273, + "grad_norm": 0.060279570519924164, + "learning_rate": 4.584243562199475e-05, + "loss": 0.0338, + "step": 32280 + }, + { + "epoch": 6.024882196826084, + "grad_norm": 0.1694820523262024, + "learning_rate": 4.583942660094965e-05, + "loss": 0.065, + "step": 32290 + }, + { + "epoch": 6.024936359204896, + "grad_norm": 0.18106858432292938, + "learning_rate": 4.583641757990455e-05, + "loss": 0.0607, + "step": 32300 + }, + { + "epoch": 6.024990521583708, + "grad_norm": 0.018989644944667816, + "learning_rate": 4.5833408558859466e-05, + "loss": 0.0653, + "step": 32310 + }, + { + "epoch": 6.02500135405947, + "eval_accuracy": 0.7756368386675375, + "eval_loss": 1.038594126701355, + "eval_runtime": 117.4951, + "eval_samples_per_second": 26.061, + "eval_steps_per_second": 3.26, + "step": 32312 + }, + { + "epoch": 7.00004332990305, + "grad_norm": 0.00679042236879468, + "learning_rate": 4.5830399537814366e-05, + "loss": 0.2218, + "step": 32320 + }, + { + "epoch": 7.000097492281861, + "grad_norm": 0.06775037944316864, + "learning_rate": 4.582739051676927e-05, + "loss": 0.0675, + "step": 32330 + }, + { + "epoch": 7.000151654660673, + "grad_norm": 0.0951746329665184, + "learning_rate": 4.5824381495724185e-05, + "loss": 0.0341, + "step": 32340 + }, + { + "epoch": 7.000205817039484, + "grad_norm": 0.12862443923950195, + "learning_rate": 4.582137247467909e-05, + "loss": 0.1251, + "step": 32350 + }, + { + "epoch": 7.0002599794182965, + "grad_norm": 0.05928846821188927, + "learning_rate": 4.5818363453634e-05, + "loss": 0.0454, + "step": 32360 + }, + { + "epoch": 7.000314141797108, + "grad_norm": 0.05331055074930191, + "learning_rate": 4.5815354432588904e-05, + "loss": 0.0689, + "step": 32370 + }, + { + "epoch": 7.000368304175919, + "grad_norm": 0.12546378374099731, + "learning_rate": 4.581234541154381e-05, + "loss": 0.0066, + "step": 32380 + }, + { + "epoch": 7.000422466554731, + "grad_norm": 0.025460191071033478, + "learning_rate": 4.5809336390498716e-05, + "loss": 0.1099, + "step": 32390 + }, + { + "epoch": 7.0004766289335425, + "grad_norm": 0.006152594927698374, + "learning_rate": 4.580632736945362e-05, + "loss": 0.1144, + "step": 32400 + }, + { + "epoch": 7.000530791312355, + "grad_norm": 5.900794982910156, + "learning_rate": 4.580331834840853e-05, + "loss": 0.0934, + "step": 32410 + }, + { + "epoch": 7.000584953691166, + "grad_norm": 0.006770148873329163, + "learning_rate": 4.580030932736344e-05, + "loss": 0.0483, + "step": 32420 + }, + { + "epoch": 7.000639116069978, + "grad_norm": 0.006718629039824009, + "learning_rate": 4.579730030631835e-05, + "loss": 0.1442, + "step": 32430 + }, + { + "epoch": 7.000693278448789, + "grad_norm": 0.045082684606313705, + "learning_rate": 4.579429128527325e-05, + "loss": 0.0638, + "step": 32440 + }, + { + "epoch": 7.000747440827602, + "grad_norm": 3.5184359550476074, + "learning_rate": 4.579128226422816e-05, + "loss": 0.1344, + "step": 32450 + }, + { + "epoch": 7.000801603206413, + "grad_norm": 3.4514520168304443, + "learning_rate": 4.578827324318307e-05, + "loss": 0.1768, + "step": 32460 + }, + { + "epoch": 7.000855765585224, + "grad_norm": 1.2483826875686646, + "learning_rate": 4.578526422213797e-05, + "loss": 0.0611, + "step": 32470 + }, + { + "epoch": 7.000909927964036, + "grad_norm": 0.4090964198112488, + "learning_rate": 4.578225520109288e-05, + "loss": 0.0363, + "step": 32480 + }, + { + "epoch": 7.000964090342848, + "grad_norm": 0.7311972975730896, + "learning_rate": 4.5779246180047786e-05, + "loss": 0.04, + "step": 32490 + }, + { + "epoch": 7.00101825272166, + "grad_norm": 0.00408136798068881, + "learning_rate": 4.577623715900269e-05, + "loss": 0.0274, + "step": 32500 + }, + { + "epoch": 7.001072415100471, + "grad_norm": 0.31243476271629333, + "learning_rate": 4.57732281379576e-05, + "loss": 0.0548, + "step": 32510 + }, + { + "epoch": 7.001126577479283, + "grad_norm": 0.20971743762493134, + "learning_rate": 4.5770219116912505e-05, + "loss": 0.1617, + "step": 32520 + }, + { + "epoch": 7.0011807398580945, + "grad_norm": 0.6228681802749634, + "learning_rate": 4.576721009586741e-05, + "loss": 0.1137, + "step": 32530 + }, + { + "epoch": 7.001234902236906, + "grad_norm": 0.09942913800477982, + "learning_rate": 4.5764201074822324e-05, + "loss": 0.0102, + "step": 32540 + }, + { + "epoch": 7.001289064615718, + "grad_norm": 0.06628843396902084, + "learning_rate": 4.5761192053777224e-05, + "loss": 0.0371, + "step": 32550 + }, + { + "epoch": 7.001343226994529, + "grad_norm": 2.1381008625030518, + "learning_rate": 4.575818303273213e-05, + "loss": 0.0233, + "step": 32560 + }, + { + "epoch": 7.001397389373341, + "grad_norm": 0.12471725791692734, + "learning_rate": 4.575517401168704e-05, + "loss": 0.1058, + "step": 32570 + }, + { + "epoch": 7.001451551752153, + "grad_norm": 0.9870837330818176, + "learning_rate": 4.575216499064195e-05, + "loss": 0.1316, + "step": 32580 + }, + { + "epoch": 7.001505714130965, + "grad_norm": 0.7975395917892456, + "learning_rate": 4.574915596959685e-05, + "loss": 0.0595, + "step": 32590 + }, + { + "epoch": 7.001559876509776, + "grad_norm": 0.00244343769736588, + "learning_rate": 4.574614694855176e-05, + "loss": 0.2148, + "step": 32600 + }, + { + "epoch": 7.001614038888588, + "grad_norm": 14.675838470458984, + "learning_rate": 4.574313792750667e-05, + "loss": 0.1621, + "step": 32610 + }, + { + "epoch": 7.0016682012674, + "grad_norm": 0.12863007187843323, + "learning_rate": 4.5740128906461574e-05, + "loss": 0.0351, + "step": 32620 + }, + { + "epoch": 7.001722363646211, + "grad_norm": 0.04697771742939949, + "learning_rate": 4.573711988541648e-05, + "loss": 0.1013, + "step": 32630 + }, + { + "epoch": 7.001776526025023, + "grad_norm": 0.10162032395601273, + "learning_rate": 4.573411086437139e-05, + "loss": 0.0999, + "step": 32640 + }, + { + "epoch": 7.001830688403834, + "grad_norm": 0.23925858736038208, + "learning_rate": 4.573110184332629e-05, + "loss": 0.1756, + "step": 32650 + }, + { + "epoch": 7.0018848507826466, + "grad_norm": 0.27764227986335754, + "learning_rate": 4.57280928222812e-05, + "loss": 0.0054, + "step": 32660 + }, + { + "epoch": 7.001939013161458, + "grad_norm": 0.0027246216777712107, + "learning_rate": 4.5725083801236106e-05, + "loss": 0.0785, + "step": 32670 + }, + { + "epoch": 7.00199317554027, + "grad_norm": 0.08220656216144562, + "learning_rate": 4.572207478019102e-05, + "loss": 0.0478, + "step": 32680 + }, + { + "epoch": 7.002047337919081, + "grad_norm": 0.048110831528902054, + "learning_rate": 4.5719065759145925e-05, + "loss": 0.0293, + "step": 32690 + }, + { + "epoch": 7.0021015002978935, + "grad_norm": 0.0015462142182514071, + "learning_rate": 4.5716056738100825e-05, + "loss": 0.0086, + "step": 32700 + }, + { + "epoch": 7.002155662676705, + "grad_norm": 23.956300735473633, + "learning_rate": 4.571304771705574e-05, + "loss": 0.0288, + "step": 32710 + }, + { + "epoch": 7.002209825055516, + "grad_norm": 0.053736936300992966, + "learning_rate": 4.5710038696010644e-05, + "loss": 0.0063, + "step": 32720 + }, + { + "epoch": 7.002263987434328, + "grad_norm": 2.6439783573150635, + "learning_rate": 4.570702967496555e-05, + "loss": 0.0248, + "step": 32730 + }, + { + "epoch": 7.0023181498131395, + "grad_norm": 2.9983856678009033, + "learning_rate": 4.5704020653920456e-05, + "loss": 0.0593, + "step": 32740 + }, + { + "epoch": 7.002372312191952, + "grad_norm": 7.759568214416504, + "learning_rate": 4.570101163287536e-05, + "loss": 0.3028, + "step": 32750 + }, + { + "epoch": 7.002426474570763, + "grad_norm": 1.2168779373168945, + "learning_rate": 4.569800261183027e-05, + "loss": 0.1353, + "step": 32760 + }, + { + "epoch": 7.002480636949575, + "grad_norm": 8.29804801940918, + "learning_rate": 4.5694993590785175e-05, + "loss": 0.1082, + "step": 32770 + }, + { + "epoch": 7.002534799328386, + "grad_norm": 0.029151039198040962, + "learning_rate": 4.569198456974008e-05, + "loss": 0.1403, + "step": 32780 + }, + { + "epoch": 7.002588961707199, + "grad_norm": 0.005989924538880587, + "learning_rate": 4.568897554869499e-05, + "loss": 0.004, + "step": 32790 + }, + { + "epoch": 7.00264312408601, + "grad_norm": 6.829868316650391, + "learning_rate": 4.56859665276499e-05, + "loss": 0.1443, + "step": 32800 + }, + { + "epoch": 7.002697286464821, + "grad_norm": 0.009365064091980457, + "learning_rate": 4.56829575066048e-05, + "loss": 0.0089, + "step": 32810 + }, + { + "epoch": 7.002751448843633, + "grad_norm": 0.1807188093662262, + "learning_rate": 4.5679948485559707e-05, + "loss": 0.1119, + "step": 32820 + }, + { + "epoch": 7.002805611222445, + "grad_norm": 0.005986528936773539, + "learning_rate": 4.567693946451462e-05, + "loss": 0.0509, + "step": 32830 + }, + { + "epoch": 7.002859773601257, + "grad_norm": 7.044196605682373, + "learning_rate": 4.5673930443469526e-05, + "loss": 0.016, + "step": 32840 + }, + { + "epoch": 7.002913935980068, + "grad_norm": 0.14189356565475464, + "learning_rate": 4.5670921422424425e-05, + "loss": 0.1322, + "step": 32850 + }, + { + "epoch": 7.00296809835888, + "grad_norm": 3.917863368988037, + "learning_rate": 4.566791240137934e-05, + "loss": 0.1598, + "step": 32860 + }, + { + "epoch": 7.0030222607376915, + "grad_norm": 1.7306928634643555, + "learning_rate": 4.5664903380334245e-05, + "loss": 0.0667, + "step": 32870 + }, + { + "epoch": 7.003076423116504, + "grad_norm": 0.3776662051677704, + "learning_rate": 4.566189435928915e-05, + "loss": 0.0618, + "step": 32880 + }, + { + "epoch": 7.003130585495315, + "grad_norm": 1.0653982162475586, + "learning_rate": 4.565888533824406e-05, + "loss": 0.049, + "step": 32890 + }, + { + "epoch": 7.003184747874126, + "grad_norm": 0.1704825907945633, + "learning_rate": 4.5655876317198964e-05, + "loss": 0.1423, + "step": 32900 + }, + { + "epoch": 7.003238910252938, + "grad_norm": 2.4755523204803467, + "learning_rate": 4.565286729615387e-05, + "loss": 0.1307, + "step": 32910 + }, + { + "epoch": 7.00329307263175, + "grad_norm": 2.925830841064453, + "learning_rate": 4.5649858275108776e-05, + "loss": 0.0994, + "step": 32920 + }, + { + "epoch": 7.003347235010562, + "grad_norm": 0.374981164932251, + "learning_rate": 4.564684925406368e-05, + "loss": 0.141, + "step": 32930 + }, + { + "epoch": 7.003401397389373, + "grad_norm": 2.0994575023651123, + "learning_rate": 4.5643840233018595e-05, + "loss": 0.0709, + "step": 32940 + }, + { + "epoch": 7.003455559768185, + "grad_norm": 0.4228685796260834, + "learning_rate": 4.56408312119735e-05, + "loss": 0.091, + "step": 32950 + }, + { + "epoch": 7.003509722146997, + "grad_norm": 0.004510400351136923, + "learning_rate": 4.56378221909284e-05, + "loss": 0.0608, + "step": 32960 + }, + { + "epoch": 7.003563884525809, + "grad_norm": 1.306969165802002, + "learning_rate": 4.5634813169883314e-05, + "loss": 0.1837, + "step": 32970 + }, + { + "epoch": 7.00361804690462, + "grad_norm": 0.07240498065948486, + "learning_rate": 4.563180414883822e-05, + "loss": 0.031, + "step": 32980 + }, + { + "epoch": 7.003672209283431, + "grad_norm": 0.0021297279745340347, + "learning_rate": 4.562879512779313e-05, + "loss": 0.023, + "step": 32990 + }, + { + "epoch": 7.0037263716622435, + "grad_norm": 1.6994342803955078, + "learning_rate": 4.562578610674803e-05, + "loss": 0.0445, + "step": 33000 + }, + { + "epoch": 7.003780534041055, + "grad_norm": 0.2039346545934677, + "learning_rate": 4.562277708570294e-05, + "loss": 0.1923, + "step": 33010 + }, + { + "epoch": 7.003834696419867, + "grad_norm": 2.9098963737487793, + "learning_rate": 4.5619768064657846e-05, + "loss": 0.045, + "step": 33020 + }, + { + "epoch": 7.003888858798678, + "grad_norm": 2.6241705417633057, + "learning_rate": 4.561675904361276e-05, + "loss": 0.0983, + "step": 33030 + }, + { + "epoch": 7.0039430211774905, + "grad_norm": 1.7651206254959106, + "learning_rate": 4.561375002256766e-05, + "loss": 0.0431, + "step": 33040 + }, + { + "epoch": 7.003997183556302, + "grad_norm": 0.0895363986492157, + "learning_rate": 4.5610741001522564e-05, + "loss": 0.1473, + "step": 33050 + }, + { + "epoch": 7.004051345935113, + "grad_norm": 0.056921616196632385, + "learning_rate": 4.560773198047748e-05, + "loss": 0.0613, + "step": 33060 + }, + { + "epoch": 7.004105508313925, + "grad_norm": 0.15006916224956512, + "learning_rate": 4.560472295943238e-05, + "loss": 0.0213, + "step": 33070 + }, + { + "epoch": 7.0041596706927365, + "grad_norm": 6.356783390045166, + "learning_rate": 4.560171393838728e-05, + "loss": 0.1148, + "step": 33080 + }, + { + "epoch": 7.004213833071549, + "grad_norm": 0.024689510464668274, + "learning_rate": 4.5598704917342196e-05, + "loss": 0.0079, + "step": 33090 + }, + { + "epoch": 7.00426799545036, + "grad_norm": 0.007101150695234537, + "learning_rate": 4.55956958962971e-05, + "loss": 0.0072, + "step": 33100 + }, + { + "epoch": 7.004322157829172, + "grad_norm": 0.04013878107070923, + "learning_rate": 4.5592686875252e-05, + "loss": 0.0943, + "step": 33110 + }, + { + "epoch": 7.004376320207983, + "grad_norm": 0.6465972065925598, + "learning_rate": 4.5589677854206915e-05, + "loss": 0.0846, + "step": 33120 + }, + { + "epoch": 7.004430482586796, + "grad_norm": 0.4937107563018799, + "learning_rate": 4.558666883316182e-05, + "loss": 0.0823, + "step": 33130 + }, + { + "epoch": 7.004484644965607, + "grad_norm": 0.17817167937755585, + "learning_rate": 4.558365981211673e-05, + "loss": 0.109, + "step": 33140 + }, + { + "epoch": 7.004538807344418, + "grad_norm": 0.018106309697031975, + "learning_rate": 4.5580650791071634e-05, + "loss": 0.0103, + "step": 33150 + }, + { + "epoch": 7.00459296972323, + "grad_norm": 0.13110357522964478, + "learning_rate": 4.557764177002654e-05, + "loss": 0.0068, + "step": 33160 + }, + { + "epoch": 7.004647132102042, + "grad_norm": 0.002248329808935523, + "learning_rate": 4.5574632748981447e-05, + "loss": 0.1054, + "step": 33170 + }, + { + "epoch": 7.004701294480854, + "grad_norm": 0.3072637617588043, + "learning_rate": 4.557162372793636e-05, + "loss": 0.0975, + "step": 33180 + }, + { + "epoch": 7.004755456859665, + "grad_norm": 0.16019248962402344, + "learning_rate": 4.556861470689126e-05, + "loss": 0.0163, + "step": 33190 + }, + { + "epoch": 7.004809619238477, + "grad_norm": 0.011996911838650703, + "learning_rate": 4.556560568584617e-05, + "loss": 0.0389, + "step": 33200 + }, + { + "epoch": 7.0048637816172885, + "grad_norm": 0.0022713327780365944, + "learning_rate": 4.556259666480108e-05, + "loss": 0.0558, + "step": 33210 + }, + { + "epoch": 7.004917943996101, + "grad_norm": 0.08932683616876602, + "learning_rate": 4.555958764375598e-05, + "loss": 0.2548, + "step": 33220 + }, + { + "epoch": 7.004972106374912, + "grad_norm": 0.007234825287014246, + "learning_rate": 4.555657862271089e-05, + "loss": 0.0608, + "step": 33230 + }, + { + "epoch": 7.005026268753723, + "grad_norm": 0.4840485155582428, + "learning_rate": 4.55535696016658e-05, + "loss": 0.0645, + "step": 33240 + }, + { + "epoch": 7.005080431132535, + "grad_norm": 0.36613523960113525, + "learning_rate": 4.5550560580620704e-05, + "loss": 0.0686, + "step": 33250 + }, + { + "epoch": 7.005134593511347, + "grad_norm": 0.007132347207516432, + "learning_rate": 4.554755155957561e-05, + "loss": 0.1316, + "step": 33260 + }, + { + "epoch": 7.005188755890159, + "grad_norm": 4.472064971923828, + "learning_rate": 4.5544542538530516e-05, + "loss": 0.0191, + "step": 33270 + }, + { + "epoch": 7.00524291826897, + "grad_norm": 0.0035217958502471447, + "learning_rate": 4.554153351748542e-05, + "loss": 0.0099, + "step": 33280 + }, + { + "epoch": 7.005297080647782, + "grad_norm": 0.017295341938734055, + "learning_rate": 4.5538524496440335e-05, + "loss": 0.2762, + "step": 33290 + }, + { + "epoch": 7.005351243026594, + "grad_norm": 0.014533608220517635, + "learning_rate": 4.5535515475395235e-05, + "loss": 0.0805, + "step": 33300 + }, + { + "epoch": 7.005405405405406, + "grad_norm": 11.774585723876953, + "learning_rate": 4.553250645435014e-05, + "loss": 0.1104, + "step": 33310 + }, + { + "epoch": 7.005459567784217, + "grad_norm": 0.46777570247650146, + "learning_rate": 4.5529497433305054e-05, + "loss": 0.1844, + "step": 33320 + }, + { + "epoch": 7.005513730163028, + "grad_norm": 0.12428324669599533, + "learning_rate": 4.552648841225996e-05, + "loss": 0.0219, + "step": 33330 + }, + { + "epoch": 7.0055678925418405, + "grad_norm": 0.25031569600105286, + "learning_rate": 4.552347939121486e-05, + "loss": 0.1236, + "step": 33340 + }, + { + "epoch": 7.005622054920652, + "grad_norm": 0.02166709117591381, + "learning_rate": 4.552047037016977e-05, + "loss": 0.0617, + "step": 33350 + }, + { + "epoch": 7.005676217299464, + "grad_norm": 0.18805988132953644, + "learning_rate": 4.551746134912468e-05, + "loss": 0.0214, + "step": 33360 + }, + { + "epoch": 7.005730379678275, + "grad_norm": 0.007477836683392525, + "learning_rate": 4.551445232807958e-05, + "loss": 0.0043, + "step": 33370 + }, + { + "epoch": 7.0057845420570874, + "grad_norm": 0.14454685151576996, + "learning_rate": 4.551144330703449e-05, + "loss": 0.0147, + "step": 33380 + }, + { + "epoch": 7.005838704435899, + "grad_norm": 0.004736863076686859, + "learning_rate": 4.55084342859894e-05, + "loss": 0.0355, + "step": 33390 + }, + { + "epoch": 7.005892866814711, + "grad_norm": 0.008181101642549038, + "learning_rate": 4.5505425264944304e-05, + "loss": 0.1572, + "step": 33400 + }, + { + "epoch": 7.005947029193522, + "grad_norm": 0.008246039040386677, + "learning_rate": 4.550241624389921e-05, + "loss": 0.1268, + "step": 33410 + }, + { + "epoch": 7.0060011915723335, + "grad_norm": 0.10274672508239746, + "learning_rate": 4.549940722285412e-05, + "loss": 0.0222, + "step": 33420 + }, + { + "epoch": 7.006055353951146, + "grad_norm": 7.054144382476807, + "learning_rate": 4.549639820180902e-05, + "loss": 0.1829, + "step": 33430 + }, + { + "epoch": 7.006109516329957, + "grad_norm": 0.26072385907173157, + "learning_rate": 4.5493389180763936e-05, + "loss": 0.0382, + "step": 33440 + }, + { + "epoch": 7.006163678708769, + "grad_norm": 0.13494211435317993, + "learning_rate": 4.5490380159718836e-05, + "loss": 0.006, + "step": 33450 + }, + { + "epoch": 7.00621784108758, + "grad_norm": 0.22056402266025543, + "learning_rate": 4.548737113867375e-05, + "loss": 0.1566, + "step": 33460 + }, + { + "epoch": 7.006272003466393, + "grad_norm": 0.06399959325790405, + "learning_rate": 4.5484362117628655e-05, + "loss": 0.07, + "step": 33470 + }, + { + "epoch": 7.006326165845204, + "grad_norm": 0.023575574159622192, + "learning_rate": 4.548135309658356e-05, + "loss": 0.0083, + "step": 33480 + }, + { + "epoch": 7.006380328224016, + "grad_norm": 0.34878700971603394, + "learning_rate": 4.547834407553847e-05, + "loss": 0.1147, + "step": 33490 + }, + { + "epoch": 7.006434490602827, + "grad_norm": 2.543015480041504, + "learning_rate": 4.5475335054493374e-05, + "loss": 0.041, + "step": 33500 + }, + { + "epoch": 7.006488652981639, + "grad_norm": 0.2472725510597229, + "learning_rate": 4.547232603344828e-05, + "loss": 0.0364, + "step": 33510 + }, + { + "epoch": 7.006542815360451, + "grad_norm": 0.14555685222148895, + "learning_rate": 4.5469317012403186e-05, + "loss": 0.0356, + "step": 33520 + }, + { + "epoch": 7.006596977739262, + "grad_norm": 2.202363967895508, + "learning_rate": 4.546630799135809e-05, + "loss": 0.0646, + "step": 33530 + }, + { + "epoch": 7.006651140118074, + "grad_norm": 0.1614273637533188, + "learning_rate": 4.5463298970313e-05, + "loss": 0.0956, + "step": 33540 + }, + { + "epoch": 7.0067053024968855, + "grad_norm": 0.03610461577773094, + "learning_rate": 4.546028994926791e-05, + "loss": 0.0094, + "step": 33550 + }, + { + "epoch": 7.006759464875698, + "grad_norm": 0.0015151748666539788, + "learning_rate": 4.545728092822281e-05, + "loss": 0.0049, + "step": 33560 + }, + { + "epoch": 7.006813627254509, + "grad_norm": 0.004340590909123421, + "learning_rate": 4.545427190717772e-05, + "loss": 0.0228, + "step": 33570 + }, + { + "epoch": 7.006867789633321, + "grad_norm": 0.0809430405497551, + "learning_rate": 4.545126288613263e-05, + "loss": 0.0342, + "step": 33580 + }, + { + "epoch": 7.006921952012132, + "grad_norm": 0.01368605624884367, + "learning_rate": 4.544825386508754e-05, + "loss": 0.0519, + "step": 33590 + }, + { + "epoch": 7.006976114390944, + "grad_norm": 8.448967933654785, + "learning_rate": 4.544524484404244e-05, + "loss": 0.0808, + "step": 33600 + }, + { + "epoch": 7.007030276769756, + "grad_norm": 0.012177586555480957, + "learning_rate": 4.544223582299735e-05, + "loss": 0.0962, + "step": 33610 + }, + { + "epoch": 7.007084439148567, + "grad_norm": 0.01880405656993389, + "learning_rate": 4.5439226801952256e-05, + "loss": 0.1378, + "step": 33620 + }, + { + "epoch": 7.007138601527379, + "grad_norm": 0.019566817209124565, + "learning_rate": 4.543621778090716e-05, + "loss": 0.0409, + "step": 33630 + }, + { + "epoch": 7.007192763906191, + "grad_norm": 0.006269589997828007, + "learning_rate": 4.543320875986207e-05, + "loss": 0.1474, + "step": 33640 + }, + { + "epoch": 7.007246926285003, + "grad_norm": 0.0824575200676918, + "learning_rate": 4.5430199738816975e-05, + "loss": 0.0303, + "step": 33650 + }, + { + "epoch": 7.007301088663814, + "grad_norm": 0.38083890080451965, + "learning_rate": 4.542719071777188e-05, + "loss": 0.0887, + "step": 33660 + }, + { + "epoch": 7.007355251042625, + "grad_norm": 0.1063384935259819, + "learning_rate": 4.542418169672679e-05, + "loss": 0.2018, + "step": 33670 + }, + { + "epoch": 7.0074094134214375, + "grad_norm": 0.28445321321487427, + "learning_rate": 4.5421172675681694e-05, + "loss": 0.0674, + "step": 33680 + }, + { + "epoch": 7.007463575800249, + "grad_norm": 0.005148506257683039, + "learning_rate": 4.54181636546366e-05, + "loss": 0.052, + "step": 33690 + }, + { + "epoch": 7.007517738179061, + "grad_norm": 4.185569763183594, + "learning_rate": 4.541515463359151e-05, + "loss": 0.1748, + "step": 33700 + }, + { + "epoch": 7.007571900557872, + "grad_norm": 7.416937828063965, + "learning_rate": 4.541214561254641e-05, + "loss": 0.1922, + "step": 33710 + }, + { + "epoch": 7.007626062936684, + "grad_norm": 0.8525494933128357, + "learning_rate": 4.5409136591501326e-05, + "loss": 0.0902, + "step": 33720 + }, + { + "epoch": 7.007680225315496, + "grad_norm": 0.056527502834796906, + "learning_rate": 4.540612757045623e-05, + "loss": 0.1986, + "step": 33730 + }, + { + "epoch": 7.007734387694308, + "grad_norm": 0.4430144131183624, + "learning_rate": 4.540311854941114e-05, + "loss": 0.0542, + "step": 33740 + }, + { + "epoch": 7.007788550073119, + "grad_norm": 0.12654417753219604, + "learning_rate": 4.5400109528366044e-05, + "loss": 0.1126, + "step": 33750 + }, + { + "epoch": 7.0078427124519305, + "grad_norm": 2.172940969467163, + "learning_rate": 4.539710050732095e-05, + "loss": 0.0721, + "step": 33760 + }, + { + "epoch": 7.007896874830743, + "grad_norm": 0.1274147778749466, + "learning_rate": 4.539409148627586e-05, + "loss": 0.0889, + "step": 33770 + }, + { + "epoch": 7.007951037209554, + "grad_norm": 0.37557390332221985, + "learning_rate": 4.539108246523076e-05, + "loss": 0.1636, + "step": 33780 + }, + { + "epoch": 7.008005199588366, + "grad_norm": 0.5300049781799316, + "learning_rate": 4.538807344418567e-05, + "loss": 0.0312, + "step": 33790 + }, + { + "epoch": 7.008059361967177, + "grad_norm": 0.2536580562591553, + "learning_rate": 4.5385064423140576e-05, + "loss": 0.0064, + "step": 33800 + }, + { + "epoch": 7.0081135243459896, + "grad_norm": 0.008212034590542316, + "learning_rate": 4.538205540209549e-05, + "loss": 0.0962, + "step": 33810 + }, + { + "epoch": 7.008167686724801, + "grad_norm": 0.0635610893368721, + "learning_rate": 4.537904638105039e-05, + "loss": 0.0845, + "step": 33820 + }, + { + "epoch": 7.008221849103613, + "grad_norm": 0.5959897637367249, + "learning_rate": 4.5376037360005295e-05, + "loss": 0.2062, + "step": 33830 + }, + { + "epoch": 7.008276011482424, + "grad_norm": 2.5515859127044678, + "learning_rate": 4.537302833896021e-05, + "loss": 0.1196, + "step": 33840 + }, + { + "epoch": 7.008330173861236, + "grad_norm": 0.01656654290854931, + "learning_rate": 4.5370019317915114e-05, + "loss": 0.0689, + "step": 33850 + }, + { + "epoch": 7.008384336240048, + "grad_norm": 6.851761817932129, + "learning_rate": 4.536701029687001e-05, + "loss": 0.1157, + "step": 33860 + }, + { + "epoch": 7.008438498618859, + "grad_norm": 1.423666000366211, + "learning_rate": 4.5364001275824926e-05, + "loss": 0.0924, + "step": 33870 + }, + { + "epoch": 7.008492660997671, + "grad_norm": 0.21323548257350922, + "learning_rate": 4.536099225477983e-05, + "loss": 0.0283, + "step": 33880 + }, + { + "epoch": 7.0085468233764825, + "grad_norm": 7.236353397369385, + "learning_rate": 4.535798323373474e-05, + "loss": 0.1396, + "step": 33890 + }, + { + "epoch": 7.008600985755295, + "grad_norm": 0.8245626091957092, + "learning_rate": 4.5354974212689645e-05, + "loss": 0.1058, + "step": 33900 + }, + { + "epoch": 7.008655148134106, + "grad_norm": 0.12695001065731049, + "learning_rate": 4.535196519164455e-05, + "loss": 0.0783, + "step": 33910 + }, + { + "epoch": 7.008709310512918, + "grad_norm": 0.17997121810913086, + "learning_rate": 4.534895617059946e-05, + "loss": 0.0307, + "step": 33920 + }, + { + "epoch": 7.008763472891729, + "grad_norm": 1.849968433380127, + "learning_rate": 4.534594714955437e-05, + "loss": 0.1355, + "step": 33930 + }, + { + "epoch": 7.008817635270541, + "grad_norm": 0.02124139294028282, + "learning_rate": 4.534293812850927e-05, + "loss": 0.0859, + "step": 33940 + }, + { + "epoch": 7.008871797649353, + "grad_norm": 0.18076881766319275, + "learning_rate": 4.533992910746418e-05, + "loss": 0.0811, + "step": 33950 + }, + { + "epoch": 7.008925960028164, + "grad_norm": 0.39409661293029785, + "learning_rate": 4.533692008641909e-05, + "loss": 0.0714, + "step": 33960 + }, + { + "epoch": 7.008980122406976, + "grad_norm": 0.002041416708379984, + "learning_rate": 4.533391106537399e-05, + "loss": 0.0078, + "step": 33970 + }, + { + "epoch": 7.009034284785788, + "grad_norm": 0.07872708886861801, + "learning_rate": 4.53309020443289e-05, + "loss": 0.0084, + "step": 33980 + }, + { + "epoch": 7.0090884471646, + "grad_norm": 0.1980978101491928, + "learning_rate": 4.532789302328381e-05, + "loss": 0.04, + "step": 33990 + }, + { + "epoch": 7.009142609543411, + "grad_norm": 0.006639828439801931, + "learning_rate": 4.5324884002238715e-05, + "loss": 0.0077, + "step": 34000 + }, + { + "epoch": 7.009196771922223, + "grad_norm": 0.08406348526477814, + "learning_rate": 4.532187498119362e-05, + "loss": 0.2469, + "step": 34010 + }, + { + "epoch": 7.0092509343010345, + "grad_norm": 0.7924312949180603, + "learning_rate": 4.531886596014853e-05, + "loss": 0.0092, + "step": 34020 + }, + { + "epoch": 7.009305096679846, + "grad_norm": 1.0251858234405518, + "learning_rate": 4.5315856939103434e-05, + "loss": 0.0972, + "step": 34030 + }, + { + "epoch": 7.009359259058658, + "grad_norm": 0.6704382300376892, + "learning_rate": 4.531284791805834e-05, + "loss": 0.0886, + "step": 34040 + }, + { + "epoch": 7.009413421437469, + "grad_norm": 12.159014701843262, + "learning_rate": 4.5309838897013246e-05, + "loss": 0.098, + "step": 34050 + }, + { + "epoch": 7.009467583816281, + "grad_norm": 0.3859002888202667, + "learning_rate": 4.530682987596815e-05, + "loss": 0.17, + "step": 34060 + }, + { + "epoch": 7.009521746195093, + "grad_norm": 0.004568261094391346, + "learning_rate": 4.5303820854923065e-05, + "loss": 0.04, + "step": 34070 + }, + { + "epoch": 7.009575908573905, + "grad_norm": 0.6232761740684509, + "learning_rate": 4.530081183387797e-05, + "loss": 0.0322, + "step": 34080 + }, + { + "epoch": 7.009630070952716, + "grad_norm": 16.11067008972168, + "learning_rate": 4.529780281283287e-05, + "loss": 0.1377, + "step": 34090 + }, + { + "epoch": 7.009684233331528, + "grad_norm": 0.17957964539527893, + "learning_rate": 4.5294793791787784e-05, + "loss": 0.047, + "step": 34100 + }, + { + "epoch": 7.00973839571034, + "grad_norm": 0.007485478185117245, + "learning_rate": 4.529178477074269e-05, + "loss": 0.0499, + "step": 34110 + }, + { + "epoch": 7.009792558089151, + "grad_norm": 0.2642413377761841, + "learning_rate": 4.528877574969759e-05, + "loss": 0.08, + "step": 34120 + }, + { + "epoch": 7.009846720467963, + "grad_norm": 0.2442207932472229, + "learning_rate": 4.52857667286525e-05, + "loss": 0.0558, + "step": 34130 + }, + { + "epoch": 7.009900882846774, + "grad_norm": 0.5806466341018677, + "learning_rate": 4.528275770760741e-05, + "loss": 0.0202, + "step": 34140 + }, + { + "epoch": 7.0099550452255865, + "grad_norm": 0.08168760687112808, + "learning_rate": 4.5279748686562316e-05, + "loss": 0.0706, + "step": 34150 + }, + { + "epoch": 7.010009207604398, + "grad_norm": 14.138127326965332, + "learning_rate": 4.527673966551722e-05, + "loss": 0.201, + "step": 34160 + }, + { + "epoch": 7.01006336998321, + "grad_norm": 0.13253070414066315, + "learning_rate": 4.527373064447213e-05, + "loss": 0.1643, + "step": 34170 + }, + { + "epoch": 7.010117532362021, + "grad_norm": 0.3820914328098297, + "learning_rate": 4.5270721623427034e-05, + "loss": 0.0926, + "step": 34180 + }, + { + "epoch": 7.0101716947408335, + "grad_norm": 0.033337969332933426, + "learning_rate": 4.526771260238195e-05, + "loss": 0.1839, + "step": 34190 + }, + { + "epoch": 7.010225857119645, + "grad_norm": 0.012209362350404263, + "learning_rate": 4.526470358133685e-05, + "loss": 0.0896, + "step": 34200 + }, + { + "epoch": 7.010280019498456, + "grad_norm": 0.9131116271018982, + "learning_rate": 4.526169456029175e-05, + "loss": 0.1097, + "step": 34210 + }, + { + "epoch": 7.010334181877268, + "grad_norm": 0.055453963577747345, + "learning_rate": 4.5258685539246666e-05, + "loss": 0.1125, + "step": 34220 + }, + { + "epoch": 7.0103883442560795, + "grad_norm": 0.3052462339401245, + "learning_rate": 4.525567651820157e-05, + "loss": 0.0847, + "step": 34230 + }, + { + "epoch": 7.010442506634892, + "grad_norm": 1.1979002952575684, + "learning_rate": 4.525266749715648e-05, + "loss": 0.0165, + "step": 34240 + }, + { + "epoch": 7.010496669013703, + "grad_norm": 0.20440441370010376, + "learning_rate": 4.5249658476111385e-05, + "loss": 0.1082, + "step": 34250 + }, + { + "epoch": 7.010550831392515, + "grad_norm": 0.019945405423641205, + "learning_rate": 4.524664945506629e-05, + "loss": 0.0777, + "step": 34260 + }, + { + "epoch": 7.010604993771326, + "grad_norm": 2.7121965885162354, + "learning_rate": 4.52436404340212e-05, + "loss": 0.1119, + "step": 34270 + }, + { + "epoch": 7.010659156150138, + "grad_norm": 0.00810435600578785, + "learning_rate": 4.5240631412976104e-05, + "loss": 0.1447, + "step": 34280 + }, + { + "epoch": 7.01071331852895, + "grad_norm": 0.6571696400642395, + "learning_rate": 4.523762239193101e-05, + "loss": 0.1112, + "step": 34290 + }, + { + "epoch": 7.010767480907761, + "grad_norm": 0.009607692249119282, + "learning_rate": 4.5234613370885917e-05, + "loss": 0.044, + "step": 34300 + }, + { + "epoch": 7.010821643286573, + "grad_norm": 0.012837120331823826, + "learning_rate": 4.523160434984082e-05, + "loss": 0.0039, + "step": 34310 + }, + { + "epoch": 7.010875805665385, + "grad_norm": 3.523524045944214, + "learning_rate": 4.522859532879573e-05, + "loss": 0.1589, + "step": 34320 + }, + { + "epoch": 7.010929968044197, + "grad_norm": 1.4732778072357178, + "learning_rate": 4.522558630775064e-05, + "loss": 0.0659, + "step": 34330 + }, + { + "epoch": 7.010984130423008, + "grad_norm": 0.24215245246887207, + "learning_rate": 4.522257728670555e-05, + "loss": 0.1247, + "step": 34340 + }, + { + "epoch": 7.01103829280182, + "grad_norm": 0.00990304071456194, + "learning_rate": 4.521956826566045e-05, + "loss": 0.0099, + "step": 34350 + }, + { + "epoch": 7.0110924551806315, + "grad_norm": 0.11928728222846985, + "learning_rate": 4.521655924461536e-05, + "loss": 0.1026, + "step": 34360 + }, + { + "epoch": 7.011146617559443, + "grad_norm": 0.9191178679466248, + "learning_rate": 4.521355022357027e-05, + "loss": 0.0633, + "step": 34370 + }, + { + "epoch": 7.011200779938255, + "grad_norm": 0.03312951326370239, + "learning_rate": 4.5210541202525174e-05, + "loss": 0.0282, + "step": 34380 + }, + { + "epoch": 7.011254942317066, + "grad_norm": 0.017991552129387856, + "learning_rate": 4.520753218148008e-05, + "loss": 0.0802, + "step": 34390 + }, + { + "epoch": 7.011309104695878, + "grad_norm": 0.07593835145235062, + "learning_rate": 4.5204523160434986e-05, + "loss": 0.172, + "step": 34400 + }, + { + "epoch": 7.01136326707469, + "grad_norm": 0.3016771674156189, + "learning_rate": 4.520151413938989e-05, + "loss": 0.0626, + "step": 34410 + }, + { + "epoch": 7.011417429453502, + "grad_norm": 2.9913086891174316, + "learning_rate": 4.51985051183448e-05, + "loss": 0.1567, + "step": 34420 + }, + { + "epoch": 7.011471591832313, + "grad_norm": 0.1044284850358963, + "learning_rate": 4.5195496097299705e-05, + "loss": 0.0978, + "step": 34430 + }, + { + "epoch": 7.011525754211125, + "grad_norm": 0.017873186618089676, + "learning_rate": 4.519248707625461e-05, + "loss": 0.0159, + "step": 34440 + }, + { + "epoch": 7.011579916589937, + "grad_norm": 0.10838720202445984, + "learning_rate": 4.5189478055209524e-05, + "loss": 0.0749, + "step": 34450 + }, + { + "epoch": 7.011634078968748, + "grad_norm": 0.011084917932748795, + "learning_rate": 4.5186469034164424e-05, + "loss": 0.0576, + "step": 34460 + }, + { + "epoch": 7.01168824134756, + "grad_norm": 3.8919739723205566, + "learning_rate": 4.518346001311933e-05, + "loss": 0.17, + "step": 34470 + }, + { + "epoch": 7.011742403726371, + "grad_norm": 0.3858637809753418, + "learning_rate": 4.518045099207424e-05, + "loss": 0.2813, + "step": 34480 + }, + { + "epoch": 7.0117965661051835, + "grad_norm": 0.01016867533326149, + "learning_rate": 4.517744197102915e-05, + "loss": 0.2026, + "step": 34490 + }, + { + "epoch": 7.011850728483995, + "grad_norm": 0.03841691464185715, + "learning_rate": 4.5174432949984056e-05, + "loss": 0.0537, + "step": 34500 + }, + { + "epoch": 7.011904890862807, + "grad_norm": 0.47870683670043945, + "learning_rate": 4.517142392893896e-05, + "loss": 0.0122, + "step": 34510 + }, + { + "epoch": 7.011959053241618, + "grad_norm": 0.9177903532981873, + "learning_rate": 4.516841490789387e-05, + "loss": 0.1255, + "step": 34520 + }, + { + "epoch": 7.0120132156204305, + "grad_norm": 0.7727459073066711, + "learning_rate": 4.5165405886848774e-05, + "loss": 0.0103, + "step": 34530 + }, + { + "epoch": 7.012067377999242, + "grad_norm": 0.061044786125421524, + "learning_rate": 4.516239686580368e-05, + "loss": 0.1626, + "step": 34540 + }, + { + "epoch": 7.012121540378053, + "grad_norm": 13.687017440795898, + "learning_rate": 4.515938784475859e-05, + "loss": 0.1561, + "step": 34550 + }, + { + "epoch": 7.012175702756865, + "grad_norm": 1.2881423234939575, + "learning_rate": 4.515637882371349e-05, + "loss": 0.0742, + "step": 34560 + }, + { + "epoch": 7.0122298651356765, + "grad_norm": 0.02968868426978588, + "learning_rate": 4.51533698026684e-05, + "loss": 0.0606, + "step": 34570 + }, + { + "epoch": 7.012284027514489, + "grad_norm": 0.07591604441404343, + "learning_rate": 4.5150360781623306e-05, + "loss": 0.1294, + "step": 34580 + }, + { + "epoch": 7.0123381898933, + "grad_norm": 0.09138476103544235, + "learning_rate": 4.514735176057822e-05, + "loss": 0.0284, + "step": 34590 + }, + { + "epoch": 7.012392352272112, + "grad_norm": 0.5626019835472107, + "learning_rate": 4.5144342739533125e-05, + "loss": 0.078, + "step": 34600 + }, + { + "epoch": 7.012446514650923, + "grad_norm": 1.1955920457839966, + "learning_rate": 4.5141333718488025e-05, + "loss": 0.0752, + "step": 34610 + }, + { + "epoch": 7.012500677029736, + "grad_norm": 2.7993690967559814, + "learning_rate": 4.513832469744294e-05, + "loss": 0.1505, + "step": 34620 + }, + { + "epoch": 7.012554839408547, + "grad_norm": 0.8638518452644348, + "learning_rate": 4.5135315676397844e-05, + "loss": 0.0613, + "step": 34630 + }, + { + "epoch": 7.012609001787358, + "grad_norm": 0.016672300174832344, + "learning_rate": 4.513230665535275e-05, + "loss": 0.0414, + "step": 34640 + }, + { + "epoch": 7.01266316416617, + "grad_norm": 5.44243860244751, + "learning_rate": 4.5129297634307656e-05, + "loss": 0.1318, + "step": 34650 + }, + { + "epoch": 7.012717326544982, + "grad_norm": 0.5387796759605408, + "learning_rate": 4.512628861326256e-05, + "loss": 0.0438, + "step": 34660 + }, + { + "epoch": 7.012771488923794, + "grad_norm": 0.19342981278896332, + "learning_rate": 4.512327959221747e-05, + "loss": 0.0312, + "step": 34670 + }, + { + "epoch": 7.012825651302605, + "grad_norm": 5.513591766357422, + "learning_rate": 4.512027057117238e-05, + "loss": 0.0371, + "step": 34680 + }, + { + "epoch": 7.012879813681417, + "grad_norm": 0.12986256182193756, + "learning_rate": 4.511726155012728e-05, + "loss": 0.0453, + "step": 34690 + }, + { + "epoch": 7.0129339760602285, + "grad_norm": 1.4699238538742065, + "learning_rate": 4.511425252908219e-05, + "loss": 0.1139, + "step": 34700 + }, + { + "epoch": 7.012988138439041, + "grad_norm": 2.5216972827911377, + "learning_rate": 4.51112435080371e-05, + "loss": 0.0153, + "step": 34710 + }, + { + "epoch": 7.013042300817852, + "grad_norm": 0.07281087338924408, + "learning_rate": 4.5108234486992e-05, + "loss": 0.0052, + "step": 34720 + }, + { + "epoch": 7.013096463196663, + "grad_norm": 0.024023165926337242, + "learning_rate": 4.510522546594691e-05, + "loss": 0.01, + "step": 34730 + }, + { + "epoch": 7.013150625575475, + "grad_norm": 0.09721668064594269, + "learning_rate": 4.510221644490182e-05, + "loss": 0.3444, + "step": 34740 + }, + { + "epoch": 7.013204787954287, + "grad_norm": 6.040936470031738, + "learning_rate": 4.5099207423856726e-05, + "loss": 0.196, + "step": 34750 + }, + { + "epoch": 7.013258950333099, + "grad_norm": 0.21144932508468628, + "learning_rate": 4.509619840281163e-05, + "loss": 0.042, + "step": 34760 + }, + { + "epoch": 7.01331311271191, + "grad_norm": 0.40663060545921326, + "learning_rate": 4.509318938176654e-05, + "loss": 0.0111, + "step": 34770 + }, + { + "epoch": 7.013367275090722, + "grad_norm": 0.0430501252412796, + "learning_rate": 4.5090180360721445e-05, + "loss": 0.1607, + "step": 34780 + }, + { + "epoch": 7.013421437469534, + "grad_norm": 0.018232379108667374, + "learning_rate": 4.508717133967635e-05, + "loss": 0.0764, + "step": 34790 + }, + { + "epoch": 7.013475599848345, + "grad_norm": 0.031489260494709015, + "learning_rate": 4.508416231863126e-05, + "loss": 0.0784, + "step": 34800 + }, + { + "epoch": 7.013529762227157, + "grad_norm": 0.34304261207580566, + "learning_rate": 4.5081153297586164e-05, + "loss": 0.0641, + "step": 34810 + }, + { + "epoch": 7.013583924605968, + "grad_norm": 0.02737201377749443, + "learning_rate": 4.507814427654107e-05, + "loss": 0.0158, + "step": 34820 + }, + { + "epoch": 7.0136380869847805, + "grad_norm": 0.5438295602798462, + "learning_rate": 4.507513525549598e-05, + "loss": 0.0313, + "step": 34830 + }, + { + "epoch": 7.013692249363592, + "grad_norm": 0.1389273852109909, + "learning_rate": 4.507212623445088e-05, + "loss": 0.0507, + "step": 34840 + }, + { + "epoch": 7.013746411742404, + "grad_norm": 0.004798214882612228, + "learning_rate": 4.5069117213405796e-05, + "loss": 0.0362, + "step": 34850 + }, + { + "epoch": 7.013800574121215, + "grad_norm": 0.003969857003539801, + "learning_rate": 4.50661081923607e-05, + "loss": 0.0077, + "step": 34860 + }, + { + "epoch": 7.013854736500027, + "grad_norm": 6.444830417633057, + "learning_rate": 4.50630991713156e-05, + "loss": 0.0923, + "step": 34870 + }, + { + "epoch": 7.013908898878839, + "grad_norm": 0.2063504308462143, + "learning_rate": 4.5060090150270514e-05, + "loss": 0.0456, + "step": 34880 + }, + { + "epoch": 7.01396306125765, + "grad_norm": 0.06644925475120544, + "learning_rate": 4.505708112922542e-05, + "loss": 0.0698, + "step": 34890 + }, + { + "epoch": 7.014017223636462, + "grad_norm": 0.0018473215168341994, + "learning_rate": 4.505407210818033e-05, + "loss": 0.0618, + "step": 34900 + }, + { + "epoch": 7.0140713860152735, + "grad_norm": 0.05321958661079407, + "learning_rate": 4.505106308713523e-05, + "loss": 0.0412, + "step": 34910 + }, + { + "epoch": 7.014125548394086, + "grad_norm": 22.789405822753906, + "learning_rate": 4.504805406609014e-05, + "loss": 0.1033, + "step": 34920 + }, + { + "epoch": 7.014179710772897, + "grad_norm": 1.5473663806915283, + "learning_rate": 4.5045045045045046e-05, + "loss": 0.0159, + "step": 34930 + }, + { + "epoch": 7.014233873151709, + "grad_norm": 0.0729600340127945, + "learning_rate": 4.504203602399996e-05, + "loss": 0.0687, + "step": 34940 + }, + { + "epoch": 7.01428803553052, + "grad_norm": 0.014819654636085033, + "learning_rate": 4.503902700295486e-05, + "loss": 0.1166, + "step": 34950 + }, + { + "epoch": 7.0143421979093326, + "grad_norm": 0.2208363562822342, + "learning_rate": 4.5036017981909765e-05, + "loss": 0.0801, + "step": 34960 + }, + { + "epoch": 7.014396360288144, + "grad_norm": 1.6478379964828491, + "learning_rate": 4.503300896086468e-05, + "loss": 0.0538, + "step": 34970 + }, + { + "epoch": 7.014450522666955, + "grad_norm": 0.17765839397907257, + "learning_rate": 4.5029999939819584e-05, + "loss": 0.107, + "step": 34980 + }, + { + "epoch": 7.014504685045767, + "grad_norm": 0.40910667181015015, + "learning_rate": 4.502699091877448e-05, + "loss": 0.097, + "step": 34990 + }, + { + "epoch": 7.014558847424579, + "grad_norm": 1.9491711854934692, + "learning_rate": 4.5023981897729396e-05, + "loss": 0.0427, + "step": 35000 + }, + { + "epoch": 7.014613009803391, + "grad_norm": 0.7696524858474731, + "learning_rate": 4.50209728766843e-05, + "loss": 0.164, + "step": 35010 + }, + { + "epoch": 7.014667172182202, + "grad_norm": 0.06141582503914833, + "learning_rate": 4.501796385563921e-05, + "loss": 0.1932, + "step": 35020 + }, + { + "epoch": 7.014721334561014, + "grad_norm": 1.6822487115859985, + "learning_rate": 4.5014954834594115e-05, + "loss": 0.1233, + "step": 35030 + }, + { + "epoch": 7.0147754969398255, + "grad_norm": 0.5714677572250366, + "learning_rate": 4.501194581354902e-05, + "loss": 0.0694, + "step": 35040 + }, + { + "epoch": 7.014829659318638, + "grad_norm": 2.85331654548645, + "learning_rate": 4.500893679250393e-05, + "loss": 0.204, + "step": 35050 + }, + { + "epoch": 7.014883821697449, + "grad_norm": 0.8032820224761963, + "learning_rate": 4.5005927771458834e-05, + "loss": 0.0596, + "step": 35060 + }, + { + "epoch": 7.01493798407626, + "grad_norm": 1.2294763326644897, + "learning_rate": 4.500291875041374e-05, + "loss": 0.0889, + "step": 35070 + }, + { + "epoch": 7.014992146455072, + "grad_norm": 0.012830978259444237, + "learning_rate": 4.499990972936865e-05, + "loss": 0.041, + "step": 35080 + }, + { + "epoch": 7.015046308833884, + "grad_norm": 0.0048205675557255745, + "learning_rate": 4.499690070832356e-05, + "loss": 0.0446, + "step": 35090 + }, + { + "epoch": 7.015100471212696, + "grad_norm": 0.07666574418544769, + "learning_rate": 4.499389168727846e-05, + "loss": 0.0036, + "step": 35100 + }, + { + "epoch": 7.015154633591507, + "grad_norm": 0.108399897813797, + "learning_rate": 4.499088266623337e-05, + "loss": 0.0348, + "step": 35110 + }, + { + "epoch": 7.015208795970319, + "grad_norm": 0.0033718044869601727, + "learning_rate": 4.498787364518828e-05, + "loss": 0.0394, + "step": 35120 + }, + { + "epoch": 7.015262958349131, + "grad_norm": 0.11253232508897781, + "learning_rate": 4.4984864624143185e-05, + "loss": 0.051, + "step": 35130 + }, + { + "epoch": 7.015317120727943, + "grad_norm": 0.8300681114196777, + "learning_rate": 4.498185560309809e-05, + "loss": 0.2473, + "step": 35140 + }, + { + "epoch": 7.015371283106754, + "grad_norm": 2.8564741611480713, + "learning_rate": 4.4978846582053e-05, + "loss": 0.1259, + "step": 35150 + }, + { + "epoch": 7.015425445485565, + "grad_norm": 0.023209962993860245, + "learning_rate": 4.4975837561007904e-05, + "loss": 0.0469, + "step": 35160 + }, + { + "epoch": 7.0154796078643775, + "grad_norm": 0.3476739823818207, + "learning_rate": 4.497282853996281e-05, + "loss": 0.0063, + "step": 35170 + }, + { + "epoch": 7.015533770243189, + "grad_norm": 0.34118402004241943, + "learning_rate": 4.4969819518917716e-05, + "loss": 0.1322, + "step": 35180 + }, + { + "epoch": 7.015587932622001, + "grad_norm": 0.024681799113750458, + "learning_rate": 4.496681049787262e-05, + "loss": 0.104, + "step": 35190 + }, + { + "epoch": 7.015642095000812, + "grad_norm": 4.190957546234131, + "learning_rate": 4.4963801476827535e-05, + "loss": 0.0745, + "step": 35200 + }, + { + "epoch": 7.015696257379624, + "grad_norm": 0.05113965645432472, + "learning_rate": 4.4960792455782435e-05, + "loss": 0.1042, + "step": 35210 + }, + { + "epoch": 7.015750419758436, + "grad_norm": 0.8025801181793213, + "learning_rate": 4.495778343473734e-05, + "loss": 0.0946, + "step": 35220 + }, + { + "epoch": 7.015804582137248, + "grad_norm": 2.6580698490142822, + "learning_rate": 4.4954774413692254e-05, + "loss": 0.0523, + "step": 35230 + }, + { + "epoch": 7.015858744516059, + "grad_norm": 0.040175892412662506, + "learning_rate": 4.495176539264716e-05, + "loss": 0.0867, + "step": 35240 + }, + { + "epoch": 7.0159129068948705, + "grad_norm": 0.6562447547912598, + "learning_rate": 4.494875637160206e-05, + "loss": 0.0289, + "step": 35250 + }, + { + "epoch": 7.015967069273683, + "grad_norm": 5.0700860023498535, + "learning_rate": 4.494574735055697e-05, + "loss": 0.1354, + "step": 35260 + }, + { + "epoch": 7.016021231652494, + "grad_norm": 0.16008806228637695, + "learning_rate": 4.494273832951188e-05, + "loss": 0.2578, + "step": 35270 + }, + { + "epoch": 7.016075394031306, + "grad_norm": 27.90900421142578, + "learning_rate": 4.4939729308466786e-05, + "loss": 0.0559, + "step": 35280 + }, + { + "epoch": 7.016129556410117, + "grad_norm": 0.13736604154109955, + "learning_rate": 4.493672028742169e-05, + "loss": 0.0893, + "step": 35290 + }, + { + "epoch": 7.0161837187889295, + "grad_norm": 0.016703128814697266, + "learning_rate": 4.49337112663766e-05, + "loss": 0.1673, + "step": 35300 + }, + { + "epoch": 7.016237881167741, + "grad_norm": 0.851484477519989, + "learning_rate": 4.4930702245331505e-05, + "loss": 0.0643, + "step": 35310 + }, + { + "epoch": 7.016292043546553, + "grad_norm": 0.003748267190530896, + "learning_rate": 4.492769322428641e-05, + "loss": 0.0951, + "step": 35320 + }, + { + "epoch": 7.016346205925364, + "grad_norm": 0.10281394422054291, + "learning_rate": 4.492468420324132e-05, + "loss": 0.0664, + "step": 35330 + }, + { + "epoch": 7.016400368304176, + "grad_norm": 0.011406267993152142, + "learning_rate": 4.492167518219623e-05, + "loss": 0.1248, + "step": 35340 + }, + { + "epoch": 7.016454530682988, + "grad_norm": 0.027072133496403694, + "learning_rate": 4.4918666161151136e-05, + "loss": 0.0538, + "step": 35350 + }, + { + "epoch": 7.016508693061799, + "grad_norm": 2.6154215335845947, + "learning_rate": 4.4915657140106036e-05, + "loss": 0.0659, + "step": 35360 + }, + { + "epoch": 7.016562855440611, + "grad_norm": 0.16302962601184845, + "learning_rate": 4.491264811906095e-05, + "loss": 0.1142, + "step": 35370 + }, + { + "epoch": 7.0166170178194225, + "grad_norm": 0.01804296486079693, + "learning_rate": 4.4909639098015855e-05, + "loss": 0.1309, + "step": 35380 + }, + { + "epoch": 7.016671180198235, + "grad_norm": 0.21002119779586792, + "learning_rate": 4.490663007697076e-05, + "loss": 0.153, + "step": 35390 + }, + { + "epoch": 7.016725342577046, + "grad_norm": 0.03532659634947777, + "learning_rate": 4.490362105592567e-05, + "loss": 0.0562, + "step": 35400 + }, + { + "epoch": 7.016779504955857, + "grad_norm": 2.2452361583709717, + "learning_rate": 4.4900612034880574e-05, + "loss": 0.0545, + "step": 35410 + }, + { + "epoch": 7.016833667334669, + "grad_norm": 2.7062346935272217, + "learning_rate": 4.489760301383548e-05, + "loss": 0.2294, + "step": 35420 + }, + { + "epoch": 7.016887829713481, + "grad_norm": 2.1848673820495605, + "learning_rate": 4.489459399279039e-05, + "loss": 0.2119, + "step": 35430 + }, + { + "epoch": 7.016941992092293, + "grad_norm": 0.3465251326560974, + "learning_rate": 4.489158497174529e-05, + "loss": 0.0673, + "step": 35440 + }, + { + "epoch": 7.016996154471104, + "grad_norm": 0.49092385172843933, + "learning_rate": 4.48885759507002e-05, + "loss": 0.0142, + "step": 35450 + }, + { + "epoch": 7.017050316849916, + "grad_norm": 0.045992154628038406, + "learning_rate": 4.488556692965511e-05, + "loss": 0.0493, + "step": 35460 + }, + { + "epoch": 7.017104479228728, + "grad_norm": 0.07535011321306229, + "learning_rate": 4.488255790861001e-05, + "loss": 0.0799, + "step": 35470 + }, + { + "epoch": 7.01715864160754, + "grad_norm": 1.4288395643234253, + "learning_rate": 4.487954888756492e-05, + "loss": 0.1448, + "step": 35480 + }, + { + "epoch": 7.017212803986351, + "grad_norm": 0.025081688538193703, + "learning_rate": 4.487653986651983e-05, + "loss": 0.0591, + "step": 35490 + }, + { + "epoch": 7.017266966365162, + "grad_norm": 1.4446815252304077, + "learning_rate": 4.487353084547474e-05, + "loss": 0.1466, + "step": 35500 + }, + { + "epoch": 7.0173211287439745, + "grad_norm": 0.6861498355865479, + "learning_rate": 4.487052182442964e-05, + "loss": 0.0391, + "step": 35510 + }, + { + "epoch": 7.017375291122786, + "grad_norm": 0.43032240867614746, + "learning_rate": 4.486751280338455e-05, + "loss": 0.0605, + "step": 35520 + }, + { + "epoch": 7.017429453501598, + "grad_norm": 3.333970308303833, + "learning_rate": 4.4864503782339456e-05, + "loss": 0.109, + "step": 35530 + }, + { + "epoch": 7.017483615880409, + "grad_norm": 0.022954817861318588, + "learning_rate": 4.486149476129436e-05, + "loss": 0.0608, + "step": 35540 + }, + { + "epoch": 7.017537778259221, + "grad_norm": 0.004127213265746832, + "learning_rate": 4.485848574024927e-05, + "loss": 0.0488, + "step": 35550 + }, + { + "epoch": 7.017591940638033, + "grad_norm": 0.6049934029579163, + "learning_rate": 4.4855476719204175e-05, + "loss": 0.0879, + "step": 35560 + }, + { + "epoch": 7.017646103016845, + "grad_norm": 6.136470794677734, + "learning_rate": 4.485246769815908e-05, + "loss": 0.2949, + "step": 35570 + }, + { + "epoch": 7.017700265395656, + "grad_norm": 0.14600770175457, + "learning_rate": 4.4849458677113994e-05, + "loss": 0.1455, + "step": 35580 + }, + { + "epoch": 7.0177544277744675, + "grad_norm": 0.00909191183745861, + "learning_rate": 4.4846449656068894e-05, + "loss": 0.117, + "step": 35590 + }, + { + "epoch": 7.01780859015328, + "grad_norm": 0.009913668036460876, + "learning_rate": 4.484344063502381e-05, + "loss": 0.0149, + "step": 35600 + }, + { + "epoch": 7.017862752532091, + "grad_norm": 0.08192995190620422, + "learning_rate": 4.484043161397871e-05, + "loss": 0.007, + "step": 35610 + }, + { + "epoch": 7.017916914910903, + "grad_norm": 0.31688928604125977, + "learning_rate": 4.483742259293361e-05, + "loss": 0.0267, + "step": 35620 + }, + { + "epoch": 7.017971077289714, + "grad_norm": 0.30731338262557983, + "learning_rate": 4.4834413571888526e-05, + "loss": 0.0299, + "step": 35630 + }, + { + "epoch": 7.0180252396685265, + "grad_norm": 0.15788564085960388, + "learning_rate": 4.483140455084343e-05, + "loss": 0.0037, + "step": 35640 + }, + { + "epoch": 7.018079402047338, + "grad_norm": 0.024721220135688782, + "learning_rate": 4.482839552979834e-05, + "loss": 0.0195, + "step": 35650 + }, + { + "epoch": 7.01813356442615, + "grad_norm": 5.135998725891113, + "learning_rate": 4.4825386508753244e-05, + "loss": 0.054, + "step": 35660 + }, + { + "epoch": 7.018187726804961, + "grad_norm": 0.02359355241060257, + "learning_rate": 4.482237748770815e-05, + "loss": 0.3776, + "step": 35670 + }, + { + "epoch": 7.018241889183773, + "grad_norm": 0.17440472543239594, + "learning_rate": 4.481936846666306e-05, + "loss": 0.0551, + "step": 35680 + }, + { + "epoch": 7.018296051562585, + "grad_norm": 0.6988747119903564, + "learning_rate": 4.481635944561797e-05, + "loss": 0.0888, + "step": 35690 + }, + { + "epoch": 7.018350213941396, + "grad_norm": 0.19871456921100616, + "learning_rate": 4.481335042457287e-05, + "loss": 0.0435, + "step": 35700 + }, + { + "epoch": 7.018404376320208, + "grad_norm": 0.8909059762954712, + "learning_rate": 4.4810341403527776e-05, + "loss": 0.1142, + "step": 35710 + }, + { + "epoch": 7.0184585386990195, + "grad_norm": 0.14265531301498413, + "learning_rate": 4.480733238248269e-05, + "loss": 0.1548, + "step": 35720 + }, + { + "epoch": 7.018512701077832, + "grad_norm": 0.014120636507868767, + "learning_rate": 4.4804323361437595e-05, + "loss": 0.0684, + "step": 35730 + }, + { + "epoch": 7.018566863456643, + "grad_norm": 1.8849942684173584, + "learning_rate": 4.4801314340392495e-05, + "loss": 0.0415, + "step": 35740 + }, + { + "epoch": 7.018621025835455, + "grad_norm": 0.009903286583721638, + "learning_rate": 4.479830531934741e-05, + "loss": 0.0913, + "step": 35750 + }, + { + "epoch": 7.018675188214266, + "grad_norm": 0.2509879469871521, + "learning_rate": 4.4795296298302314e-05, + "loss": 0.0722, + "step": 35760 + }, + { + "epoch": 7.018729350593078, + "grad_norm": 0.004166616592556238, + "learning_rate": 4.4792287277257213e-05, + "loss": 0.2559, + "step": 35770 + }, + { + "epoch": 7.01878351297189, + "grad_norm": 0.14406222105026245, + "learning_rate": 4.4789278256212127e-05, + "loss": 0.0873, + "step": 35780 + }, + { + "epoch": 7.018837675350701, + "grad_norm": 0.007657983340322971, + "learning_rate": 4.478626923516703e-05, + "loss": 0.008, + "step": 35790 + }, + { + "epoch": 7.018891837729513, + "grad_norm": 0.008547556586563587, + "learning_rate": 4.478326021412194e-05, + "loss": 0.038, + "step": 35800 + }, + { + "epoch": 7.018946000108325, + "grad_norm": 0.022472413256764412, + "learning_rate": 4.4780251193076845e-05, + "loss": 0.0124, + "step": 35810 + }, + { + "epoch": 7.019000162487137, + "grad_norm": 0.10550523549318314, + "learning_rate": 4.477724217203175e-05, + "loss": 0.1788, + "step": 35820 + }, + { + "epoch": 7.019054324865948, + "grad_norm": 0.8499479293823242, + "learning_rate": 4.477423315098666e-05, + "loss": 0.0411, + "step": 35830 + }, + { + "epoch": 7.01910848724476, + "grad_norm": 2.623711109161377, + "learning_rate": 4.477122412994157e-05, + "loss": 0.0551, + "step": 35840 + }, + { + "epoch": 7.0191626496235715, + "grad_norm": 0.2002072036266327, + "learning_rate": 4.476821510889647e-05, + "loss": 0.0339, + "step": 35850 + }, + { + "epoch": 7.019216812002383, + "grad_norm": 0.6044508218765259, + "learning_rate": 4.4765206087851383e-05, + "loss": 0.0575, + "step": 35860 + }, + { + "epoch": 7.019270974381195, + "grad_norm": 0.1461334526538849, + "learning_rate": 4.476219706680629e-05, + "loss": 0.1253, + "step": 35870 + }, + { + "epoch": 7.019325136760006, + "grad_norm": 4.755772590637207, + "learning_rate": 4.4759188045761196e-05, + "loss": 0.0565, + "step": 35880 + }, + { + "epoch": 7.019379299138818, + "grad_norm": 0.16999021172523499, + "learning_rate": 4.47561790247161e-05, + "loss": 0.1026, + "step": 35890 + }, + { + "epoch": 7.01943346151763, + "grad_norm": 0.002533446066081524, + "learning_rate": 4.475317000367101e-05, + "loss": 0.0658, + "step": 35900 + }, + { + "epoch": 7.019487623896442, + "grad_norm": 0.03882667049765587, + "learning_rate": 4.4750160982625915e-05, + "loss": 0.0893, + "step": 35910 + }, + { + "epoch": 7.019541786275253, + "grad_norm": 0.19063609838485718, + "learning_rate": 4.474715196158082e-05, + "loss": 0.0871, + "step": 35920 + }, + { + "epoch": 7.019595948654064, + "grad_norm": 0.1655806452035904, + "learning_rate": 4.474414294053573e-05, + "loss": 0.1079, + "step": 35930 + }, + { + "epoch": 7.019650111032877, + "grad_norm": 6.734665393829346, + "learning_rate": 4.4741133919490634e-05, + "loss": 0.109, + "step": 35940 + }, + { + "epoch": 7.019704273411688, + "grad_norm": 0.008205975405871868, + "learning_rate": 4.473812489844555e-05, + "loss": 0.0583, + "step": 35950 + }, + { + "epoch": 7.0197584357905, + "grad_norm": 0.01014983095228672, + "learning_rate": 4.4735115877400446e-05, + "loss": 0.0067, + "step": 35960 + }, + { + "epoch": 7.019812598169311, + "grad_norm": 5.373883247375488, + "learning_rate": 4.473210685635535e-05, + "loss": 0.0819, + "step": 35970 + }, + { + "epoch": 7.0198667605481235, + "grad_norm": 0.07003141939640045, + "learning_rate": 4.4729097835310266e-05, + "loss": 0.0751, + "step": 35980 + }, + { + "epoch": 7.019920922926935, + "grad_norm": 0.004204805474728346, + "learning_rate": 4.472608881426517e-05, + "loss": 0.0786, + "step": 35990 + }, + { + "epoch": 7.019975085305747, + "grad_norm": 4.8697614669799805, + "learning_rate": 4.472307979322007e-05, + "loss": 0.1533, + "step": 36000 + }, + { + "epoch": 7.020029247684558, + "grad_norm": 0.02599012479186058, + "learning_rate": 4.4720070772174984e-05, + "loss": 0.079, + "step": 36010 + }, + { + "epoch": 7.0200834100633696, + "grad_norm": 0.024459799751639366, + "learning_rate": 4.471706175112989e-05, + "loss": 0.2171, + "step": 36020 + }, + { + "epoch": 7.020137572442182, + "grad_norm": 0.04818723723292351, + "learning_rate": 4.47140527300848e-05, + "loss": 0.0228, + "step": 36030 + }, + { + "epoch": 7.020191734820993, + "grad_norm": 0.08096017688512802, + "learning_rate": 4.47110437090397e-05, + "loss": 0.0333, + "step": 36040 + }, + { + "epoch": 7.020245897199805, + "grad_norm": 0.09995739907026291, + "learning_rate": 4.470803468799461e-05, + "loss": 0.0738, + "step": 36050 + }, + { + "epoch": 7.0203000595786165, + "grad_norm": 0.01863723248243332, + "learning_rate": 4.4705025666949516e-05, + "loss": 0.0107, + "step": 36060 + }, + { + "epoch": 7.020354221957429, + "grad_norm": 0.008407536894083023, + "learning_rate": 4.470201664590442e-05, + "loss": 0.0319, + "step": 36070 + }, + { + "epoch": 7.02040838433624, + "grad_norm": 5.6258625984191895, + "learning_rate": 4.469900762485933e-05, + "loss": 0.1846, + "step": 36080 + }, + { + "epoch": 7.020462546715052, + "grad_norm": 0.030112702399492264, + "learning_rate": 4.4695998603814235e-05, + "loss": 0.156, + "step": 36090 + }, + { + "epoch": 7.020516709093863, + "grad_norm": 0.02462916076183319, + "learning_rate": 4.469298958276915e-05, + "loss": 0.0384, + "step": 36100 + }, + { + "epoch": 7.020570871472675, + "grad_norm": 1.798354983329773, + "learning_rate": 4.468998056172405e-05, + "loss": 0.0256, + "step": 36110 + }, + { + "epoch": 7.020625033851487, + "grad_norm": 4.444777488708496, + "learning_rate": 4.468697154067896e-05, + "loss": 0.1875, + "step": 36120 + }, + { + "epoch": 7.020679196230298, + "grad_norm": 1.2475333213806152, + "learning_rate": 4.4683962519633866e-05, + "loss": 0.1244, + "step": 36130 + }, + { + "epoch": 7.02073335860911, + "grad_norm": 0.745653510093689, + "learning_rate": 4.468095349858877e-05, + "loss": 0.0969, + "step": 36140 + }, + { + "epoch": 7.020787520987922, + "grad_norm": 34.33847427368164, + "learning_rate": 4.467794447754368e-05, + "loss": 0.0512, + "step": 36150 + }, + { + "epoch": 7.020841683366734, + "grad_norm": 0.026069622486829758, + "learning_rate": 4.4674935456498585e-05, + "loss": 0.0136, + "step": 36160 + }, + { + "epoch": 7.020895845745545, + "grad_norm": 0.3503160774707794, + "learning_rate": 4.467192643545349e-05, + "loss": 0.0658, + "step": 36170 + }, + { + "epoch": 7.020950008124357, + "grad_norm": 0.35645654797554016, + "learning_rate": 4.46689174144084e-05, + "loss": 0.0676, + "step": 36180 + }, + { + "epoch": 7.0210041705031685, + "grad_norm": 0.07985389232635498, + "learning_rate": 4.4665908393363304e-05, + "loss": 0.0976, + "step": 36190 + }, + { + "epoch": 7.02105833288198, + "grad_norm": 0.009018891490995884, + "learning_rate": 4.466289937231821e-05, + "loss": 0.0754, + "step": 36200 + }, + { + "epoch": 7.021112495260792, + "grad_norm": 0.15764780342578888, + "learning_rate": 4.4659890351273123e-05, + "loss": 0.053, + "step": 36210 + }, + { + "epoch": 7.021166657639603, + "grad_norm": 0.6300204992294312, + "learning_rate": 4.465688133022802e-05, + "loss": 0.0347, + "step": 36220 + }, + { + "epoch": 7.021220820018415, + "grad_norm": 0.05313396453857422, + "learning_rate": 4.465387230918293e-05, + "loss": 0.078, + "step": 36230 + }, + { + "epoch": 7.021274982397227, + "grad_norm": 0.07026387006044388, + "learning_rate": 4.465086328813784e-05, + "loss": 0.0847, + "step": 36240 + }, + { + "epoch": 7.021329144776039, + "grad_norm": 0.11618059873580933, + "learning_rate": 4.464785426709275e-05, + "loss": 0.1036, + "step": 36250 + }, + { + "epoch": 7.02138330715485, + "grad_norm": 0.15900975465774536, + "learning_rate": 4.464484524604765e-05, + "loss": 0.1158, + "step": 36260 + }, + { + "epoch": 7.021437469533662, + "grad_norm": 0.8849380016326904, + "learning_rate": 4.464183622500256e-05, + "loss": 0.0436, + "step": 36270 + }, + { + "epoch": 7.021491631912474, + "grad_norm": 0.11483481526374817, + "learning_rate": 4.463882720395747e-05, + "loss": 0.0657, + "step": 36280 + }, + { + "epoch": 7.021545794291285, + "grad_norm": 0.011335954070091248, + "learning_rate": 4.4635818182912374e-05, + "loss": 0.0207, + "step": 36290 + }, + { + "epoch": 7.021599956670097, + "grad_norm": 0.03566396236419678, + "learning_rate": 4.463280916186728e-05, + "loss": 0.2135, + "step": 36300 + }, + { + "epoch": 7.021654119048908, + "grad_norm": 0.023607484996318817, + "learning_rate": 4.4629800140822186e-05, + "loss": 0.0257, + "step": 36310 + }, + { + "epoch": 7.0217082814277205, + "grad_norm": 0.01767519861459732, + "learning_rate": 4.462679111977709e-05, + "loss": 0.127, + "step": 36320 + }, + { + "epoch": 7.021762443806532, + "grad_norm": 0.42236119508743286, + "learning_rate": 4.4623782098732006e-05, + "loss": 0.2298, + "step": 36330 + }, + { + "epoch": 7.021816606185344, + "grad_norm": 0.2613135576248169, + "learning_rate": 4.4620773077686905e-05, + "loss": 0.1055, + "step": 36340 + }, + { + "epoch": 7.021870768564155, + "grad_norm": 0.44989702105522156, + "learning_rate": 4.461776405664181e-05, + "loss": 0.0666, + "step": 36350 + }, + { + "epoch": 7.021924930942967, + "grad_norm": 0.08489754796028137, + "learning_rate": 4.4614755035596724e-05, + "loss": 0.0313, + "step": 36360 + }, + { + "epoch": 7.021979093321779, + "grad_norm": 0.28010451793670654, + "learning_rate": 4.4611746014551624e-05, + "loss": 0.1359, + "step": 36370 + }, + { + "epoch": 7.02203325570059, + "grad_norm": 0.008933937177062035, + "learning_rate": 4.460873699350654e-05, + "loss": 0.0444, + "step": 36380 + }, + { + "epoch": 7.022087418079402, + "grad_norm": 0.0258648544549942, + "learning_rate": 4.460572797246144e-05, + "loss": 0.0775, + "step": 36390 + }, + { + "epoch": 7.0221415804582135, + "grad_norm": 0.25959739089012146, + "learning_rate": 4.460271895141635e-05, + "loss": 0.1277, + "step": 36400 + }, + { + "epoch": 7.022195742837026, + "grad_norm": 0.2643592357635498, + "learning_rate": 4.4599709930371256e-05, + "loss": 0.0433, + "step": 36410 + }, + { + "epoch": 7.022249905215837, + "grad_norm": 0.4376610517501831, + "learning_rate": 4.459670090932616e-05, + "loss": 0.1847, + "step": 36420 + }, + { + "epoch": 7.022304067594649, + "grad_norm": 0.012051482684910297, + "learning_rate": 4.459369188828107e-05, + "loss": 0.0807, + "step": 36430 + }, + { + "epoch": 7.02235822997346, + "grad_norm": 0.0085378959774971, + "learning_rate": 4.4590682867235975e-05, + "loss": 0.0351, + "step": 36440 + }, + { + "epoch": 7.0224123923522725, + "grad_norm": 0.012722736224532127, + "learning_rate": 4.458767384619088e-05, + "loss": 0.1122, + "step": 36450 + }, + { + "epoch": 7.022466554731084, + "grad_norm": 0.21566632390022278, + "learning_rate": 4.458466482514579e-05, + "loss": 0.131, + "step": 36460 + }, + { + "epoch": 7.022520717109895, + "grad_norm": 0.010806105099618435, + "learning_rate": 4.45816558041007e-05, + "loss": 0.0089, + "step": 36470 + }, + { + "epoch": 7.022574879488707, + "grad_norm": 0.07770703732967377, + "learning_rate": 4.4578646783055606e-05, + "loss": 0.0499, + "step": 36480 + }, + { + "epoch": 7.022629041867519, + "grad_norm": 0.1414186805486679, + "learning_rate": 4.4575637762010506e-05, + "loss": 0.0287, + "step": 36490 + }, + { + "epoch": 7.022683204246331, + "grad_norm": 0.15986359119415283, + "learning_rate": 4.457262874096542e-05, + "loss": 0.1208, + "step": 36500 + }, + { + "epoch": 7.022737366625142, + "grad_norm": 0.2918829917907715, + "learning_rate": 4.4569619719920325e-05, + "loss": 0.0268, + "step": 36510 + }, + { + "epoch": 7.022791529003954, + "grad_norm": 0.490293025970459, + "learning_rate": 4.4566610698875225e-05, + "loss": 0.0684, + "step": 36520 + }, + { + "epoch": 7.0228456913827655, + "grad_norm": 0.04124172031879425, + "learning_rate": 4.456360167783014e-05, + "loss": 0.0058, + "step": 36530 + }, + { + "epoch": 7.022899853761577, + "grad_norm": 0.0020175527315586805, + "learning_rate": 4.4560592656785044e-05, + "loss": 0.0047, + "step": 36540 + }, + { + "epoch": 7.022954016140389, + "grad_norm": 2.3902249336242676, + "learning_rate": 4.455758363573995e-05, + "loss": 0.084, + "step": 36550 + }, + { + "epoch": 7.0230081785192, + "grad_norm": 0.0062264809384942055, + "learning_rate": 4.4554574614694857e-05, + "loss": 0.2295, + "step": 36560 + }, + { + "epoch": 7.023062340898012, + "grad_norm": 3.5042829513549805, + "learning_rate": 4.455156559364976e-05, + "loss": 0.0815, + "step": 36570 + }, + { + "epoch": 7.023116503276824, + "grad_norm": 0.13689438998699188, + "learning_rate": 4.454855657260467e-05, + "loss": 0.0754, + "step": 36580 + }, + { + "epoch": 7.023170665655636, + "grad_norm": 2.387827157974243, + "learning_rate": 4.454554755155958e-05, + "loss": 0.1052, + "step": 36590 + }, + { + "epoch": 7.023224828034447, + "grad_norm": 1.3116718530654907, + "learning_rate": 4.454253853051448e-05, + "loss": 0.0443, + "step": 36600 + }, + { + "epoch": 7.023278990413259, + "grad_norm": 0.8198032975196838, + "learning_rate": 4.453952950946939e-05, + "loss": 0.0879, + "step": 36610 + }, + { + "epoch": 7.023333152792071, + "grad_norm": 1.6349326372146606, + "learning_rate": 4.45365204884243e-05, + "loss": 0.0902, + "step": 36620 + }, + { + "epoch": 7.023387315170882, + "grad_norm": 0.1721566766500473, + "learning_rate": 4.453351146737921e-05, + "loss": 0.0473, + "step": 36630 + }, + { + "epoch": 7.023441477549694, + "grad_norm": 7.131582736968994, + "learning_rate": 4.4530502446334114e-05, + "loss": 0.2448, + "step": 36640 + }, + { + "epoch": 7.023495639928505, + "grad_norm": 6.483015060424805, + "learning_rate": 4.452749342528902e-05, + "loss": 0.1526, + "step": 36650 + }, + { + "epoch": 7.0235498023073175, + "grad_norm": 0.07126662135124207, + "learning_rate": 4.4524484404243926e-05, + "loss": 0.0251, + "step": 36660 + }, + { + "epoch": 7.023603964686129, + "grad_norm": 0.05914284288883209, + "learning_rate": 4.452147538319883e-05, + "loss": 0.0985, + "step": 36670 + }, + { + "epoch": 7.023658127064941, + "grad_norm": 0.03604617714881897, + "learning_rate": 4.451846636215374e-05, + "loss": 0.0276, + "step": 36680 + }, + { + "epoch": 7.023712289443752, + "grad_norm": 0.12433405965566635, + "learning_rate": 4.4515457341108645e-05, + "loss": 0.077, + "step": 36690 + }, + { + "epoch": 7.023766451822564, + "grad_norm": 0.039611078798770905, + "learning_rate": 4.451244832006355e-05, + "loss": 0.0424, + "step": 36700 + }, + { + "epoch": 7.023820614201376, + "grad_norm": 0.026655158028006554, + "learning_rate": 4.450943929901846e-05, + "loss": 0.0159, + "step": 36710 + }, + { + "epoch": 7.023874776580187, + "grad_norm": 2.0458459854125977, + "learning_rate": 4.4506430277973364e-05, + "loss": 0.206, + "step": 36720 + }, + { + "epoch": 7.023928938958999, + "grad_norm": 4.023892402648926, + "learning_rate": 4.450342125692828e-05, + "loss": 0.1757, + "step": 36730 + }, + { + "epoch": 7.0239831013378105, + "grad_norm": 2.5780322551727295, + "learning_rate": 4.450041223588318e-05, + "loss": 0.0977, + "step": 36740 + }, + { + "epoch": 7.024037263716623, + "grad_norm": 6.088955879211426, + "learning_rate": 4.449740321483808e-05, + "loss": 0.0385, + "step": 36750 + }, + { + "epoch": 7.024091426095434, + "grad_norm": 0.0019974280148744583, + "learning_rate": 4.4494394193792996e-05, + "loss": 0.0247, + "step": 36760 + }, + { + "epoch": 7.024145588474246, + "grad_norm": 1.671327829360962, + "learning_rate": 4.44913851727479e-05, + "loss": 0.0742, + "step": 36770 + }, + { + "epoch": 7.024199750853057, + "grad_norm": 1.793485403060913, + "learning_rate": 4.448837615170281e-05, + "loss": 0.1299, + "step": 36780 + }, + { + "epoch": 7.0242539132318695, + "grad_norm": 0.11691910773515701, + "learning_rate": 4.4485367130657714e-05, + "loss": 0.0765, + "step": 36790 + }, + { + "epoch": 7.024308075610681, + "grad_norm": 0.21259558200836182, + "learning_rate": 4.448235810961262e-05, + "loss": 0.0734, + "step": 36800 + }, + { + "epoch": 7.024362237989492, + "grad_norm": 0.08238408714532852, + "learning_rate": 4.447934908856753e-05, + "loss": 0.1141, + "step": 36810 + }, + { + "epoch": 7.024416400368304, + "grad_norm": 0.021824968978762627, + "learning_rate": 4.447634006752243e-05, + "loss": 0.1116, + "step": 36820 + }, + { + "epoch": 7.024470562747116, + "grad_norm": 0.08599107712507248, + "learning_rate": 4.447333104647734e-05, + "loss": 0.0944, + "step": 36830 + }, + { + "epoch": 7.024524725125928, + "grad_norm": 0.011927173472940922, + "learning_rate": 4.4470322025432246e-05, + "loss": 0.054, + "step": 36840 + }, + { + "epoch": 7.024578887504739, + "grad_norm": 0.020254245027899742, + "learning_rate": 4.446731300438716e-05, + "loss": 0.2002, + "step": 36850 + }, + { + "epoch": 7.024633049883551, + "grad_norm": 0.19481606781482697, + "learning_rate": 4.446430398334206e-05, + "loss": 0.0558, + "step": 36860 + }, + { + "epoch": 7.0246872122623625, + "grad_norm": 0.1261090189218521, + "learning_rate": 4.4461294962296965e-05, + "loss": 0.1035, + "step": 36870 + }, + { + "epoch": 7.024741374641175, + "grad_norm": 0.25936102867126465, + "learning_rate": 4.445828594125188e-05, + "loss": 0.1627, + "step": 36880 + }, + { + "epoch": 7.024795537019986, + "grad_norm": 2.114532709121704, + "learning_rate": 4.4455276920206784e-05, + "loss": 0.1505, + "step": 36890 + }, + { + "epoch": 7.024849699398797, + "grad_norm": 2.030726432800293, + "learning_rate": 4.445226789916169e-05, + "loss": 0.0673, + "step": 36900 + }, + { + "epoch": 7.024903861777609, + "grad_norm": 0.4204549789428711, + "learning_rate": 4.4449258878116597e-05, + "loss": 0.0915, + "step": 36910 + }, + { + "epoch": 7.024958024156421, + "grad_norm": 0.004277660045772791, + "learning_rate": 4.44462498570715e-05, + "loss": 0.0079, + "step": 36920 + }, + { + "epoch": 7.02500135405947, + "eval_accuracy": 0.8282168517308949, + "eval_loss": 0.5399090051651001, + "eval_runtime": 118.0007, + "eval_samples_per_second": 25.949, + "eval_steps_per_second": 3.246, + "step": 36928 + }, + { + "epoch": 8.000010832475763, + "grad_norm": 0.04850982874631882, + "learning_rate": 4.444324083602641e-05, + "loss": 0.0809, + "step": 36930 + }, + { + "epoch": 8.000064994854574, + "grad_norm": 1.4267892837524414, + "learning_rate": 4.4440231814981315e-05, + "loss": 0.0988, + "step": 36940 + }, + { + "epoch": 8.000119157233385, + "grad_norm": 0.07350174337625504, + "learning_rate": 4.443722279393622e-05, + "loss": 0.0642, + "step": 36950 + }, + { + "epoch": 8.000173319612198, + "grad_norm": 0.013330180197954178, + "learning_rate": 4.443421377289113e-05, + "loss": 0.0453, + "step": 36960 + }, + { + "epoch": 8.00022748199101, + "grad_norm": 0.041414786130189896, + "learning_rate": 4.4431204751846034e-05, + "loss": 0.0506, + "step": 36970 + }, + { + "epoch": 8.00028164436982, + "grad_norm": 0.03149350360035896, + "learning_rate": 4.442819573080094e-05, + "loss": 0.132, + "step": 36980 + }, + { + "epoch": 8.000335806748632, + "grad_norm": 1.1956636905670166, + "learning_rate": 4.4425186709755854e-05, + "loss": 0.0156, + "step": 36990 + }, + { + "epoch": 8.000389969127443, + "grad_norm": 0.002600623993203044, + "learning_rate": 4.442217768871076e-05, + "loss": 0.0514, + "step": 37000 + }, + { + "epoch": 8.000444131506256, + "grad_norm": 2.465955972671509, + "learning_rate": 4.441916866766566e-05, + "loss": 0.1456, + "step": 37010 + }, + { + "epoch": 8.000498293885068, + "grad_norm": 1.258445143699646, + "learning_rate": 4.441615964662057e-05, + "loss": 0.0921, + "step": 37020 + }, + { + "epoch": 8.000552456263879, + "grad_norm": 0.2483518123626709, + "learning_rate": 4.441315062557548e-05, + "loss": 0.0428, + "step": 37030 + }, + { + "epoch": 8.00060661864269, + "grad_norm": 0.2646375894546509, + "learning_rate": 4.4410141604530385e-05, + "loss": 0.0403, + "step": 37040 + }, + { + "epoch": 8.000660781021502, + "grad_norm": 0.06456687301397324, + "learning_rate": 4.440713258348529e-05, + "loss": 0.011, + "step": 37050 + }, + { + "epoch": 8.000714943400315, + "grad_norm": 0.13266125321388245, + "learning_rate": 4.44041235624402e-05, + "loss": 0.011, + "step": 37060 + }, + { + "epoch": 8.000769105779126, + "grad_norm": 0.0072052511386573315, + "learning_rate": 4.4401114541395104e-05, + "loss": 0.1526, + "step": 37070 + }, + { + "epoch": 8.000823268157937, + "grad_norm": 0.038316767662763596, + "learning_rate": 4.439810552035002e-05, + "loss": 0.0085, + "step": 37080 + }, + { + "epoch": 8.000877430536748, + "grad_norm": 3.142069101333618, + "learning_rate": 4.4395096499304916e-05, + "loss": 0.1591, + "step": 37090 + }, + { + "epoch": 8.000931592915562, + "grad_norm": 0.0026537450030446053, + "learning_rate": 4.439208747825982e-05, + "loss": 0.0216, + "step": 37100 + }, + { + "epoch": 8.000985755294373, + "grad_norm": 0.0047958018258214, + "learning_rate": 4.4389078457214736e-05, + "loss": 0.0352, + "step": 37110 + }, + { + "epoch": 8.001039917673184, + "grad_norm": 1.7729341983795166, + "learning_rate": 4.4386069436169635e-05, + "loss": 0.0804, + "step": 37120 + }, + { + "epoch": 8.001094080051995, + "grad_norm": 6.540862560272217, + "learning_rate": 4.438306041512454e-05, + "loss": 0.0786, + "step": 37130 + }, + { + "epoch": 8.001148242430807, + "grad_norm": 0.001337112276814878, + "learning_rate": 4.4380051394079454e-05, + "loss": 0.0854, + "step": 37140 + }, + { + "epoch": 8.00120240480962, + "grad_norm": 0.024147044867277145, + "learning_rate": 4.437704237303436e-05, + "loss": 0.0019, + "step": 37150 + }, + { + "epoch": 8.001256567188431, + "grad_norm": 0.1338217854499817, + "learning_rate": 4.437403335198927e-05, + "loss": 0.1073, + "step": 37160 + }, + { + "epoch": 8.001310729567242, + "grad_norm": 0.33396321535110474, + "learning_rate": 4.437102433094417e-05, + "loss": 0.0675, + "step": 37170 + }, + { + "epoch": 8.001364891946054, + "grad_norm": 0.019361587241292, + "learning_rate": 4.436801530989908e-05, + "loss": 0.084, + "step": 37180 + }, + { + "epoch": 8.001419054324867, + "grad_norm": 0.036468494683504105, + "learning_rate": 4.4365006288853986e-05, + "loss": 0.0789, + "step": 37190 + }, + { + "epoch": 8.001473216703678, + "grad_norm": 0.028236739337444305, + "learning_rate": 4.436199726780889e-05, + "loss": 0.0692, + "step": 37200 + }, + { + "epoch": 8.00152737908249, + "grad_norm": 0.00201867101714015, + "learning_rate": 4.43589882467638e-05, + "loss": 0.1173, + "step": 37210 + }, + { + "epoch": 8.0015815414613, + "grad_norm": 0.001861838041804731, + "learning_rate": 4.4355979225718705e-05, + "loss": 0.0856, + "step": 37220 + }, + { + "epoch": 8.001635703840112, + "grad_norm": 1.5043041706085205, + "learning_rate": 4.435297020467362e-05, + "loss": 0.0701, + "step": 37230 + }, + { + "epoch": 8.001689866218925, + "grad_norm": 0.048147231340408325, + "learning_rate": 4.434996118362852e-05, + "loss": 0.0994, + "step": 37240 + }, + { + "epoch": 8.001744028597736, + "grad_norm": 0.011258725076913834, + "learning_rate": 4.434695216258343e-05, + "loss": 0.0398, + "step": 37250 + }, + { + "epoch": 8.001798190976547, + "grad_norm": 0.01856677606701851, + "learning_rate": 4.4343943141538336e-05, + "loss": 0.0276, + "step": 37260 + }, + { + "epoch": 8.001852353355359, + "grad_norm": 1.5018681287765503, + "learning_rate": 4.4340934120493236e-05, + "loss": 0.0462, + "step": 37270 + }, + { + "epoch": 8.001906515734172, + "grad_norm": 0.01324375905096531, + "learning_rate": 4.433792509944815e-05, + "loss": 0.0378, + "step": 37280 + }, + { + "epoch": 8.001960678112983, + "grad_norm": 0.01914854533970356, + "learning_rate": 4.4334916078403055e-05, + "loss": 0.0206, + "step": 37290 + }, + { + "epoch": 8.002014840491794, + "grad_norm": 0.31911417841911316, + "learning_rate": 4.433190705735796e-05, + "loss": 0.1693, + "step": 37300 + }, + { + "epoch": 8.002069002870606, + "grad_norm": 1.0423380136489868, + "learning_rate": 4.432889803631287e-05, + "loss": 0.0409, + "step": 37310 + }, + { + "epoch": 8.002123165249417, + "grad_norm": 0.7067127227783203, + "learning_rate": 4.4325889015267774e-05, + "loss": 0.0178, + "step": 37320 + }, + { + "epoch": 8.00217732762823, + "grad_norm": 0.0021479891147464514, + "learning_rate": 4.432287999422268e-05, + "loss": 0.0115, + "step": 37330 + }, + { + "epoch": 8.002231490007041, + "grad_norm": 0.0015344356652349234, + "learning_rate": 4.4319870973177593e-05, + "loss": 0.0945, + "step": 37340 + }, + { + "epoch": 8.002285652385853, + "grad_norm": 0.02577766589820385, + "learning_rate": 4.431686195213249e-05, + "loss": 0.0662, + "step": 37350 + }, + { + "epoch": 8.002339814764664, + "grad_norm": 1.1397830247879028, + "learning_rate": 4.43138529310874e-05, + "loss": 0.142, + "step": 37360 + }, + { + "epoch": 8.002393977143477, + "grad_norm": 0.016057191416621208, + "learning_rate": 4.431084391004231e-05, + "loss": 0.0639, + "step": 37370 + }, + { + "epoch": 8.002448139522288, + "grad_norm": 0.010017810389399529, + "learning_rate": 4.430783488899722e-05, + "loss": 0.0395, + "step": 37380 + }, + { + "epoch": 8.0025023019011, + "grad_norm": 0.13217882812023163, + "learning_rate": 4.430482586795212e-05, + "loss": 0.1349, + "step": 37390 + }, + { + "epoch": 8.00255646427991, + "grad_norm": 0.06529145687818527, + "learning_rate": 4.430181684690703e-05, + "loss": 0.1628, + "step": 37400 + }, + { + "epoch": 8.002610626658722, + "grad_norm": 0.4489005506038666, + "learning_rate": 4.429880782586194e-05, + "loss": 0.0332, + "step": 37410 + }, + { + "epoch": 8.002664789037535, + "grad_norm": 0.003787704510614276, + "learning_rate": 4.4295798804816844e-05, + "loss": 0.0126, + "step": 37420 + }, + { + "epoch": 8.002718951416346, + "grad_norm": 1.090599775314331, + "learning_rate": 4.429278978377175e-05, + "loss": 0.0722, + "step": 37430 + }, + { + "epoch": 8.002773113795158, + "grad_norm": 0.019512377679347992, + "learning_rate": 4.4289780762726656e-05, + "loss": 0.2038, + "step": 37440 + }, + { + "epoch": 8.002827276173969, + "grad_norm": 0.12597967684268951, + "learning_rate": 4.428677174168156e-05, + "loss": 0.0671, + "step": 37450 + }, + { + "epoch": 8.002881438552782, + "grad_norm": 0.5987630486488342, + "learning_rate": 4.428376272063647e-05, + "loss": 0.1954, + "step": 37460 + }, + { + "epoch": 8.002935600931593, + "grad_norm": 0.349906325340271, + "learning_rate": 4.4280753699591375e-05, + "loss": 0.0697, + "step": 37470 + }, + { + "epoch": 8.002989763310405, + "grad_norm": 0.13232901692390442, + "learning_rate": 4.427774467854628e-05, + "loss": 0.0865, + "step": 37480 + }, + { + "epoch": 8.003043925689216, + "grad_norm": 0.9831815361976624, + "learning_rate": 4.4274735657501194e-05, + "loss": 0.0524, + "step": 37490 + }, + { + "epoch": 8.003098088068027, + "grad_norm": 0.3848903775215149, + "learning_rate": 4.4271726636456094e-05, + "loss": 0.1404, + "step": 37500 + }, + { + "epoch": 8.00315225044684, + "grad_norm": 0.30493462085723877, + "learning_rate": 4.426871761541101e-05, + "loss": 0.1325, + "step": 37510 + }, + { + "epoch": 8.003206412825651, + "grad_norm": 0.01669497601687908, + "learning_rate": 4.426570859436591e-05, + "loss": 0.0773, + "step": 37520 + }, + { + "epoch": 8.003260575204463, + "grad_norm": 0.35464775562286377, + "learning_rate": 4.426269957332082e-05, + "loss": 0.088, + "step": 37530 + }, + { + "epoch": 8.003314737583274, + "grad_norm": 0.12020428478717804, + "learning_rate": 4.4259690552275726e-05, + "loss": 0.0554, + "step": 37540 + }, + { + "epoch": 8.003368899962087, + "grad_norm": 0.10656308382749557, + "learning_rate": 4.425668153123063e-05, + "loss": 0.0539, + "step": 37550 + }, + { + "epoch": 8.003423062340898, + "grad_norm": 0.0827292874455452, + "learning_rate": 4.425367251018554e-05, + "loss": 0.0438, + "step": 37560 + }, + { + "epoch": 8.00347722471971, + "grad_norm": 0.1498609036207199, + "learning_rate": 4.4250663489140445e-05, + "loss": 0.0809, + "step": 37570 + }, + { + "epoch": 8.003531387098521, + "grad_norm": 1.5259854793548584, + "learning_rate": 4.424765446809535e-05, + "loss": 0.0842, + "step": 37580 + }, + { + "epoch": 8.003585549477332, + "grad_norm": 0.008022211492061615, + "learning_rate": 4.424464544705026e-05, + "loss": 0.2007, + "step": 37590 + }, + { + "epoch": 8.003639711856145, + "grad_norm": 0.011866573244333267, + "learning_rate": 4.424163642600517e-05, + "loss": 0.1055, + "step": 37600 + }, + { + "epoch": 8.003693874234957, + "grad_norm": 0.014983022585511208, + "learning_rate": 4.423862740496007e-05, + "loss": 0.0987, + "step": 37610 + }, + { + "epoch": 8.003748036613768, + "grad_norm": 0.00818494614213705, + "learning_rate": 4.4235618383914976e-05, + "loss": 0.0532, + "step": 37620 + }, + { + "epoch": 8.00380219899258, + "grad_norm": 4.7076005935668945, + "learning_rate": 4.423260936286989e-05, + "loss": 0.1934, + "step": 37630 + }, + { + "epoch": 8.003856361371392, + "grad_norm": 0.5417094826698303, + "learning_rate": 4.4229600341824795e-05, + "loss": 0.1045, + "step": 37640 + }, + { + "epoch": 8.003910523750204, + "grad_norm": 0.09418344497680664, + "learning_rate": 4.4226591320779695e-05, + "loss": 0.0189, + "step": 37650 + }, + { + "epoch": 8.003964686129015, + "grad_norm": 1.6607396602630615, + "learning_rate": 4.422358229973461e-05, + "loss": 0.0991, + "step": 37660 + }, + { + "epoch": 8.004018848507826, + "grad_norm": 0.23035433888435364, + "learning_rate": 4.4220573278689514e-05, + "loss": 0.0832, + "step": 37670 + }, + { + "epoch": 8.004073010886637, + "grad_norm": 0.0166104007512331, + "learning_rate": 4.421756425764442e-05, + "loss": 0.0356, + "step": 37680 + }, + { + "epoch": 8.00412717326545, + "grad_norm": 0.04495040327310562, + "learning_rate": 4.421455523659933e-05, + "loss": 0.006, + "step": 37690 + }, + { + "epoch": 8.004181335644262, + "grad_norm": 0.1028398796916008, + "learning_rate": 4.421154621555423e-05, + "loss": 0.0422, + "step": 37700 + }, + { + "epoch": 8.004235498023073, + "grad_norm": 0.004160991404205561, + "learning_rate": 4.420853719450914e-05, + "loss": 0.0909, + "step": 37710 + }, + { + "epoch": 8.004289660401884, + "grad_norm": 0.004535439424216747, + "learning_rate": 4.4205528173464045e-05, + "loss": 0.0063, + "step": 37720 + }, + { + "epoch": 8.004343822780697, + "grad_norm": 0.43639302253723145, + "learning_rate": 4.420251915241895e-05, + "loss": 0.1433, + "step": 37730 + }, + { + "epoch": 8.004397985159509, + "grad_norm": 0.23405911028385162, + "learning_rate": 4.419951013137386e-05, + "loss": 0.0784, + "step": 37740 + }, + { + "epoch": 8.00445214753832, + "grad_norm": 0.01876671239733696, + "learning_rate": 4.419650111032877e-05, + "loss": 0.0471, + "step": 37750 + }, + { + "epoch": 8.004506309917131, + "grad_norm": 0.04317681863903999, + "learning_rate": 4.419349208928367e-05, + "loss": 0.0067, + "step": 37760 + }, + { + "epoch": 8.004560472295942, + "grad_norm": 0.05796779692173004, + "learning_rate": 4.4190483068238584e-05, + "loss": 0.0048, + "step": 37770 + }, + { + "epoch": 8.004614634674756, + "grad_norm": 2.4632866382598877, + "learning_rate": 4.418747404719349e-05, + "loss": 0.1238, + "step": 37780 + }, + { + "epoch": 8.004668797053567, + "grad_norm": 0.0558471605181694, + "learning_rate": 4.4184465026148396e-05, + "loss": 0.0421, + "step": 37790 + }, + { + "epoch": 8.004722959432378, + "grad_norm": 0.3640095889568329, + "learning_rate": 4.41814560051033e-05, + "loss": 0.0502, + "step": 37800 + }, + { + "epoch": 8.00477712181119, + "grad_norm": 3.007662057876587, + "learning_rate": 4.417844698405821e-05, + "loss": 0.0889, + "step": 37810 + }, + { + "epoch": 8.004831284190002, + "grad_norm": 0.3525395393371582, + "learning_rate": 4.4175437963013115e-05, + "loss": 0.0047, + "step": 37820 + }, + { + "epoch": 8.004885446568814, + "grad_norm": 5.321776390075684, + "learning_rate": 4.417242894196803e-05, + "loss": 0.0896, + "step": 37830 + }, + { + "epoch": 8.004939608947625, + "grad_norm": 7.415426731109619, + "learning_rate": 4.416941992092293e-05, + "loss": 0.2386, + "step": 37840 + }, + { + "epoch": 8.004993771326436, + "grad_norm": 0.09558402746915817, + "learning_rate": 4.4166410899877834e-05, + "loss": 0.0801, + "step": 37850 + }, + { + "epoch": 8.005047933705248, + "grad_norm": 0.23179005086421967, + "learning_rate": 4.416340187883275e-05, + "loss": 0.1669, + "step": 37860 + }, + { + "epoch": 8.00510209608406, + "grad_norm": 0.09737393260002136, + "learning_rate": 4.4160392857787646e-05, + "loss": 0.0111, + "step": 37870 + }, + { + "epoch": 8.005156258462872, + "grad_norm": 9.412588119506836, + "learning_rate": 4.415738383674255e-05, + "loss": 0.1192, + "step": 37880 + }, + { + "epoch": 8.005210420841683, + "grad_norm": 0.061741214245557785, + "learning_rate": 4.4154374815697466e-05, + "loss": 0.0581, + "step": 37890 + }, + { + "epoch": 8.005264583220495, + "grad_norm": 0.1595422625541687, + "learning_rate": 4.415136579465237e-05, + "loss": 0.0204, + "step": 37900 + }, + { + "epoch": 8.005318745599308, + "grad_norm": 0.0024963561445474625, + "learning_rate": 4.414835677360727e-05, + "loss": 0.1379, + "step": 37910 + }, + { + "epoch": 8.005372907978119, + "grad_norm": 0.0791555643081665, + "learning_rate": 4.4145347752562184e-05, + "loss": 0.0475, + "step": 37920 + }, + { + "epoch": 8.00542707035693, + "grad_norm": 7.430164337158203, + "learning_rate": 4.414233873151709e-05, + "loss": 0.0428, + "step": 37930 + }, + { + "epoch": 8.005481232735741, + "grad_norm": 0.09682008624076843, + "learning_rate": 4.4139329710472e-05, + "loss": 0.0625, + "step": 37940 + }, + { + "epoch": 8.005535395114553, + "grad_norm": 0.0657925233244896, + "learning_rate": 4.41363206894269e-05, + "loss": 0.0949, + "step": 37950 + }, + { + "epoch": 8.005589557493366, + "grad_norm": 2.317744016647339, + "learning_rate": 4.413331166838181e-05, + "loss": 0.1875, + "step": 37960 + }, + { + "epoch": 8.005643719872177, + "grad_norm": 0.6557689905166626, + "learning_rate": 4.4130302647336716e-05, + "loss": 0.0842, + "step": 37970 + }, + { + "epoch": 8.005697882250988, + "grad_norm": 5.12103271484375, + "learning_rate": 4.412729362629163e-05, + "loss": 0.1512, + "step": 37980 + }, + { + "epoch": 8.0057520446298, + "grad_norm": 3.5357508659362793, + "learning_rate": 4.412428460524653e-05, + "loss": 0.0721, + "step": 37990 + }, + { + "epoch": 8.005806207008613, + "grad_norm": 0.08052486926317215, + "learning_rate": 4.4121275584201435e-05, + "loss": 0.0398, + "step": 38000 + }, + { + "epoch": 8.005860369387424, + "grad_norm": 0.029255907982587814, + "learning_rate": 4.411826656315635e-05, + "loss": 0.005, + "step": 38010 + }, + { + "epoch": 8.005914531766235, + "grad_norm": 1.5842009782791138, + "learning_rate": 4.411525754211125e-05, + "loss": 0.0292, + "step": 38020 + }, + { + "epoch": 8.005968694145047, + "grad_norm": 0.005485893227159977, + "learning_rate": 4.411224852106616e-05, + "loss": 0.2061, + "step": 38030 + }, + { + "epoch": 8.006022856523858, + "grad_norm": 0.3574559986591339, + "learning_rate": 4.4109239500021067e-05, + "loss": 0.0496, + "step": 38040 + }, + { + "epoch": 8.00607701890267, + "grad_norm": 1.3252838850021362, + "learning_rate": 4.410623047897597e-05, + "loss": 0.1668, + "step": 38050 + }, + { + "epoch": 8.006131181281482, + "grad_norm": 0.19675251841545105, + "learning_rate": 4.410322145793088e-05, + "loss": 0.041, + "step": 38060 + }, + { + "epoch": 8.006185343660293, + "grad_norm": 2.1192054748535156, + "learning_rate": 4.4100212436885785e-05, + "loss": 0.1059, + "step": 38070 + }, + { + "epoch": 8.006239506039105, + "grad_norm": 0.009976370260119438, + "learning_rate": 4.409720341584069e-05, + "loss": 0.1196, + "step": 38080 + }, + { + "epoch": 8.006293668417918, + "grad_norm": 1.052158236503601, + "learning_rate": 4.4094194394795605e-05, + "loss": 0.0691, + "step": 38090 + }, + { + "epoch": 8.006347830796729, + "grad_norm": 0.0077215139754116535, + "learning_rate": 4.4091185373750504e-05, + "loss": 0.0703, + "step": 38100 + }, + { + "epoch": 8.00640199317554, + "grad_norm": 0.14499029517173767, + "learning_rate": 4.408817635270541e-05, + "loss": 0.0153, + "step": 38110 + }, + { + "epoch": 8.006456155554352, + "grad_norm": 2.906113386154175, + "learning_rate": 4.4085167331660324e-05, + "loss": 0.1275, + "step": 38120 + }, + { + "epoch": 8.006510317933163, + "grad_norm": 0.7645202875137329, + "learning_rate": 4.408215831061523e-05, + "loss": 0.1028, + "step": 38130 + }, + { + "epoch": 8.006564480311976, + "grad_norm": 1.3049203157424927, + "learning_rate": 4.407914928957013e-05, + "loss": 0.0699, + "step": 38140 + }, + { + "epoch": 8.006618642690787, + "grad_norm": 1.0605143308639526, + "learning_rate": 4.407614026852504e-05, + "loss": 0.0384, + "step": 38150 + }, + { + "epoch": 8.006672805069599, + "grad_norm": 2.3919265270233154, + "learning_rate": 4.407313124747995e-05, + "loss": 0.1123, + "step": 38160 + }, + { + "epoch": 8.00672696744841, + "grad_norm": 0.12119687348604202, + "learning_rate": 4.407012222643485e-05, + "loss": 0.1477, + "step": 38170 + }, + { + "epoch": 8.006781129827223, + "grad_norm": 0.48789554834365845, + "learning_rate": 4.406711320538976e-05, + "loss": 0.1335, + "step": 38180 + }, + { + "epoch": 8.006835292206034, + "grad_norm": 0.0064859651029109955, + "learning_rate": 4.406410418434467e-05, + "loss": 0.0718, + "step": 38190 + }, + { + "epoch": 8.006889454584845, + "grad_norm": 2.8408987522125244, + "learning_rate": 4.4061095163299574e-05, + "loss": 0.0476, + "step": 38200 + }, + { + "epoch": 8.006943616963657, + "grad_norm": 0.05597547069191933, + "learning_rate": 4.405808614225448e-05, + "loss": 0.0476, + "step": 38210 + }, + { + "epoch": 8.006997779342468, + "grad_norm": 0.004641436040401459, + "learning_rate": 4.4055077121209386e-05, + "loss": 0.0597, + "step": 38220 + }, + { + "epoch": 8.007051941721281, + "grad_norm": 0.0062985606491565704, + "learning_rate": 4.405206810016429e-05, + "loss": 0.0876, + "step": 38230 + }, + { + "epoch": 8.007106104100092, + "grad_norm": 1.6065359115600586, + "learning_rate": 4.4049059079119206e-05, + "loss": 0.145, + "step": 38240 + }, + { + "epoch": 8.007160266478904, + "grad_norm": 0.6079443693161011, + "learning_rate": 4.4046050058074105e-05, + "loss": 0.0216, + "step": 38250 + }, + { + "epoch": 8.007214428857715, + "grad_norm": 35.61128234863281, + "learning_rate": 4.404304103702901e-05, + "loss": 0.1795, + "step": 38260 + }, + { + "epoch": 8.007268591236526, + "grad_norm": 4.0660176277160645, + "learning_rate": 4.4040032015983924e-05, + "loss": 0.0976, + "step": 38270 + }, + { + "epoch": 8.00732275361534, + "grad_norm": 0.2155364453792572, + "learning_rate": 4.403702299493883e-05, + "loss": 0.1746, + "step": 38280 + }, + { + "epoch": 8.00737691599415, + "grad_norm": 4.738574028015137, + "learning_rate": 4.403401397389374e-05, + "loss": 0.0568, + "step": 38290 + }, + { + "epoch": 8.007431078372962, + "grad_norm": 5.222655773162842, + "learning_rate": 4.403100495284864e-05, + "loss": 0.0733, + "step": 38300 + }, + { + "epoch": 8.007485240751773, + "grad_norm": 0.3222435712814331, + "learning_rate": 4.402799593180355e-05, + "loss": 0.0596, + "step": 38310 + }, + { + "epoch": 8.007539403130586, + "grad_norm": 7.762257099151611, + "learning_rate": 4.4024986910758456e-05, + "loss": 0.0673, + "step": 38320 + }, + { + "epoch": 8.007593565509397, + "grad_norm": 0.012757175602018833, + "learning_rate": 4.402197788971336e-05, + "loss": 0.1158, + "step": 38330 + }, + { + "epoch": 8.007647727888209, + "grad_norm": 0.13700872659683228, + "learning_rate": 4.401896886866827e-05, + "loss": 0.0627, + "step": 38340 + }, + { + "epoch": 8.00770189026702, + "grad_norm": 0.13204248249530792, + "learning_rate": 4.401595984762318e-05, + "loss": 0.0798, + "step": 38350 + }, + { + "epoch": 8.007756052645831, + "grad_norm": 0.0862596333026886, + "learning_rate": 4.401295082657808e-05, + "loss": 0.0135, + "step": 38360 + }, + { + "epoch": 8.007810215024644, + "grad_norm": 4.436659812927246, + "learning_rate": 4.400994180553299e-05, + "loss": 0.0122, + "step": 38370 + }, + { + "epoch": 8.007864377403456, + "grad_norm": 9.616476058959961, + "learning_rate": 4.40069327844879e-05, + "loss": 0.1594, + "step": 38380 + }, + { + "epoch": 8.007918539782267, + "grad_norm": 0.042891256511211395, + "learning_rate": 4.4003923763442806e-05, + "loss": 0.0287, + "step": 38390 + }, + { + "epoch": 8.007972702161078, + "grad_norm": 0.0360109806060791, + "learning_rate": 4.4000914742397706e-05, + "loss": 0.1545, + "step": 38400 + }, + { + "epoch": 8.008026864539891, + "grad_norm": 4.149970054626465, + "learning_rate": 4.399790572135262e-05, + "loss": 0.162, + "step": 38410 + }, + { + "epoch": 8.008081026918703, + "grad_norm": 0.28139781951904297, + "learning_rate": 4.3994896700307525e-05, + "loss": 0.1636, + "step": 38420 + }, + { + "epoch": 8.008135189297514, + "grad_norm": 0.13832464814186096, + "learning_rate": 4.399188767926243e-05, + "loss": 0.054, + "step": 38430 + }, + { + "epoch": 8.008189351676325, + "grad_norm": 3.622462034225464, + "learning_rate": 4.398887865821734e-05, + "loss": 0.039, + "step": 38440 + }, + { + "epoch": 8.008243514055136, + "grad_norm": 0.042174309492111206, + "learning_rate": 4.3985869637172244e-05, + "loss": 0.0504, + "step": 38450 + }, + { + "epoch": 8.00829767643395, + "grad_norm": 0.0070195975713431835, + "learning_rate": 4.398286061612715e-05, + "loss": 0.0531, + "step": 38460 + }, + { + "epoch": 8.00835183881276, + "grad_norm": 4.870485305786133, + "learning_rate": 4.397985159508206e-05, + "loss": 0.0439, + "step": 38470 + }, + { + "epoch": 8.008406001191572, + "grad_norm": 2.6314780712127686, + "learning_rate": 4.397684257403696e-05, + "loss": 0.277, + "step": 38480 + }, + { + "epoch": 8.008460163570383, + "grad_norm": 0.11301341652870178, + "learning_rate": 4.397383355299187e-05, + "loss": 0.0894, + "step": 38490 + }, + { + "epoch": 8.008514325949196, + "grad_norm": 0.133673757314682, + "learning_rate": 4.397082453194678e-05, + "loss": 0.0431, + "step": 38500 + }, + { + "epoch": 8.008568488328008, + "grad_norm": 0.13585884869098663, + "learning_rate": 4.396781551090168e-05, + "loss": 0.0499, + "step": 38510 + }, + { + "epoch": 8.008622650706819, + "grad_norm": 0.011949697509407997, + "learning_rate": 4.396480648985659e-05, + "loss": 0.0775, + "step": 38520 + }, + { + "epoch": 8.00867681308563, + "grad_norm": 0.011972933076322079, + "learning_rate": 4.39617974688115e-05, + "loss": 0.0186, + "step": 38530 + }, + { + "epoch": 8.008730975464442, + "grad_norm": 3.683312177658081, + "learning_rate": 4.395878844776641e-05, + "loss": 0.0601, + "step": 38540 + }, + { + "epoch": 8.008785137843255, + "grad_norm": 3.630373239517212, + "learning_rate": 4.3955779426721314e-05, + "loss": 0.1104, + "step": 38550 + }, + { + "epoch": 8.008839300222066, + "grad_norm": 0.22985820472240448, + "learning_rate": 4.395277040567622e-05, + "loss": 0.0127, + "step": 38560 + }, + { + "epoch": 8.008893462600877, + "grad_norm": 1.1060197353363037, + "learning_rate": 4.3949761384631126e-05, + "loss": 0.0055, + "step": 38570 + }, + { + "epoch": 8.008947624979688, + "grad_norm": 0.001851408276706934, + "learning_rate": 4.394675236358603e-05, + "loss": 0.0195, + "step": 38580 + }, + { + "epoch": 8.009001787358502, + "grad_norm": 0.0019189391750842333, + "learning_rate": 4.394374334254094e-05, + "loss": 0.0691, + "step": 38590 + }, + { + "epoch": 8.009055949737313, + "grad_norm": 0.0018849492771551013, + "learning_rate": 4.3940734321495845e-05, + "loss": 0.0307, + "step": 38600 + }, + { + "epoch": 8.009110112116124, + "grad_norm": 0.5952389240264893, + "learning_rate": 4.393772530045076e-05, + "loss": 0.0381, + "step": 38610 + }, + { + "epoch": 8.009164274494935, + "grad_norm": 0.05429895222187042, + "learning_rate": 4.393471627940566e-05, + "loss": 0.0238, + "step": 38620 + }, + { + "epoch": 8.009218436873747, + "grad_norm": 0.14824241399765015, + "learning_rate": 4.3931707258360564e-05, + "loss": 0.0672, + "step": 38630 + }, + { + "epoch": 8.00927259925256, + "grad_norm": 0.0033697939943522215, + "learning_rate": 4.392869823731548e-05, + "loss": 0.0741, + "step": 38640 + }, + { + "epoch": 8.009326761631371, + "grad_norm": 0.08592988550662994, + "learning_rate": 4.392568921627038e-05, + "loss": 0.0753, + "step": 38650 + }, + { + "epoch": 8.009380924010182, + "grad_norm": 0.532172679901123, + "learning_rate": 4.392268019522528e-05, + "loss": 0.0495, + "step": 38660 + }, + { + "epoch": 8.009435086388994, + "grad_norm": 0.19143342971801758, + "learning_rate": 4.3919671174180196e-05, + "loss": 0.0248, + "step": 38670 + }, + { + "epoch": 8.009489248767807, + "grad_norm": 0.0014576594112440944, + "learning_rate": 4.39166621531351e-05, + "loss": 0.0147, + "step": 38680 + }, + { + "epoch": 8.009543411146618, + "grad_norm": 0.0023614903911948204, + "learning_rate": 4.391365313209001e-05, + "loss": 0.1944, + "step": 38690 + }, + { + "epoch": 8.00959757352543, + "grad_norm": 7.135039329528809, + "learning_rate": 4.3910644111044915e-05, + "loss": 0.1245, + "step": 38700 + }, + { + "epoch": 8.00965173590424, + "grad_norm": 0.054718874394893646, + "learning_rate": 4.390763508999982e-05, + "loss": 0.0786, + "step": 38710 + }, + { + "epoch": 8.009705898283052, + "grad_norm": 0.003983341157436371, + "learning_rate": 4.390462606895473e-05, + "loss": 0.1016, + "step": 38720 + }, + { + "epoch": 8.009760060661865, + "grad_norm": 3.214242458343506, + "learning_rate": 4.390161704790964e-05, + "loss": 0.1186, + "step": 38730 + }, + { + "epoch": 8.009814223040676, + "grad_norm": 0.015645625069737434, + "learning_rate": 4.389860802686454e-05, + "loss": 0.0119, + "step": 38740 + }, + { + "epoch": 8.009868385419487, + "grad_norm": 5.348331928253174, + "learning_rate": 4.3895599005819446e-05, + "loss": 0.1545, + "step": 38750 + }, + { + "epoch": 8.009922547798299, + "grad_norm": 0.14564047753810883, + "learning_rate": 4.389258998477436e-05, + "loss": 0.0708, + "step": 38760 + }, + { + "epoch": 8.009976710177112, + "grad_norm": 0.011461378075182438, + "learning_rate": 4.388958096372926e-05, + "loss": 0.0717, + "step": 38770 + }, + { + "epoch": 8.010030872555923, + "grad_norm": 0.3718123733997345, + "learning_rate": 4.388657194268417e-05, + "loss": 0.1189, + "step": 38780 + }, + { + "epoch": 8.010085034934734, + "grad_norm": 0.005202536471188068, + "learning_rate": 4.388356292163908e-05, + "loss": 0.0316, + "step": 38790 + }, + { + "epoch": 8.010139197313546, + "grad_norm": 3.0863821506500244, + "learning_rate": 4.3880553900593984e-05, + "loss": 0.1361, + "step": 38800 + }, + { + "epoch": 8.010193359692357, + "grad_norm": 0.07224061340093613, + "learning_rate": 4.387754487954889e-05, + "loss": 0.108, + "step": 38810 + }, + { + "epoch": 8.01024752207117, + "grad_norm": 0.02428065799176693, + "learning_rate": 4.38745358585038e-05, + "loss": 0.0738, + "step": 38820 + }, + { + "epoch": 8.010301684449981, + "grad_norm": 0.00612298771739006, + "learning_rate": 4.38715268374587e-05, + "loss": 0.0521, + "step": 38830 + }, + { + "epoch": 8.010355846828793, + "grad_norm": 0.022077316418290138, + "learning_rate": 4.386851781641361e-05, + "loss": 0.006, + "step": 38840 + }, + { + "epoch": 8.010410009207604, + "grad_norm": 1.0757769346237183, + "learning_rate": 4.3865508795368515e-05, + "loss": 0.1368, + "step": 38850 + }, + { + "epoch": 8.010464171586417, + "grad_norm": 8.084765434265137, + "learning_rate": 4.386249977432342e-05, + "loss": 0.0365, + "step": 38860 + }, + { + "epoch": 8.010518333965228, + "grad_norm": 9.408330917358398, + "learning_rate": 4.3859490753278335e-05, + "loss": 0.2638, + "step": 38870 + }, + { + "epoch": 8.01057249634404, + "grad_norm": 0.11650069802999496, + "learning_rate": 4.385648173223324e-05, + "loss": 0.0179, + "step": 38880 + }, + { + "epoch": 8.01062665872285, + "grad_norm": 2.560123920440674, + "learning_rate": 4.385347271118814e-05, + "loss": 0.1063, + "step": 38890 + }, + { + "epoch": 8.010680821101662, + "grad_norm": 0.23806676268577576, + "learning_rate": 4.3850463690143054e-05, + "loss": 0.098, + "step": 38900 + }, + { + "epoch": 8.010734983480475, + "grad_norm": 0.0027824926655739546, + "learning_rate": 4.384745466909796e-05, + "loss": 0.01, + "step": 38910 + }, + { + "epoch": 8.010789145859286, + "grad_norm": 1.1414241790771484, + "learning_rate": 4.384444564805286e-05, + "loss": 0.0607, + "step": 38920 + }, + { + "epoch": 8.010843308238098, + "grad_norm": 0.5033242702484131, + "learning_rate": 4.384143662700777e-05, + "loss": 0.1823, + "step": 38930 + }, + { + "epoch": 8.010897470616909, + "grad_norm": 0.07207047194242477, + "learning_rate": 4.383842760596268e-05, + "loss": 0.0236, + "step": 38940 + }, + { + "epoch": 8.010951632995722, + "grad_norm": 10.127189636230469, + "learning_rate": 4.3835418584917585e-05, + "loss": 0.0204, + "step": 38950 + }, + { + "epoch": 8.011005795374533, + "grad_norm": 0.017235811799764633, + "learning_rate": 4.383240956387249e-05, + "loss": 0.2302, + "step": 38960 + }, + { + "epoch": 8.011059957753345, + "grad_norm": 0.009371617808938026, + "learning_rate": 4.38294005428274e-05, + "loss": 0.0025, + "step": 38970 + }, + { + "epoch": 8.011114120132156, + "grad_norm": 0.9076035618782043, + "learning_rate": 4.3826391521782304e-05, + "loss": 0.1042, + "step": 38980 + }, + { + "epoch": 8.011168282510967, + "grad_norm": 0.033450327813625336, + "learning_rate": 4.382338250073722e-05, + "loss": 0.1938, + "step": 38990 + }, + { + "epoch": 8.01122244488978, + "grad_norm": 0.01420174352824688, + "learning_rate": 4.3820373479692116e-05, + "loss": 0.0281, + "step": 39000 + }, + { + "epoch": 8.011276607268591, + "grad_norm": 0.03828561678528786, + "learning_rate": 4.381736445864702e-05, + "loss": 0.1164, + "step": 39010 + }, + { + "epoch": 8.011330769647403, + "grad_norm": 0.08245515823364258, + "learning_rate": 4.3814355437601936e-05, + "loss": 0.0834, + "step": 39020 + }, + { + "epoch": 8.011384932026214, + "grad_norm": 0.21649757027626038, + "learning_rate": 4.381134641655684e-05, + "loss": 0.0525, + "step": 39030 + }, + { + "epoch": 8.011439094405027, + "grad_norm": 0.025067325681447983, + "learning_rate": 4.380833739551175e-05, + "loss": 0.0414, + "step": 39040 + }, + { + "epoch": 8.011493256783838, + "grad_norm": 2.9465177059173584, + "learning_rate": 4.3805328374466655e-05, + "loss": 0.1239, + "step": 39050 + }, + { + "epoch": 8.01154741916265, + "grad_norm": 23.1069393157959, + "learning_rate": 4.380231935342156e-05, + "loss": 0.1307, + "step": 39060 + }, + { + "epoch": 8.011601581541461, + "grad_norm": 0.08631832897663116, + "learning_rate": 4.379931033237647e-05, + "loss": 0.0201, + "step": 39070 + }, + { + "epoch": 8.011655743920272, + "grad_norm": 0.05926472693681717, + "learning_rate": 4.379630131133137e-05, + "loss": 0.01, + "step": 39080 + }, + { + "epoch": 8.011709906299085, + "grad_norm": 0.01112393755465746, + "learning_rate": 4.379329229028628e-05, + "loss": 0.1696, + "step": 39090 + }, + { + "epoch": 8.011764068677897, + "grad_norm": 0.4014519155025482, + "learning_rate": 4.3790283269241186e-05, + "loss": 0.1115, + "step": 39100 + }, + { + "epoch": 8.011818231056708, + "grad_norm": 0.04998895153403282, + "learning_rate": 4.378727424819609e-05, + "loss": 0.0321, + "step": 39110 + }, + { + "epoch": 8.01187239343552, + "grad_norm": 0.1850614994764328, + "learning_rate": 4.3784265227151e-05, + "loss": 0.213, + "step": 39120 + }, + { + "epoch": 8.011926555814332, + "grad_norm": 0.00814170204102993, + "learning_rate": 4.378125620610591e-05, + "loss": 0.1258, + "step": 39130 + }, + { + "epoch": 8.011980718193144, + "grad_norm": 0.5697231292724609, + "learning_rate": 4.377824718506082e-05, + "loss": 0.1413, + "step": 39140 + }, + { + "epoch": 8.012034880571955, + "grad_norm": 0.7389355301856995, + "learning_rate": 4.377523816401572e-05, + "loss": 0.041, + "step": 39150 + }, + { + "epoch": 8.012089042950766, + "grad_norm": 3.8938684463500977, + "learning_rate": 4.377222914297063e-05, + "loss": 0.1475, + "step": 39160 + }, + { + "epoch": 8.012143205329577, + "grad_norm": 0.005391324870288372, + "learning_rate": 4.3769220121925537e-05, + "loss": 0.1804, + "step": 39170 + }, + { + "epoch": 8.01219736770839, + "grad_norm": 2.466498374938965, + "learning_rate": 4.376621110088044e-05, + "loss": 0.0647, + "step": 39180 + }, + { + "epoch": 8.012251530087202, + "grad_norm": 2.532042980194092, + "learning_rate": 4.376320207983535e-05, + "loss": 0.0444, + "step": 39190 + }, + { + "epoch": 8.012305692466013, + "grad_norm": 0.7188820242881775, + "learning_rate": 4.3760193058790255e-05, + "loss": 0.1211, + "step": 39200 + }, + { + "epoch": 8.012359854844824, + "grad_norm": 0.24683408439159393, + "learning_rate": 4.375718403774516e-05, + "loss": 0.0653, + "step": 39210 + }, + { + "epoch": 8.012414017223637, + "grad_norm": 13.770630836486816, + "learning_rate": 4.375417501670007e-05, + "loss": 0.1551, + "step": 39220 + }, + { + "epoch": 8.012468179602449, + "grad_norm": 0.3995636999607086, + "learning_rate": 4.3751165995654974e-05, + "loss": 0.0633, + "step": 39230 + }, + { + "epoch": 8.01252234198126, + "grad_norm": 0.005408347584307194, + "learning_rate": 4.374815697460988e-05, + "loss": 0.1453, + "step": 39240 + }, + { + "epoch": 8.012576504360071, + "grad_norm": 0.5601893663406372, + "learning_rate": 4.3745147953564794e-05, + "loss": 0.0283, + "step": 39250 + }, + { + "epoch": 8.012630666738882, + "grad_norm": 3.264902353286743, + "learning_rate": 4.374213893251969e-05, + "loss": 0.1173, + "step": 39260 + }, + { + "epoch": 8.012684829117696, + "grad_norm": 0.14918678998947144, + "learning_rate": 4.37391299114746e-05, + "loss": 0.0304, + "step": 39270 + }, + { + "epoch": 8.012738991496507, + "grad_norm": 0.051447417587041855, + "learning_rate": 4.373612089042951e-05, + "loss": 0.0066, + "step": 39280 + }, + { + "epoch": 8.012793153875318, + "grad_norm": 0.42999741435050964, + "learning_rate": 4.373311186938442e-05, + "loss": 0.0487, + "step": 39290 + }, + { + "epoch": 8.01284731625413, + "grad_norm": 5.801003456115723, + "learning_rate": 4.3730102848339325e-05, + "loss": 0.1888, + "step": 39300 + }, + { + "epoch": 8.012901478632942, + "grad_norm": 0.138032928109169, + "learning_rate": 4.372709382729423e-05, + "loss": 0.3289, + "step": 39310 + }, + { + "epoch": 8.012955641011754, + "grad_norm": 0.6326272487640381, + "learning_rate": 4.372408480624914e-05, + "loss": 0.146, + "step": 39320 + }, + { + "epoch": 8.013009803390565, + "grad_norm": 0.5218244791030884, + "learning_rate": 4.3721075785204044e-05, + "loss": 0.0703, + "step": 39330 + }, + { + "epoch": 8.013063965769376, + "grad_norm": 0.013275806792080402, + "learning_rate": 4.371806676415895e-05, + "loss": 0.0747, + "step": 39340 + }, + { + "epoch": 8.013118128148188, + "grad_norm": 1.3636990785598755, + "learning_rate": 4.3715057743113856e-05, + "loss": 0.1451, + "step": 39350 + }, + { + "epoch": 8.013172290527, + "grad_norm": 0.039150383323431015, + "learning_rate": 4.371204872206876e-05, + "loss": 0.1233, + "step": 39360 + }, + { + "epoch": 8.013226452905812, + "grad_norm": 0.016786934807896614, + "learning_rate": 4.370903970102367e-05, + "loss": 0.062, + "step": 39370 + }, + { + "epoch": 8.013280615284623, + "grad_norm": 0.3427814543247223, + "learning_rate": 4.3706030679978575e-05, + "loss": 0.0468, + "step": 39380 + }, + { + "epoch": 8.013334777663434, + "grad_norm": 0.2053910791873932, + "learning_rate": 4.370302165893349e-05, + "loss": 0.1335, + "step": 39390 + }, + { + "epoch": 8.013388940042246, + "grad_norm": 0.046104975044727325, + "learning_rate": 4.3700012637888394e-05, + "loss": 0.0746, + "step": 39400 + }, + { + "epoch": 8.013443102421059, + "grad_norm": 1.0965081453323364, + "learning_rate": 4.3697003616843294e-05, + "loss": 0.08, + "step": 39410 + }, + { + "epoch": 8.01349726479987, + "grad_norm": 0.4431711733341217, + "learning_rate": 4.369399459579821e-05, + "loss": 0.0863, + "step": 39420 + }, + { + "epoch": 8.013551427178681, + "grad_norm": 0.09716270118951797, + "learning_rate": 4.369098557475311e-05, + "loss": 0.0537, + "step": 39430 + }, + { + "epoch": 8.013605589557493, + "grad_norm": 0.003726586000993848, + "learning_rate": 4.368797655370802e-05, + "loss": 0.0227, + "step": 39440 + }, + { + "epoch": 8.013659751936306, + "grad_norm": 0.05469702184200287, + "learning_rate": 4.3684967532662926e-05, + "loss": 0.0247, + "step": 39450 + }, + { + "epoch": 8.013713914315117, + "grad_norm": 0.05723747983574867, + "learning_rate": 4.368195851161783e-05, + "loss": 0.0069, + "step": 39460 + }, + { + "epoch": 8.013768076693928, + "grad_norm": 0.059243347495794296, + "learning_rate": 4.367894949057274e-05, + "loss": 0.0834, + "step": 39470 + }, + { + "epoch": 8.01382223907274, + "grad_norm": 0.07334452122449875, + "learning_rate": 4.367594046952765e-05, + "loss": 0.0131, + "step": 39480 + }, + { + "epoch": 8.013876401451551, + "grad_norm": 1.7384699583053589, + "learning_rate": 4.367293144848255e-05, + "loss": 0.2798, + "step": 39490 + }, + { + "epoch": 8.013930563830364, + "grad_norm": 0.04476495459675789, + "learning_rate": 4.366992242743746e-05, + "loss": 0.0911, + "step": 39500 + }, + { + "epoch": 8.013984726209175, + "grad_norm": 0.058890171349048615, + "learning_rate": 4.366691340639237e-05, + "loss": 0.0087, + "step": 39510 + }, + { + "epoch": 8.014038888587987, + "grad_norm": 0.6730952262878418, + "learning_rate": 4.366390438534727e-05, + "loss": 0.1396, + "step": 39520 + }, + { + "epoch": 8.014093050966798, + "grad_norm": 1.4933414459228516, + "learning_rate": 4.3660895364302176e-05, + "loss": 0.059, + "step": 39530 + }, + { + "epoch": 8.01414721334561, + "grad_norm": 0.9346166253089905, + "learning_rate": 4.365788634325709e-05, + "loss": 0.0446, + "step": 39540 + }, + { + "epoch": 8.014201375724422, + "grad_norm": 1.412948489189148, + "learning_rate": 4.3654877322211995e-05, + "loss": 0.0254, + "step": 39550 + }, + { + "epoch": 8.014255538103233, + "grad_norm": 0.36259469389915466, + "learning_rate": 4.36518683011669e-05, + "loss": 0.2315, + "step": 39560 + }, + { + "epoch": 8.014309700482045, + "grad_norm": 0.07414063066244125, + "learning_rate": 4.364885928012181e-05, + "loss": 0.0648, + "step": 39570 + }, + { + "epoch": 8.014363862860856, + "grad_norm": 0.7938470244407654, + "learning_rate": 4.3645850259076714e-05, + "loss": 0.0064, + "step": 39580 + }, + { + "epoch": 8.014418025239669, + "grad_norm": 0.5422947406768799, + "learning_rate": 4.364284123803162e-05, + "loss": 0.2024, + "step": 39590 + }, + { + "epoch": 8.01447218761848, + "grad_norm": 0.3392452895641327, + "learning_rate": 4.363983221698653e-05, + "loss": 0.131, + "step": 39600 + }, + { + "epoch": 8.014526349997292, + "grad_norm": 4.0431976318359375, + "learning_rate": 4.363682319594143e-05, + "loss": 0.1023, + "step": 39610 + }, + { + "epoch": 8.014580512376103, + "grad_norm": 0.013764070346951485, + "learning_rate": 4.363381417489634e-05, + "loss": 0.0497, + "step": 39620 + }, + { + "epoch": 8.014634674754916, + "grad_norm": 0.32918402552604675, + "learning_rate": 4.363080515385125e-05, + "loss": 0.0344, + "step": 39630 + }, + { + "epoch": 8.014688837133727, + "grad_norm": 0.11205610632896423, + "learning_rate": 4.362779613280615e-05, + "loss": 0.0797, + "step": 39640 + }, + { + "epoch": 8.014742999512539, + "grad_norm": 2.2812366485595703, + "learning_rate": 4.3624787111761065e-05, + "loss": 0.1187, + "step": 39650 + }, + { + "epoch": 8.01479716189135, + "grad_norm": 0.5691778063774109, + "learning_rate": 4.362177809071597e-05, + "loss": 0.0854, + "step": 39660 + }, + { + "epoch": 8.014851324270161, + "grad_norm": 0.17879721522331238, + "learning_rate": 4.361876906967087e-05, + "loss": 0.0487, + "step": 39670 + }, + { + "epoch": 8.014905486648974, + "grad_norm": 0.05315123870968819, + "learning_rate": 4.3615760048625784e-05, + "loss": 0.1024, + "step": 39680 + }, + { + "epoch": 8.014959649027785, + "grad_norm": 0.0917225033044815, + "learning_rate": 4.361275102758069e-05, + "loss": 0.0432, + "step": 39690 + }, + { + "epoch": 8.015013811406597, + "grad_norm": 4.5963454246521, + "learning_rate": 4.3609742006535596e-05, + "loss": 0.0909, + "step": 39700 + }, + { + "epoch": 8.015067973785408, + "grad_norm": 0.5510092377662659, + "learning_rate": 4.36067329854905e-05, + "loss": 0.0835, + "step": 39710 + }, + { + "epoch": 8.015122136164221, + "grad_norm": 0.011229979805648327, + "learning_rate": 4.360372396444541e-05, + "loss": 0.0988, + "step": 39720 + }, + { + "epoch": 8.015176298543032, + "grad_norm": 4.845649719238281, + "learning_rate": 4.3600714943400315e-05, + "loss": 0.0478, + "step": 39730 + }, + { + "epoch": 8.015230460921844, + "grad_norm": 0.8231130242347717, + "learning_rate": 4.359770592235523e-05, + "loss": 0.1163, + "step": 39740 + }, + { + "epoch": 8.015284623300655, + "grad_norm": 0.3352149724960327, + "learning_rate": 4.359469690131013e-05, + "loss": 0.057, + "step": 39750 + }, + { + "epoch": 8.015338785679466, + "grad_norm": 6.1606831550598145, + "learning_rate": 4.3591687880265034e-05, + "loss": 0.1551, + "step": 39760 + }, + { + "epoch": 8.01539294805828, + "grad_norm": 0.23890367150306702, + "learning_rate": 4.358867885921995e-05, + "loss": 0.0309, + "step": 39770 + }, + { + "epoch": 8.01544711043709, + "grad_norm": 0.018449710682034492, + "learning_rate": 4.358566983817485e-05, + "loss": 0.0446, + "step": 39780 + }, + { + "epoch": 8.015501272815902, + "grad_norm": 0.003577952040359378, + "learning_rate": 4.358266081712975e-05, + "loss": 0.0487, + "step": 39790 + }, + { + "epoch": 8.015555435194713, + "grad_norm": 0.38007423281669617, + "learning_rate": 4.3579651796084666e-05, + "loss": 0.0075, + "step": 39800 + }, + { + "epoch": 8.015609597573526, + "grad_norm": 0.0525737889111042, + "learning_rate": 4.357664277503957e-05, + "loss": 0.0043, + "step": 39810 + }, + { + "epoch": 8.015663759952337, + "grad_norm": 0.0022126648109406233, + "learning_rate": 4.357363375399448e-05, + "loss": 0.1315, + "step": 39820 + }, + { + "epoch": 8.015717922331149, + "grad_norm": 7.585901737213135, + "learning_rate": 4.3570624732949385e-05, + "loss": 0.1112, + "step": 39830 + }, + { + "epoch": 8.01577208470996, + "grad_norm": 0.11565829813480377, + "learning_rate": 4.356761571190429e-05, + "loss": 0.0053, + "step": 39840 + }, + { + "epoch": 8.015826247088771, + "grad_norm": 0.14834633469581604, + "learning_rate": 4.35646066908592e-05, + "loss": 0.1172, + "step": 39850 + }, + { + "epoch": 8.015880409467584, + "grad_norm": 0.009864152409136295, + "learning_rate": 4.3561597669814103e-05, + "loss": 0.1137, + "step": 39860 + }, + { + "epoch": 8.015934571846396, + "grad_norm": 3.9272878170013428, + "learning_rate": 4.355858864876901e-05, + "loss": 0.072, + "step": 39870 + }, + { + "epoch": 8.015988734225207, + "grad_norm": 5.62746524810791, + "learning_rate": 4.3555579627723916e-05, + "loss": 0.2295, + "step": 39880 + }, + { + "epoch": 8.016042896604018, + "grad_norm": 0.03718021512031555, + "learning_rate": 4.355257060667883e-05, + "loss": 0.0963, + "step": 39890 + }, + { + "epoch": 8.016097058982831, + "grad_norm": 0.5148112177848816, + "learning_rate": 4.354956158563373e-05, + "loss": 0.0278, + "step": 39900 + }, + { + "epoch": 8.016151221361643, + "grad_norm": 0.5902028679847717, + "learning_rate": 4.354655256458864e-05, + "loss": 0.0081, + "step": 39910 + }, + { + "epoch": 8.016205383740454, + "grad_norm": 0.15858730673789978, + "learning_rate": 4.354354354354355e-05, + "loss": 0.103, + "step": 39920 + }, + { + "epoch": 8.016259546119265, + "grad_norm": 0.0924556702375412, + "learning_rate": 4.3540534522498454e-05, + "loss": 0.1507, + "step": 39930 + }, + { + "epoch": 8.016313708498076, + "grad_norm": 0.4052984118461609, + "learning_rate": 4.353752550145336e-05, + "loss": 0.0541, + "step": 39940 + }, + { + "epoch": 8.01636787087689, + "grad_norm": 14.156655311584473, + "learning_rate": 4.353451648040827e-05, + "loss": 0.1295, + "step": 39950 + }, + { + "epoch": 8.0164220332557, + "grad_norm": 0.3944474756717682, + "learning_rate": 4.353150745936317e-05, + "loss": 0.0803, + "step": 39960 + }, + { + "epoch": 8.016476195634512, + "grad_norm": 0.32068026065826416, + "learning_rate": 4.352849843831808e-05, + "loss": 0.1394, + "step": 39970 + }, + { + "epoch": 8.016530358013323, + "grad_norm": 18.559656143188477, + "learning_rate": 4.3525489417272985e-05, + "loss": 0.0341, + "step": 39980 + }, + { + "epoch": 8.016584520392136, + "grad_norm": 0.1627844125032425, + "learning_rate": 4.352248039622789e-05, + "loss": 0.1379, + "step": 39990 + }, + { + "epoch": 8.016638682770948, + "grad_norm": 0.08638592809438705, + "learning_rate": 4.3519471375182805e-05, + "loss": 0.0538, + "step": 40000 + }, + { + "epoch": 8.016692845149759, + "grad_norm": 3.1207127571105957, + "learning_rate": 4.3516462354137704e-05, + "loss": 0.2172, + "step": 40010 + }, + { + "epoch": 8.01674700752857, + "grad_norm": 0.007440246641635895, + "learning_rate": 4.351345333309261e-05, + "loss": 0.0722, + "step": 40020 + }, + { + "epoch": 8.016801169907382, + "grad_norm": 0.5716203451156616, + "learning_rate": 4.3510444312047524e-05, + "loss": 0.164, + "step": 40030 + }, + { + "epoch": 8.016855332286195, + "grad_norm": 0.03182128444314003, + "learning_rate": 4.350743529100243e-05, + "loss": 0.1356, + "step": 40040 + }, + { + "epoch": 8.016909494665006, + "grad_norm": 0.013592341914772987, + "learning_rate": 4.350442626995733e-05, + "loss": 0.0774, + "step": 40050 + }, + { + "epoch": 8.016963657043817, + "grad_norm": 0.009610289707779884, + "learning_rate": 4.350141724891224e-05, + "loss": 0.0405, + "step": 40060 + }, + { + "epoch": 8.017017819422628, + "grad_norm": 0.32365667819976807, + "learning_rate": 4.349840822786715e-05, + "loss": 0.1285, + "step": 40070 + }, + { + "epoch": 8.017071981801442, + "grad_norm": 0.0032580101396888494, + "learning_rate": 4.3495399206822055e-05, + "loss": 0.0724, + "step": 40080 + }, + { + "epoch": 8.017126144180253, + "grad_norm": 11.742524147033691, + "learning_rate": 4.349239018577696e-05, + "loss": 0.2107, + "step": 40090 + }, + { + "epoch": 8.017180306559064, + "grad_norm": 0.12869003415107727, + "learning_rate": 4.348938116473187e-05, + "loss": 0.1488, + "step": 40100 + }, + { + "epoch": 8.017234468937875, + "grad_norm": 1.928428053855896, + "learning_rate": 4.3486372143686774e-05, + "loss": 0.0485, + "step": 40110 + }, + { + "epoch": 8.017288631316687, + "grad_norm": 0.010857662186026573, + "learning_rate": 4.348336312264168e-05, + "loss": 0.042, + "step": 40120 + }, + { + "epoch": 8.0173427936955, + "grad_norm": 0.004160828422755003, + "learning_rate": 4.3480354101596586e-05, + "loss": 0.0141, + "step": 40130 + }, + { + "epoch": 8.017396956074311, + "grad_norm": 2.6629374027252197, + "learning_rate": 4.347734508055149e-05, + "loss": 0.0471, + "step": 40140 + }, + { + "epoch": 8.017451118453122, + "grad_norm": 0.30779626965522766, + "learning_rate": 4.3474336059506406e-05, + "loss": 0.1072, + "step": 40150 + }, + { + "epoch": 8.017505280831934, + "grad_norm": 0.7656024098396301, + "learning_rate": 4.3471327038461305e-05, + "loss": 0.0671, + "step": 40160 + }, + { + "epoch": 8.017559443210747, + "grad_norm": 0.30822286009788513, + "learning_rate": 4.346831801741622e-05, + "loss": 0.0172, + "step": 40170 + }, + { + "epoch": 8.017613605589558, + "grad_norm": 0.007099241483956575, + "learning_rate": 4.3465308996371125e-05, + "loss": 0.0927, + "step": 40180 + }, + { + "epoch": 8.01766776796837, + "grad_norm": 0.06590892374515533, + "learning_rate": 4.346229997532603e-05, + "loss": 0.0533, + "step": 40190 + }, + { + "epoch": 8.01772193034718, + "grad_norm": 0.2672586739063263, + "learning_rate": 4.345929095428094e-05, + "loss": 0.0301, + "step": 40200 + }, + { + "epoch": 8.017776092725992, + "grad_norm": 0.531023383140564, + "learning_rate": 4.345628193323584e-05, + "loss": 0.0303, + "step": 40210 + }, + { + "epoch": 8.017830255104805, + "grad_norm": 0.0021409010514616966, + "learning_rate": 4.345327291219075e-05, + "loss": 0.0571, + "step": 40220 + }, + { + "epoch": 8.017884417483616, + "grad_norm": 0.06048325449228287, + "learning_rate": 4.345026389114566e-05, + "loss": 0.0359, + "step": 40230 + }, + { + "epoch": 8.017938579862427, + "grad_norm": 10.621649742126465, + "learning_rate": 4.344725487010056e-05, + "loss": 0.1005, + "step": 40240 + }, + { + "epoch": 8.017992742241239, + "grad_norm": 0.0075335013680160046, + "learning_rate": 4.344424584905547e-05, + "loss": 0.0483, + "step": 40250 + }, + { + "epoch": 8.018046904620052, + "grad_norm": 0.003059134352952242, + "learning_rate": 4.344123682801038e-05, + "loss": 0.0713, + "step": 40260 + }, + { + "epoch": 8.018101066998863, + "grad_norm": 0.02717541716992855, + "learning_rate": 4.343822780696528e-05, + "loss": 0.0773, + "step": 40270 + }, + { + "epoch": 8.018155229377674, + "grad_norm": 0.3656712472438812, + "learning_rate": 4.343521878592019e-05, + "loss": 0.2189, + "step": 40280 + }, + { + "epoch": 8.018209391756486, + "grad_norm": 0.014247135259211063, + "learning_rate": 4.34322097648751e-05, + "loss": 0.0811, + "step": 40290 + }, + { + "epoch": 8.018263554135297, + "grad_norm": 0.005474724341183901, + "learning_rate": 4.3429200743830007e-05, + "loss": 0.0914, + "step": 40300 + }, + { + "epoch": 8.01831771651411, + "grad_norm": 0.008012469857931137, + "learning_rate": 4.3426191722784906e-05, + "loss": 0.0248, + "step": 40310 + }, + { + "epoch": 8.018371878892921, + "grad_norm": 0.061205603182315826, + "learning_rate": 4.342318270173982e-05, + "loss": 0.0261, + "step": 40320 + }, + { + "epoch": 8.018426041271733, + "grad_norm": 0.44356536865234375, + "learning_rate": 4.3420173680694725e-05, + "loss": 0.0297, + "step": 40330 + }, + { + "epoch": 8.018480203650544, + "grad_norm": 0.04731140285730362, + "learning_rate": 4.341716465964963e-05, + "loss": 0.0053, + "step": 40340 + }, + { + "epoch": 8.018534366029357, + "grad_norm": 0.004474318586289883, + "learning_rate": 4.341415563860454e-05, + "loss": 0.1017, + "step": 40350 + }, + { + "epoch": 8.018588528408168, + "grad_norm": 0.024641623720526695, + "learning_rate": 4.3411146617559444e-05, + "loss": 0.0153, + "step": 40360 + }, + { + "epoch": 8.01864269078698, + "grad_norm": 0.0052095153369009495, + "learning_rate": 4.340813759651435e-05, + "loss": 0.063, + "step": 40370 + }, + { + "epoch": 8.01869685316579, + "grad_norm": 3.075343370437622, + "learning_rate": 4.3405128575469264e-05, + "loss": 0.0721, + "step": 40380 + }, + { + "epoch": 8.018751015544602, + "grad_norm": 1.1996424198150635, + "learning_rate": 4.340211955442416e-05, + "loss": 0.0036, + "step": 40390 + }, + { + "epoch": 8.018805177923415, + "grad_norm": 0.002529129618778825, + "learning_rate": 4.339911053337907e-05, + "loss": 0.0174, + "step": 40400 + }, + { + "epoch": 8.018859340302226, + "grad_norm": 0.08856039494276047, + "learning_rate": 4.339610151233398e-05, + "loss": 0.1418, + "step": 40410 + }, + { + "epoch": 8.018913502681038, + "grad_norm": 0.026086464524269104, + "learning_rate": 4.339309249128888e-05, + "loss": 0.1871, + "step": 40420 + }, + { + "epoch": 8.018967665059849, + "grad_norm": 0.7853572368621826, + "learning_rate": 4.3390083470243795e-05, + "loss": 0.0856, + "step": 40430 + }, + { + "epoch": 8.019021827438662, + "grad_norm": 3.098222255706787, + "learning_rate": 4.33870744491987e-05, + "loss": 0.1074, + "step": 40440 + }, + { + "epoch": 8.019075989817473, + "grad_norm": 7.556326389312744, + "learning_rate": 4.338406542815361e-05, + "loss": 0.1406, + "step": 40450 + }, + { + "epoch": 8.019130152196285, + "grad_norm": 0.1827799379825592, + "learning_rate": 4.3381056407108514e-05, + "loss": 0.0267, + "step": 40460 + }, + { + "epoch": 8.019184314575096, + "grad_norm": 0.006565921939909458, + "learning_rate": 4.337804738606342e-05, + "loss": 0.0236, + "step": 40470 + }, + { + "epoch": 8.019238476953907, + "grad_norm": 0.16038550436496735, + "learning_rate": 4.3375038365018326e-05, + "loss": 0.0864, + "step": 40480 + }, + { + "epoch": 8.01929263933272, + "grad_norm": 0.04961730167269707, + "learning_rate": 4.337202934397324e-05, + "loss": 0.0914, + "step": 40490 + }, + { + "epoch": 8.019346801711531, + "grad_norm": 0.010386179201304913, + "learning_rate": 4.336902032292814e-05, + "loss": 0.157, + "step": 40500 + }, + { + "epoch": 8.019400964090343, + "grad_norm": 0.011299324221909046, + "learning_rate": 4.3366011301883045e-05, + "loss": 0.0121, + "step": 40510 + }, + { + "epoch": 8.019455126469154, + "grad_norm": 0.119045190513134, + "learning_rate": 4.336300228083796e-05, + "loss": 0.0241, + "step": 40520 + }, + { + "epoch": 8.019509288847965, + "grad_norm": 0.29805508255958557, + "learning_rate": 4.3359993259792864e-05, + "loss": 0.0647, + "step": 40530 + }, + { + "epoch": 8.019563451226778, + "grad_norm": 10.563821792602539, + "learning_rate": 4.3356984238747764e-05, + "loss": 0.2911, + "step": 40540 + }, + { + "epoch": 8.01961761360559, + "grad_norm": 0.09730944037437439, + "learning_rate": 4.335397521770268e-05, + "loss": 0.0292, + "step": 40550 + }, + { + "epoch": 8.019671775984401, + "grad_norm": 0.014142968691885471, + "learning_rate": 4.335096619665758e-05, + "loss": 0.0246, + "step": 40560 + }, + { + "epoch": 8.019725938363212, + "grad_norm": 0.00891964416950941, + "learning_rate": 4.334795717561248e-05, + "loss": 0.0621, + "step": 40570 + }, + { + "epoch": 8.019780100742025, + "grad_norm": 0.1676149219274521, + "learning_rate": 4.3344948154567396e-05, + "loss": 0.0128, + "step": 40580 + }, + { + "epoch": 8.019834263120837, + "grad_norm": 0.006604698020964861, + "learning_rate": 4.33419391335223e-05, + "loss": 0.0644, + "step": 40590 + }, + { + "epoch": 8.019888425499648, + "grad_norm": 0.06495010107755661, + "learning_rate": 4.333893011247721e-05, + "loss": 0.0655, + "step": 40600 + }, + { + "epoch": 8.01994258787846, + "grad_norm": 5.640097618103027, + "learning_rate": 4.3335921091432115e-05, + "loss": 0.1712, + "step": 40610 + }, + { + "epoch": 8.01999675025727, + "grad_norm": 0.03453445807099342, + "learning_rate": 4.333291207038702e-05, + "loss": 0.0441, + "step": 40620 + }, + { + "epoch": 8.020050912636083, + "grad_norm": 0.045967038720846176, + "learning_rate": 4.332990304934193e-05, + "loss": 0.021, + "step": 40630 + }, + { + "epoch": 8.020105075014895, + "grad_norm": 0.06517753005027771, + "learning_rate": 4.332689402829684e-05, + "loss": 0.0061, + "step": 40640 + }, + { + "epoch": 8.020159237393706, + "grad_norm": 0.11574622243642807, + "learning_rate": 4.332388500725174e-05, + "loss": 0.0064, + "step": 40650 + }, + { + "epoch": 8.020213399772517, + "grad_norm": 4.811962604522705, + "learning_rate": 4.3320875986206646e-05, + "loss": 0.1156, + "step": 40660 + }, + { + "epoch": 8.02026756215133, + "grad_norm": 5.857922077178955, + "learning_rate": 4.331786696516156e-05, + "loss": 0.2193, + "step": 40670 + }, + { + "epoch": 8.020321724530142, + "grad_norm": 0.8164772987365723, + "learning_rate": 4.3314857944116465e-05, + "loss": 0.009, + "step": 40680 + }, + { + "epoch": 8.020375886908953, + "grad_norm": 0.008400370366871357, + "learning_rate": 4.331184892307137e-05, + "loss": 0.0568, + "step": 40690 + }, + { + "epoch": 8.020430049287764, + "grad_norm": 0.03626660257577896, + "learning_rate": 4.330883990202628e-05, + "loss": 0.1446, + "step": 40700 + }, + { + "epoch": 8.020484211666576, + "grad_norm": 2.7695398330688477, + "learning_rate": 4.3305830880981184e-05, + "loss": 0.0924, + "step": 40710 + }, + { + "epoch": 8.020538374045389, + "grad_norm": 0.47765183448791504, + "learning_rate": 4.330282185993609e-05, + "loss": 0.0163, + "step": 40720 + }, + { + "epoch": 8.0205925364242, + "grad_norm": 0.01173343788832426, + "learning_rate": 4.3299812838891e-05, + "loss": 0.1075, + "step": 40730 + }, + { + "epoch": 8.020646698803011, + "grad_norm": 0.07057952135801315, + "learning_rate": 4.32968038178459e-05, + "loss": 0.1622, + "step": 40740 + }, + { + "epoch": 8.020700861181822, + "grad_norm": 0.0804971307516098, + "learning_rate": 4.3293794796800816e-05, + "loss": 0.0056, + "step": 40750 + }, + { + "epoch": 8.020755023560636, + "grad_norm": 0.01394677720963955, + "learning_rate": 4.3290785775755716e-05, + "loss": 0.2205, + "step": 40760 + }, + { + "epoch": 8.020809185939447, + "grad_norm": 15.939698219299316, + "learning_rate": 4.328777675471062e-05, + "loss": 0.1974, + "step": 40770 + }, + { + "epoch": 8.020863348318258, + "grad_norm": 19.206876754760742, + "learning_rate": 4.3284767733665535e-05, + "loss": 0.0834, + "step": 40780 + }, + { + "epoch": 8.02091751069707, + "grad_norm": 0.1269359141588211, + "learning_rate": 4.328175871262044e-05, + "loss": 0.1139, + "step": 40790 + }, + { + "epoch": 8.02097167307588, + "grad_norm": 0.13381916284561157, + "learning_rate": 4.327874969157534e-05, + "loss": 0.0795, + "step": 40800 + }, + { + "epoch": 8.021025835454694, + "grad_norm": 2.7867870330810547, + "learning_rate": 4.3275740670530254e-05, + "loss": 0.1211, + "step": 40810 + }, + { + "epoch": 8.021079997833505, + "grad_norm": 0.17735596001148224, + "learning_rate": 4.327273164948516e-05, + "loss": 0.044, + "step": 40820 + }, + { + "epoch": 8.021134160212316, + "grad_norm": 2.4097907543182373, + "learning_rate": 4.3269722628440066e-05, + "loss": 0.1028, + "step": 40830 + }, + { + "epoch": 8.021188322591128, + "grad_norm": 0.12901988625526428, + "learning_rate": 4.326671360739497e-05, + "loss": 0.2567, + "step": 40840 + }, + { + "epoch": 8.02124248496994, + "grad_norm": 0.3164571225643158, + "learning_rate": 4.326370458634988e-05, + "loss": 0.0434, + "step": 40850 + }, + { + "epoch": 8.021296647348752, + "grad_norm": 3.5008015632629395, + "learning_rate": 4.3260695565304785e-05, + "loss": 0.0186, + "step": 40860 + }, + { + "epoch": 8.021350809727563, + "grad_norm": 0.07274112850427628, + "learning_rate": 4.325768654425969e-05, + "loss": 0.0258, + "step": 40870 + }, + { + "epoch": 8.021404972106374, + "grad_norm": 0.005957332439720631, + "learning_rate": 4.32546775232146e-05, + "loss": 0.0329, + "step": 40880 + }, + { + "epoch": 8.021459134485186, + "grad_norm": 0.036318715661764145, + "learning_rate": 4.3251668502169504e-05, + "loss": 0.1102, + "step": 40890 + }, + { + "epoch": 8.021513296863999, + "grad_norm": 0.054098356515169144, + "learning_rate": 4.324865948112442e-05, + "loss": 0.003, + "step": 40900 + }, + { + "epoch": 8.02156745924281, + "grad_norm": 0.05165136232972145, + "learning_rate": 4.3245650460079316e-05, + "loss": 0.0906, + "step": 40910 + }, + { + "epoch": 8.021621621621621, + "grad_norm": 0.04375959560275078, + "learning_rate": 4.324264143903422e-05, + "loss": 0.0959, + "step": 40920 + }, + { + "epoch": 8.021675784000433, + "grad_norm": 0.3584974706172943, + "learning_rate": 4.3239632417989136e-05, + "loss": 0.0752, + "step": 40930 + }, + { + "epoch": 8.021729946379246, + "grad_norm": 0.021873436868190765, + "learning_rate": 4.323662339694404e-05, + "loss": 0.0833, + "step": 40940 + }, + { + "epoch": 8.021784108758057, + "grad_norm": 0.010825134813785553, + "learning_rate": 4.323361437589895e-05, + "loss": 0.0583, + "step": 40950 + }, + { + "epoch": 8.021838271136868, + "grad_norm": 4.3005781173706055, + "learning_rate": 4.3230605354853855e-05, + "loss": 0.2232, + "step": 40960 + }, + { + "epoch": 8.02189243351568, + "grad_norm": 0.07841454446315765, + "learning_rate": 4.322759633380876e-05, + "loss": 0.0196, + "step": 40970 + }, + { + "epoch": 8.02194659589449, + "grad_norm": 0.14545898139476776, + "learning_rate": 4.322458731276367e-05, + "loss": 0.0204, + "step": 40980 + }, + { + "epoch": 8.022000758273304, + "grad_norm": 0.06474294513463974, + "learning_rate": 4.3221578291718573e-05, + "loss": 0.0968, + "step": 40990 + }, + { + "epoch": 8.022054920652115, + "grad_norm": 0.056207798421382904, + "learning_rate": 4.321856927067348e-05, + "loss": 0.0478, + "step": 41000 + }, + { + "epoch": 8.022109083030927, + "grad_norm": 0.022268934175372124, + "learning_rate": 4.321556024962839e-05, + "loss": 0.1339, + "step": 41010 + }, + { + "epoch": 8.022163245409738, + "grad_norm": 0.03948488458991051, + "learning_rate": 4.321255122858329e-05, + "loss": 0.0441, + "step": 41020 + }, + { + "epoch": 8.02221740778855, + "grad_norm": 0.050234898924827576, + "learning_rate": 4.32095422075382e-05, + "loss": 0.0321, + "step": 41030 + }, + { + "epoch": 8.022271570167362, + "grad_norm": 0.009325326420366764, + "learning_rate": 4.320653318649311e-05, + "loss": 0.131, + "step": 41040 + }, + { + "epoch": 8.022325732546173, + "grad_norm": 0.007368412800133228, + "learning_rate": 4.320352416544802e-05, + "loss": 0.0351, + "step": 41050 + }, + { + "epoch": 8.022379894924985, + "grad_norm": 0.03375996649265289, + "learning_rate": 4.320051514440292e-05, + "loss": 0.1731, + "step": 41060 + }, + { + "epoch": 8.022434057303796, + "grad_norm": 0.46204882860183716, + "learning_rate": 4.319750612335783e-05, + "loss": 0.0214, + "step": 41070 + }, + { + "epoch": 8.022488219682609, + "grad_norm": 0.07260201126337051, + "learning_rate": 4.319449710231274e-05, + "loss": 0.0764, + "step": 41080 + }, + { + "epoch": 8.02254238206142, + "grad_norm": 0.10104091465473175, + "learning_rate": 4.319148808126764e-05, + "loss": 0.0451, + "step": 41090 + }, + { + "epoch": 8.022596544440232, + "grad_norm": 12.496249198913574, + "learning_rate": 4.318847906022255e-05, + "loss": 0.0373, + "step": 41100 + }, + { + "epoch": 8.022650706819043, + "grad_norm": 5.445650100708008, + "learning_rate": 4.3185470039177455e-05, + "loss": 0.1812, + "step": 41110 + }, + { + "epoch": 8.022704869197856, + "grad_norm": 0.019213195890188217, + "learning_rate": 4.318246101813236e-05, + "loss": 0.0071, + "step": 41120 + }, + { + "epoch": 8.022759031576667, + "grad_norm": 0.22215527296066284, + "learning_rate": 4.3179451997087275e-05, + "loss": 0.0075, + "step": 41130 + }, + { + "epoch": 8.022813193955479, + "grad_norm": 0.005039687268435955, + "learning_rate": 4.3176442976042174e-05, + "loss": 0.1479, + "step": 41140 + }, + { + "epoch": 8.02286735633429, + "grad_norm": 0.15754130482673645, + "learning_rate": 4.317343395499708e-05, + "loss": 0.0719, + "step": 41150 + }, + { + "epoch": 8.022921518713101, + "grad_norm": 17.050159454345703, + "learning_rate": 4.3170424933951994e-05, + "loss": 0.0714, + "step": 41160 + }, + { + "epoch": 8.022975681091914, + "grad_norm": 0.059798091650009155, + "learning_rate": 4.316741591290689e-05, + "loss": 0.0611, + "step": 41170 + }, + { + "epoch": 8.023029843470725, + "grad_norm": 0.13118822872638702, + "learning_rate": 4.31644068918618e-05, + "loss": 0.053, + "step": 41180 + }, + { + "epoch": 8.023084005849537, + "grad_norm": 0.019875600934028625, + "learning_rate": 4.316139787081671e-05, + "loss": 0.0729, + "step": 41190 + }, + { + "epoch": 8.023138168228348, + "grad_norm": 0.08809970319271088, + "learning_rate": 4.315838884977162e-05, + "loss": 0.1135, + "step": 41200 + }, + { + "epoch": 8.023192330607161, + "grad_norm": 0.11126542836427689, + "learning_rate": 4.3155379828726525e-05, + "loss": 0.1088, + "step": 41210 + }, + { + "epoch": 8.023246492985972, + "grad_norm": 0.1447569727897644, + "learning_rate": 4.315237080768143e-05, + "loss": 0.0832, + "step": 41220 + }, + { + "epoch": 8.023300655364784, + "grad_norm": 1.727094054222107, + "learning_rate": 4.314936178663634e-05, + "loss": 0.1412, + "step": 41230 + }, + { + "epoch": 8.023354817743595, + "grad_norm": 0.04207412898540497, + "learning_rate": 4.3146352765591244e-05, + "loss": 0.0419, + "step": 41240 + }, + { + "epoch": 8.023408980122406, + "grad_norm": 0.08664881438016891, + "learning_rate": 4.314334374454615e-05, + "loss": 0.1088, + "step": 41250 + }, + { + "epoch": 8.02346314250122, + "grad_norm": 6.224583625793457, + "learning_rate": 4.3140334723501056e-05, + "loss": 0.0223, + "step": 41260 + }, + { + "epoch": 8.02351730488003, + "grad_norm": 0.6837025880813599, + "learning_rate": 4.313732570245597e-05, + "loss": 0.19, + "step": 41270 + }, + { + "epoch": 8.023571467258842, + "grad_norm": 0.5179277658462524, + "learning_rate": 4.3134316681410876e-05, + "loss": 0.0957, + "step": 41280 + }, + { + "epoch": 8.023625629637653, + "grad_norm": 2.3300058841705322, + "learning_rate": 4.3131307660365775e-05, + "loss": 0.122, + "step": 41290 + }, + { + "epoch": 8.023679792016466, + "grad_norm": 0.12422972172498703, + "learning_rate": 4.312829863932069e-05, + "loss": 0.1792, + "step": 41300 + }, + { + "epoch": 8.023733954395277, + "grad_norm": 0.45089438557624817, + "learning_rate": 4.3125289618275595e-05, + "loss": 0.0467, + "step": 41310 + }, + { + "epoch": 8.023788116774089, + "grad_norm": 1.2710422277450562, + "learning_rate": 4.3122280597230494e-05, + "loss": 0.0463, + "step": 41320 + }, + { + "epoch": 8.0238422791529, + "grad_norm": 0.0716317817568779, + "learning_rate": 4.311927157618541e-05, + "loss": 0.0787, + "step": 41330 + }, + { + "epoch": 8.023896441531711, + "grad_norm": 0.0034725540317595005, + "learning_rate": 4.311626255514031e-05, + "loss": 0.0384, + "step": 41340 + }, + { + "epoch": 8.023950603910524, + "grad_norm": 0.21956148743629456, + "learning_rate": 4.311325353409522e-05, + "loss": 0.043, + "step": 41350 + }, + { + "epoch": 8.024004766289336, + "grad_norm": 0.35053732991218567, + "learning_rate": 4.3110244513050126e-05, + "loss": 0.06, + "step": 41360 + }, + { + "epoch": 8.024058928668147, + "grad_norm": 0.1360701620578766, + "learning_rate": 4.310723549200503e-05, + "loss": 0.0747, + "step": 41370 + }, + { + "epoch": 8.024113091046958, + "grad_norm": 0.4423236548900604, + "learning_rate": 4.310422647095994e-05, + "loss": 0.1398, + "step": 41380 + }, + { + "epoch": 8.024167253425771, + "grad_norm": 4.223097801208496, + "learning_rate": 4.310121744991485e-05, + "loss": 0.0389, + "step": 41390 + }, + { + "epoch": 8.024221415804583, + "grad_norm": 0.0032296935096383095, + "learning_rate": 4.309820842886975e-05, + "loss": 0.1235, + "step": 41400 + }, + { + "epoch": 8.024275578183394, + "grad_norm": 0.006954623386263847, + "learning_rate": 4.309519940782466e-05, + "loss": 0.1121, + "step": 41410 + }, + { + "epoch": 8.024329740562205, + "grad_norm": 4.553284168243408, + "learning_rate": 4.309219038677957e-05, + "loss": 0.1611, + "step": 41420 + }, + { + "epoch": 8.024383902941016, + "grad_norm": 0.03261779993772507, + "learning_rate": 4.3089181365734477e-05, + "loss": 0.201, + "step": 41430 + }, + { + "epoch": 8.02443806531983, + "grad_norm": 0.005640103481709957, + "learning_rate": 4.3086172344689376e-05, + "loss": 0.0497, + "step": 41440 + }, + { + "epoch": 8.02449222769864, + "grad_norm": 4.995294570922852, + "learning_rate": 4.308316332364429e-05, + "loss": 0.1892, + "step": 41450 + }, + { + "epoch": 8.024546390077452, + "grad_norm": 0.010837333276867867, + "learning_rate": 4.3080154302599195e-05, + "loss": 0.0789, + "step": 41460 + }, + { + "epoch": 8.024600552456263, + "grad_norm": 2.029491424560547, + "learning_rate": 4.30771452815541e-05, + "loss": 0.0315, + "step": 41470 + }, + { + "epoch": 8.024654714835076, + "grad_norm": 1.6953102350234985, + "learning_rate": 4.307413626050901e-05, + "loss": 0.0645, + "step": 41480 + }, + { + "epoch": 8.024708877213888, + "grad_norm": 2.4386041164398193, + "learning_rate": 4.3071127239463914e-05, + "loss": 0.1372, + "step": 41490 + }, + { + "epoch": 8.024763039592699, + "grad_norm": 8.402539253234863, + "learning_rate": 4.306811821841882e-05, + "loss": 0.079, + "step": 41500 + }, + { + "epoch": 8.02481720197151, + "grad_norm": 0.6339068412780762, + "learning_rate": 4.306510919737373e-05, + "loss": 0.0947, + "step": 41510 + }, + { + "epoch": 8.024871364350322, + "grad_norm": 0.8308120965957642, + "learning_rate": 4.306210017632863e-05, + "loss": 0.0701, + "step": 41520 + }, + { + "epoch": 8.024925526729135, + "grad_norm": 0.4893210232257843, + "learning_rate": 4.3059091155283546e-05, + "loss": 0.0499, + "step": 41530 + }, + { + "epoch": 8.024979689107946, + "grad_norm": 0.9239968657493591, + "learning_rate": 4.305608213423845e-05, + "loss": 0.0377, + "step": 41540 + }, + { + "epoch": 8.025001354059471, + "eval_accuracy": 0.8324624428478119, + "eval_loss": 0.5703830122947693, + "eval_runtime": 116.6365, + "eval_samples_per_second": 26.253, + "eval_steps_per_second": 3.284, + "step": 41544 + }, + { + "epoch": 9.000032497427288, + "grad_norm": 6.142567157745361, + "learning_rate": 4.305307311319335e-05, + "loss": 0.0785, + "step": 41550 + }, + { + "epoch": 9.0000866598061, + "grad_norm": 0.1128431037068367, + "learning_rate": 4.3050064092148265e-05, + "loss": 0.0644, + "step": 41560 + }, + { + "epoch": 9.00014082218491, + "grad_norm": 0.29613569378852844, + "learning_rate": 4.304705507110317e-05, + "loss": 0.0833, + "step": 41570 + }, + { + "epoch": 9.000194984563722, + "grad_norm": 0.019570359960198402, + "learning_rate": 4.304404605005808e-05, + "loss": 0.0446, + "step": 41580 + }, + { + "epoch": 9.000249146942533, + "grad_norm": 0.03344811126589775, + "learning_rate": 4.3041037029012984e-05, + "loss": 0.1097, + "step": 41590 + }, + { + "epoch": 9.000303309321346, + "grad_norm": 0.9105361700057983, + "learning_rate": 4.303802800796789e-05, + "loss": 0.0475, + "step": 41600 + }, + { + "epoch": 9.000357471700157, + "grad_norm": 0.6163584589958191, + "learning_rate": 4.3035018986922796e-05, + "loss": 0.008, + "step": 41610 + }, + { + "epoch": 9.000411634078969, + "grad_norm": 0.3951288163661957, + "learning_rate": 4.30320099658777e-05, + "loss": 0.1102, + "step": 41620 + }, + { + "epoch": 9.00046579645778, + "grad_norm": 0.004681719001382589, + "learning_rate": 4.302900094483261e-05, + "loss": 0.0958, + "step": 41630 + }, + { + "epoch": 9.000519958836593, + "grad_norm": 0.04312437027692795, + "learning_rate": 4.3025991923787515e-05, + "loss": 0.0036, + "step": 41640 + }, + { + "epoch": 9.000574121215404, + "grad_norm": 0.005121935624629259, + "learning_rate": 4.302298290274243e-05, + "loss": 0.0462, + "step": 41650 + }, + { + "epoch": 9.000628283594216, + "grad_norm": 0.03603469952940941, + "learning_rate": 4.301997388169733e-05, + "loss": 0.102, + "step": 41660 + }, + { + "epoch": 9.000682445973027, + "grad_norm": 0.1408948451280594, + "learning_rate": 4.3016964860652234e-05, + "loss": 0.0585, + "step": 41670 + }, + { + "epoch": 9.000736608351838, + "grad_norm": 3.09546160697937, + "learning_rate": 4.301395583960715e-05, + "loss": 0.0492, + "step": 41680 + }, + { + "epoch": 9.000790770730651, + "grad_norm": 0.428017795085907, + "learning_rate": 4.301094681856205e-05, + "loss": 0.0353, + "step": 41690 + }, + { + "epoch": 9.000844933109462, + "grad_norm": 0.333632230758667, + "learning_rate": 4.300793779751695e-05, + "loss": 0.061, + "step": 41700 + }, + { + "epoch": 9.000899095488274, + "grad_norm": 16.087642669677734, + "learning_rate": 4.3004928776471866e-05, + "loss": 0.105, + "step": 41710 + }, + { + "epoch": 9.000953257867085, + "grad_norm": 4.54427433013916, + "learning_rate": 4.300191975542677e-05, + "loss": 0.1692, + "step": 41720 + }, + { + "epoch": 9.001007420245898, + "grad_norm": 0.057187363505363464, + "learning_rate": 4.299891073438168e-05, + "loss": 0.1421, + "step": 41730 + }, + { + "epoch": 9.00106158262471, + "grad_norm": 0.392899751663208, + "learning_rate": 4.2995901713336585e-05, + "loss": 0.0876, + "step": 41740 + }, + { + "epoch": 9.00111574500352, + "grad_norm": 0.016992326825857162, + "learning_rate": 4.299289269229149e-05, + "loss": 0.0372, + "step": 41750 + }, + { + "epoch": 9.001169907382332, + "grad_norm": 0.06107435002923012, + "learning_rate": 4.29898836712464e-05, + "loss": 0.0571, + "step": 41760 + }, + { + "epoch": 9.001224069761143, + "grad_norm": 0.11072302609682083, + "learning_rate": 4.2986874650201304e-05, + "loss": 0.1108, + "step": 41770 + }, + { + "epoch": 9.001278232139956, + "grad_norm": 0.12192647159099579, + "learning_rate": 4.298386562915621e-05, + "loss": 0.0232, + "step": 41780 + }, + { + "epoch": 9.001332394518768, + "grad_norm": 0.1537124365568161, + "learning_rate": 4.298085660811112e-05, + "loss": 0.058, + "step": 41790 + }, + { + "epoch": 9.001386556897579, + "grad_norm": 0.005915668793022633, + "learning_rate": 4.297784758706603e-05, + "loss": 0.0466, + "step": 41800 + }, + { + "epoch": 9.00144071927639, + "grad_norm": 1.4935382604599, + "learning_rate": 4.297483856602093e-05, + "loss": 0.131, + "step": 41810 + }, + { + "epoch": 9.001494881655203, + "grad_norm": 0.3819156885147095, + "learning_rate": 4.297182954497584e-05, + "loss": 0.0596, + "step": 41820 + }, + { + "epoch": 9.001549044034014, + "grad_norm": 0.038537491112947464, + "learning_rate": 4.296882052393075e-05, + "loss": 0.0449, + "step": 41830 + }, + { + "epoch": 9.001603206412826, + "grad_norm": 3.444202423095703, + "learning_rate": 4.2965811502885654e-05, + "loss": 0.0876, + "step": 41840 + }, + { + "epoch": 9.001657368791637, + "grad_norm": 0.39064234495162964, + "learning_rate": 4.296280248184056e-05, + "loss": 0.0136, + "step": 41850 + }, + { + "epoch": 9.001711531170448, + "grad_norm": 0.03594626113772392, + "learning_rate": 4.295979346079547e-05, + "loss": 0.0692, + "step": 41860 + }, + { + "epoch": 9.001765693549261, + "grad_norm": 0.004824209026992321, + "learning_rate": 4.295678443975037e-05, + "loss": 0.1162, + "step": 41870 + }, + { + "epoch": 9.001819855928073, + "grad_norm": 0.9010862112045288, + "learning_rate": 4.2953775418705286e-05, + "loss": 0.1021, + "step": 41880 + }, + { + "epoch": 9.001874018306884, + "grad_norm": 1.1265349388122559, + "learning_rate": 4.2950766397660186e-05, + "loss": 0.0686, + "step": 41890 + }, + { + "epoch": 9.001928180685695, + "grad_norm": 0.005664234049618244, + "learning_rate": 4.294775737661509e-05, + "loss": 0.0386, + "step": 41900 + }, + { + "epoch": 9.001982343064507, + "grad_norm": 2.2013864517211914, + "learning_rate": 4.2944748355570005e-05, + "loss": 0.0267, + "step": 41910 + }, + { + "epoch": 9.00203650544332, + "grad_norm": 0.01195481326431036, + "learning_rate": 4.2941739334524904e-05, + "loss": 0.0186, + "step": 41920 + }, + { + "epoch": 9.00209066782213, + "grad_norm": 0.034756213426589966, + "learning_rate": 4.293873031347981e-05, + "loss": 0.045, + "step": 41930 + }, + { + "epoch": 9.002144830200942, + "grad_norm": 1.260573148727417, + "learning_rate": 4.2935721292434724e-05, + "loss": 0.0727, + "step": 41940 + }, + { + "epoch": 9.002198992579753, + "grad_norm": 0.14897190034389496, + "learning_rate": 4.293271227138963e-05, + "loss": 0.0247, + "step": 41950 + }, + { + "epoch": 9.002253154958566, + "grad_norm": 0.0034989556297659874, + "learning_rate": 4.2929703250344536e-05, + "loss": 0.09, + "step": 41960 + }, + { + "epoch": 9.002307317337378, + "grad_norm": 3.966078042984009, + "learning_rate": 4.292669422929944e-05, + "loss": 0.1412, + "step": 41970 + }, + { + "epoch": 9.002361479716189, + "grad_norm": 5.718930721282959, + "learning_rate": 4.292368520825435e-05, + "loss": 0.2256, + "step": 41980 + }, + { + "epoch": 9.002415642095, + "grad_norm": 0.018360983580350876, + "learning_rate": 4.2920676187209255e-05, + "loss": 0.1039, + "step": 41990 + }, + { + "epoch": 9.002469804473812, + "grad_norm": 0.056573521345853806, + "learning_rate": 4.291766716616416e-05, + "loss": 0.1111, + "step": 42000 + }, + { + "epoch": 9.002523966852625, + "grad_norm": 0.08263130486011505, + "learning_rate": 4.291465814511907e-05, + "loss": 0.021, + "step": 42010 + }, + { + "epoch": 9.002578129231436, + "grad_norm": 15.164730072021484, + "learning_rate": 4.2911649124073974e-05, + "loss": 0.1014, + "step": 42020 + }, + { + "epoch": 9.002632291610247, + "grad_norm": 0.2879883944988251, + "learning_rate": 4.290864010302889e-05, + "loss": 0.0483, + "step": 42030 + }, + { + "epoch": 9.002686453989059, + "grad_norm": 0.004615205340087414, + "learning_rate": 4.2905631081983786e-05, + "loss": 0.0438, + "step": 42040 + }, + { + "epoch": 9.002740616367872, + "grad_norm": 0.00396726094186306, + "learning_rate": 4.29026220609387e-05, + "loss": 0.1153, + "step": 42050 + }, + { + "epoch": 9.002794778746683, + "grad_norm": 0.04269927367568016, + "learning_rate": 4.2899613039893606e-05, + "loss": 0.0851, + "step": 42060 + }, + { + "epoch": 9.002848941125494, + "grad_norm": 0.0041802749037742615, + "learning_rate": 4.2896604018848505e-05, + "loss": 0.0764, + "step": 42070 + }, + { + "epoch": 9.002903103504305, + "grad_norm": 0.48491567373275757, + "learning_rate": 4.289359499780342e-05, + "loss": 0.0038, + "step": 42080 + }, + { + "epoch": 9.002957265883117, + "grad_norm": 0.0756312683224678, + "learning_rate": 4.2890585976758325e-05, + "loss": 0.0829, + "step": 42090 + }, + { + "epoch": 9.00301142826193, + "grad_norm": 2.449056625366211, + "learning_rate": 4.288757695571323e-05, + "loss": 0.0858, + "step": 42100 + }, + { + "epoch": 9.003065590640741, + "grad_norm": 0.05732494592666626, + "learning_rate": 4.288456793466814e-05, + "loss": 0.0745, + "step": 42110 + }, + { + "epoch": 9.003119753019552, + "grad_norm": 0.004556973464787006, + "learning_rate": 4.2881558913623043e-05, + "loss": 0.0721, + "step": 42120 + }, + { + "epoch": 9.003173915398364, + "grad_norm": 0.13676634430885315, + "learning_rate": 4.287854989257795e-05, + "loss": 0.039, + "step": 42130 + }, + { + "epoch": 9.003228077777177, + "grad_norm": 0.036252379417419434, + "learning_rate": 4.287554087153286e-05, + "loss": 0.0632, + "step": 42140 + }, + { + "epoch": 9.003282240155988, + "grad_norm": 0.2831238806247711, + "learning_rate": 4.287253185048776e-05, + "loss": 0.0561, + "step": 42150 + }, + { + "epoch": 9.0033364025348, + "grad_norm": 3.3474221229553223, + "learning_rate": 4.286952282944267e-05, + "loss": 0.0223, + "step": 42160 + }, + { + "epoch": 9.00339056491361, + "grad_norm": 0.0218286644667387, + "learning_rate": 4.286651380839758e-05, + "loss": 0.0108, + "step": 42170 + }, + { + "epoch": 9.003444727292422, + "grad_norm": 0.01826193556189537, + "learning_rate": 4.286350478735249e-05, + "loss": 0.1079, + "step": 42180 + }, + { + "epoch": 9.003498889671235, + "grad_norm": 0.9937471151351929, + "learning_rate": 4.286049576630739e-05, + "loss": 0.2177, + "step": 42190 + }, + { + "epoch": 9.003553052050046, + "grad_norm": 0.10716183483600616, + "learning_rate": 4.28574867452623e-05, + "loss": 0.0141, + "step": 42200 + }, + { + "epoch": 9.003607214428857, + "grad_norm": 1.5688458681106567, + "learning_rate": 4.285447772421721e-05, + "loss": 0.0631, + "step": 42210 + }, + { + "epoch": 9.003661376807669, + "grad_norm": 0.02246861718595028, + "learning_rate": 4.285146870317211e-05, + "loss": 0.0512, + "step": 42220 + }, + { + "epoch": 9.003715539186482, + "grad_norm": 0.007512201555073261, + "learning_rate": 4.284845968212702e-05, + "loss": 0.0801, + "step": 42230 + }, + { + "epoch": 9.003769701565293, + "grad_norm": 0.298935204744339, + "learning_rate": 4.2845450661081926e-05, + "loss": 0.2077, + "step": 42240 + }, + { + "epoch": 9.003823863944104, + "grad_norm": 0.16681255400180817, + "learning_rate": 4.284244164003683e-05, + "loss": 0.044, + "step": 42250 + }, + { + "epoch": 9.003878026322916, + "grad_norm": 0.011046506464481354, + "learning_rate": 4.283943261899174e-05, + "loss": 0.0818, + "step": 42260 + }, + { + "epoch": 9.003932188701727, + "grad_norm": 0.060819875448942184, + "learning_rate": 4.2836423597946644e-05, + "loss": 0.0155, + "step": 42270 + }, + { + "epoch": 9.00398635108054, + "grad_norm": 2.6181910037994385, + "learning_rate": 4.283341457690155e-05, + "loss": 0.1387, + "step": 42280 + }, + { + "epoch": 9.004040513459351, + "grad_norm": 0.00302243884652853, + "learning_rate": 4.2830405555856464e-05, + "loss": 0.0944, + "step": 42290 + }, + { + "epoch": 9.004094675838163, + "grad_norm": 0.06178848445415497, + "learning_rate": 4.282739653481136e-05, + "loss": 0.0622, + "step": 42300 + }, + { + "epoch": 9.004148838216974, + "grad_norm": 0.005295417737215757, + "learning_rate": 4.2824387513766276e-05, + "loss": 0.0382, + "step": 42310 + }, + { + "epoch": 9.004203000595787, + "grad_norm": 0.8212676048278809, + "learning_rate": 4.282137849272118e-05, + "loss": 0.0371, + "step": 42320 + }, + { + "epoch": 9.004257162974598, + "grad_norm": 0.13002373278141022, + "learning_rate": 4.281836947167609e-05, + "loss": 0.0246, + "step": 42330 + }, + { + "epoch": 9.00431132535341, + "grad_norm": 0.016745448112487793, + "learning_rate": 4.2815360450630995e-05, + "loss": 0.0569, + "step": 42340 + }, + { + "epoch": 9.00436548773222, + "grad_norm": 0.07227767258882523, + "learning_rate": 4.28123514295859e-05, + "loss": 0.1723, + "step": 42350 + }, + { + "epoch": 9.004419650111032, + "grad_norm": 0.02563035488128662, + "learning_rate": 4.280934240854081e-05, + "loss": 0.215, + "step": 42360 + }, + { + "epoch": 9.004473812489845, + "grad_norm": 0.041942741721868515, + "learning_rate": 4.2806333387495714e-05, + "loss": 0.0584, + "step": 42370 + }, + { + "epoch": 9.004527974868656, + "grad_norm": 0.0067081390880048275, + "learning_rate": 4.280332436645062e-05, + "loss": 0.056, + "step": 42380 + }, + { + "epoch": 9.004582137247468, + "grad_norm": 0.31662610173225403, + "learning_rate": 4.2800315345405526e-05, + "loss": 0.0744, + "step": 42390 + }, + { + "epoch": 9.004636299626279, + "grad_norm": 3.5886924266815186, + "learning_rate": 4.279730632436044e-05, + "loss": 0.1028, + "step": 42400 + }, + { + "epoch": 9.004690462005092, + "grad_norm": 0.12772579491138458, + "learning_rate": 4.279429730331534e-05, + "loss": 0.0115, + "step": 42410 + }, + { + "epoch": 9.004744624383903, + "grad_norm": 0.006379902828484774, + "learning_rate": 4.2791288282270245e-05, + "loss": 0.0409, + "step": 42420 + }, + { + "epoch": 9.004798786762715, + "grad_norm": 3.876110076904297, + "learning_rate": 4.278827926122516e-05, + "loss": 0.0999, + "step": 42430 + }, + { + "epoch": 9.004852949141526, + "grad_norm": 0.04519415646791458, + "learning_rate": 4.2785270240180065e-05, + "loss": 0.0182, + "step": 42440 + }, + { + "epoch": 9.004907111520337, + "grad_norm": 0.002244760049507022, + "learning_rate": 4.2782261219134964e-05, + "loss": 0.0121, + "step": 42450 + }, + { + "epoch": 9.00496127389915, + "grad_norm": 0.006557360757142305, + "learning_rate": 4.277925219808988e-05, + "loss": 0.1026, + "step": 42460 + }, + { + "epoch": 9.005015436277962, + "grad_norm": 0.044961534440517426, + "learning_rate": 4.277624317704478e-05, + "loss": 0.0016, + "step": 42470 + }, + { + "epoch": 9.005069598656773, + "grad_norm": 0.2837388217449188, + "learning_rate": 4.277323415599969e-05, + "loss": 0.0823, + "step": 42480 + }, + { + "epoch": 9.005123761035584, + "grad_norm": 0.0018040926661342382, + "learning_rate": 4.2770225134954596e-05, + "loss": 0.0017, + "step": 42490 + }, + { + "epoch": 9.005177923414397, + "grad_norm": 0.0021211986895650625, + "learning_rate": 4.27672161139095e-05, + "loss": 0.0183, + "step": 42500 + }, + { + "epoch": 9.005232085793208, + "grad_norm": 0.4618780016899109, + "learning_rate": 4.276420709286441e-05, + "loss": 0.266, + "step": 42510 + }, + { + "epoch": 9.00528624817202, + "grad_norm": 2.606851577758789, + "learning_rate": 4.2761198071819315e-05, + "loss": 0.0324, + "step": 42520 + }, + { + "epoch": 9.005340410550831, + "grad_norm": 7.905721664428711, + "learning_rate": 4.275818905077422e-05, + "loss": 0.1537, + "step": 42530 + }, + { + "epoch": 9.005394572929642, + "grad_norm": 2.6049423217773438, + "learning_rate": 4.275518002972913e-05, + "loss": 0.0259, + "step": 42540 + }, + { + "epoch": 9.005448735308455, + "grad_norm": 0.06162184476852417, + "learning_rate": 4.275217100868404e-05, + "loss": 0.1426, + "step": 42550 + }, + { + "epoch": 9.005502897687267, + "grad_norm": 0.037246111780405045, + "learning_rate": 4.274916198763894e-05, + "loss": 0.0117, + "step": 42560 + }, + { + "epoch": 9.005557060066078, + "grad_norm": 0.022916628047823906, + "learning_rate": 4.274615296659385e-05, + "loss": 0.0766, + "step": 42570 + }, + { + "epoch": 9.00561122244489, + "grad_norm": 0.02946312353014946, + "learning_rate": 4.274314394554876e-05, + "loss": 0.07, + "step": 42580 + }, + { + "epoch": 9.005665384823702, + "grad_norm": 0.021657738834619522, + "learning_rate": 4.2740134924503665e-05, + "loss": 0.1066, + "step": 42590 + }, + { + "epoch": 9.005719547202514, + "grad_norm": 0.9814549684524536, + "learning_rate": 4.273712590345857e-05, + "loss": 0.0605, + "step": 42600 + }, + { + "epoch": 9.005773709581325, + "grad_norm": 0.7900279760360718, + "learning_rate": 4.273411688241348e-05, + "loss": 0.0939, + "step": 42610 + }, + { + "epoch": 9.005827871960136, + "grad_norm": 0.06456425786018372, + "learning_rate": 4.2731107861368384e-05, + "loss": 0.0036, + "step": 42620 + }, + { + "epoch": 9.005882034338947, + "grad_norm": 0.01162367407232523, + "learning_rate": 4.27280988403233e-05, + "loss": 0.1406, + "step": 42630 + }, + { + "epoch": 9.00593619671776, + "grad_norm": 5.264919281005859, + "learning_rate": 4.27250898192782e-05, + "loss": 0.1276, + "step": 42640 + }, + { + "epoch": 9.005990359096572, + "grad_norm": 0.02391819655895233, + "learning_rate": 4.27220807982331e-05, + "loss": 0.0784, + "step": 42650 + }, + { + "epoch": 9.006044521475383, + "grad_norm": 0.10482876747846603, + "learning_rate": 4.2719071777188016e-05, + "loss": 0.0291, + "step": 42660 + }, + { + "epoch": 9.006098683854194, + "grad_norm": 0.7629581093788147, + "learning_rate": 4.2716062756142916e-05, + "loss": 0.0537, + "step": 42670 + }, + { + "epoch": 9.006152846233007, + "grad_norm": 0.7675333023071289, + "learning_rate": 4.271305373509782e-05, + "loss": 0.11, + "step": 42680 + }, + { + "epoch": 9.006207008611819, + "grad_norm": 0.1164848655462265, + "learning_rate": 4.2710044714052735e-05, + "loss": 0.1394, + "step": 42690 + }, + { + "epoch": 9.00626117099063, + "grad_norm": 0.08100416511297226, + "learning_rate": 4.270703569300764e-05, + "loss": 0.054, + "step": 42700 + }, + { + "epoch": 9.006315333369441, + "grad_norm": 0.010553435422480106, + "learning_rate": 4.270402667196254e-05, + "loss": 0.0056, + "step": 42710 + }, + { + "epoch": 9.006369495748253, + "grad_norm": 0.004983165767043829, + "learning_rate": 4.2701017650917454e-05, + "loss": 0.0802, + "step": 42720 + }, + { + "epoch": 9.006423658127066, + "grad_norm": 18.765371322631836, + "learning_rate": 4.269800862987236e-05, + "loss": 0.0732, + "step": 42730 + }, + { + "epoch": 9.006477820505877, + "grad_norm": 0.09618460386991501, + "learning_rate": 4.2694999608827266e-05, + "loss": 0.1037, + "step": 42740 + }, + { + "epoch": 9.006531982884688, + "grad_norm": 0.04492199420928955, + "learning_rate": 4.269199058778217e-05, + "loss": 0.0276, + "step": 42750 + }, + { + "epoch": 9.0065861452635, + "grad_norm": 0.7839969992637634, + "learning_rate": 4.268898156673708e-05, + "loss": 0.0939, + "step": 42760 + }, + { + "epoch": 9.006640307642312, + "grad_norm": 0.8772493600845337, + "learning_rate": 4.2685972545691985e-05, + "loss": 0.1926, + "step": 42770 + }, + { + "epoch": 9.006694470021124, + "grad_norm": 0.05115017667412758, + "learning_rate": 4.26829635246469e-05, + "loss": 0.0101, + "step": 42780 + }, + { + "epoch": 9.006748632399935, + "grad_norm": 0.03567923977971077, + "learning_rate": 4.26799545036018e-05, + "loss": 0.1206, + "step": 42790 + }, + { + "epoch": 9.006802794778746, + "grad_norm": 0.03379151225090027, + "learning_rate": 4.2676945482556704e-05, + "loss": 0.0563, + "step": 42800 + }, + { + "epoch": 9.006856957157558, + "grad_norm": 0.033577047288417816, + "learning_rate": 4.267393646151162e-05, + "loss": 0.1418, + "step": 42810 + }, + { + "epoch": 9.00691111953637, + "grad_norm": 11.362493515014648, + "learning_rate": 4.2670927440466517e-05, + "loss": 0.1506, + "step": 42820 + }, + { + "epoch": 9.006965281915182, + "grad_norm": 0.0357264019548893, + "learning_rate": 4.266791841942143e-05, + "loss": 0.044, + "step": 42830 + }, + { + "epoch": 9.007019444293993, + "grad_norm": 2.108243465423584, + "learning_rate": 4.2664909398376336e-05, + "loss": 0.0615, + "step": 42840 + }, + { + "epoch": 9.007073606672805, + "grad_norm": 0.20911255478858948, + "learning_rate": 4.266190037733124e-05, + "loss": 0.0714, + "step": 42850 + }, + { + "epoch": 9.007127769051618, + "grad_norm": 0.0795915499329567, + "learning_rate": 4.265889135628615e-05, + "loss": 0.0102, + "step": 42860 + }, + { + "epoch": 9.007181931430429, + "grad_norm": 0.22700297832489014, + "learning_rate": 4.2655882335241055e-05, + "loss": 0.0916, + "step": 42870 + }, + { + "epoch": 9.00723609380924, + "grad_norm": 0.122267946600914, + "learning_rate": 4.265287331419596e-05, + "loss": 0.0584, + "step": 42880 + }, + { + "epoch": 9.007290256188051, + "grad_norm": 6.640659809112549, + "learning_rate": 4.2649864293150874e-05, + "loss": 0.1531, + "step": 42890 + }, + { + "epoch": 9.007344418566863, + "grad_norm": 12.717330932617188, + "learning_rate": 4.2646855272105774e-05, + "loss": 0.0788, + "step": 42900 + }, + { + "epoch": 9.007398580945676, + "grad_norm": 0.2792956829071045, + "learning_rate": 4.264384625106068e-05, + "loss": 0.1566, + "step": 42910 + }, + { + "epoch": 9.007452743324487, + "grad_norm": 0.44896551966667175, + "learning_rate": 4.264083723001559e-05, + "loss": 0.0396, + "step": 42920 + }, + { + "epoch": 9.007506905703298, + "grad_norm": 0.015411474741995335, + "learning_rate": 4.26378282089705e-05, + "loss": 0.0052, + "step": 42930 + }, + { + "epoch": 9.00756106808211, + "grad_norm": 0.013018003664910793, + "learning_rate": 4.26348191879254e-05, + "loss": 0.0089, + "step": 42940 + }, + { + "epoch": 9.007615230460923, + "grad_norm": 0.12610280513763428, + "learning_rate": 4.263181016688031e-05, + "loss": 0.077, + "step": 42950 + }, + { + "epoch": 9.007669392839734, + "grad_norm": 2.947533130645752, + "learning_rate": 4.262880114583522e-05, + "loss": 0.2088, + "step": 42960 + }, + { + "epoch": 9.007723555218545, + "grad_norm": 0.05879289656877518, + "learning_rate": 4.262579212479012e-05, + "loss": 0.1255, + "step": 42970 + }, + { + "epoch": 9.007777717597357, + "grad_norm": 0.044879548251628876, + "learning_rate": 4.262278310374503e-05, + "loss": 0.22, + "step": 42980 + }, + { + "epoch": 9.007831879976168, + "grad_norm": 0.06363508850336075, + "learning_rate": 4.261977408269994e-05, + "loss": 0.053, + "step": 42990 + }, + { + "epoch": 9.007886042354981, + "grad_norm": 4.607552528381348, + "learning_rate": 4.261676506165484e-05, + "loss": 0.173, + "step": 43000 + }, + { + "epoch": 9.007940204733792, + "grad_norm": 0.015234051272273064, + "learning_rate": 4.261375604060975e-05, + "loss": 0.0856, + "step": 43010 + }, + { + "epoch": 9.007994367112603, + "grad_norm": 4.89670991897583, + "learning_rate": 4.2610747019564656e-05, + "loss": 0.2265, + "step": 43020 + }, + { + "epoch": 9.008048529491415, + "grad_norm": 3.2527337074279785, + "learning_rate": 4.260773799851956e-05, + "loss": 0.077, + "step": 43030 + }, + { + "epoch": 9.008102691870226, + "grad_norm": 0.19953791797161102, + "learning_rate": 4.2604728977474475e-05, + "loss": 0.0821, + "step": 43040 + }, + { + "epoch": 9.00815685424904, + "grad_norm": 0.5959811210632324, + "learning_rate": 4.2601719956429374e-05, + "loss": 0.0935, + "step": 43050 + }, + { + "epoch": 9.00821101662785, + "grad_norm": 0.13908055424690247, + "learning_rate": 4.259871093538428e-05, + "loss": 0.0156, + "step": 43060 + }, + { + "epoch": 9.008265179006662, + "grad_norm": 0.22849762439727783, + "learning_rate": 4.2595701914339194e-05, + "loss": 0.0741, + "step": 43070 + }, + { + "epoch": 9.008319341385473, + "grad_norm": 1.204566240310669, + "learning_rate": 4.25926928932941e-05, + "loss": 0.1266, + "step": 43080 + }, + { + "epoch": 9.008373503764286, + "grad_norm": 0.0107711311429739, + "learning_rate": 4.2589683872249006e-05, + "loss": 0.0554, + "step": 43090 + }, + { + "epoch": 9.008427666143097, + "grad_norm": 0.1398591846227646, + "learning_rate": 4.258667485120391e-05, + "loss": 0.1323, + "step": 43100 + }, + { + "epoch": 9.008481828521909, + "grad_norm": 0.03322839364409447, + "learning_rate": 4.258366583015882e-05, + "loss": 0.1556, + "step": 43110 + }, + { + "epoch": 9.00853599090072, + "grad_norm": 1.9588408470153809, + "learning_rate": 4.2580656809113725e-05, + "loss": 0.1093, + "step": 43120 + }, + { + "epoch": 9.008590153279531, + "grad_norm": 1.1661689281463623, + "learning_rate": 4.257764778806863e-05, + "loss": 0.0592, + "step": 43130 + }, + { + "epoch": 9.008644315658344, + "grad_norm": 0.02498439885675907, + "learning_rate": 4.257463876702354e-05, + "loss": 0.0664, + "step": 43140 + }, + { + "epoch": 9.008698478037156, + "grad_norm": 11.338614463806152, + "learning_rate": 4.257162974597845e-05, + "loss": 0.2524, + "step": 43150 + }, + { + "epoch": 9.008752640415967, + "grad_norm": 0.04371616244316101, + "learning_rate": 4.256862072493335e-05, + "loss": 0.0153, + "step": 43160 + }, + { + "epoch": 9.008806802794778, + "grad_norm": 0.11613127589225769, + "learning_rate": 4.2565611703888256e-05, + "loss": 0.0918, + "step": 43170 + }, + { + "epoch": 9.008860965173591, + "grad_norm": 0.2955622673034668, + "learning_rate": 4.256260268284317e-05, + "loss": 0.0236, + "step": 43180 + }, + { + "epoch": 9.008915127552402, + "grad_norm": 0.6140323877334595, + "learning_rate": 4.2559593661798076e-05, + "loss": 0.0503, + "step": 43190 + }, + { + "epoch": 9.008969289931214, + "grad_norm": 0.006156423129141331, + "learning_rate": 4.2556584640752975e-05, + "loss": 0.0602, + "step": 43200 + }, + { + "epoch": 9.009023452310025, + "grad_norm": 9.601374626159668, + "learning_rate": 4.255357561970789e-05, + "loss": 0.0106, + "step": 43210 + }, + { + "epoch": 9.009077614688836, + "grad_norm": 0.015180746093392372, + "learning_rate": 4.2550566598662795e-05, + "loss": 0.0413, + "step": 43220 + }, + { + "epoch": 9.00913177706765, + "grad_norm": 0.01842254027724266, + "learning_rate": 4.25475575776177e-05, + "loss": 0.0431, + "step": 43230 + }, + { + "epoch": 9.00918593944646, + "grad_norm": 0.0684308260679245, + "learning_rate": 4.254454855657261e-05, + "loss": 0.0575, + "step": 43240 + }, + { + "epoch": 9.009240101825272, + "grad_norm": 0.0678224116563797, + "learning_rate": 4.2541539535527513e-05, + "loss": 0.0516, + "step": 43250 + }, + { + "epoch": 9.009294264204083, + "grad_norm": 0.00866731908172369, + "learning_rate": 4.253853051448242e-05, + "loss": 0.0036, + "step": 43260 + }, + { + "epoch": 9.009348426582896, + "grad_norm": 0.013296184130012989, + "learning_rate": 4.2535521493437326e-05, + "loss": 0.1129, + "step": 43270 + }, + { + "epoch": 9.009402588961708, + "grad_norm": 0.037865351885557175, + "learning_rate": 4.253251247239223e-05, + "loss": 0.0504, + "step": 43280 + }, + { + "epoch": 9.009456751340519, + "grad_norm": 0.0850520133972168, + "learning_rate": 4.252950345134714e-05, + "loss": 0.0832, + "step": 43290 + }, + { + "epoch": 9.00951091371933, + "grad_norm": 0.18459945917129517, + "learning_rate": 4.252649443030205e-05, + "loss": 0.0108, + "step": 43300 + }, + { + "epoch": 9.009565076098141, + "grad_norm": 0.05866061896085739, + "learning_rate": 4.252348540925695e-05, + "loss": 0.0923, + "step": 43310 + }, + { + "epoch": 9.009619238476954, + "grad_norm": 9.33717155456543, + "learning_rate": 4.252047638821186e-05, + "loss": 0.0696, + "step": 43320 + }, + { + "epoch": 9.009673400855766, + "grad_norm": 0.023671872913837433, + "learning_rate": 4.251746736716677e-05, + "loss": 0.1533, + "step": 43330 + }, + { + "epoch": 9.009727563234577, + "grad_norm": 0.12895365059375763, + "learning_rate": 4.251445834612168e-05, + "loss": 0.0845, + "step": 43340 + }, + { + "epoch": 9.009781725613388, + "grad_norm": 0.05179506167769432, + "learning_rate": 4.251144932507658e-05, + "loss": 0.0987, + "step": 43350 + }, + { + "epoch": 9.009835887992201, + "grad_norm": 0.008485982194542885, + "learning_rate": 4.250844030403149e-05, + "loss": 0.011, + "step": 43360 + }, + { + "epoch": 9.009890050371013, + "grad_norm": 7.762475490570068, + "learning_rate": 4.2505431282986396e-05, + "loss": 0.0422, + "step": 43370 + }, + { + "epoch": 9.009944212749824, + "grad_norm": 0.01618473045527935, + "learning_rate": 4.25024222619413e-05, + "loss": 0.0048, + "step": 43380 + }, + { + "epoch": 9.009998375128635, + "grad_norm": 0.010477167554199696, + "learning_rate": 4.249941324089621e-05, + "loss": 0.0352, + "step": 43390 + }, + { + "epoch": 9.010052537507447, + "grad_norm": 3.1319937705993652, + "learning_rate": 4.2496404219851114e-05, + "loss": 0.2158, + "step": 43400 + }, + { + "epoch": 9.01010669988626, + "grad_norm": 2.9853122234344482, + "learning_rate": 4.249339519880603e-05, + "loss": 0.191, + "step": 43410 + }, + { + "epoch": 9.01016086226507, + "grad_norm": 0.43229007720947266, + "learning_rate": 4.249038617776093e-05, + "loss": 0.0749, + "step": 43420 + }, + { + "epoch": 9.010215024643882, + "grad_norm": 0.048307474702596664, + "learning_rate": 4.248737715671583e-05, + "loss": 0.0198, + "step": 43430 + }, + { + "epoch": 9.010269187022693, + "grad_norm": 0.10584838688373566, + "learning_rate": 4.2484368135670746e-05, + "loss": 0.0308, + "step": 43440 + }, + { + "epoch": 9.010323349401506, + "grad_norm": 0.11071710288524628, + "learning_rate": 4.248135911462565e-05, + "loss": 0.0374, + "step": 43450 + }, + { + "epoch": 9.010377511780318, + "grad_norm": 0.022024765610694885, + "learning_rate": 4.247835009358055e-05, + "loss": 0.0335, + "step": 43460 + }, + { + "epoch": 9.010431674159129, + "grad_norm": 0.004923023283481598, + "learning_rate": 4.2475341072535465e-05, + "loss": 0.0523, + "step": 43470 + }, + { + "epoch": 9.01048583653794, + "grad_norm": 0.002891167998313904, + "learning_rate": 4.247233205149037e-05, + "loss": 0.0441, + "step": 43480 + }, + { + "epoch": 9.010539998916752, + "grad_norm": 0.010816848836839199, + "learning_rate": 4.246932303044528e-05, + "loss": 0.0252, + "step": 43490 + }, + { + "epoch": 9.010594161295565, + "grad_norm": 1.0253852605819702, + "learning_rate": 4.2466314009400184e-05, + "loss": 0.031, + "step": 43500 + }, + { + "epoch": 9.010648323674376, + "grad_norm": 0.6910658478736877, + "learning_rate": 4.246330498835509e-05, + "loss": 0.0288, + "step": 43510 + }, + { + "epoch": 9.010702486053187, + "grad_norm": 0.22735072672367096, + "learning_rate": 4.2460295967309996e-05, + "loss": 0.0445, + "step": 43520 + }, + { + "epoch": 9.010756648431999, + "grad_norm": 0.0020537173841148615, + "learning_rate": 4.24572869462649e-05, + "loss": 0.0671, + "step": 43530 + }, + { + "epoch": 9.010810810810812, + "grad_norm": 0.02236105687916279, + "learning_rate": 4.245427792521981e-05, + "loss": 0.0772, + "step": 43540 + }, + { + "epoch": 9.010864973189623, + "grad_norm": 0.01016053557395935, + "learning_rate": 4.2451268904174715e-05, + "loss": 0.047, + "step": 43550 + }, + { + "epoch": 9.010919135568434, + "grad_norm": 0.07877204567193985, + "learning_rate": 4.244825988312963e-05, + "loss": 0.0636, + "step": 43560 + }, + { + "epoch": 9.010973297947245, + "grad_norm": 0.020506711676716805, + "learning_rate": 4.244525086208453e-05, + "loss": 0.0028, + "step": 43570 + }, + { + "epoch": 9.011027460326057, + "grad_norm": 0.07351285219192505, + "learning_rate": 4.2442241841039434e-05, + "loss": 0.0766, + "step": 43580 + }, + { + "epoch": 9.01108162270487, + "grad_norm": 0.0957598090171814, + "learning_rate": 4.243923281999435e-05, + "loss": 0.0068, + "step": 43590 + }, + { + "epoch": 9.011135785083681, + "grad_norm": 0.08081795275211334, + "learning_rate": 4.2436223798949253e-05, + "loss": 0.0044, + "step": 43600 + }, + { + "epoch": 9.011189947462492, + "grad_norm": 0.9509225487709045, + "learning_rate": 4.243321477790416e-05, + "loss": 0.0334, + "step": 43610 + }, + { + "epoch": 9.011244109841304, + "grad_norm": 0.023174505680799484, + "learning_rate": 4.2430205756859066e-05, + "loss": 0.139, + "step": 43620 + }, + { + "epoch": 9.011298272220117, + "grad_norm": 0.025355501100420952, + "learning_rate": 4.242719673581397e-05, + "loss": 0.0237, + "step": 43630 + }, + { + "epoch": 9.011352434598928, + "grad_norm": 0.34098559617996216, + "learning_rate": 4.242418771476888e-05, + "loss": 0.1217, + "step": 43640 + }, + { + "epoch": 9.01140659697774, + "grad_norm": 0.10516395419836044, + "learning_rate": 4.2421178693723785e-05, + "loss": 0.0815, + "step": 43650 + }, + { + "epoch": 9.01146075935655, + "grad_norm": 0.11717895418405533, + "learning_rate": 4.241816967267869e-05, + "loss": 0.0107, + "step": 43660 + }, + { + "epoch": 9.011514921735362, + "grad_norm": 0.5800274610519409, + "learning_rate": 4.2415160651633604e-05, + "loss": 0.0127, + "step": 43670 + }, + { + "epoch": 9.011569084114175, + "grad_norm": 0.0020150435157120228, + "learning_rate": 4.2412151630588504e-05, + "loss": 0.2853, + "step": 43680 + }, + { + "epoch": 9.011623246492986, + "grad_norm": 2.239057779312134, + "learning_rate": 4.240914260954341e-05, + "loss": 0.3035, + "step": 43690 + }, + { + "epoch": 9.011677408871797, + "grad_norm": 0.21920910477638245, + "learning_rate": 4.240613358849832e-05, + "loss": 0.0132, + "step": 43700 + }, + { + "epoch": 9.011731571250609, + "grad_norm": 0.3625440001487732, + "learning_rate": 4.240312456745323e-05, + "loss": 0.0659, + "step": 43710 + }, + { + "epoch": 9.011785733629422, + "grad_norm": 0.2378896176815033, + "learning_rate": 4.240011554640813e-05, + "loss": 0.1021, + "step": 43720 + }, + { + "epoch": 9.011839896008233, + "grad_norm": 0.0030108289793133736, + "learning_rate": 4.239710652536304e-05, + "loss": 0.1563, + "step": 43730 + }, + { + "epoch": 9.011894058387044, + "grad_norm": 0.013805222697556019, + "learning_rate": 4.239409750431795e-05, + "loss": 0.0112, + "step": 43740 + }, + { + "epoch": 9.011948220765856, + "grad_norm": 0.16323207318782806, + "learning_rate": 4.2391088483272854e-05, + "loss": 0.0142, + "step": 43750 + }, + { + "epoch": 9.012002383144667, + "grad_norm": 0.037347257137298584, + "learning_rate": 4.238807946222776e-05, + "loss": 0.1978, + "step": 43760 + }, + { + "epoch": 9.01205654552348, + "grad_norm": 0.3470088541507721, + "learning_rate": 4.238507044118267e-05, + "loss": 0.1279, + "step": 43770 + }, + { + "epoch": 9.012110707902291, + "grad_norm": 2.244900941848755, + "learning_rate": 4.238206142013757e-05, + "loss": 0.2017, + "step": 43780 + }, + { + "epoch": 9.012164870281103, + "grad_norm": 2.088348627090454, + "learning_rate": 4.2379052399092486e-05, + "loss": 0.0994, + "step": 43790 + }, + { + "epoch": 9.012219032659914, + "grad_norm": 0.29196006059646606, + "learning_rate": 4.2376043378047386e-05, + "loss": 0.0766, + "step": 43800 + }, + { + "epoch": 9.012273195038727, + "grad_norm": 0.19891239702701569, + "learning_rate": 4.237303435700229e-05, + "loss": 0.0177, + "step": 43810 + }, + { + "epoch": 9.012327357417538, + "grad_norm": 0.1432153880596161, + "learning_rate": 4.2370025335957205e-05, + "loss": 0.0213, + "step": 43820 + }, + { + "epoch": 9.01238151979635, + "grad_norm": 1.156017780303955, + "learning_rate": 4.2367016314912104e-05, + "loss": 0.0686, + "step": 43830 + }, + { + "epoch": 9.01243568217516, + "grad_norm": 0.09708039462566376, + "learning_rate": 4.236400729386701e-05, + "loss": 0.1354, + "step": 43840 + }, + { + "epoch": 9.012489844553972, + "grad_norm": 0.0023786311503499746, + "learning_rate": 4.2360998272821924e-05, + "loss": 0.0958, + "step": 43850 + }, + { + "epoch": 9.012544006932785, + "grad_norm": 7.462850570678711, + "learning_rate": 4.235798925177683e-05, + "loss": 0.1215, + "step": 43860 + }, + { + "epoch": 9.012598169311596, + "grad_norm": 0.030483826994895935, + "learning_rate": 4.2354980230731736e-05, + "loss": 0.0632, + "step": 43870 + }, + { + "epoch": 9.012652331690408, + "grad_norm": 0.11799218505620956, + "learning_rate": 4.235197120968664e-05, + "loss": 0.007, + "step": 43880 + }, + { + "epoch": 9.012706494069219, + "grad_norm": 0.008360722102224827, + "learning_rate": 4.234896218864155e-05, + "loss": 0.1691, + "step": 43890 + }, + { + "epoch": 9.012760656448032, + "grad_norm": 0.1798018366098404, + "learning_rate": 4.2345953167596455e-05, + "loss": 0.0086, + "step": 43900 + }, + { + "epoch": 9.012814818826843, + "grad_norm": 3.112602949142456, + "learning_rate": 4.234294414655136e-05, + "loss": 0.0623, + "step": 43910 + }, + { + "epoch": 9.012868981205655, + "grad_norm": 2.9580037593841553, + "learning_rate": 4.233993512550627e-05, + "loss": 0.0591, + "step": 43920 + }, + { + "epoch": 9.012923143584466, + "grad_norm": 0.08157049864530563, + "learning_rate": 4.233692610446118e-05, + "loss": 0.0305, + "step": 43930 + }, + { + "epoch": 9.012977305963277, + "grad_norm": 0.4966966211795807, + "learning_rate": 4.233391708341609e-05, + "loss": 0.0944, + "step": 43940 + }, + { + "epoch": 9.01303146834209, + "grad_norm": 0.011324924416840076, + "learning_rate": 4.2330908062370987e-05, + "loss": 0.2227, + "step": 43950 + }, + { + "epoch": 9.013085630720902, + "grad_norm": 0.2152666300535202, + "learning_rate": 4.23278990413259e-05, + "loss": 0.1472, + "step": 43960 + }, + { + "epoch": 9.013139793099713, + "grad_norm": 0.10469091683626175, + "learning_rate": 4.2324890020280806e-05, + "loss": 0.0839, + "step": 43970 + }, + { + "epoch": 9.013193955478524, + "grad_norm": 0.9099310040473938, + "learning_rate": 4.2321880999235705e-05, + "loss": 0.0572, + "step": 43980 + }, + { + "epoch": 9.013248117857337, + "grad_norm": 0.23647861182689667, + "learning_rate": 4.231887197819062e-05, + "loss": 0.0818, + "step": 43990 + }, + { + "epoch": 9.013302280236148, + "grad_norm": 0.7146568894386292, + "learning_rate": 4.2315862957145525e-05, + "loss": 0.0606, + "step": 44000 + }, + { + "epoch": 9.01335644261496, + "grad_norm": 0.29570847749710083, + "learning_rate": 4.231285393610043e-05, + "loss": 0.1024, + "step": 44010 + }, + { + "epoch": 9.013410604993771, + "grad_norm": 0.2519051432609558, + "learning_rate": 4.230984491505534e-05, + "loss": 0.0764, + "step": 44020 + }, + { + "epoch": 9.013464767372582, + "grad_norm": 0.4061441421508789, + "learning_rate": 4.2306835894010244e-05, + "loss": 0.075, + "step": 44030 + }, + { + "epoch": 9.013518929751395, + "grad_norm": 2.7316479682922363, + "learning_rate": 4.230382687296515e-05, + "loss": 0.067, + "step": 44040 + }, + { + "epoch": 9.013573092130207, + "grad_norm": 0.11399662494659424, + "learning_rate": 4.230081785192006e-05, + "loss": 0.0536, + "step": 44050 + }, + { + "epoch": 9.013627254509018, + "grad_norm": 0.02218184620141983, + "learning_rate": 4.229780883087496e-05, + "loss": 0.0166, + "step": 44060 + }, + { + "epoch": 9.01368141688783, + "grad_norm": 0.0023344780784100294, + "learning_rate": 4.229479980982987e-05, + "loss": 0.0392, + "step": 44070 + }, + { + "epoch": 9.013735579266642, + "grad_norm": 0.16915467381477356, + "learning_rate": 4.229179078878478e-05, + "loss": 0.2175, + "step": 44080 + }, + { + "epoch": 9.013789741645454, + "grad_norm": 0.6015234589576721, + "learning_rate": 4.228878176773969e-05, + "loss": 0.1092, + "step": 44090 + }, + { + "epoch": 9.013843904024265, + "grad_norm": 0.23061209917068481, + "learning_rate": 4.228577274669459e-05, + "loss": 0.0406, + "step": 44100 + }, + { + "epoch": 9.013898066403076, + "grad_norm": 0.012731814756989479, + "learning_rate": 4.22827637256495e-05, + "loss": 0.091, + "step": 44110 + }, + { + "epoch": 9.013952228781887, + "grad_norm": 0.4400084316730499, + "learning_rate": 4.227975470460441e-05, + "loss": 0.0569, + "step": 44120 + }, + { + "epoch": 9.0140063911607, + "grad_norm": 1.141852617263794, + "learning_rate": 4.227674568355931e-05, + "loss": 0.1021, + "step": 44130 + }, + { + "epoch": 9.014060553539512, + "grad_norm": 3.1433448791503906, + "learning_rate": 4.227373666251422e-05, + "loss": 0.1249, + "step": 44140 + }, + { + "epoch": 9.014114715918323, + "grad_norm": 0.8171924948692322, + "learning_rate": 4.2270727641469126e-05, + "loss": 0.01, + "step": 44150 + }, + { + "epoch": 9.014168878297134, + "grad_norm": 0.042113423347473145, + "learning_rate": 4.226771862042403e-05, + "loss": 0.0049, + "step": 44160 + }, + { + "epoch": 9.014223040675947, + "grad_norm": 0.2386716604232788, + "learning_rate": 4.226470959937894e-05, + "loss": 0.0821, + "step": 44170 + }, + { + "epoch": 9.014277203054759, + "grad_norm": 0.07074329257011414, + "learning_rate": 4.2261700578333844e-05, + "loss": 0.0877, + "step": 44180 + }, + { + "epoch": 9.01433136543357, + "grad_norm": 0.005162681918591261, + "learning_rate": 4.225869155728876e-05, + "loss": 0.0023, + "step": 44190 + }, + { + "epoch": 9.014385527812381, + "grad_norm": 0.04319370165467262, + "learning_rate": 4.2255682536243664e-05, + "loss": 0.1642, + "step": 44200 + }, + { + "epoch": 9.014439690191193, + "grad_norm": 0.501288115978241, + "learning_rate": 4.225267351519856e-05, + "loss": 0.1029, + "step": 44210 + }, + { + "epoch": 9.014493852570006, + "grad_norm": 0.29333624243736267, + "learning_rate": 4.2249664494153476e-05, + "loss": 0.1325, + "step": 44220 + }, + { + "epoch": 9.014548014948817, + "grad_norm": 0.011408328078687191, + "learning_rate": 4.224665547310838e-05, + "loss": 0.0195, + "step": 44230 + }, + { + "epoch": 9.014602177327628, + "grad_norm": 0.017955616116523743, + "learning_rate": 4.224364645206329e-05, + "loss": 0.024, + "step": 44240 + }, + { + "epoch": 9.01465633970644, + "grad_norm": 0.008211955428123474, + "learning_rate": 4.2240637431018195e-05, + "loss": 0.0187, + "step": 44250 + }, + { + "epoch": 9.01471050208525, + "grad_norm": 0.11518213152885437, + "learning_rate": 4.22376284099731e-05, + "loss": 0.0039, + "step": 44260 + }, + { + "epoch": 9.014764664464064, + "grad_norm": 8.445616722106934, + "learning_rate": 4.223461938892801e-05, + "loss": 0.0986, + "step": 44270 + }, + { + "epoch": 9.014818826842875, + "grad_norm": 0.005991533398628235, + "learning_rate": 4.2231610367882914e-05, + "loss": 0.0154, + "step": 44280 + }, + { + "epoch": 9.014872989221686, + "grad_norm": 0.24165597558021545, + "learning_rate": 4.222860134683782e-05, + "loss": 0.0429, + "step": 44290 + }, + { + "epoch": 9.014927151600498, + "grad_norm": 0.0026478322688490152, + "learning_rate": 4.2225592325792727e-05, + "loss": 0.039, + "step": 44300 + }, + { + "epoch": 9.01498131397931, + "grad_norm": 0.01670331321656704, + "learning_rate": 4.222258330474764e-05, + "loss": 0.1426, + "step": 44310 + }, + { + "epoch": 9.015035476358122, + "grad_norm": 1.110182285308838, + "learning_rate": 4.221957428370254e-05, + "loss": 0.0312, + "step": 44320 + }, + { + "epoch": 9.015089638736933, + "grad_norm": 0.31384867429733276, + "learning_rate": 4.2216565262657445e-05, + "loss": 0.0448, + "step": 44330 + }, + { + "epoch": 9.015143801115745, + "grad_norm": 0.006151373963803053, + "learning_rate": 4.221355624161236e-05, + "loss": 0.0619, + "step": 44340 + }, + { + "epoch": 9.015197963494556, + "grad_norm": 0.002999667078256607, + "learning_rate": 4.2210547220567265e-05, + "loss": 0.0807, + "step": 44350 + }, + { + "epoch": 9.015252125873369, + "grad_norm": 0.23850493133068085, + "learning_rate": 4.2207538199522164e-05, + "loss": 0.0436, + "step": 44360 + }, + { + "epoch": 9.01530628825218, + "grad_norm": 0.0022863908670842648, + "learning_rate": 4.220452917847708e-05, + "loss": 0.0296, + "step": 44370 + }, + { + "epoch": 9.015360450630991, + "grad_norm": 0.0650414526462555, + "learning_rate": 4.2201520157431983e-05, + "loss": 0.0724, + "step": 44380 + }, + { + "epoch": 9.015414613009803, + "grad_norm": 0.021632423624396324, + "learning_rate": 4.219851113638689e-05, + "loss": 0.0919, + "step": 44390 + }, + { + "epoch": 9.015468775388616, + "grad_norm": 8.457453727722168, + "learning_rate": 4.2195502115341796e-05, + "loss": 0.2515, + "step": 44400 + }, + { + "epoch": 9.015522937767427, + "grad_norm": 1.462029218673706, + "learning_rate": 4.21924930942967e-05, + "loss": 0.1522, + "step": 44410 + }, + { + "epoch": 9.015577100146238, + "grad_norm": 0.7816680073738098, + "learning_rate": 4.218948407325161e-05, + "loss": 0.1277, + "step": 44420 + }, + { + "epoch": 9.01563126252505, + "grad_norm": 0.4175686538219452, + "learning_rate": 4.2186475052206515e-05, + "loss": 0.0412, + "step": 44430 + }, + { + "epoch": 9.015685424903861, + "grad_norm": 0.006310398690402508, + "learning_rate": 4.218346603116142e-05, + "loss": 0.0855, + "step": 44440 + }, + { + "epoch": 9.015739587282674, + "grad_norm": 0.6435827016830444, + "learning_rate": 4.2180457010116334e-05, + "loss": 0.0083, + "step": 44450 + }, + { + "epoch": 9.015793749661485, + "grad_norm": 0.010358522646129131, + "learning_rate": 4.217744798907124e-05, + "loss": 0.0782, + "step": 44460 + }, + { + "epoch": 9.015847912040297, + "grad_norm": 2.6599490642547607, + "learning_rate": 4.217443896802614e-05, + "loss": 0.2078, + "step": 44470 + }, + { + "epoch": 9.015902074419108, + "grad_norm": 3.9171831607818604, + "learning_rate": 4.217142994698105e-05, + "loss": 0.1197, + "step": 44480 + }, + { + "epoch": 9.015956236797921, + "grad_norm": 0.10439641773700714, + "learning_rate": 4.216842092593596e-05, + "loss": 0.0813, + "step": 44490 + }, + { + "epoch": 9.016010399176732, + "grad_norm": 1.5715274810791016, + "learning_rate": 4.2165411904890866e-05, + "loss": 0.0464, + "step": 44500 + }, + { + "epoch": 9.016064561555543, + "grad_norm": 0.39207199215888977, + "learning_rate": 4.216240288384577e-05, + "loss": 0.0387, + "step": 44510 + }, + { + "epoch": 9.016118723934355, + "grad_norm": 0.17807698249816895, + "learning_rate": 4.215939386280068e-05, + "loss": 0.158, + "step": 44520 + }, + { + "epoch": 9.016172886313166, + "grad_norm": 0.00344737502746284, + "learning_rate": 4.2156384841755584e-05, + "loss": 0.053, + "step": 44530 + }, + { + "epoch": 9.016227048691979, + "grad_norm": 0.07284900546073914, + "learning_rate": 4.21533758207105e-05, + "loss": 0.1766, + "step": 44540 + }, + { + "epoch": 9.01628121107079, + "grad_norm": 0.00783957913517952, + "learning_rate": 4.21503667996654e-05, + "loss": 0.08, + "step": 44550 + }, + { + "epoch": 9.016335373449602, + "grad_norm": 0.43318068981170654, + "learning_rate": 4.21473577786203e-05, + "loss": 0.1142, + "step": 44560 + }, + { + "epoch": 9.016389535828413, + "grad_norm": 0.25647398829460144, + "learning_rate": 4.2144348757575216e-05, + "loss": 0.0064, + "step": 44570 + }, + { + "epoch": 9.016443698207226, + "grad_norm": 0.08534606546163559, + "learning_rate": 4.2141339736530116e-05, + "loss": 0.1585, + "step": 44580 + }, + { + "epoch": 9.016497860586037, + "grad_norm": 0.15912999212741852, + "learning_rate": 4.213833071548502e-05, + "loss": 0.0479, + "step": 44590 + }, + { + "epoch": 9.016552022964849, + "grad_norm": 0.16495627164840698, + "learning_rate": 4.2135321694439935e-05, + "loss": 0.0768, + "step": 44600 + }, + { + "epoch": 9.01660618534366, + "grad_norm": 3.666260242462158, + "learning_rate": 4.213231267339484e-05, + "loss": 0.1105, + "step": 44610 + }, + { + "epoch": 9.016660347722471, + "grad_norm": 0.3220755159854889, + "learning_rate": 4.212930365234974e-05, + "loss": 0.1111, + "step": 44620 + }, + { + "epoch": 9.016714510101284, + "grad_norm": 4.313347339630127, + "learning_rate": 4.2126294631304654e-05, + "loss": 0.0654, + "step": 44630 + }, + { + "epoch": 9.016768672480096, + "grad_norm": 2.740447521209717, + "learning_rate": 4.212328561025956e-05, + "loss": 0.1141, + "step": 44640 + }, + { + "epoch": 9.016822834858907, + "grad_norm": 2.0061116218566895, + "learning_rate": 4.2120276589214466e-05, + "loss": 0.0697, + "step": 44650 + }, + { + "epoch": 9.016876997237718, + "grad_norm": 0.21828210353851318, + "learning_rate": 4.211726756816937e-05, + "loss": 0.0794, + "step": 44660 + }, + { + "epoch": 9.016931159616531, + "grad_norm": 0.41423434019088745, + "learning_rate": 4.211425854712428e-05, + "loss": 0.0905, + "step": 44670 + }, + { + "epoch": 9.016985321995342, + "grad_norm": 0.0028967757243663073, + "learning_rate": 4.2111249526079185e-05, + "loss": 0.0415, + "step": 44680 + }, + { + "epoch": 9.017039484374154, + "grad_norm": 0.027394067496061325, + "learning_rate": 4.21082405050341e-05, + "loss": 0.0578, + "step": 44690 + }, + { + "epoch": 9.017093646752965, + "grad_norm": 0.07096397876739502, + "learning_rate": 4.2105231483989e-05, + "loss": 0.056, + "step": 44700 + }, + { + "epoch": 9.017147809131776, + "grad_norm": 0.004148768726736307, + "learning_rate": 4.210222246294391e-05, + "loss": 0.0168, + "step": 44710 + }, + { + "epoch": 9.01720197151059, + "grad_norm": 0.06667190045118332, + "learning_rate": 4.209921344189882e-05, + "loss": 0.0038, + "step": 44720 + }, + { + "epoch": 9.0172561338894, + "grad_norm": 3.4629578590393066, + "learning_rate": 4.209620442085372e-05, + "loss": 0.2012, + "step": 44730 + }, + { + "epoch": 9.017310296268212, + "grad_norm": 0.07793296873569489, + "learning_rate": 4.209319539980863e-05, + "loss": 0.0656, + "step": 44740 + }, + { + "epoch": 9.017364458647023, + "grad_norm": 0.3410947620868683, + "learning_rate": 4.2090186378763536e-05, + "loss": 0.0941, + "step": 44750 + }, + { + "epoch": 9.017418621025836, + "grad_norm": 0.002959773875772953, + "learning_rate": 4.208717735771844e-05, + "loss": 0.0362, + "step": 44760 + }, + { + "epoch": 9.017472783404648, + "grad_norm": 0.09504730999469757, + "learning_rate": 4.208416833667335e-05, + "loss": 0.0058, + "step": 44770 + }, + { + "epoch": 9.017526945783459, + "grad_norm": 0.3839538097381592, + "learning_rate": 4.2081159315628255e-05, + "loss": 0.0533, + "step": 44780 + }, + { + "epoch": 9.01758110816227, + "grad_norm": 0.00430306326597929, + "learning_rate": 4.207815029458316e-05, + "loss": 0.008, + "step": 44790 + }, + { + "epoch": 9.017635270541081, + "grad_norm": 0.09351455420255661, + "learning_rate": 4.2075141273538074e-05, + "loss": 0.0865, + "step": 44800 + }, + { + "epoch": 9.017689432919894, + "grad_norm": 2.02541446685791, + "learning_rate": 4.2072132252492974e-05, + "loss": 0.0237, + "step": 44810 + }, + { + "epoch": 9.017743595298706, + "grad_norm": 0.7811800241470337, + "learning_rate": 4.206912323144788e-05, + "loss": 0.0226, + "step": 44820 + }, + { + "epoch": 9.017797757677517, + "grad_norm": 5.85521936416626, + "learning_rate": 4.206611421040279e-05, + "loss": 0.0397, + "step": 44830 + }, + { + "epoch": 9.017851920056328, + "grad_norm": 0.011581184342503548, + "learning_rate": 4.20631051893577e-05, + "loss": 0.0565, + "step": 44840 + }, + { + "epoch": 9.017906082435141, + "grad_norm": 0.0030562905594706535, + "learning_rate": 4.20600961683126e-05, + "loss": 0.001, + "step": 44850 + }, + { + "epoch": 9.017960244813953, + "grad_norm": 1.9461114406585693, + "learning_rate": 4.205708714726751e-05, + "loss": 0.0488, + "step": 44860 + }, + { + "epoch": 9.018014407192764, + "grad_norm": 3.9925029277801514, + "learning_rate": 4.205407812622242e-05, + "loss": 0.1114, + "step": 44870 + }, + { + "epoch": 9.018068569571575, + "grad_norm": 2.4897541999816895, + "learning_rate": 4.205106910517732e-05, + "loss": 0.0668, + "step": 44880 + }, + { + "epoch": 9.018122731950386, + "grad_norm": 2.823920488357544, + "learning_rate": 4.204806008413223e-05, + "loss": 0.0555, + "step": 44890 + }, + { + "epoch": 9.0181768943292, + "grad_norm": 3.520576000213623, + "learning_rate": 4.204505106308714e-05, + "loss": 0.1872, + "step": 44900 + }, + { + "epoch": 9.01823105670801, + "grad_norm": 0.06473370641469955, + "learning_rate": 4.204204204204204e-05, + "loss": 0.2457, + "step": 44910 + }, + { + "epoch": 9.018285219086822, + "grad_norm": 2.083404064178467, + "learning_rate": 4.203903302099695e-05, + "loss": 0.0669, + "step": 44920 + }, + { + "epoch": 9.018339381465633, + "grad_norm": 1.432895541191101, + "learning_rate": 4.2036023999951856e-05, + "loss": 0.0486, + "step": 44930 + }, + { + "epoch": 9.018393543844446, + "grad_norm": 0.007193074561655521, + "learning_rate": 4.203301497890676e-05, + "loss": 0.0384, + "step": 44940 + }, + { + "epoch": 9.018447706223258, + "grad_norm": 0.09356924146413803, + "learning_rate": 4.2030005957861675e-05, + "loss": 0.0436, + "step": 44950 + }, + { + "epoch": 9.018501868602069, + "grad_norm": 0.055775612592697144, + "learning_rate": 4.2026996936816575e-05, + "loss": 0.0212, + "step": 44960 + }, + { + "epoch": 9.01855603098088, + "grad_norm": 0.17160676419734955, + "learning_rate": 4.202398791577149e-05, + "loss": 0.054, + "step": 44970 + }, + { + "epoch": 9.018610193359692, + "grad_norm": 0.276752769947052, + "learning_rate": 4.2020978894726394e-05, + "loss": 0.138, + "step": 44980 + }, + { + "epoch": 9.018664355738505, + "grad_norm": 0.0065020304173231125, + "learning_rate": 4.20179698736813e-05, + "loss": 0.0747, + "step": 44990 + }, + { + "epoch": 9.018718518117316, + "grad_norm": 0.3693990707397461, + "learning_rate": 4.2014960852636206e-05, + "loss": 0.0496, + "step": 45000 + }, + { + "epoch": 9.018772680496127, + "grad_norm": 0.7657061219215393, + "learning_rate": 4.201195183159111e-05, + "loss": 0.0805, + "step": 45010 + }, + { + "epoch": 9.018826842874939, + "grad_norm": 0.0023350270930677652, + "learning_rate": 4.200894281054602e-05, + "loss": 0.0378, + "step": 45020 + }, + { + "epoch": 9.018881005253752, + "grad_norm": 2.6039557456970215, + "learning_rate": 4.2005933789500925e-05, + "loss": 0.1041, + "step": 45030 + }, + { + "epoch": 9.018935167632563, + "grad_norm": 0.5503116250038147, + "learning_rate": 4.200292476845583e-05, + "loss": 0.091, + "step": 45040 + }, + { + "epoch": 9.018989330011374, + "grad_norm": 0.06171811744570732, + "learning_rate": 4.199991574741074e-05, + "loss": 0.1475, + "step": 45050 + }, + { + "epoch": 9.019043492390185, + "grad_norm": 0.0925004780292511, + "learning_rate": 4.199690672636565e-05, + "loss": 0.1119, + "step": 45060 + }, + { + "epoch": 9.019097654768997, + "grad_norm": 0.2582755982875824, + "learning_rate": 4.199389770532055e-05, + "loss": 0.116, + "step": 45070 + }, + { + "epoch": 9.01915181714781, + "grad_norm": 0.00893132109194994, + "learning_rate": 4.1990888684275457e-05, + "loss": 0.179, + "step": 45080 + }, + { + "epoch": 9.019205979526621, + "grad_norm": 0.0066315471194684505, + "learning_rate": 4.198787966323037e-05, + "loss": 0.0623, + "step": 45090 + }, + { + "epoch": 9.019260141905432, + "grad_norm": 0.10339641571044922, + "learning_rate": 4.1984870642185276e-05, + "loss": 0.0965, + "step": 45100 + }, + { + "epoch": 9.019314304284244, + "grad_norm": 0.26186057925224304, + "learning_rate": 4.1981861621140175e-05, + "loss": 0.0477, + "step": 45110 + }, + { + "epoch": 9.019368466663057, + "grad_norm": 0.06688690930604935, + "learning_rate": 4.197885260009509e-05, + "loss": 0.0372, + "step": 45120 + }, + { + "epoch": 9.019422629041868, + "grad_norm": 0.5534613728523254, + "learning_rate": 4.1975843579049995e-05, + "loss": 0.0547, + "step": 45130 + }, + { + "epoch": 9.01947679142068, + "grad_norm": 0.01327453926205635, + "learning_rate": 4.19728345580049e-05, + "loss": 0.3916, + "step": 45140 + }, + { + "epoch": 9.01953095379949, + "grad_norm": 0.6782059669494629, + "learning_rate": 4.196982553695981e-05, + "loss": 0.0467, + "step": 45150 + }, + { + "epoch": 9.019585116178302, + "grad_norm": 0.185947984457016, + "learning_rate": 4.1966816515914714e-05, + "loss": 0.0648, + "step": 45160 + }, + { + "epoch": 9.019639278557115, + "grad_norm": 1.8955562114715576, + "learning_rate": 4.196380749486962e-05, + "loss": 0.0971, + "step": 45170 + }, + { + "epoch": 9.019693440935926, + "grad_norm": 0.28374558687210083, + "learning_rate": 4.1960798473824526e-05, + "loss": 0.0101, + "step": 45180 + }, + { + "epoch": 9.019747603314737, + "grad_norm": 0.20175181329250336, + "learning_rate": 4.195778945277943e-05, + "loss": 0.013, + "step": 45190 + }, + { + "epoch": 9.019801765693549, + "grad_norm": 0.011368007399141788, + "learning_rate": 4.195478043173434e-05, + "loss": 0.0278, + "step": 45200 + }, + { + "epoch": 9.019855928072362, + "grad_norm": 17.608863830566406, + "learning_rate": 4.195177141068925e-05, + "loss": 0.0717, + "step": 45210 + }, + { + "epoch": 9.019910090451173, + "grad_norm": 0.074602872133255, + "learning_rate": 4.194876238964415e-05, + "loss": 0.1047, + "step": 45220 + }, + { + "epoch": 9.019964252829984, + "grad_norm": 0.044798366725444794, + "learning_rate": 4.1945753368599064e-05, + "loss": 0.0017, + "step": 45230 + }, + { + "epoch": 9.020018415208796, + "grad_norm": 0.04338404908776283, + "learning_rate": 4.194274434755397e-05, + "loss": 0.1912, + "step": 45240 + }, + { + "epoch": 9.020072577587607, + "grad_norm": 14.215986251831055, + "learning_rate": 4.193973532650888e-05, + "loss": 0.0186, + "step": 45250 + }, + { + "epoch": 9.02012673996642, + "grad_norm": 0.027689563110470772, + "learning_rate": 4.193672630546378e-05, + "loss": 0.0575, + "step": 45260 + }, + { + "epoch": 9.020180902345231, + "grad_norm": 0.0654521957039833, + "learning_rate": 4.193371728441869e-05, + "loss": 0.0555, + "step": 45270 + }, + { + "epoch": 9.020235064724043, + "grad_norm": 0.1769631803035736, + "learning_rate": 4.1930708263373596e-05, + "loss": 0.0325, + "step": 45280 + }, + { + "epoch": 9.020289227102854, + "grad_norm": 32.86360168457031, + "learning_rate": 4.192769924232851e-05, + "loss": 0.1623, + "step": 45290 + }, + { + "epoch": 9.020343389481667, + "grad_norm": 0.026971153914928436, + "learning_rate": 4.192469022128341e-05, + "loss": 0.1322, + "step": 45300 + }, + { + "epoch": 9.020397551860478, + "grad_norm": 0.16417276859283447, + "learning_rate": 4.1921681200238314e-05, + "loss": 0.0839, + "step": 45310 + }, + { + "epoch": 9.02045171423929, + "grad_norm": 0.44798702001571655, + "learning_rate": 4.191867217919323e-05, + "loss": 0.0719, + "step": 45320 + }, + { + "epoch": 9.0205058766181, + "grad_norm": 0.3106614053249359, + "learning_rate": 4.191566315814813e-05, + "loss": 0.1359, + "step": 45330 + }, + { + "epoch": 9.020560038996912, + "grad_norm": 1.9086768627166748, + "learning_rate": 4.191265413710303e-05, + "loss": 0.0535, + "step": 45340 + }, + { + "epoch": 9.020614201375725, + "grad_norm": 0.012867999263107777, + "learning_rate": 4.1909645116057946e-05, + "loss": 0.0238, + "step": 45350 + }, + { + "epoch": 9.020668363754536, + "grad_norm": 0.40156081318855286, + "learning_rate": 4.190663609501285e-05, + "loss": 0.1281, + "step": 45360 + }, + { + "epoch": 9.020722526133348, + "grad_norm": 10.981378555297852, + "learning_rate": 4.190362707396775e-05, + "loss": 0.0236, + "step": 45370 + }, + { + "epoch": 9.020776688512159, + "grad_norm": 0.21046926081180573, + "learning_rate": 4.1900618052922665e-05, + "loss": 0.0459, + "step": 45380 + }, + { + "epoch": 9.02083085089097, + "grad_norm": 4.049026966094971, + "learning_rate": 4.189760903187757e-05, + "loss": 0.0978, + "step": 45390 + }, + { + "epoch": 9.020885013269783, + "grad_norm": 0.008867279626429081, + "learning_rate": 4.189460001083248e-05, + "loss": 0.0073, + "step": 45400 + }, + { + "epoch": 9.020939175648595, + "grad_norm": 0.01387204322963953, + "learning_rate": 4.1891590989787384e-05, + "loss": 0.0217, + "step": 45410 + }, + { + "epoch": 9.020993338027406, + "grad_norm": 0.0035519415978342295, + "learning_rate": 4.188858196874229e-05, + "loss": 0.0115, + "step": 45420 + }, + { + "epoch": 9.021047500406217, + "grad_norm": 1.7385801076889038, + "learning_rate": 4.1885572947697197e-05, + "loss": 0.0258, + "step": 45430 + }, + { + "epoch": 9.02110166278503, + "grad_norm": 0.024460673332214355, + "learning_rate": 4.188256392665211e-05, + "loss": 0.2802, + "step": 45440 + }, + { + "epoch": 9.021155825163842, + "grad_norm": 0.04832746461033821, + "learning_rate": 4.187955490560701e-05, + "loss": 0.0774, + "step": 45450 + }, + { + "epoch": 9.021209987542653, + "grad_norm": 1.418431282043457, + "learning_rate": 4.1876545884561915e-05, + "loss": 0.0958, + "step": 45460 + }, + { + "epoch": 9.021264149921464, + "grad_norm": 0.004770437255501747, + "learning_rate": 4.187353686351683e-05, + "loss": 0.0796, + "step": 45470 + }, + { + "epoch": 9.021318312300275, + "grad_norm": 3.9303224086761475, + "learning_rate": 4.187052784247173e-05, + "loss": 0.1476, + "step": 45480 + }, + { + "epoch": 9.021372474679088, + "grad_norm": 0.008096043951809406, + "learning_rate": 4.186751882142664e-05, + "loss": 0.0475, + "step": 45490 + }, + { + "epoch": 9.0214266370579, + "grad_norm": 0.06075001507997513, + "learning_rate": 4.186450980038155e-05, + "loss": 0.0775, + "step": 45500 + }, + { + "epoch": 9.021480799436711, + "grad_norm": 0.046262048184871674, + "learning_rate": 4.1861500779336453e-05, + "loss": 0.0759, + "step": 45510 + }, + { + "epoch": 9.021534961815522, + "grad_norm": 0.024527791887521744, + "learning_rate": 4.185849175829136e-05, + "loss": 0.009, + "step": 45520 + }, + { + "epoch": 9.021589124194335, + "grad_norm": 0.17371642589569092, + "learning_rate": 4.1855482737246266e-05, + "loss": 0.0585, + "step": 45530 + }, + { + "epoch": 9.021643286573147, + "grad_norm": 3.3692750930786133, + "learning_rate": 4.185247371620117e-05, + "loss": 0.0548, + "step": 45540 + }, + { + "epoch": 9.021697448951958, + "grad_norm": 0.00202572182752192, + "learning_rate": 4.1849464695156085e-05, + "loss": 0.1059, + "step": 45550 + }, + { + "epoch": 9.02175161133077, + "grad_norm": 7.932041645050049, + "learning_rate": 4.1846455674110985e-05, + "loss": 0.045, + "step": 45560 + }, + { + "epoch": 9.02180577370958, + "grad_norm": 0.7383502721786499, + "learning_rate": 4.184344665306589e-05, + "loss": 0.1749, + "step": 45570 + }, + { + "epoch": 9.021859936088394, + "grad_norm": 1.1924591064453125, + "learning_rate": 4.1840437632020804e-05, + "loss": 0.0237, + "step": 45580 + }, + { + "epoch": 9.021914098467205, + "grad_norm": 0.05876355245709419, + "learning_rate": 4.183742861097571e-05, + "loss": 0.0563, + "step": 45590 + }, + { + "epoch": 9.021968260846016, + "grad_norm": 0.07612966001033783, + "learning_rate": 4.183441958993061e-05, + "loss": 0.188, + "step": 45600 + }, + { + "epoch": 9.022022423224827, + "grad_norm": 0.2125472128391266, + "learning_rate": 4.183141056888552e-05, + "loss": 0.0923, + "step": 45610 + }, + { + "epoch": 9.02207658560364, + "grad_norm": 0.6417738199234009, + "learning_rate": 4.182840154784043e-05, + "loss": 0.0061, + "step": 45620 + }, + { + "epoch": 9.022130747982452, + "grad_norm": 0.057299163192510605, + "learning_rate": 4.182539252679533e-05, + "loss": 0.1023, + "step": 45630 + }, + { + "epoch": 9.022184910361263, + "grad_norm": 0.05433434620499611, + "learning_rate": 4.182238350575024e-05, + "loss": 0.0808, + "step": 45640 + }, + { + "epoch": 9.022239072740074, + "grad_norm": 0.0019106044201180339, + "learning_rate": 4.181937448470515e-05, + "loss": 0.0316, + "step": 45650 + }, + { + "epoch": 9.022293235118886, + "grad_norm": 1.740944266319275, + "learning_rate": 4.1816365463660054e-05, + "loss": 0.0528, + "step": 45660 + }, + { + "epoch": 9.022347397497699, + "grad_norm": 0.06994800269603729, + "learning_rate": 4.181335644261496e-05, + "loss": 0.1759, + "step": 45670 + }, + { + "epoch": 9.02240155987651, + "grad_norm": 0.2353423535823822, + "learning_rate": 4.181034742156987e-05, + "loss": 0.0984, + "step": 45680 + }, + { + "epoch": 9.022455722255321, + "grad_norm": 0.21773725748062134, + "learning_rate": 4.180733840052477e-05, + "loss": 0.0635, + "step": 45690 + }, + { + "epoch": 9.022509884634133, + "grad_norm": 0.06856875866651535, + "learning_rate": 4.1804329379479686e-05, + "loss": 0.0646, + "step": 45700 + }, + { + "epoch": 9.022564047012946, + "grad_norm": 0.018339408561587334, + "learning_rate": 4.1801320358434586e-05, + "loss": 0.0556, + "step": 45710 + }, + { + "epoch": 9.022618209391757, + "grad_norm": 15.182758331298828, + "learning_rate": 4.179831133738949e-05, + "loss": 0.1278, + "step": 45720 + }, + { + "epoch": 9.022672371770568, + "grad_norm": 2.7782793045043945, + "learning_rate": 4.1795302316344405e-05, + "loss": 0.2087, + "step": 45730 + }, + { + "epoch": 9.02272653414938, + "grad_norm": 0.28209078311920166, + "learning_rate": 4.179229329529931e-05, + "loss": 0.0196, + "step": 45740 + }, + { + "epoch": 9.02278069652819, + "grad_norm": 0.09985426068305969, + "learning_rate": 4.178928427425422e-05, + "loss": 0.0794, + "step": 45750 + }, + { + "epoch": 9.022834858907004, + "grad_norm": 0.03610681742429733, + "learning_rate": 4.1786275253209124e-05, + "loss": 0.0149, + "step": 45760 + }, + { + "epoch": 9.022889021285815, + "grad_norm": 0.6390969157218933, + "learning_rate": 4.178326623216403e-05, + "loss": 0.0628, + "step": 45770 + }, + { + "epoch": 9.022943183664626, + "grad_norm": 0.009074310772120953, + "learning_rate": 4.1780257211118936e-05, + "loss": 0.1013, + "step": 45780 + }, + { + "epoch": 9.022997346043438, + "grad_norm": 0.0033175640273839235, + "learning_rate": 4.177724819007384e-05, + "loss": 0.2201, + "step": 45790 + }, + { + "epoch": 9.02305150842225, + "grad_norm": 0.09734603017568588, + "learning_rate": 4.177423916902875e-05, + "loss": 0.0044, + "step": 45800 + }, + { + "epoch": 9.023105670801062, + "grad_norm": 0.13871020078659058, + "learning_rate": 4.177123014798366e-05, + "loss": 0.1717, + "step": 45810 + }, + { + "epoch": 9.023159833179873, + "grad_norm": 0.18684351444244385, + "learning_rate": 4.176822112693856e-05, + "loss": 0.0428, + "step": 45820 + }, + { + "epoch": 9.023213995558685, + "grad_norm": 0.20687495172023773, + "learning_rate": 4.176521210589347e-05, + "loss": 0.076, + "step": 45830 + }, + { + "epoch": 9.023268157937496, + "grad_norm": 1.3451428413391113, + "learning_rate": 4.176220308484838e-05, + "loss": 0.0853, + "step": 45840 + }, + { + "epoch": 9.023322320316309, + "grad_norm": 0.05395851656794548, + "learning_rate": 4.175919406380329e-05, + "loss": 0.1028, + "step": 45850 + }, + { + "epoch": 9.02337648269512, + "grad_norm": 0.03326946869492531, + "learning_rate": 4.175618504275819e-05, + "loss": 0.0545, + "step": 45860 + }, + { + "epoch": 9.023430645073931, + "grad_norm": 0.046243153512477875, + "learning_rate": 4.17531760217131e-05, + "loss": 0.0099, + "step": 45870 + }, + { + "epoch": 9.023484807452743, + "grad_norm": 2.0938618183135986, + "learning_rate": 4.1750167000668006e-05, + "loss": 0.1714, + "step": 45880 + }, + { + "epoch": 9.023538969831556, + "grad_norm": 0.1674491912126541, + "learning_rate": 4.174715797962291e-05, + "loss": 0.0082, + "step": 45890 + }, + { + "epoch": 9.023593132210367, + "grad_norm": 0.04448113590478897, + "learning_rate": 4.174414895857782e-05, + "loss": 0.0032, + "step": 45900 + }, + { + "epoch": 9.023647294589178, + "grad_norm": 0.0036178873851895332, + "learning_rate": 4.1741139937532725e-05, + "loss": 0.0842, + "step": 45910 + }, + { + "epoch": 9.02370145696799, + "grad_norm": 2.9735679626464844, + "learning_rate": 4.173813091648763e-05, + "loss": 0.0978, + "step": 45920 + }, + { + "epoch": 9.023755619346801, + "grad_norm": 0.1723943054676056, + "learning_rate": 4.173512189544254e-05, + "loss": 0.1346, + "step": 45930 + }, + { + "epoch": 9.023809781725614, + "grad_norm": 2.375720500946045, + "learning_rate": 4.1732112874397444e-05, + "loss": 0.0819, + "step": 45940 + }, + { + "epoch": 9.023863944104425, + "grad_norm": 0.3492448031902313, + "learning_rate": 4.172910385335235e-05, + "loss": 0.0452, + "step": 45950 + }, + { + "epoch": 9.023918106483237, + "grad_norm": 0.5296070575714111, + "learning_rate": 4.172609483230726e-05, + "loss": 0.0674, + "step": 45960 + }, + { + "epoch": 9.023972268862048, + "grad_norm": 8.97480583190918, + "learning_rate": 4.172308581126216e-05, + "loss": 0.0314, + "step": 45970 + }, + { + "epoch": 9.024026431240861, + "grad_norm": 3.809053421020508, + "learning_rate": 4.172007679021707e-05, + "loss": 0.0244, + "step": 45980 + }, + { + "epoch": 9.024080593619672, + "grad_norm": 2.325242280960083, + "learning_rate": 4.171706776917198e-05, + "loss": 0.0908, + "step": 45990 + }, + { + "epoch": 9.024134755998483, + "grad_norm": 0.10274330526590347, + "learning_rate": 4.171405874812689e-05, + "loss": 0.1665, + "step": 46000 + }, + { + "epoch": 9.024188918377295, + "grad_norm": 0.002623864682391286, + "learning_rate": 4.1711049727081794e-05, + "loss": 0.0445, + "step": 46010 + }, + { + "epoch": 9.024243080756106, + "grad_norm": 1.9202775955200195, + "learning_rate": 4.17080407060367e-05, + "loss": 0.0712, + "step": 46020 + }, + { + "epoch": 9.024297243134919, + "grad_norm": 2.1837780475616455, + "learning_rate": 4.170503168499161e-05, + "loss": 0.0506, + "step": 46030 + }, + { + "epoch": 9.02435140551373, + "grad_norm": 0.015172282233834267, + "learning_rate": 4.170202266394651e-05, + "loss": 0.139, + "step": 46040 + }, + { + "epoch": 9.024405567892542, + "grad_norm": 1.7912805080413818, + "learning_rate": 4.169901364290142e-05, + "loss": 0.0872, + "step": 46050 + }, + { + "epoch": 9.024459730271353, + "grad_norm": 0.014671320095658302, + "learning_rate": 4.1696004621856326e-05, + "loss": 0.0427, + "step": 46060 + }, + { + "epoch": 9.024513892650166, + "grad_norm": 0.04298178851604462, + "learning_rate": 4.169299560081124e-05, + "loss": 0.0615, + "step": 46070 + }, + { + "epoch": 9.024568055028977, + "grad_norm": 21.223676681518555, + "learning_rate": 4.168998657976614e-05, + "loss": 0.1196, + "step": 46080 + }, + { + "epoch": 9.024622217407789, + "grad_norm": 0.9639994502067566, + "learning_rate": 4.1686977558721045e-05, + "loss": 0.0864, + "step": 46090 + }, + { + "epoch": 9.0246763797866, + "grad_norm": 0.040864720940589905, + "learning_rate": 4.168396853767596e-05, + "loss": 0.1035, + "step": 46100 + }, + { + "epoch": 9.024730542165411, + "grad_norm": 4.197370529174805, + "learning_rate": 4.1680959516630864e-05, + "loss": 0.0692, + "step": 46110 + }, + { + "epoch": 9.024784704544224, + "grad_norm": 0.4275880455970764, + "learning_rate": 4.167795049558576e-05, + "loss": 0.0869, + "step": 46120 + }, + { + "epoch": 9.024838866923035, + "grad_norm": 1.1459044218063354, + "learning_rate": 4.1674941474540676e-05, + "loss": 0.0234, + "step": 46130 + }, + { + "epoch": 9.024893029301847, + "grad_norm": 0.19243477284908295, + "learning_rate": 4.167193245349558e-05, + "loss": 0.0879, + "step": 46140 + }, + { + "epoch": 9.024947191680658, + "grad_norm": 7.515225410461426, + "learning_rate": 4.166892343245049e-05, + "loss": 0.1082, + "step": 46150 + }, + { + "epoch": 9.025001354059471, + "grad_norm": 0.10181895643472672, + "learning_rate": 4.1665914411405395e-05, + "loss": 0.1408, + "step": 46160 + }, + { + "epoch": 9.025001354059471, + "eval_accuracy": 0.8595689092096669, + "eval_loss": 0.4445280134677887, + "eval_runtime": 117.3956, + "eval_samples_per_second": 26.083, + "eval_steps_per_second": 3.262, + "step": 46160 + }, + { + "epoch": 10.000054162378811, + "grad_norm": 0.3088856041431427, + "learning_rate": 4.16629053903603e-05, + "loss": 0.0488, + "step": 46170 + }, + { + "epoch": 10.000108324757623, + "grad_norm": 4.369933128356934, + "learning_rate": 4.165989636931521e-05, + "loss": 0.0856, + "step": 46180 + }, + { + "epoch": 10.000162487136436, + "grad_norm": 4.653788089752197, + "learning_rate": 4.165688734827012e-05, + "loss": 0.1479, + "step": 46190 + }, + { + "epoch": 10.000216649515247, + "grad_norm": 1.6188867092132568, + "learning_rate": 4.165387832722502e-05, + "loss": 0.1202, + "step": 46200 + }, + { + "epoch": 10.000270811894058, + "grad_norm": 0.0070851584896445274, + "learning_rate": 4.1650869306179927e-05, + "loss": 0.0601, + "step": 46210 + }, + { + "epoch": 10.00032497427287, + "grad_norm": 0.0032045042607933283, + "learning_rate": 4.164786028513484e-05, + "loss": 0.0518, + "step": 46220 + }, + { + "epoch": 10.000379136651683, + "grad_norm": 1.2230044603347778, + "learning_rate": 4.164485126408974e-05, + "loss": 0.1031, + "step": 46230 + }, + { + "epoch": 10.000433299030494, + "grad_norm": 0.08647383749485016, + "learning_rate": 4.1641842243044645e-05, + "loss": 0.0794, + "step": 46240 + }, + { + "epoch": 10.000487461409305, + "grad_norm": 0.0025711546186357737, + "learning_rate": 4.163883322199956e-05, + "loss": 0.0309, + "step": 46250 + }, + { + "epoch": 10.000541623788116, + "grad_norm": 0.12036444246768951, + "learning_rate": 4.1635824200954465e-05, + "loss": 0.0214, + "step": 46260 + }, + { + "epoch": 10.000595786166928, + "grad_norm": 1.1504838466644287, + "learning_rate": 4.163281517990937e-05, + "loss": 0.0453, + "step": 46270 + }, + { + "epoch": 10.00064994854574, + "grad_norm": 0.007628018036484718, + "learning_rate": 4.162980615886428e-05, + "loss": 0.0527, + "step": 46280 + }, + { + "epoch": 10.000704110924552, + "grad_norm": 0.7161661982536316, + "learning_rate": 4.1626797137819184e-05, + "loss": 0.0422, + "step": 46290 + }, + { + "epoch": 10.000758273303363, + "grad_norm": 0.001859643729403615, + "learning_rate": 4.162378811677409e-05, + "loss": 0.0006, + "step": 46300 + }, + { + "epoch": 10.000812435682175, + "grad_norm": 0.02406167984008789, + "learning_rate": 4.1620779095728996e-05, + "loss": 0.0646, + "step": 46310 + }, + { + "epoch": 10.000866598060988, + "grad_norm": 0.011237502098083496, + "learning_rate": 4.16177700746839e-05, + "loss": 0.1886, + "step": 46320 + }, + { + "epoch": 10.000920760439799, + "grad_norm": 0.5089653730392456, + "learning_rate": 4.1614761053638815e-05, + "loss": 0.1084, + "step": 46330 + }, + { + "epoch": 10.00097492281861, + "grad_norm": 13.338438034057617, + "learning_rate": 4.161175203259372e-05, + "loss": 0.2284, + "step": 46340 + }, + { + "epoch": 10.001029085197422, + "grad_norm": 0.010558702051639557, + "learning_rate": 4.160874301154862e-05, + "loss": 0.1325, + "step": 46350 + }, + { + "epoch": 10.001083247576233, + "grad_norm": 0.4616471230983734, + "learning_rate": 4.1605733990503534e-05, + "loss": 0.0295, + "step": 46360 + }, + { + "epoch": 10.001137409955046, + "grad_norm": 0.015241553075611591, + "learning_rate": 4.160272496945844e-05, + "loss": 0.1427, + "step": 46370 + }, + { + "epoch": 10.001191572333857, + "grad_norm": 0.42902815341949463, + "learning_rate": 4.159971594841334e-05, + "loss": 0.0631, + "step": 46380 + }, + { + "epoch": 10.001245734712668, + "grad_norm": 0.005741671193391085, + "learning_rate": 4.159670692736825e-05, + "loss": 0.0311, + "step": 46390 + }, + { + "epoch": 10.00129989709148, + "grad_norm": 2.250682830810547, + "learning_rate": 4.159369790632316e-05, + "loss": 0.1351, + "step": 46400 + }, + { + "epoch": 10.001354059470293, + "grad_norm": 2.139967679977417, + "learning_rate": 4.1590688885278066e-05, + "loss": 0.0769, + "step": 46410 + }, + { + "epoch": 10.001408221849104, + "grad_norm": 0.00679750507697463, + "learning_rate": 4.158767986423297e-05, + "loss": 0.0405, + "step": 46420 + }, + { + "epoch": 10.001462384227915, + "grad_norm": 4.4871978759765625, + "learning_rate": 4.158467084318788e-05, + "loss": 0.056, + "step": 46430 + }, + { + "epoch": 10.001516546606727, + "grad_norm": 0.1897314488887787, + "learning_rate": 4.1581661822142784e-05, + "loss": 0.0476, + "step": 46440 + }, + { + "epoch": 10.001570708985538, + "grad_norm": 1.6074851751327515, + "learning_rate": 4.15786528010977e-05, + "loss": 0.0512, + "step": 46450 + }, + { + "epoch": 10.001624871364351, + "grad_norm": 0.17008179426193237, + "learning_rate": 4.15756437800526e-05, + "loss": 0.0828, + "step": 46460 + }, + { + "epoch": 10.001679033743162, + "grad_norm": 0.05451333895325661, + "learning_rate": 4.15726347590075e-05, + "loss": 0.0617, + "step": 46470 + }, + { + "epoch": 10.001733196121974, + "grad_norm": 2.2995567321777344, + "learning_rate": 4.1569625737962416e-05, + "loss": 0.0339, + "step": 46480 + }, + { + "epoch": 10.001787358500785, + "grad_norm": 20.81239891052246, + "learning_rate": 4.156661671691732e-05, + "loss": 0.1295, + "step": 46490 + }, + { + "epoch": 10.001841520879598, + "grad_norm": 0.014771847054362297, + "learning_rate": 4.156360769587222e-05, + "loss": 0.1178, + "step": 46500 + }, + { + "epoch": 10.00189568325841, + "grad_norm": 0.929358720779419, + "learning_rate": 4.1560598674827135e-05, + "loss": 0.0707, + "step": 46510 + }, + { + "epoch": 10.00194984563722, + "grad_norm": 1.2055931091308594, + "learning_rate": 4.155758965378204e-05, + "loss": 0.065, + "step": 46520 + }, + { + "epoch": 10.002004008016032, + "grad_norm": 0.0038896072655916214, + "learning_rate": 4.155458063273695e-05, + "loss": 0.087, + "step": 46530 + }, + { + "epoch": 10.002058170394843, + "grad_norm": 0.44013527035713196, + "learning_rate": 4.1551571611691854e-05, + "loss": 0.0638, + "step": 46540 + }, + { + "epoch": 10.002112332773656, + "grad_norm": 0.0032842960208654404, + "learning_rate": 4.154856259064676e-05, + "loss": 0.0339, + "step": 46550 + }, + { + "epoch": 10.002166495152467, + "grad_norm": 0.28263619542121887, + "learning_rate": 4.1545553569601667e-05, + "loss": 0.0974, + "step": 46560 + }, + { + "epoch": 10.002220657531279, + "grad_norm": 0.6045931577682495, + "learning_rate": 4.154254454855657e-05, + "loss": 0.0811, + "step": 46570 + }, + { + "epoch": 10.00227481991009, + "grad_norm": 0.024029046297073364, + "learning_rate": 4.153953552751148e-05, + "loss": 0.2244, + "step": 46580 + }, + { + "epoch": 10.002328982288903, + "grad_norm": 0.0047600227408111095, + "learning_rate": 4.153652650646639e-05, + "loss": 0.1148, + "step": 46590 + }, + { + "epoch": 10.002383144667714, + "grad_norm": 0.07841075211763382, + "learning_rate": 4.15335174854213e-05, + "loss": 0.0679, + "step": 46600 + }, + { + "epoch": 10.002437307046526, + "grad_norm": 0.01293067168444395, + "learning_rate": 4.15305084643762e-05, + "loss": 0.1302, + "step": 46610 + }, + { + "epoch": 10.002491469425337, + "grad_norm": 0.7013433575630188, + "learning_rate": 4.152749944333111e-05, + "loss": 0.0832, + "step": 46620 + }, + { + "epoch": 10.002545631804148, + "grad_norm": 0.1872427612543106, + "learning_rate": 4.152449042228602e-05, + "loss": 0.1461, + "step": 46630 + }, + { + "epoch": 10.002599794182961, + "grad_norm": 0.07569238543510437, + "learning_rate": 4.1521481401240924e-05, + "loss": 0.0773, + "step": 46640 + }, + { + "epoch": 10.002653956561772, + "grad_norm": 0.03931332379579544, + "learning_rate": 4.151847238019583e-05, + "loss": 0.0103, + "step": 46650 + }, + { + "epoch": 10.002708118940584, + "grad_norm": 0.1797812432050705, + "learning_rate": 4.1515463359150736e-05, + "loss": 0.0517, + "step": 46660 + }, + { + "epoch": 10.002762281319395, + "grad_norm": 0.10860159993171692, + "learning_rate": 4.151245433810564e-05, + "loss": 0.0239, + "step": 46670 + }, + { + "epoch": 10.002816443698208, + "grad_norm": 0.003396095708012581, + "learning_rate": 4.150944531706055e-05, + "loss": 0.069, + "step": 46680 + }, + { + "epoch": 10.00287060607702, + "grad_norm": 0.12982773780822754, + "learning_rate": 4.1506436296015455e-05, + "loss": 0.0578, + "step": 46690 + }, + { + "epoch": 10.00292476845583, + "grad_norm": 0.005200880113989115, + "learning_rate": 4.150342727497036e-05, + "loss": 0.1718, + "step": 46700 + }, + { + "epoch": 10.002978930834642, + "grad_norm": 0.14430442452430725, + "learning_rate": 4.1500418253925274e-05, + "loss": 0.0109, + "step": 46710 + }, + { + "epoch": 10.003033093213453, + "grad_norm": 0.10002659261226654, + "learning_rate": 4.1497409232880174e-05, + "loss": 0.0073, + "step": 46720 + }, + { + "epoch": 10.003087255592266, + "grad_norm": 0.03257603570818901, + "learning_rate": 4.149440021183508e-05, + "loss": 0.0362, + "step": 46730 + }, + { + "epoch": 10.003141417971078, + "grad_norm": 4.859095573425293, + "learning_rate": 4.149139119078999e-05, + "loss": 0.067, + "step": 46740 + }, + { + "epoch": 10.003195580349889, + "grad_norm": 2.7773661613464355, + "learning_rate": 4.14883821697449e-05, + "loss": 0.1578, + "step": 46750 + }, + { + "epoch": 10.0032497427287, + "grad_norm": 0.008655260317027569, + "learning_rate": 4.14853731486998e-05, + "loss": 0.0271, + "step": 46760 + }, + { + "epoch": 10.003303905107511, + "grad_norm": 0.17177823185920715, + "learning_rate": 4.148236412765471e-05, + "loss": 0.033, + "step": 46770 + }, + { + "epoch": 10.003358067486325, + "grad_norm": 0.0144849494099617, + "learning_rate": 4.147935510660962e-05, + "loss": 0.0646, + "step": 46780 + }, + { + "epoch": 10.003412229865136, + "grad_norm": 0.005287506151944399, + "learning_rate": 4.1476346085564524e-05, + "loss": 0.0051, + "step": 46790 + }, + { + "epoch": 10.003466392243947, + "grad_norm": 0.09445137530565262, + "learning_rate": 4.147333706451943e-05, + "loss": 0.0612, + "step": 46800 + }, + { + "epoch": 10.003520554622758, + "grad_norm": 0.045746125280857086, + "learning_rate": 4.147032804347434e-05, + "loss": 0.1175, + "step": 46810 + }, + { + "epoch": 10.003574717001571, + "grad_norm": 2.41782546043396, + "learning_rate": 4.146731902242924e-05, + "loss": 0.0824, + "step": 46820 + }, + { + "epoch": 10.003628879380383, + "grad_norm": 0.008014969527721405, + "learning_rate": 4.146431000138415e-05, + "loss": 0.107, + "step": 46830 + }, + { + "epoch": 10.003683041759194, + "grad_norm": 0.028291305527091026, + "learning_rate": 4.1461300980339056e-05, + "loss": 0.0868, + "step": 46840 + }, + { + "epoch": 10.003737204138005, + "grad_norm": 4.1450514793396, + "learning_rate": 4.145829195929397e-05, + "loss": 0.113, + "step": 46850 + }, + { + "epoch": 10.003791366516817, + "grad_norm": 4.712275505065918, + "learning_rate": 4.1455282938248875e-05, + "loss": 0.0566, + "step": 46860 + }, + { + "epoch": 10.00384552889563, + "grad_norm": 0.040915846824645996, + "learning_rate": 4.1452273917203775e-05, + "loss": 0.0346, + "step": 46870 + }, + { + "epoch": 10.003899691274441, + "grad_norm": 0.04792734235525131, + "learning_rate": 4.144926489615869e-05, + "loss": 0.0784, + "step": 46880 + }, + { + "epoch": 10.003953853653252, + "grad_norm": 10.354192733764648, + "learning_rate": 4.1446255875113594e-05, + "loss": 0.1026, + "step": 46890 + }, + { + "epoch": 10.004008016032063, + "grad_norm": 0.03921916335821152, + "learning_rate": 4.14432468540685e-05, + "loss": 0.121, + "step": 46900 + }, + { + "epoch": 10.004062178410877, + "grad_norm": 0.5402933359146118, + "learning_rate": 4.1440237833023406e-05, + "loss": 0.0978, + "step": 46910 + }, + { + "epoch": 10.004116340789688, + "grad_norm": 0.015287953428924084, + "learning_rate": 4.143722881197831e-05, + "loss": 0.1367, + "step": 46920 + }, + { + "epoch": 10.004170503168499, + "grad_norm": 2.5192368030548096, + "learning_rate": 4.143421979093322e-05, + "loss": 0.0775, + "step": 46930 + }, + { + "epoch": 10.00422466554731, + "grad_norm": 0.12506558001041412, + "learning_rate": 4.143121076988813e-05, + "loss": 0.0358, + "step": 46940 + }, + { + "epoch": 10.004278827926122, + "grad_norm": 0.592345118522644, + "learning_rate": 4.142820174884303e-05, + "loss": 0.0891, + "step": 46950 + }, + { + "epoch": 10.004332990304935, + "grad_norm": 11.133843421936035, + "learning_rate": 4.142519272779794e-05, + "loss": 0.1165, + "step": 46960 + }, + { + "epoch": 10.004387152683746, + "grad_norm": 0.00858582928776741, + "learning_rate": 4.142218370675285e-05, + "loss": 0.1001, + "step": 46970 + }, + { + "epoch": 10.004441315062557, + "grad_norm": 0.1669682115316391, + "learning_rate": 4.141917468570775e-05, + "loss": 0.0884, + "step": 46980 + }, + { + "epoch": 10.004495477441369, + "grad_norm": 0.31451237201690674, + "learning_rate": 4.141616566466266e-05, + "loss": 0.0327, + "step": 46990 + }, + { + "epoch": 10.004549639820182, + "grad_norm": 0.2044588178396225, + "learning_rate": 4.141315664361757e-05, + "loss": 0.1009, + "step": 47000 + }, + { + "epoch": 10.004603802198993, + "grad_norm": 4.758805751800537, + "learning_rate": 4.1410147622572476e-05, + "loss": 0.0868, + "step": 47010 + }, + { + "epoch": 10.004657964577804, + "grad_norm": 0.1751990169286728, + "learning_rate": 4.1407138601527376e-05, + "loss": 0.0554, + "step": 47020 + }, + { + "epoch": 10.004712126956615, + "grad_norm": 0.10876196622848511, + "learning_rate": 4.140412958048229e-05, + "loss": 0.0396, + "step": 47030 + }, + { + "epoch": 10.004766289335427, + "grad_norm": 0.1185992881655693, + "learning_rate": 4.1401120559437195e-05, + "loss": 0.0805, + "step": 47040 + }, + { + "epoch": 10.00482045171424, + "grad_norm": 0.13707442581653595, + "learning_rate": 4.13981115383921e-05, + "loss": 0.0608, + "step": 47050 + }, + { + "epoch": 10.004874614093051, + "grad_norm": 0.005118443164974451, + "learning_rate": 4.139510251734701e-05, + "loss": 0.0433, + "step": 47060 + }, + { + "epoch": 10.004928776471862, + "grad_norm": 0.42351797223091125, + "learning_rate": 4.1392093496301914e-05, + "loss": 0.071, + "step": 47070 + }, + { + "epoch": 10.004982938850674, + "grad_norm": 0.5030804872512817, + "learning_rate": 4.138908447525682e-05, + "loss": 0.0051, + "step": 47080 + }, + { + "epoch": 10.005037101229487, + "grad_norm": 0.003377952380105853, + "learning_rate": 4.138607545421173e-05, + "loss": 0.1281, + "step": 47090 + }, + { + "epoch": 10.005091263608298, + "grad_norm": 1.1157587766647339, + "learning_rate": 4.138306643316663e-05, + "loss": 0.1794, + "step": 47100 + }, + { + "epoch": 10.00514542598711, + "grad_norm": 0.003712922101840377, + "learning_rate": 4.1380057412121546e-05, + "loss": 0.1437, + "step": 47110 + }, + { + "epoch": 10.00519958836592, + "grad_norm": 0.027989983558654785, + "learning_rate": 4.137704839107645e-05, + "loss": 0.0256, + "step": 47120 + }, + { + "epoch": 10.005253750744732, + "grad_norm": 0.005998301785439253, + "learning_rate": 4.137403937003135e-05, + "loss": 0.015, + "step": 47130 + }, + { + "epoch": 10.005307913123545, + "grad_norm": 0.015487827360630035, + "learning_rate": 4.1371030348986264e-05, + "loss": 0.0149, + "step": 47140 + }, + { + "epoch": 10.005362075502356, + "grad_norm": 14.699426651000977, + "learning_rate": 4.136802132794117e-05, + "loss": 0.0867, + "step": 47150 + }, + { + "epoch": 10.005416237881168, + "grad_norm": 0.00492129335179925, + "learning_rate": 4.136501230689608e-05, + "loss": 0.0766, + "step": 47160 + }, + { + "epoch": 10.005470400259979, + "grad_norm": 1.878713846206665, + "learning_rate": 4.136200328585098e-05, + "loss": 0.1038, + "step": 47170 + }, + { + "epoch": 10.005524562638792, + "grad_norm": 0.10561545193195343, + "learning_rate": 4.135899426480589e-05, + "loss": 0.1344, + "step": 47180 + }, + { + "epoch": 10.005578725017603, + "grad_norm": 0.05832336097955704, + "learning_rate": 4.1355985243760796e-05, + "loss": 0.0962, + "step": 47190 + }, + { + "epoch": 10.005632887396414, + "grad_norm": 0.12997345626354218, + "learning_rate": 4.135297622271571e-05, + "loss": 0.0476, + "step": 47200 + }, + { + "epoch": 10.005687049775226, + "grad_norm": 1.4639161825180054, + "learning_rate": 4.134996720167061e-05, + "loss": 0.0949, + "step": 47210 + }, + { + "epoch": 10.005741212154037, + "grad_norm": 0.718487024307251, + "learning_rate": 4.1346958180625515e-05, + "loss": 0.0425, + "step": 47220 + }, + { + "epoch": 10.00579537453285, + "grad_norm": 0.2944226861000061, + "learning_rate": 4.134394915958043e-05, + "loss": 0.0578, + "step": 47230 + }, + { + "epoch": 10.005849536911661, + "grad_norm": 0.008194748312234879, + "learning_rate": 4.1340940138535334e-05, + "loss": 0.0366, + "step": 47240 + }, + { + "epoch": 10.005903699290473, + "grad_norm": 0.3493488132953644, + "learning_rate": 4.133793111749023e-05, + "loss": 0.0736, + "step": 47250 + }, + { + "epoch": 10.005957861669284, + "grad_norm": 0.4079335927963257, + "learning_rate": 4.1334922096445146e-05, + "loss": 0.049, + "step": 47260 + }, + { + "epoch": 10.006012024048097, + "grad_norm": 0.07257712632417679, + "learning_rate": 4.133191307540005e-05, + "loss": 0.0322, + "step": 47270 + }, + { + "epoch": 10.006066186426908, + "grad_norm": 0.19089506566524506, + "learning_rate": 4.132890405435495e-05, + "loss": 0.0443, + "step": 47280 + }, + { + "epoch": 10.00612034880572, + "grad_norm": 0.0031222347170114517, + "learning_rate": 4.1325895033309865e-05, + "loss": 0.1596, + "step": 47290 + }, + { + "epoch": 10.00617451118453, + "grad_norm": 0.025609800592064857, + "learning_rate": 4.132288601226477e-05, + "loss": 0.0245, + "step": 47300 + }, + { + "epoch": 10.006228673563342, + "grad_norm": 0.6568167805671692, + "learning_rate": 4.131987699121968e-05, + "loss": 0.0786, + "step": 47310 + }, + { + "epoch": 10.006282835942155, + "grad_norm": 8.900922775268555, + "learning_rate": 4.1316867970174584e-05, + "loss": 0.0976, + "step": 47320 + }, + { + "epoch": 10.006336998320966, + "grad_norm": 2.428727865219116, + "learning_rate": 4.131385894912949e-05, + "loss": 0.0564, + "step": 47330 + }, + { + "epoch": 10.006391160699778, + "grad_norm": 0.09793950617313385, + "learning_rate": 4.13108499280844e-05, + "loss": 0.0685, + "step": 47340 + }, + { + "epoch": 10.006445323078589, + "grad_norm": 0.0030914489179849625, + "learning_rate": 4.130784090703931e-05, + "loss": 0.0052, + "step": 47350 + }, + { + "epoch": 10.006499485457402, + "grad_norm": 0.003071101615205407, + "learning_rate": 4.130483188599421e-05, + "loss": 0.0389, + "step": 47360 + }, + { + "epoch": 10.006553647836213, + "grad_norm": 0.713768482208252, + "learning_rate": 4.130182286494912e-05, + "loss": 0.0666, + "step": 47370 + }, + { + "epoch": 10.006607810215025, + "grad_norm": 0.00275592552497983, + "learning_rate": 4.129881384390403e-05, + "loss": 0.0703, + "step": 47380 + }, + { + "epoch": 10.006661972593836, + "grad_norm": 0.002965867519378662, + "learning_rate": 4.1295804822858935e-05, + "loss": 0.1926, + "step": 47390 + }, + { + "epoch": 10.006716134972647, + "grad_norm": 0.051105450838804245, + "learning_rate": 4.129279580181384e-05, + "loss": 0.0702, + "step": 47400 + }, + { + "epoch": 10.00677029735146, + "grad_norm": 0.02283572405576706, + "learning_rate": 4.128978678076875e-05, + "loss": 0.1074, + "step": 47410 + }, + { + "epoch": 10.006824459730272, + "grad_norm": 0.08675501495599747, + "learning_rate": 4.1286777759723654e-05, + "loss": 0.0883, + "step": 47420 + }, + { + "epoch": 10.006878622109083, + "grad_norm": 0.6907634735107422, + "learning_rate": 4.128376873867856e-05, + "loss": 0.0657, + "step": 47430 + }, + { + "epoch": 10.006932784487894, + "grad_norm": 0.1298518180847168, + "learning_rate": 4.1280759717633466e-05, + "loss": 0.0402, + "step": 47440 + }, + { + "epoch": 10.006986946866707, + "grad_norm": 0.09166573733091354, + "learning_rate": 4.127775069658837e-05, + "loss": 0.009, + "step": 47450 + }, + { + "epoch": 10.007041109245518, + "grad_norm": 0.002431046450510621, + "learning_rate": 4.1274741675543285e-05, + "loss": 0.0082, + "step": 47460 + }, + { + "epoch": 10.00709527162433, + "grad_norm": 0.08042485266923904, + "learning_rate": 4.1271732654498185e-05, + "loss": 0.0656, + "step": 47470 + }, + { + "epoch": 10.007149434003141, + "grad_norm": 0.4120802879333496, + "learning_rate": 4.126872363345309e-05, + "loss": 0.0025, + "step": 47480 + }, + { + "epoch": 10.007203596381952, + "grad_norm": 0.04571639746427536, + "learning_rate": 4.1265714612408004e-05, + "loss": 0.0141, + "step": 47490 + }, + { + "epoch": 10.007257758760765, + "grad_norm": 0.0021177229937165976, + "learning_rate": 4.126270559136291e-05, + "loss": 0.0023, + "step": 47500 + }, + { + "epoch": 10.007311921139577, + "grad_norm": 4.407766342163086, + "learning_rate": 4.125969657031781e-05, + "loss": 0.0649, + "step": 47510 + }, + { + "epoch": 10.007366083518388, + "grad_norm": 0.03705684095621109, + "learning_rate": 4.125668754927272e-05, + "loss": 0.027, + "step": 47520 + }, + { + "epoch": 10.0074202458972, + "grad_norm": 0.0386119969189167, + "learning_rate": 4.125367852822763e-05, + "loss": 0.03, + "step": 47530 + }, + { + "epoch": 10.007474408276012, + "grad_norm": 3.111164093017578, + "learning_rate": 4.1250669507182536e-05, + "loss": 0.1072, + "step": 47540 + }, + { + "epoch": 10.007528570654824, + "grad_norm": 4.924633026123047, + "learning_rate": 4.124766048613744e-05, + "loss": 0.1186, + "step": 47550 + }, + { + "epoch": 10.007582733033635, + "grad_norm": 0.13164371252059937, + "learning_rate": 4.124465146509235e-05, + "loss": 0.0789, + "step": 47560 + }, + { + "epoch": 10.007636895412446, + "grad_norm": 0.016471046954393387, + "learning_rate": 4.1241642444047254e-05, + "loss": 0.0243, + "step": 47570 + }, + { + "epoch": 10.007691057791257, + "grad_norm": 4.402589797973633, + "learning_rate": 4.123863342300216e-05, + "loss": 0.0231, + "step": 47580 + }, + { + "epoch": 10.00774522017007, + "grad_norm": 0.005079049151390791, + "learning_rate": 4.123562440195707e-05, + "loss": 0.3184, + "step": 47590 + }, + { + "epoch": 10.007799382548882, + "grad_norm": 0.04008164256811142, + "learning_rate": 4.123261538091197e-05, + "loss": 0.0933, + "step": 47600 + }, + { + "epoch": 10.007853544927693, + "grad_norm": 0.019114185124635696, + "learning_rate": 4.1229606359866886e-05, + "loss": 0.0616, + "step": 47610 + }, + { + "epoch": 10.007907707306504, + "grad_norm": 0.02831815369427204, + "learning_rate": 4.1226597338821786e-05, + "loss": 0.1359, + "step": 47620 + }, + { + "epoch": 10.007961869685317, + "grad_norm": 0.020835841074585915, + "learning_rate": 4.12235883177767e-05, + "loss": 0.0423, + "step": 47630 + }, + { + "epoch": 10.008016032064129, + "grad_norm": 1.279065489768982, + "learning_rate": 4.1220579296731605e-05, + "loss": 0.0111, + "step": 47640 + }, + { + "epoch": 10.00807019444294, + "grad_norm": 0.011790637858211994, + "learning_rate": 4.121757027568651e-05, + "loss": 0.0871, + "step": 47650 + }, + { + "epoch": 10.008124356821751, + "grad_norm": 0.12474899739027023, + "learning_rate": 4.121456125464142e-05, + "loss": 0.231, + "step": 47660 + }, + { + "epoch": 10.008178519200563, + "grad_norm": 0.8418771624565125, + "learning_rate": 4.1211552233596324e-05, + "loss": 0.1153, + "step": 47670 + }, + { + "epoch": 10.008232681579376, + "grad_norm": 0.037485018372535706, + "learning_rate": 4.120854321255123e-05, + "loss": 0.0888, + "step": 47680 + }, + { + "epoch": 10.008286843958187, + "grad_norm": 0.7422879338264465, + "learning_rate": 4.1205534191506137e-05, + "loss": 0.02, + "step": 47690 + }, + { + "epoch": 10.008341006336998, + "grad_norm": 0.7451586127281189, + "learning_rate": 4.120252517046104e-05, + "loss": 0.0924, + "step": 47700 + }, + { + "epoch": 10.00839516871581, + "grad_norm": 0.00986911728978157, + "learning_rate": 4.119951614941595e-05, + "loss": 0.0173, + "step": 47710 + }, + { + "epoch": 10.008449331094623, + "grad_norm": 0.028011981397867203, + "learning_rate": 4.119650712837086e-05, + "loss": 0.0488, + "step": 47720 + }, + { + "epoch": 10.008503493473434, + "grad_norm": 0.6681774258613586, + "learning_rate": 4.119349810732576e-05, + "loss": 0.0671, + "step": 47730 + }, + { + "epoch": 10.008557655852245, + "grad_norm": 7.874396800994873, + "learning_rate": 4.119048908628067e-05, + "loss": 0.0436, + "step": 47740 + }, + { + "epoch": 10.008611818231056, + "grad_norm": 0.012150156311690807, + "learning_rate": 4.118748006523558e-05, + "loss": 0.0108, + "step": 47750 + }, + { + "epoch": 10.008665980609868, + "grad_norm": 15.449970245361328, + "learning_rate": 4.118447104419049e-05, + "loss": 0.0441, + "step": 47760 + }, + { + "epoch": 10.00872014298868, + "grad_norm": 0.013254573568701744, + "learning_rate": 4.118146202314539e-05, + "loss": 0.0836, + "step": 47770 + }, + { + "epoch": 10.008774305367492, + "grad_norm": 0.8547317385673523, + "learning_rate": 4.11784530021003e-05, + "loss": 0.0477, + "step": 47780 + }, + { + "epoch": 10.008828467746303, + "grad_norm": 0.009478513151407242, + "learning_rate": 4.1175443981055206e-05, + "loss": 0.1268, + "step": 47790 + }, + { + "epoch": 10.008882630125115, + "grad_norm": 1.538615107536316, + "learning_rate": 4.117243496001011e-05, + "loss": 0.0472, + "step": 47800 + }, + { + "epoch": 10.008936792503928, + "grad_norm": 0.1153559461236, + "learning_rate": 4.116942593896502e-05, + "loss": 0.0546, + "step": 47810 + }, + { + "epoch": 10.008990954882739, + "grad_norm": 14.136785507202148, + "learning_rate": 4.1166416917919925e-05, + "loss": 0.1292, + "step": 47820 + }, + { + "epoch": 10.00904511726155, + "grad_norm": 0.007181279361248016, + "learning_rate": 4.116340789687483e-05, + "loss": 0.0226, + "step": 47830 + }, + { + "epoch": 10.009099279640362, + "grad_norm": 0.02237999066710472, + "learning_rate": 4.1160398875829744e-05, + "loss": 0.1524, + "step": 47840 + }, + { + "epoch": 10.009153442019173, + "grad_norm": 0.00329033425077796, + "learning_rate": 4.1157389854784644e-05, + "loss": 0.1597, + "step": 47850 + }, + { + "epoch": 10.009207604397986, + "grad_norm": 2.672830820083618, + "learning_rate": 4.115438083373955e-05, + "loss": 0.0501, + "step": 47860 + }, + { + "epoch": 10.009261766776797, + "grad_norm": 0.003138426225632429, + "learning_rate": 4.115137181269446e-05, + "loss": 0.0597, + "step": 47870 + }, + { + "epoch": 10.009315929155608, + "grad_norm": 0.0075126648880541325, + "learning_rate": 4.114836279164936e-05, + "loss": 0.0578, + "step": 47880 + }, + { + "epoch": 10.00937009153442, + "grad_norm": 0.00571124441921711, + "learning_rate": 4.1145353770604276e-05, + "loss": 0.0266, + "step": 47890 + }, + { + "epoch": 10.009424253913231, + "grad_norm": 0.2535170912742615, + "learning_rate": 4.114234474955918e-05, + "loss": 0.0138, + "step": 47900 + }, + { + "epoch": 10.009478416292044, + "grad_norm": 0.006227531004697084, + "learning_rate": 4.113933572851409e-05, + "loss": 0.0905, + "step": 47910 + }, + { + "epoch": 10.009532578670855, + "grad_norm": 2.489999294281006, + "learning_rate": 4.1136326707468994e-05, + "loss": 0.0382, + "step": 47920 + }, + { + "epoch": 10.009586741049667, + "grad_norm": 0.2655254006385803, + "learning_rate": 4.11333176864239e-05, + "loss": 0.0026, + "step": 47930 + }, + { + "epoch": 10.009640903428478, + "grad_norm": 0.08546648919582367, + "learning_rate": 4.113030866537881e-05, + "loss": 0.0696, + "step": 47940 + }, + { + "epoch": 10.009695065807291, + "grad_norm": 0.006537806708365679, + "learning_rate": 4.112729964433371e-05, + "loss": 0.0407, + "step": 47950 + }, + { + "epoch": 10.009749228186102, + "grad_norm": 0.0018659632187336683, + "learning_rate": 4.112429062328862e-05, + "loss": 0.0031, + "step": 47960 + }, + { + "epoch": 10.009803390564914, + "grad_norm": 0.7849332690238953, + "learning_rate": 4.1121281602243526e-05, + "loss": 0.1938, + "step": 47970 + }, + { + "epoch": 10.009857552943725, + "grad_norm": 8.393298149108887, + "learning_rate": 4.111827258119844e-05, + "loss": 0.0968, + "step": 47980 + }, + { + "epoch": 10.009911715322536, + "grad_norm": 0.010425099171698093, + "learning_rate": 4.1115263560153345e-05, + "loss": 0.0331, + "step": 47990 + }, + { + "epoch": 10.00996587770135, + "grad_norm": 0.003400743706151843, + "learning_rate": 4.1112254539108245e-05, + "loss": 0.1061, + "step": 48000 + }, + { + "epoch": 10.01002004008016, + "grad_norm": 4.2831902503967285, + "learning_rate": 4.110924551806316e-05, + "loss": 0.0384, + "step": 48010 + }, + { + "epoch": 10.010074202458972, + "grad_norm": 0.3408351242542267, + "learning_rate": 4.1106236497018064e-05, + "loss": 0.0397, + "step": 48020 + }, + { + "epoch": 10.010128364837783, + "grad_norm": 0.004741212353110313, + "learning_rate": 4.1103227475972963e-05, + "loss": 0.0473, + "step": 48030 + }, + { + "epoch": 10.010182527216596, + "grad_norm": 0.0019979248754680157, + "learning_rate": 4.1100218454927877e-05, + "loss": 0.0063, + "step": 48040 + }, + { + "epoch": 10.010236689595407, + "grad_norm": 0.0452045202255249, + "learning_rate": 4.109720943388278e-05, + "loss": 0.0333, + "step": 48050 + }, + { + "epoch": 10.010290851974219, + "grad_norm": 0.07846657186746597, + "learning_rate": 4.109420041283769e-05, + "loss": 0.1027, + "step": 48060 + }, + { + "epoch": 10.01034501435303, + "grad_norm": 0.370198518037796, + "learning_rate": 4.1091191391792595e-05, + "loss": 0.0423, + "step": 48070 + }, + { + "epoch": 10.010399176731841, + "grad_norm": 0.0022148683201521635, + "learning_rate": 4.10881823707475e-05, + "loss": 0.0463, + "step": 48080 + }, + { + "epoch": 10.010453339110654, + "grad_norm": 1.3583322763442993, + "learning_rate": 4.108517334970241e-05, + "loss": 0.0163, + "step": 48090 + }, + { + "epoch": 10.010507501489466, + "grad_norm": 0.005449244286864996, + "learning_rate": 4.108216432865732e-05, + "loss": 0.0495, + "step": 48100 + }, + { + "epoch": 10.010561663868277, + "grad_norm": 0.07514464110136032, + "learning_rate": 4.107915530761222e-05, + "loss": 0.0027, + "step": 48110 + }, + { + "epoch": 10.010615826247088, + "grad_norm": 0.011580980382859707, + "learning_rate": 4.107614628656713e-05, + "loss": 0.012, + "step": 48120 + }, + { + "epoch": 10.010669988625901, + "grad_norm": 0.36812838912010193, + "learning_rate": 4.107313726552204e-05, + "loss": 0.081, + "step": 48130 + }, + { + "epoch": 10.010724151004712, + "grad_norm": 0.006240178365260363, + "learning_rate": 4.1070128244476946e-05, + "loss": 0.0468, + "step": 48140 + }, + { + "epoch": 10.010778313383524, + "grad_norm": 1.953701376914978, + "learning_rate": 4.106711922343185e-05, + "loss": 0.0889, + "step": 48150 + }, + { + "epoch": 10.010832475762335, + "grad_norm": 0.06692714244127274, + "learning_rate": 4.106411020238676e-05, + "loss": 0.0017, + "step": 48160 + }, + { + "epoch": 10.010886638141146, + "grad_norm": 0.0017073155613616109, + "learning_rate": 4.1061101181341665e-05, + "loss": 0.0056, + "step": 48170 + }, + { + "epoch": 10.01094080051996, + "grad_norm": 0.03478126600384712, + "learning_rate": 4.105809216029657e-05, + "loss": 0.0036, + "step": 48180 + }, + { + "epoch": 10.01099496289877, + "grad_norm": 6.151431083679199, + "learning_rate": 4.105508313925148e-05, + "loss": 0.166, + "step": 48190 + }, + { + "epoch": 10.011049125277582, + "grad_norm": 0.23111751675605774, + "learning_rate": 4.1052074118206384e-05, + "loss": 0.0525, + "step": 48200 + }, + { + "epoch": 10.011103287656393, + "grad_norm": 0.12234135717153549, + "learning_rate": 4.104906509716129e-05, + "loss": 0.0355, + "step": 48210 + }, + { + "epoch": 10.011157450035206, + "grad_norm": 0.1011851504445076, + "learning_rate": 4.1046056076116196e-05, + "loss": 0.0266, + "step": 48220 + }, + { + "epoch": 10.011211612414018, + "grad_norm": 0.0017525260336697102, + "learning_rate": 4.10430470550711e-05, + "loss": 0.0688, + "step": 48230 + }, + { + "epoch": 10.011265774792829, + "grad_norm": 0.015712086111307144, + "learning_rate": 4.1040038034026016e-05, + "loss": 0.1863, + "step": 48240 + }, + { + "epoch": 10.01131993717164, + "grad_norm": 0.02686525136232376, + "learning_rate": 4.103702901298092e-05, + "loss": 0.0938, + "step": 48250 + }, + { + "epoch": 10.011374099550451, + "grad_norm": 1.9527095556259155, + "learning_rate": 4.103401999193582e-05, + "loss": 0.0665, + "step": 48260 + }, + { + "epoch": 10.011428261929264, + "grad_norm": 0.20769540965557098, + "learning_rate": 4.1031010970890734e-05, + "loss": 0.0048, + "step": 48270 + }, + { + "epoch": 10.011482424308076, + "grad_norm": 4.397121429443359, + "learning_rate": 4.102800194984564e-05, + "loss": 0.139, + "step": 48280 + }, + { + "epoch": 10.011536586686887, + "grad_norm": 0.04033320024609566, + "learning_rate": 4.102499292880055e-05, + "loss": 0.1005, + "step": 48290 + }, + { + "epoch": 10.011590749065698, + "grad_norm": 0.505366325378418, + "learning_rate": 4.102198390775545e-05, + "loss": 0.0486, + "step": 48300 + }, + { + "epoch": 10.011644911444511, + "grad_norm": 2.0338141918182373, + "learning_rate": 4.101897488671036e-05, + "loss": 0.0939, + "step": 48310 + }, + { + "epoch": 10.011699073823323, + "grad_norm": 1.2511330842971802, + "learning_rate": 4.1015965865665266e-05, + "loss": 0.0801, + "step": 48320 + }, + { + "epoch": 10.011753236202134, + "grad_norm": 0.5355191230773926, + "learning_rate": 4.101295684462017e-05, + "loss": 0.091, + "step": 48330 + }, + { + "epoch": 10.011807398580945, + "grad_norm": 2.845705270767212, + "learning_rate": 4.100994782357508e-05, + "loss": 0.0396, + "step": 48340 + }, + { + "epoch": 10.011861560959757, + "grad_norm": 0.1476081907749176, + "learning_rate": 4.1006938802529985e-05, + "loss": 0.0176, + "step": 48350 + }, + { + "epoch": 10.01191572333857, + "grad_norm": 0.39185264706611633, + "learning_rate": 4.10039297814849e-05, + "loss": 0.0806, + "step": 48360 + }, + { + "epoch": 10.011969885717381, + "grad_norm": 0.01865699142217636, + "learning_rate": 4.10009207604398e-05, + "loss": 0.0542, + "step": 48370 + }, + { + "epoch": 10.012024048096192, + "grad_norm": 0.04971884936094284, + "learning_rate": 4.0997911739394703e-05, + "loss": 0.0052, + "step": 48380 + }, + { + "epoch": 10.012078210475003, + "grad_norm": 0.5177294611930847, + "learning_rate": 4.0994902718349616e-05, + "loss": 0.1112, + "step": 48390 + }, + { + "epoch": 10.012132372853817, + "grad_norm": 0.001934137544594705, + "learning_rate": 4.099189369730452e-05, + "loss": 0.0526, + "step": 48400 + }, + { + "epoch": 10.012186535232628, + "grad_norm": 0.002617242280393839, + "learning_rate": 4.098888467625943e-05, + "loss": 0.0631, + "step": 48410 + }, + { + "epoch": 10.012240697611439, + "grad_norm": 2.649746894836426, + "learning_rate": 4.0985875655214335e-05, + "loss": 0.0631, + "step": 48420 + }, + { + "epoch": 10.01229485999025, + "grad_norm": 0.0749155804514885, + "learning_rate": 4.098286663416924e-05, + "loss": 0.062, + "step": 48430 + }, + { + "epoch": 10.012349022369062, + "grad_norm": 0.0020219110883772373, + "learning_rate": 4.097985761312415e-05, + "loss": 0.014, + "step": 48440 + }, + { + "epoch": 10.012403184747875, + "grad_norm": 0.5860905647277832, + "learning_rate": 4.0976848592079054e-05, + "loss": 0.236, + "step": 48450 + }, + { + "epoch": 10.012457347126686, + "grad_norm": 0.6089552640914917, + "learning_rate": 4.097383957103396e-05, + "loss": 0.1361, + "step": 48460 + }, + { + "epoch": 10.012511509505497, + "grad_norm": 0.22109974920749664, + "learning_rate": 4.0970830549988873e-05, + "loss": 0.0677, + "step": 48470 + }, + { + "epoch": 10.012565671884309, + "grad_norm": 5.287498474121094, + "learning_rate": 4.096782152894377e-05, + "loss": 0.1319, + "step": 48480 + }, + { + "epoch": 10.012619834263122, + "grad_norm": 0.005033887457102537, + "learning_rate": 4.096481250789868e-05, + "loss": 0.0655, + "step": 48490 + }, + { + "epoch": 10.012673996641933, + "grad_norm": 1.4838556051254272, + "learning_rate": 4.096180348685359e-05, + "loss": 0.0874, + "step": 48500 + }, + { + "epoch": 10.012728159020744, + "grad_norm": 0.005158976186066866, + "learning_rate": 4.09587944658085e-05, + "loss": 0.114, + "step": 48510 + }, + { + "epoch": 10.012782321399555, + "grad_norm": 0.014102841727435589, + "learning_rate": 4.09557854447634e-05, + "loss": 0.0158, + "step": 48520 + }, + { + "epoch": 10.012836483778367, + "grad_norm": 0.0024533167015761137, + "learning_rate": 4.095277642371831e-05, + "loss": 0.0815, + "step": 48530 + }, + { + "epoch": 10.01289064615718, + "grad_norm": 0.012150618247687817, + "learning_rate": 4.094976740267322e-05, + "loss": 0.0088, + "step": 48540 + }, + { + "epoch": 10.012944808535991, + "grad_norm": 0.04833807051181793, + "learning_rate": 4.0946758381628124e-05, + "loss": 0.0747, + "step": 48550 + }, + { + "epoch": 10.012998970914802, + "grad_norm": 0.005443444009870291, + "learning_rate": 4.094374936058303e-05, + "loss": 0.142, + "step": 48560 + }, + { + "epoch": 10.013053133293614, + "grad_norm": 0.8502508401870728, + "learning_rate": 4.0940740339537936e-05, + "loss": 0.1212, + "step": 48570 + }, + { + "epoch": 10.013107295672427, + "grad_norm": 2.0966076850891113, + "learning_rate": 4.093773131849284e-05, + "loss": 0.0273, + "step": 48580 + }, + { + "epoch": 10.013161458051238, + "grad_norm": 5.796025276184082, + "learning_rate": 4.0934722297447755e-05, + "loss": 0.1203, + "step": 48590 + }, + { + "epoch": 10.01321562043005, + "grad_norm": 0.5089326500892639, + "learning_rate": 4.0931713276402655e-05, + "loss": 0.0513, + "step": 48600 + }, + { + "epoch": 10.01326978280886, + "grad_norm": 0.013809886761009693, + "learning_rate": 4.092870425535756e-05, + "loss": 0.0441, + "step": 48610 + }, + { + "epoch": 10.013323945187672, + "grad_norm": 0.024201642721891403, + "learning_rate": 4.0925695234312474e-05, + "loss": 0.0451, + "step": 48620 + }, + { + "epoch": 10.013378107566485, + "grad_norm": 0.10463598370552063, + "learning_rate": 4.0922686213267374e-05, + "loss": 0.1016, + "step": 48630 + }, + { + "epoch": 10.013432269945296, + "grad_norm": 5.244426250457764, + "learning_rate": 4.091967719222228e-05, + "loss": 0.0914, + "step": 48640 + }, + { + "epoch": 10.013486432324108, + "grad_norm": 0.05622430518269539, + "learning_rate": 4.091666817117719e-05, + "loss": 0.0188, + "step": 48650 + }, + { + "epoch": 10.013540594702919, + "grad_norm": 0.10398077964782715, + "learning_rate": 4.09136591501321e-05, + "loss": 0.0039, + "step": 48660 + }, + { + "epoch": 10.013594757081732, + "grad_norm": 2.819870710372925, + "learning_rate": 4.0910650129087006e-05, + "loss": 0.0504, + "step": 48670 + }, + { + "epoch": 10.013648919460543, + "grad_norm": 0.003645712975412607, + "learning_rate": 4.090764110804191e-05, + "loss": 0.045, + "step": 48680 + }, + { + "epoch": 10.013703081839354, + "grad_norm": 0.6024671792984009, + "learning_rate": 4.090463208699682e-05, + "loss": 0.0321, + "step": 48690 + }, + { + "epoch": 10.013757244218166, + "grad_norm": 0.013776702806353569, + "learning_rate": 4.0901623065951725e-05, + "loss": 0.0796, + "step": 48700 + }, + { + "epoch": 10.013811406596977, + "grad_norm": 0.04654248058795929, + "learning_rate": 4.089861404490663e-05, + "loss": 0.0261, + "step": 48710 + }, + { + "epoch": 10.01386556897579, + "grad_norm": 0.05543602257966995, + "learning_rate": 4.089560502386154e-05, + "loss": 0.0645, + "step": 48720 + }, + { + "epoch": 10.013919731354601, + "grad_norm": 0.09526011347770691, + "learning_rate": 4.089259600281645e-05, + "loss": 0.0013, + "step": 48730 + }, + { + "epoch": 10.013973893733413, + "grad_norm": 0.022010982036590576, + "learning_rate": 4.0889586981771356e-05, + "loss": 0.0044, + "step": 48740 + }, + { + "epoch": 10.014028056112224, + "grad_norm": 2.875633955001831, + "learning_rate": 4.0886577960726256e-05, + "loss": 0.1408, + "step": 48750 + }, + { + "epoch": 10.014082218491037, + "grad_norm": 0.2527202069759369, + "learning_rate": 4.088356893968117e-05, + "loss": 0.0041, + "step": 48760 + }, + { + "epoch": 10.014136380869848, + "grad_norm": 0.08106648176908493, + "learning_rate": 4.0880559918636075e-05, + "loss": 0.0831, + "step": 48770 + }, + { + "epoch": 10.01419054324866, + "grad_norm": 0.16235600411891937, + "learning_rate": 4.0877550897590975e-05, + "loss": 0.0932, + "step": 48780 + }, + { + "epoch": 10.01424470562747, + "grad_norm": 0.06908094882965088, + "learning_rate": 4.087454187654589e-05, + "loss": 0.1542, + "step": 48790 + }, + { + "epoch": 10.014298868006282, + "grad_norm": 10.337505340576172, + "learning_rate": 4.0871532855500794e-05, + "loss": 0.0837, + "step": 48800 + }, + { + "epoch": 10.014353030385095, + "grad_norm": 0.8141332268714905, + "learning_rate": 4.08685238344557e-05, + "loss": 0.0617, + "step": 48810 + }, + { + "epoch": 10.014407192763906, + "grad_norm": 0.006217394955456257, + "learning_rate": 4.0865514813410607e-05, + "loss": 0.0525, + "step": 48820 + }, + { + "epoch": 10.014461355142718, + "grad_norm": 0.0026550479233264923, + "learning_rate": 4.086250579236551e-05, + "loss": 0.0451, + "step": 48830 + }, + { + "epoch": 10.014515517521529, + "grad_norm": 3.895418167114258, + "learning_rate": 4.085949677132042e-05, + "loss": 0.0673, + "step": 48840 + }, + { + "epoch": 10.014569679900342, + "grad_norm": 2.2671470642089844, + "learning_rate": 4.085648775027533e-05, + "loss": 0.0786, + "step": 48850 + }, + { + "epoch": 10.014623842279153, + "grad_norm": 0.052950240671634674, + "learning_rate": 4.085347872923023e-05, + "loss": 0.0474, + "step": 48860 + }, + { + "epoch": 10.014678004657965, + "grad_norm": 0.07040934264659882, + "learning_rate": 4.085046970818514e-05, + "loss": 0.04, + "step": 48870 + }, + { + "epoch": 10.014732167036776, + "grad_norm": 1.9072537422180176, + "learning_rate": 4.084746068714005e-05, + "loss": 0.1552, + "step": 48880 + }, + { + "epoch": 10.014786329415587, + "grad_norm": 9.062711715698242, + "learning_rate": 4.084445166609496e-05, + "loss": 0.071, + "step": 48890 + }, + { + "epoch": 10.0148404917944, + "grad_norm": 0.008413503877818584, + "learning_rate": 4.084144264504986e-05, + "loss": 0.0406, + "step": 48900 + }, + { + "epoch": 10.014894654173212, + "grad_norm": 0.033535126596689224, + "learning_rate": 4.083843362400477e-05, + "loss": 0.0195, + "step": 48910 + }, + { + "epoch": 10.014948816552023, + "grad_norm": 3.97084379196167, + "learning_rate": 4.0835424602959676e-05, + "loss": 0.1133, + "step": 48920 + }, + { + "epoch": 10.015002978930834, + "grad_norm": 0.0056270151399075985, + "learning_rate": 4.083241558191458e-05, + "loss": 0.2441, + "step": 48930 + }, + { + "epoch": 10.015057141309647, + "grad_norm": 0.18806566298007965, + "learning_rate": 4.082940656086949e-05, + "loss": 0.0903, + "step": 48940 + }, + { + "epoch": 10.015111303688458, + "grad_norm": 1.82109797000885, + "learning_rate": 4.0826397539824395e-05, + "loss": 0.0578, + "step": 48950 + }, + { + "epoch": 10.01516546606727, + "grad_norm": 0.005544185638427734, + "learning_rate": 4.08233885187793e-05, + "loss": 0.0669, + "step": 48960 + }, + { + "epoch": 10.015219628446081, + "grad_norm": 0.23535628616809845, + "learning_rate": 4.082037949773421e-05, + "loss": 0.0057, + "step": 48970 + }, + { + "epoch": 10.015273790824892, + "grad_norm": 0.02922077104449272, + "learning_rate": 4.0817370476689114e-05, + "loss": 0.0423, + "step": 48980 + }, + { + "epoch": 10.015327953203705, + "grad_norm": 1.0765583515167236, + "learning_rate": 4.081436145564403e-05, + "loss": 0.0428, + "step": 48990 + }, + { + "epoch": 10.015382115582517, + "grad_norm": 0.0033637143205851316, + "learning_rate": 4.081135243459893e-05, + "loss": 0.1341, + "step": 49000 + }, + { + "epoch": 10.015436277961328, + "grad_norm": 1.9743785858154297, + "learning_rate": 4.080834341355383e-05, + "loss": 0.076, + "step": 49010 + }, + { + "epoch": 10.01549044034014, + "grad_norm": 0.591798722743988, + "learning_rate": 4.0805334392508746e-05, + "loss": 0.0706, + "step": 49020 + }, + { + "epoch": 10.01554460271895, + "grad_norm": 0.49939045310020447, + "learning_rate": 4.080232537146365e-05, + "loss": 0.053, + "step": 49030 + }, + { + "epoch": 10.015598765097764, + "grad_norm": 0.6584284901618958, + "learning_rate": 4.079931635041856e-05, + "loss": 0.1357, + "step": 49040 + }, + { + "epoch": 10.015652927476575, + "grad_norm": 1.0996339321136475, + "learning_rate": 4.0796307329373464e-05, + "loss": 0.0612, + "step": 49050 + }, + { + "epoch": 10.015707089855386, + "grad_norm": 0.011647461913526058, + "learning_rate": 4.079329830832837e-05, + "loss": 0.0885, + "step": 49060 + }, + { + "epoch": 10.015761252234197, + "grad_norm": 0.4268173277378082, + "learning_rate": 4.079028928728328e-05, + "loss": 0.0677, + "step": 49070 + }, + { + "epoch": 10.01581541461301, + "grad_norm": 0.014323830604553223, + "learning_rate": 4.078728026623818e-05, + "loss": 0.0126, + "step": 49080 + }, + { + "epoch": 10.015869576991822, + "grad_norm": 6.057983875274658, + "learning_rate": 4.078427124519309e-05, + "loss": 0.0307, + "step": 49090 + }, + { + "epoch": 10.015923739370633, + "grad_norm": 2.2261409759521484, + "learning_rate": 4.0781262224147996e-05, + "loss": 0.0478, + "step": 49100 + }, + { + "epoch": 10.015977901749444, + "grad_norm": 0.0015811447519809008, + "learning_rate": 4.077825320310291e-05, + "loss": 0.2768, + "step": 49110 + }, + { + "epoch": 10.016032064128256, + "grad_norm": 0.005121550057083368, + "learning_rate": 4.077524418205781e-05, + "loss": 0.075, + "step": 49120 + }, + { + "epoch": 10.016086226507069, + "grad_norm": 0.03575936332345009, + "learning_rate": 4.0772235161012715e-05, + "loss": 0.0088, + "step": 49130 + }, + { + "epoch": 10.01614038888588, + "grad_norm": 0.002633066149428487, + "learning_rate": 4.076922613996763e-05, + "loss": 0.0081, + "step": 49140 + }, + { + "epoch": 10.016194551264691, + "grad_norm": 0.04711422696709633, + "learning_rate": 4.0766217118922534e-05, + "loss": 0.0183, + "step": 49150 + }, + { + "epoch": 10.016248713643503, + "grad_norm": 0.020852793008089066, + "learning_rate": 4.0763208097877433e-05, + "loss": 0.091, + "step": 49160 + }, + { + "epoch": 10.016302876022316, + "grad_norm": 0.008428491652011871, + "learning_rate": 4.0760199076832347e-05, + "loss": 0.1069, + "step": 49170 + }, + { + "epoch": 10.016357038401127, + "grad_norm": 0.0766456201672554, + "learning_rate": 4.075719005578725e-05, + "loss": 0.0832, + "step": 49180 + }, + { + "epoch": 10.016411200779938, + "grad_norm": 0.3724918067455292, + "learning_rate": 4.075418103474216e-05, + "loss": 0.0055, + "step": 49190 + }, + { + "epoch": 10.01646536315875, + "grad_norm": 0.002385895000770688, + "learning_rate": 4.0751172013697065e-05, + "loss": 0.0489, + "step": 49200 + }, + { + "epoch": 10.01651952553756, + "grad_norm": 0.8162288069725037, + "learning_rate": 4.074816299265197e-05, + "loss": 0.15, + "step": 49210 + }, + { + "epoch": 10.016573687916374, + "grad_norm": 0.014117170125246048, + "learning_rate": 4.074515397160688e-05, + "loss": 0.0775, + "step": 49220 + }, + { + "epoch": 10.016627850295185, + "grad_norm": 0.4107426702976227, + "learning_rate": 4.0742144950561784e-05, + "loss": 0.0524, + "step": 49230 + }, + { + "epoch": 10.016682012673996, + "grad_norm": 4.342434406280518, + "learning_rate": 4.073913592951669e-05, + "loss": 0.0678, + "step": 49240 + }, + { + "epoch": 10.016736175052808, + "grad_norm": 0.3544078469276428, + "learning_rate": 4.0736126908471603e-05, + "loss": 0.1327, + "step": 49250 + }, + { + "epoch": 10.01679033743162, + "grad_norm": 0.0612882599234581, + "learning_rate": 4.073311788742651e-05, + "loss": 0.0108, + "step": 49260 + }, + { + "epoch": 10.016844499810432, + "grad_norm": 0.0017901004757732153, + "learning_rate": 4.073010886638141e-05, + "loss": 0.0758, + "step": 49270 + }, + { + "epoch": 10.016898662189243, + "grad_norm": 0.06534647941589355, + "learning_rate": 4.072709984533632e-05, + "loss": 0.0234, + "step": 49280 + }, + { + "epoch": 10.016952824568055, + "grad_norm": 0.0027164306957274675, + "learning_rate": 4.072409082429123e-05, + "loss": 0.0817, + "step": 49290 + }, + { + "epoch": 10.017006986946866, + "grad_norm": 0.030154960229992867, + "learning_rate": 4.0721081803246135e-05, + "loss": 0.0435, + "step": 49300 + }, + { + "epoch": 10.017061149325679, + "grad_norm": 0.0019939884077757597, + "learning_rate": 4.071807278220104e-05, + "loss": 0.0827, + "step": 49310 + }, + { + "epoch": 10.01711531170449, + "grad_norm": 2.3509418964385986, + "learning_rate": 4.071506376115595e-05, + "loss": 0.0157, + "step": 49320 + }, + { + "epoch": 10.017169474083301, + "grad_norm": 0.001941135386005044, + "learning_rate": 4.0712054740110854e-05, + "loss": 0.0106, + "step": 49330 + }, + { + "epoch": 10.017223636462113, + "grad_norm": 0.0022525957319885492, + "learning_rate": 4.070904571906577e-05, + "loss": 0.0036, + "step": 49340 + }, + { + "epoch": 10.017277798840926, + "grad_norm": 1.1974984407424927, + "learning_rate": 4.0706036698020666e-05, + "loss": 0.0843, + "step": 49350 + }, + { + "epoch": 10.017331961219737, + "grad_norm": 0.11737464368343353, + "learning_rate": 4.070302767697557e-05, + "loss": 0.077, + "step": 49360 + }, + { + "epoch": 10.017386123598548, + "grad_norm": 1.7210078239440918, + "learning_rate": 4.0700018655930486e-05, + "loss": 0.1365, + "step": 49370 + }, + { + "epoch": 10.01744028597736, + "grad_norm": 1.4985378980636597, + "learning_rate": 4.0697009634885385e-05, + "loss": 0.0548, + "step": 49380 + }, + { + "epoch": 10.017494448356171, + "grad_norm": 0.04647146537899971, + "learning_rate": 4.069400061384029e-05, + "loss": 0.056, + "step": 49390 + }, + { + "epoch": 10.017548610734984, + "grad_norm": 1.2010245323181152, + "learning_rate": 4.0690991592795204e-05, + "loss": 0.0999, + "step": 49400 + }, + { + "epoch": 10.017602773113795, + "grad_norm": 0.24550384283065796, + "learning_rate": 4.068798257175011e-05, + "loss": 0.0963, + "step": 49410 + }, + { + "epoch": 10.017656935492607, + "grad_norm": 0.06872320175170898, + "learning_rate": 4.068497355070501e-05, + "loss": 0.0692, + "step": 49420 + }, + { + "epoch": 10.017711097871418, + "grad_norm": 0.2598312199115753, + "learning_rate": 4.068196452965992e-05, + "loss": 0.0162, + "step": 49430 + }, + { + "epoch": 10.017765260250231, + "grad_norm": 3.6944379806518555, + "learning_rate": 4.067895550861483e-05, + "loss": 0.0338, + "step": 49440 + }, + { + "epoch": 10.017819422629042, + "grad_norm": 0.014063097536563873, + "learning_rate": 4.0675946487569736e-05, + "loss": 0.174, + "step": 49450 + }, + { + "epoch": 10.017873585007854, + "grad_norm": 0.16254960000514984, + "learning_rate": 4.067293746652464e-05, + "loss": 0.0081, + "step": 49460 + }, + { + "epoch": 10.017927747386665, + "grad_norm": 3.199373722076416, + "learning_rate": 4.066992844547955e-05, + "loss": 0.0915, + "step": 49470 + }, + { + "epoch": 10.017981909765476, + "grad_norm": 0.08431226760149002, + "learning_rate": 4.0666919424434455e-05, + "loss": 0.0603, + "step": 49480 + }, + { + "epoch": 10.01803607214429, + "grad_norm": 0.002008831826969981, + "learning_rate": 4.066391040338937e-05, + "loss": 0.1368, + "step": 49490 + }, + { + "epoch": 10.0180902345231, + "grad_norm": 0.15621599555015564, + "learning_rate": 4.066090138234427e-05, + "loss": 0.1492, + "step": 49500 + }, + { + "epoch": 10.018144396901912, + "grad_norm": 0.002244803588837385, + "learning_rate": 4.065789236129918e-05, + "loss": 0.0651, + "step": 49510 + }, + { + "epoch": 10.018198559280723, + "grad_norm": 3.364553928375244, + "learning_rate": 4.0654883340254086e-05, + "loss": 0.0976, + "step": 49520 + }, + { + "epoch": 10.018252721659536, + "grad_norm": 0.003533244365826249, + "learning_rate": 4.0651874319208986e-05, + "loss": 0.0555, + "step": 49530 + }, + { + "epoch": 10.018306884038347, + "grad_norm": 2.2137644290924072, + "learning_rate": 4.06488652981639e-05, + "loss": 0.0485, + "step": 49540 + }, + { + "epoch": 10.018361046417159, + "grad_norm": 0.11909620463848114, + "learning_rate": 4.0645856277118805e-05, + "loss": 0.0324, + "step": 49550 + }, + { + "epoch": 10.01841520879597, + "grad_norm": 0.004290079232305288, + "learning_rate": 4.064284725607371e-05, + "loss": 0.0987, + "step": 49560 + }, + { + "epoch": 10.018469371174781, + "grad_norm": 1.7092424631118774, + "learning_rate": 4.063983823502862e-05, + "loss": 0.1276, + "step": 49570 + }, + { + "epoch": 10.018523533553594, + "grad_norm": 1.5080595016479492, + "learning_rate": 4.0636829213983524e-05, + "loss": 0.1294, + "step": 49580 + }, + { + "epoch": 10.018577695932406, + "grad_norm": 0.29947417974472046, + "learning_rate": 4.063382019293843e-05, + "loss": 0.1667, + "step": 49590 + }, + { + "epoch": 10.018631858311217, + "grad_norm": 0.12344956398010254, + "learning_rate": 4.0630811171893343e-05, + "loss": 0.0259, + "step": 49600 + }, + { + "epoch": 10.018686020690028, + "grad_norm": 0.0367908775806427, + "learning_rate": 4.062780215084824e-05, + "loss": 0.024, + "step": 49610 + }, + { + "epoch": 10.018740183068841, + "grad_norm": 0.02425079420208931, + "learning_rate": 4.062479312980315e-05, + "loss": 0.0294, + "step": 49620 + }, + { + "epoch": 10.018794345447652, + "grad_norm": 3.0060203075408936, + "learning_rate": 4.062178410875806e-05, + "loss": 0.0499, + "step": 49630 + }, + { + "epoch": 10.018848507826464, + "grad_norm": 0.9236656427383423, + "learning_rate": 4.061877508771297e-05, + "loss": 0.0176, + "step": 49640 + }, + { + "epoch": 10.018902670205275, + "grad_norm": 0.6415642499923706, + "learning_rate": 4.061576606666787e-05, + "loss": 0.1275, + "step": 49650 + }, + { + "epoch": 10.018956832584086, + "grad_norm": 0.32716548442840576, + "learning_rate": 4.061275704562278e-05, + "loss": 0.0021, + "step": 49660 + }, + { + "epoch": 10.0190109949629, + "grad_norm": 0.0018276943592354655, + "learning_rate": 4.060974802457769e-05, + "loss": 0.0568, + "step": 49670 + }, + { + "epoch": 10.01906515734171, + "grad_norm": 0.06457546353340149, + "learning_rate": 4.060673900353259e-05, + "loss": 0.0322, + "step": 49680 + }, + { + "epoch": 10.019119319720522, + "grad_norm": 0.028092626482248306, + "learning_rate": 4.06037299824875e-05, + "loss": 0.1202, + "step": 49690 + }, + { + "epoch": 10.019173482099333, + "grad_norm": 0.0019137212075293064, + "learning_rate": 4.0600720961442406e-05, + "loss": 0.0793, + "step": 49700 + }, + { + "epoch": 10.019227644478146, + "grad_norm": 0.0027676888275891542, + "learning_rate": 4.059771194039731e-05, + "loss": 0.0767, + "step": 49710 + }, + { + "epoch": 10.019281806856958, + "grad_norm": 0.726177990436554, + "learning_rate": 4.059470291935222e-05, + "loss": 0.0121, + "step": 49720 + }, + { + "epoch": 10.019335969235769, + "grad_norm": 3.332289457321167, + "learning_rate": 4.0591693898307125e-05, + "loss": 0.2179, + "step": 49730 + }, + { + "epoch": 10.01939013161458, + "grad_norm": 0.014621107839047909, + "learning_rate": 4.058868487726203e-05, + "loss": 0.0306, + "step": 49740 + }, + { + "epoch": 10.019444293993391, + "grad_norm": 0.7106156945228577, + "learning_rate": 4.0585675856216944e-05, + "loss": 0.1214, + "step": 49750 + }, + { + "epoch": 10.019498456372204, + "grad_norm": 7.642689228057861, + "learning_rate": 4.0582666835171844e-05, + "loss": 0.0665, + "step": 49760 + }, + { + "epoch": 10.019552618751016, + "grad_norm": 0.006402674131095409, + "learning_rate": 4.057965781412676e-05, + "loss": 0.1748, + "step": 49770 + }, + { + "epoch": 10.019606781129827, + "grad_norm": 0.1825428307056427, + "learning_rate": 4.057664879308166e-05, + "loss": 0.1444, + "step": 49780 + }, + { + "epoch": 10.019660943508638, + "grad_norm": 0.008628861978650093, + "learning_rate": 4.057363977203657e-05, + "loss": 0.1415, + "step": 49790 + }, + { + "epoch": 10.019715105887451, + "grad_norm": 0.07763411849737167, + "learning_rate": 4.0570630750991476e-05, + "loss": 0.0129, + "step": 49800 + }, + { + "epoch": 10.019769268266263, + "grad_norm": 0.12984521687030792, + "learning_rate": 4.056762172994638e-05, + "loss": 0.0749, + "step": 49810 + }, + { + "epoch": 10.019823430645074, + "grad_norm": 7.526409149169922, + "learning_rate": 4.056461270890129e-05, + "loss": 0.2732, + "step": 49820 + }, + { + "epoch": 10.019877593023885, + "grad_norm": 0.09108039736747742, + "learning_rate": 4.0561603687856195e-05, + "loss": 0.028, + "step": 49830 + }, + { + "epoch": 10.019931755402697, + "grad_norm": 0.0065018050372600555, + "learning_rate": 4.05585946668111e-05, + "loss": 0.0257, + "step": 49840 + }, + { + "epoch": 10.01998591778151, + "grad_norm": 2.3280367851257324, + "learning_rate": 4.055558564576601e-05, + "loss": 0.023, + "step": 49850 + }, + { + "epoch": 10.02004008016032, + "grad_norm": 0.022868754342198372, + "learning_rate": 4.055257662472092e-05, + "loss": 0.0485, + "step": 49860 + }, + { + "epoch": 10.020094242539132, + "grad_norm": 2.544497013092041, + "learning_rate": 4.054956760367582e-05, + "loss": 0.2365, + "step": 49870 + }, + { + "epoch": 10.020148404917943, + "grad_norm": 3.6118791103363037, + "learning_rate": 4.0546558582630726e-05, + "loss": 0.0805, + "step": 49880 + }, + { + "epoch": 10.020202567296757, + "grad_norm": 2.522512435913086, + "learning_rate": 4.054354956158564e-05, + "loss": 0.0878, + "step": 49890 + }, + { + "epoch": 10.020256729675568, + "grad_norm": 0.16849112510681152, + "learning_rate": 4.0540540540540545e-05, + "loss": 0.1227, + "step": 49900 + }, + { + "epoch": 10.020310892054379, + "grad_norm": 0.37450864911079407, + "learning_rate": 4.0537531519495445e-05, + "loss": 0.0248, + "step": 49910 + }, + { + "epoch": 10.02036505443319, + "grad_norm": 5.67677640914917, + "learning_rate": 4.053452249845036e-05, + "loss": 0.0334, + "step": 49920 + }, + { + "epoch": 10.020419216812002, + "grad_norm": 0.014927553944289684, + "learning_rate": 4.0531513477405264e-05, + "loss": 0.0121, + "step": 49930 + }, + { + "epoch": 10.020473379190815, + "grad_norm": 0.1272651106119156, + "learning_rate": 4.052850445636017e-05, + "loss": 0.2062, + "step": 49940 + }, + { + "epoch": 10.020527541569626, + "grad_norm": 0.045924823731184006, + "learning_rate": 4.0525495435315077e-05, + "loss": 0.0852, + "step": 49950 + }, + { + "epoch": 10.020581703948437, + "grad_norm": 6.033161163330078, + "learning_rate": 4.052248641426998e-05, + "loss": 0.1465, + "step": 49960 + }, + { + "epoch": 10.020635866327249, + "grad_norm": 0.025782998651266098, + "learning_rate": 4.051947739322489e-05, + "loss": 0.0318, + "step": 49970 + }, + { + "epoch": 10.020690028706062, + "grad_norm": 0.3811088800430298, + "learning_rate": 4.0516468372179795e-05, + "loss": 0.0832, + "step": 49980 + }, + { + "epoch": 10.020744191084873, + "grad_norm": 0.6300578117370605, + "learning_rate": 4.05134593511347e-05, + "loss": 0.0838, + "step": 49990 + }, + { + "epoch": 10.020798353463684, + "grad_norm": 0.022780926898121834, + "learning_rate": 4.051045033008961e-05, + "loss": 0.0772, + "step": 50000 + }, + { + "epoch": 10.020852515842495, + "grad_norm": 0.05382383242249489, + "learning_rate": 4.050744130904452e-05, + "loss": 0.0458, + "step": 50010 + }, + { + "epoch": 10.020906678221307, + "grad_norm": 0.10528208315372467, + "learning_rate": 4.050443228799942e-05, + "loss": 0.1597, + "step": 50020 + }, + { + "epoch": 10.02096084060012, + "grad_norm": 0.006945919711142778, + "learning_rate": 4.0501423266954334e-05, + "loss": 0.032, + "step": 50030 + }, + { + "epoch": 10.021015002978931, + "grad_norm": 0.009389112703502178, + "learning_rate": 4.049841424590924e-05, + "loss": 0.007, + "step": 50040 + }, + { + "epoch": 10.021069165357742, + "grad_norm": 0.019663551822304726, + "learning_rate": 4.0495405224864146e-05, + "loss": 0.0348, + "step": 50050 + }, + { + "epoch": 10.021123327736554, + "grad_norm": 0.0049071405082941055, + "learning_rate": 4.049239620381905e-05, + "loss": 0.1147, + "step": 50060 + }, + { + "epoch": 10.021177490115367, + "grad_norm": 0.39649122953414917, + "learning_rate": 4.048938718277396e-05, + "loss": 0.0991, + "step": 50070 + }, + { + "epoch": 10.021231652494178, + "grad_norm": 0.21227023005485535, + "learning_rate": 4.0486378161728865e-05, + "loss": 0.0479, + "step": 50080 + }, + { + "epoch": 10.02128581487299, + "grad_norm": 0.08331764489412308, + "learning_rate": 4.048336914068377e-05, + "loss": 0.0494, + "step": 50090 + }, + { + "epoch": 10.0213399772518, + "grad_norm": 3.703216075897217, + "learning_rate": 4.048036011963868e-05, + "loss": 0.0817, + "step": 50100 + }, + { + "epoch": 10.021394139630612, + "grad_norm": 0.27193915843963623, + "learning_rate": 4.0477351098593584e-05, + "loss": 0.0148, + "step": 50110 + }, + { + "epoch": 10.021448302009425, + "grad_norm": 0.0055092559196054935, + "learning_rate": 4.04743420775485e-05, + "loss": 0.0863, + "step": 50120 + }, + { + "epoch": 10.021502464388236, + "grad_norm": 2.589442491531372, + "learning_rate": 4.0471333056503396e-05, + "loss": 0.0942, + "step": 50130 + }, + { + "epoch": 10.021556626767048, + "grad_norm": 2.180060863494873, + "learning_rate": 4.04683240354583e-05, + "loss": 0.1359, + "step": 50140 + }, + { + "epoch": 10.021610789145859, + "grad_norm": 0.12292593717575073, + "learning_rate": 4.0465315014413216e-05, + "loss": 0.0366, + "step": 50150 + }, + { + "epoch": 10.021664951524672, + "grad_norm": 0.30599209666252136, + "learning_rate": 4.046230599336812e-05, + "loss": 0.0853, + "step": 50160 + }, + { + "epoch": 10.021719113903483, + "grad_norm": 4.593895435333252, + "learning_rate": 4.045929697232302e-05, + "loss": 0.1599, + "step": 50170 + }, + { + "epoch": 10.021773276282294, + "grad_norm": 0.07981256395578384, + "learning_rate": 4.0456287951277934e-05, + "loss": 0.0777, + "step": 50180 + }, + { + "epoch": 10.021827438661106, + "grad_norm": 2.306502103805542, + "learning_rate": 4.045327893023284e-05, + "loss": 0.0584, + "step": 50190 + }, + { + "epoch": 10.021881601039917, + "grad_norm": 1.9280561208724976, + "learning_rate": 4.045026990918775e-05, + "loss": 0.0494, + "step": 50200 + }, + { + "epoch": 10.02193576341873, + "grad_norm": 8.44832706451416, + "learning_rate": 4.044726088814265e-05, + "loss": 0.1232, + "step": 50210 + }, + { + "epoch": 10.021989925797541, + "grad_norm": 0.00333092431537807, + "learning_rate": 4.044425186709756e-05, + "loss": 0.039, + "step": 50220 + }, + { + "epoch": 10.022044088176353, + "grad_norm": 0.004839963745325804, + "learning_rate": 4.0441242846052466e-05, + "loss": 0.0444, + "step": 50230 + }, + { + "epoch": 10.022098250555164, + "grad_norm": 0.1908179670572281, + "learning_rate": 4.043823382500738e-05, + "loss": 0.0137, + "step": 50240 + }, + { + "epoch": 10.022152412933975, + "grad_norm": 8.64019775390625, + "learning_rate": 4.043522480396228e-05, + "loss": 0.023, + "step": 50250 + }, + { + "epoch": 10.022206575312788, + "grad_norm": 3.603058338165283, + "learning_rate": 4.0432215782917185e-05, + "loss": 0.0633, + "step": 50260 + }, + { + "epoch": 10.0222607376916, + "grad_norm": 0.10241689532995224, + "learning_rate": 4.04292067618721e-05, + "loss": 0.0536, + "step": 50270 + }, + { + "epoch": 10.02231490007041, + "grad_norm": 0.39221182465553284, + "learning_rate": 4.0426197740827e-05, + "loss": 0.0417, + "step": 50280 + }, + { + "epoch": 10.022369062449222, + "grad_norm": 0.02243003062903881, + "learning_rate": 4.042318871978191e-05, + "loss": 0.1443, + "step": 50290 + }, + { + "epoch": 10.022423224828035, + "grad_norm": 0.21002520620822906, + "learning_rate": 4.0420179698736817e-05, + "loss": 0.0827, + "step": 50300 + }, + { + "epoch": 10.022477387206846, + "grad_norm": 2.312347173690796, + "learning_rate": 4.041717067769172e-05, + "loss": 0.0617, + "step": 50310 + }, + { + "epoch": 10.022531549585658, + "grad_norm": 0.22625309228897095, + "learning_rate": 4.041416165664663e-05, + "loss": 0.0406, + "step": 50320 + }, + { + "epoch": 10.022585711964469, + "grad_norm": 0.06220250949263573, + "learning_rate": 4.0411152635601535e-05, + "loss": 0.007, + "step": 50330 + }, + { + "epoch": 10.02263987434328, + "grad_norm": 0.2538309693336487, + "learning_rate": 4.040814361455644e-05, + "loss": 0.0043, + "step": 50340 + }, + { + "epoch": 10.022694036722093, + "grad_norm": 0.02449863962829113, + "learning_rate": 4.040513459351135e-05, + "loss": 0.0716, + "step": 50350 + }, + { + "epoch": 10.022748199100905, + "grad_norm": 0.021721355617046356, + "learning_rate": 4.0402125572466254e-05, + "loss": 0.0026, + "step": 50360 + }, + { + "epoch": 10.022802361479716, + "grad_norm": 0.7437312602996826, + "learning_rate": 4.039911655142116e-05, + "loss": 0.0078, + "step": 50370 + }, + { + "epoch": 10.022856523858527, + "grad_norm": 0.004593001678586006, + "learning_rate": 4.0396107530376074e-05, + "loss": 0.1467, + "step": 50380 + }, + { + "epoch": 10.02291068623734, + "grad_norm": 2.189980983734131, + "learning_rate": 4.039309850933098e-05, + "loss": 0.0966, + "step": 50390 + }, + { + "epoch": 10.022964848616152, + "grad_norm": 1.0710569620132446, + "learning_rate": 4.039008948828588e-05, + "loss": 0.1604, + "step": 50400 + }, + { + "epoch": 10.023019010994963, + "grad_norm": 0.15087834000587463, + "learning_rate": 4.038708046724079e-05, + "loss": 0.1093, + "step": 50410 + }, + { + "epoch": 10.023073173373774, + "grad_norm": 0.03215958550572395, + "learning_rate": 4.03840714461957e-05, + "loss": 0.0617, + "step": 50420 + }, + { + "epoch": 10.023127335752585, + "grad_norm": 0.1301591396331787, + "learning_rate": 4.03810624251506e-05, + "loss": 0.1648, + "step": 50430 + }, + { + "epoch": 10.023181498131398, + "grad_norm": 0.11133243888616562, + "learning_rate": 4.037805340410551e-05, + "loss": 0.1566, + "step": 50440 + }, + { + "epoch": 10.02323566051021, + "grad_norm": 0.9624205231666565, + "learning_rate": 4.037504438306042e-05, + "loss": 0.0315, + "step": 50450 + }, + { + "epoch": 10.023289822889021, + "grad_norm": 0.017259076237678528, + "learning_rate": 4.0372035362015324e-05, + "loss": 0.0753, + "step": 50460 + }, + { + "epoch": 10.023343985267832, + "grad_norm": 0.9234528541564941, + "learning_rate": 4.036902634097023e-05, + "loss": 0.0679, + "step": 50470 + }, + { + "epoch": 10.023398147646645, + "grad_norm": 7.415946006774902, + "learning_rate": 4.0366017319925136e-05, + "loss": 0.1707, + "step": 50480 + }, + { + "epoch": 10.023452310025457, + "grad_norm": 2.699087381362915, + "learning_rate": 4.036300829888004e-05, + "loss": 0.0753, + "step": 50490 + }, + { + "epoch": 10.023506472404268, + "grad_norm": 0.024260826408863068, + "learning_rate": 4.0359999277834956e-05, + "loss": 0.0495, + "step": 50500 + }, + { + "epoch": 10.02356063478308, + "grad_norm": 0.45467302203178406, + "learning_rate": 4.0356990256789855e-05, + "loss": 0.2074, + "step": 50510 + }, + { + "epoch": 10.02361479716189, + "grad_norm": 0.4678584337234497, + "learning_rate": 4.035398123574476e-05, + "loss": 0.1072, + "step": 50520 + }, + { + "epoch": 10.023668959540704, + "grad_norm": 0.41723212599754333, + "learning_rate": 4.0350972214699674e-05, + "loss": 0.0333, + "step": 50530 + }, + { + "epoch": 10.023723121919515, + "grad_norm": 1.6041773557662964, + "learning_rate": 4.034796319365458e-05, + "loss": 0.0789, + "step": 50540 + }, + { + "epoch": 10.023777284298326, + "grad_norm": 0.14680305123329163, + "learning_rate": 4.034495417260949e-05, + "loss": 0.0377, + "step": 50550 + }, + { + "epoch": 10.023831446677137, + "grad_norm": 0.04829634353518486, + "learning_rate": 4.034194515156439e-05, + "loss": 0.1301, + "step": 50560 + }, + { + "epoch": 10.02388560905595, + "grad_norm": 0.00358721986413002, + "learning_rate": 4.03389361305193e-05, + "loss": 0.0359, + "step": 50570 + }, + { + "epoch": 10.023939771434762, + "grad_norm": 0.30371853709220886, + "learning_rate": 4.0335927109474206e-05, + "loss": 0.0683, + "step": 50580 + }, + { + "epoch": 10.023993933813573, + "grad_norm": 0.14631915092468262, + "learning_rate": 4.033291808842911e-05, + "loss": 0.0545, + "step": 50590 + }, + { + "epoch": 10.024048096192384, + "grad_norm": 0.05995945259928703, + "learning_rate": 4.032990906738402e-05, + "loss": 0.1541, + "step": 50600 + }, + { + "epoch": 10.024102258571196, + "grad_norm": 1.5260201692581177, + "learning_rate": 4.0326900046338925e-05, + "loss": 0.0702, + "step": 50610 + }, + { + "epoch": 10.024156420950009, + "grad_norm": 0.185951367020607, + "learning_rate": 4.032389102529383e-05, + "loss": 0.1151, + "step": 50620 + }, + { + "epoch": 10.02421058332882, + "grad_norm": 0.04841591790318489, + "learning_rate": 4.032088200424874e-05, + "loss": 0.0616, + "step": 50630 + }, + { + "epoch": 10.024264745707631, + "grad_norm": 0.05505834147334099, + "learning_rate": 4.031787298320365e-05, + "loss": 0.0164, + "step": 50640 + }, + { + "epoch": 10.024318908086443, + "grad_norm": 0.1028497964143753, + "learning_rate": 4.0314863962158556e-05, + "loss": 0.0656, + "step": 50650 + }, + { + "epoch": 10.024373070465256, + "grad_norm": 0.23948128521442413, + "learning_rate": 4.0311854941113456e-05, + "loss": 0.1016, + "step": 50660 + }, + { + "epoch": 10.024427232844067, + "grad_norm": 1.3530036211013794, + "learning_rate": 4.030884592006837e-05, + "loss": 0.1085, + "step": 50670 + }, + { + "epoch": 10.024481395222878, + "grad_norm": 19.197324752807617, + "learning_rate": 4.0305836899023275e-05, + "loss": 0.1109, + "step": 50680 + }, + { + "epoch": 10.02453555760169, + "grad_norm": 1.3458722829818726, + "learning_rate": 4.030282787797818e-05, + "loss": 0.0644, + "step": 50690 + }, + { + "epoch": 10.0245897199805, + "grad_norm": 0.03339705243706703, + "learning_rate": 4.029981885693309e-05, + "loss": 0.1473, + "step": 50700 + }, + { + "epoch": 10.024643882359314, + "grad_norm": 1.7036327123641968, + "learning_rate": 4.0296809835887994e-05, + "loss": 0.0787, + "step": 50710 + }, + { + "epoch": 10.024698044738125, + "grad_norm": 0.4063590168952942, + "learning_rate": 4.02938008148429e-05, + "loss": 0.0147, + "step": 50720 + }, + { + "epoch": 10.024752207116936, + "grad_norm": 0.009174877777695656, + "learning_rate": 4.029079179379781e-05, + "loss": 0.1113, + "step": 50730 + }, + { + "epoch": 10.024806369495748, + "grad_norm": 0.0043348828330636024, + "learning_rate": 4.028778277275271e-05, + "loss": 0.0727, + "step": 50740 + }, + { + "epoch": 10.02486053187456, + "grad_norm": 0.003596716094762087, + "learning_rate": 4.028477375170762e-05, + "loss": 0.0035, + "step": 50750 + }, + { + "epoch": 10.024914694253372, + "grad_norm": 0.0027468486223369837, + "learning_rate": 4.028176473066253e-05, + "loss": 0.0632, + "step": 50760 + }, + { + "epoch": 10.024968856632183, + "grad_norm": 0.19920006394386292, + "learning_rate": 4.027875570961743e-05, + "loss": 0.0676, + "step": 50770 + }, + { + "epoch": 10.025001354059471, + "eval_accuracy": 0.8729588504245591, + "eval_loss": 0.4410460293292999, + "eval_runtime": 117.363, + "eval_samples_per_second": 26.09, + "eval_steps_per_second": 3.263, + "step": 50776 + }, + { + "epoch": 11.000021664951525, + "grad_norm": 0.09824378788471222, + "learning_rate": 4.027574668857234e-05, + "loss": 0.1791, + "step": 50780 + }, + { + "epoch": 11.000075827330337, + "grad_norm": 0.003252102294936776, + "learning_rate": 4.027273766752725e-05, + "loss": 0.0433, + "step": 50790 + }, + { + "epoch": 11.000129989709148, + "grad_norm": 2.799994945526123, + "learning_rate": 4.026972864648216e-05, + "loss": 0.0449, + "step": 50800 + }, + { + "epoch": 11.000184152087959, + "grad_norm": 0.024125155061483383, + "learning_rate": 4.0266719625437064e-05, + "loss": 0.0094, + "step": 50810 + }, + { + "epoch": 11.000238314466772, + "grad_norm": 0.02142154797911644, + "learning_rate": 4.026371060439197e-05, + "loss": 0.0215, + "step": 50820 + }, + { + "epoch": 11.000292476845583, + "grad_norm": 0.7292550802230835, + "learning_rate": 4.0260701583346876e-05, + "loss": 0.1141, + "step": 50830 + }, + { + "epoch": 11.000346639224395, + "grad_norm": 4.307636260986328, + "learning_rate": 4.025769256230178e-05, + "loss": 0.2331, + "step": 50840 + }, + { + "epoch": 11.000400801603206, + "grad_norm": 0.4282737970352173, + "learning_rate": 4.025468354125669e-05, + "loss": 0.0347, + "step": 50850 + }, + { + "epoch": 11.000454963982017, + "grad_norm": 0.026575967669487, + "learning_rate": 4.0251674520211595e-05, + "loss": 0.0384, + "step": 50860 + }, + { + "epoch": 11.00050912636083, + "grad_norm": 0.0035994125064462423, + "learning_rate": 4.02486654991665e-05, + "loss": 0.0542, + "step": 50870 + }, + { + "epoch": 11.000563288739642, + "grad_norm": 0.4917740225791931, + "learning_rate": 4.024565647812141e-05, + "loss": 0.0737, + "step": 50880 + }, + { + "epoch": 11.000617451118453, + "grad_norm": 0.053789205849170685, + "learning_rate": 4.0242647457076314e-05, + "loss": 0.0442, + "step": 50890 + }, + { + "epoch": 11.000671613497264, + "grad_norm": 0.004132461734116077, + "learning_rate": 4.023963843603123e-05, + "loss": 0.1083, + "step": 50900 + }, + { + "epoch": 11.000725775876077, + "grad_norm": 3.3279504776000977, + "learning_rate": 4.023662941498613e-05, + "loss": 0.0735, + "step": 50910 + }, + { + "epoch": 11.000779938254889, + "grad_norm": 0.15383370220661163, + "learning_rate": 4.023362039394103e-05, + "loss": 0.0606, + "step": 50920 + }, + { + "epoch": 11.0008341006337, + "grad_norm": 0.23067085444927216, + "learning_rate": 4.0230611372895946e-05, + "loss": 0.0578, + "step": 50930 + }, + { + "epoch": 11.000888263012511, + "grad_norm": 0.4015330672264099, + "learning_rate": 4.022760235185085e-05, + "loss": 0.0041, + "step": 50940 + }, + { + "epoch": 11.000942425391322, + "grad_norm": 0.8150196075439453, + "learning_rate": 4.022459333080576e-05, + "loss": 0.04, + "step": 50950 + }, + { + "epoch": 11.000996587770135, + "grad_norm": 0.3331051468849182, + "learning_rate": 4.0221584309760665e-05, + "loss": 0.086, + "step": 50960 + }, + { + "epoch": 11.001050750148947, + "grad_norm": 0.10296165198087692, + "learning_rate": 4.021857528871557e-05, + "loss": 0.1131, + "step": 50970 + }, + { + "epoch": 11.001104912527758, + "grad_norm": 0.11021499335765839, + "learning_rate": 4.021556626767048e-05, + "loss": 0.0653, + "step": 50980 + }, + { + "epoch": 11.00115907490657, + "grad_norm": 0.002723643323406577, + "learning_rate": 4.021255724662539e-05, + "loss": 0.0862, + "step": 50990 + }, + { + "epoch": 11.001213237285382, + "grad_norm": 3.2350263595581055, + "learning_rate": 4.020954822558029e-05, + "loss": 0.0813, + "step": 51000 + }, + { + "epoch": 11.001267399664194, + "grad_norm": 3.3458549976348877, + "learning_rate": 4.0206539204535196e-05, + "loss": 0.075, + "step": 51010 + }, + { + "epoch": 11.001321562043005, + "grad_norm": 0.10157891362905502, + "learning_rate": 4.020353018349011e-05, + "loss": 0.0834, + "step": 51020 + }, + { + "epoch": 11.001375724421816, + "grad_norm": 2.1687135696411133, + "learning_rate": 4.020052116244501e-05, + "loss": 0.0884, + "step": 51030 + }, + { + "epoch": 11.001429886800628, + "grad_norm": 0.33839327096939087, + "learning_rate": 4.0197512141399915e-05, + "loss": 0.0301, + "step": 51040 + }, + { + "epoch": 11.00148404917944, + "grad_norm": 0.47227588295936584, + "learning_rate": 4.019450312035483e-05, + "loss": 0.0981, + "step": 51050 + }, + { + "epoch": 11.001538211558252, + "grad_norm": 0.927602231502533, + "learning_rate": 4.0191494099309734e-05, + "loss": 0.03, + "step": 51060 + }, + { + "epoch": 11.001592373937063, + "grad_norm": 2.8890938758850098, + "learning_rate": 4.018848507826464e-05, + "loss": 0.0572, + "step": 51070 + }, + { + "epoch": 11.001646536315874, + "grad_norm": 0.04626524820923805, + "learning_rate": 4.018547605721955e-05, + "loss": 0.0426, + "step": 51080 + }, + { + "epoch": 11.001700698694687, + "grad_norm": 0.05267039313912392, + "learning_rate": 4.018246703617445e-05, + "loss": 0.0482, + "step": 51090 + }, + { + "epoch": 11.001754861073499, + "grad_norm": 0.7513546347618103, + "learning_rate": 4.017945801512936e-05, + "loss": 0.0345, + "step": 51100 + }, + { + "epoch": 11.00180902345231, + "grad_norm": 0.1611502766609192, + "learning_rate": 4.0176448994084265e-05, + "loss": 0.0224, + "step": 51110 + }, + { + "epoch": 11.001863185831121, + "grad_norm": 0.0021808522287756205, + "learning_rate": 4.017343997303917e-05, + "loss": 0.0161, + "step": 51120 + }, + { + "epoch": 11.001917348209933, + "grad_norm": 0.7375098466873169, + "learning_rate": 4.017043095199408e-05, + "loss": 0.0822, + "step": 51130 + }, + { + "epoch": 11.001971510588746, + "grad_norm": 0.17642173171043396, + "learning_rate": 4.016742193094899e-05, + "loss": 0.0355, + "step": 51140 + }, + { + "epoch": 11.002025672967557, + "grad_norm": 1.7692643404006958, + "learning_rate": 4.016441290990389e-05, + "loss": 0.0455, + "step": 51150 + }, + { + "epoch": 11.002079835346368, + "grad_norm": 0.006207084283232689, + "learning_rate": 4.0161403888858804e-05, + "loss": 0.0058, + "step": 51160 + }, + { + "epoch": 11.00213399772518, + "grad_norm": 0.001840585027821362, + "learning_rate": 4.015839486781371e-05, + "loss": 0.041, + "step": 51170 + }, + { + "epoch": 11.002188160103993, + "grad_norm": 0.20525860786437988, + "learning_rate": 4.015538584676861e-05, + "loss": 0.1342, + "step": 51180 + }, + { + "epoch": 11.002242322482804, + "grad_norm": 0.031001821160316467, + "learning_rate": 4.015237682572352e-05, + "loss": 0.0416, + "step": 51190 + }, + { + "epoch": 11.002296484861615, + "grad_norm": 2.6194844245910645, + "learning_rate": 4.014936780467843e-05, + "loss": 0.2252, + "step": 51200 + }, + { + "epoch": 11.002350647240426, + "grad_norm": 0.002254023216664791, + "learning_rate": 4.0146358783633335e-05, + "loss": 0.1008, + "step": 51210 + }, + { + "epoch": 11.002404809619238, + "grad_norm": 0.002375541953369975, + "learning_rate": 4.014334976258824e-05, + "loss": 0.127, + "step": 51220 + }, + { + "epoch": 11.00245897199805, + "grad_norm": 0.6328808069229126, + "learning_rate": 4.014034074154315e-05, + "loss": 0.0779, + "step": 51230 + }, + { + "epoch": 11.002513134376862, + "grad_norm": 3.527773380279541, + "learning_rate": 4.0137331720498054e-05, + "loss": 0.0402, + "step": 51240 + }, + { + "epoch": 11.002567296755673, + "grad_norm": 0.003123653819784522, + "learning_rate": 4.013432269945297e-05, + "loss": 0.0363, + "step": 51250 + }, + { + "epoch": 11.002621459134485, + "grad_norm": 0.22607599198818207, + "learning_rate": 4.0131313678407866e-05, + "loss": 0.0438, + "step": 51260 + }, + { + "epoch": 11.002675621513298, + "grad_norm": 0.007674460299313068, + "learning_rate": 4.012830465736277e-05, + "loss": 0.1098, + "step": 51270 + }, + { + "epoch": 11.002729783892109, + "grad_norm": 0.05064313858747482, + "learning_rate": 4.0125295636317686e-05, + "loss": 0.1146, + "step": 51280 + }, + { + "epoch": 11.00278394627092, + "grad_norm": 0.24506054818630219, + "learning_rate": 4.012228661527259e-05, + "loss": 0.0586, + "step": 51290 + }, + { + "epoch": 11.002838108649732, + "grad_norm": 3.578449249267578, + "learning_rate": 4.011927759422749e-05, + "loss": 0.077, + "step": 51300 + }, + { + "epoch": 11.002892271028543, + "grad_norm": 0.005380630027502775, + "learning_rate": 4.0116268573182404e-05, + "loss": 0.1204, + "step": 51310 + }, + { + "epoch": 11.002946433407356, + "grad_norm": 0.14114144444465637, + "learning_rate": 4.011325955213731e-05, + "loss": 0.1723, + "step": 51320 + }, + { + "epoch": 11.003000595786167, + "grad_norm": 5.479059219360352, + "learning_rate": 4.011025053109222e-05, + "loss": 0.1712, + "step": 51330 + }, + { + "epoch": 11.003054758164978, + "grad_norm": 0.2525990307331085, + "learning_rate": 4.010724151004712e-05, + "loss": 0.0944, + "step": 51340 + }, + { + "epoch": 11.00310892054379, + "grad_norm": 0.03899822756648064, + "learning_rate": 4.010423248900203e-05, + "loss": 0.0085, + "step": 51350 + }, + { + "epoch": 11.003163082922603, + "grad_norm": 0.01750885881483555, + "learning_rate": 4.0101223467956936e-05, + "loss": 0.1172, + "step": 51360 + }, + { + "epoch": 11.003217245301414, + "grad_norm": 0.8440488576889038, + "learning_rate": 4.009821444691184e-05, + "loss": 0.0484, + "step": 51370 + }, + { + "epoch": 11.003271407680225, + "grad_norm": 1.4987514019012451, + "learning_rate": 4.009520542586675e-05, + "loss": 0.0716, + "step": 51380 + }, + { + "epoch": 11.003325570059037, + "grad_norm": 9.400592803955078, + "learning_rate": 4.0092196404821655e-05, + "loss": 0.1216, + "step": 51390 + }, + { + "epoch": 11.003379732437848, + "grad_norm": 2.164231061935425, + "learning_rate": 4.008918738377657e-05, + "loss": 0.0482, + "step": 51400 + }, + { + "epoch": 11.003433894816661, + "grad_norm": 1.345765471458435, + "learning_rate": 4.008617836273147e-05, + "loss": 0.0511, + "step": 51410 + }, + { + "epoch": 11.003488057195472, + "grad_norm": 0.003996038809418678, + "learning_rate": 4.008316934168638e-05, + "loss": 0.0475, + "step": 51420 + }, + { + "epoch": 11.003542219574284, + "grad_norm": 0.029927201569080353, + "learning_rate": 4.0080160320641287e-05, + "loss": 0.0762, + "step": 51430 + }, + { + "epoch": 11.003596381953095, + "grad_norm": 5.727200508117676, + "learning_rate": 4.007715129959619e-05, + "loss": 0.1191, + "step": 51440 + }, + { + "epoch": 11.003650544331908, + "grad_norm": 0.1573026329278946, + "learning_rate": 4.00741422785511e-05, + "loss": 0.0181, + "step": 51450 + }, + { + "epoch": 11.00370470671072, + "grad_norm": 0.1852237433195114, + "learning_rate": 4.0071133257506005e-05, + "loss": 0.0307, + "step": 51460 + }, + { + "epoch": 11.00375886908953, + "grad_norm": 0.20369574427604675, + "learning_rate": 4.006812423646091e-05, + "loss": 0.0381, + "step": 51470 + }, + { + "epoch": 11.003813031468342, + "grad_norm": 0.6637476682662964, + "learning_rate": 4.006511521541582e-05, + "loss": 0.0503, + "step": 51480 + }, + { + "epoch": 11.003867193847153, + "grad_norm": 0.0034167482517659664, + "learning_rate": 4.0062106194370724e-05, + "loss": 0.0101, + "step": 51490 + }, + { + "epoch": 11.003921356225966, + "grad_norm": 0.03425341844558716, + "learning_rate": 4.005909717332563e-05, + "loss": 0.061, + "step": 51500 + }, + { + "epoch": 11.003975518604777, + "grad_norm": 0.04155370593070984, + "learning_rate": 4.0056088152280544e-05, + "loss": 0.1406, + "step": 51510 + }, + { + "epoch": 11.004029680983589, + "grad_norm": 0.009944060817360878, + "learning_rate": 4.005307913123544e-05, + "loss": 0.0612, + "step": 51520 + }, + { + "epoch": 11.0040838433624, + "grad_norm": 0.0895211398601532, + "learning_rate": 4.005007011019035e-05, + "loss": 0.0393, + "step": 51530 + }, + { + "epoch": 11.004138005741213, + "grad_norm": 0.05982888117432594, + "learning_rate": 4.004706108914526e-05, + "loss": 0.0428, + "step": 51540 + }, + { + "epoch": 11.004192168120024, + "grad_norm": 0.24427324533462524, + "learning_rate": 4.004405206810017e-05, + "loss": 0.0733, + "step": 51550 + }, + { + "epoch": 11.004246330498836, + "grad_norm": 0.1785033941268921, + "learning_rate": 4.004104304705507e-05, + "loss": 0.0376, + "step": 51560 + }, + { + "epoch": 11.004300492877647, + "grad_norm": 0.0024230023846030235, + "learning_rate": 4.003803402600998e-05, + "loss": 0.0241, + "step": 51570 + }, + { + "epoch": 11.004354655256458, + "grad_norm": 1.307356595993042, + "learning_rate": 4.003502500496489e-05, + "loss": 0.0459, + "step": 51580 + }, + { + "epoch": 11.004408817635271, + "grad_norm": 2.6487433910369873, + "learning_rate": 4.0032015983919794e-05, + "loss": 0.13, + "step": 51590 + }, + { + "epoch": 11.004462980014083, + "grad_norm": 0.6372749209403992, + "learning_rate": 4.00290069628747e-05, + "loss": 0.0785, + "step": 51600 + }, + { + "epoch": 11.004517142392894, + "grad_norm": 0.08896489441394806, + "learning_rate": 4.0025997941829606e-05, + "loss": 0.0184, + "step": 51610 + }, + { + "epoch": 11.004571304771705, + "grad_norm": 0.20163726806640625, + "learning_rate": 4.002298892078451e-05, + "loss": 0.0052, + "step": 51620 + }, + { + "epoch": 11.004625467150516, + "grad_norm": 0.6932989954948425, + "learning_rate": 4.001997989973942e-05, + "loss": 0.0064, + "step": 51630 + }, + { + "epoch": 11.00467962952933, + "grad_norm": 0.009715406224131584, + "learning_rate": 4.0016970878694325e-05, + "loss": 0.0028, + "step": 51640 + }, + { + "epoch": 11.00473379190814, + "grad_norm": 0.050229910761117935, + "learning_rate": 4.001396185764924e-05, + "loss": 0.0858, + "step": 51650 + }, + { + "epoch": 11.004787954286952, + "grad_norm": 1.6944022178649902, + "learning_rate": 4.0010952836604144e-05, + "loss": 0.1331, + "step": 51660 + }, + { + "epoch": 11.004842116665763, + "grad_norm": 0.17007513344287872, + "learning_rate": 4.0007943815559044e-05, + "loss": 0.0091, + "step": 51670 + }, + { + "epoch": 11.004896279044576, + "grad_norm": 0.03052065148949623, + "learning_rate": 4.000493479451396e-05, + "loss": 0.0556, + "step": 51680 + }, + { + "epoch": 11.004950441423388, + "grad_norm": 0.2385074943304062, + "learning_rate": 4.000192577346886e-05, + "loss": 0.0701, + "step": 51690 + }, + { + "epoch": 11.005004603802199, + "grad_norm": 0.16462232172489166, + "learning_rate": 3.999891675242377e-05, + "loss": 0.0025, + "step": 51700 + }, + { + "epoch": 11.00505876618101, + "grad_norm": 0.6967549324035645, + "learning_rate": 3.9995907731378676e-05, + "loss": 0.1251, + "step": 51710 + }, + { + "epoch": 11.005112928559821, + "grad_norm": 0.16840876638889313, + "learning_rate": 3.999289871033358e-05, + "loss": 0.0034, + "step": 51720 + }, + { + "epoch": 11.005167090938635, + "grad_norm": 0.27236202359199524, + "learning_rate": 3.998988968928849e-05, + "loss": 0.0829, + "step": 51730 + }, + { + "epoch": 11.005221253317446, + "grad_norm": 0.012058223597705364, + "learning_rate": 3.99868806682434e-05, + "loss": 0.094, + "step": 51740 + }, + { + "epoch": 11.005275415696257, + "grad_norm": 0.2565356492996216, + "learning_rate": 3.99838716471983e-05, + "loss": 0.0367, + "step": 51750 + }, + { + "epoch": 11.005329578075068, + "grad_norm": 0.5295913815498352, + "learning_rate": 3.998086262615321e-05, + "loss": 0.0705, + "step": 51760 + }, + { + "epoch": 11.005383740453881, + "grad_norm": 0.15927888453006744, + "learning_rate": 3.997785360510812e-05, + "loss": 0.0668, + "step": 51770 + }, + { + "epoch": 11.005437902832693, + "grad_norm": 0.0023638831917196512, + "learning_rate": 3.997484458406302e-05, + "loss": 0.0362, + "step": 51780 + }, + { + "epoch": 11.005492065211504, + "grad_norm": 0.18489177525043488, + "learning_rate": 3.9971835563017926e-05, + "loss": 0.063, + "step": 51790 + }, + { + "epoch": 11.005546227590315, + "grad_norm": 0.0017424989491701126, + "learning_rate": 3.996882654197284e-05, + "loss": 0.0768, + "step": 51800 + }, + { + "epoch": 11.005600389969127, + "grad_norm": 0.03263901546597481, + "learning_rate": 3.9965817520927745e-05, + "loss": 0.0255, + "step": 51810 + }, + { + "epoch": 11.00565455234794, + "grad_norm": 0.08267682045698166, + "learning_rate": 3.9962808499882645e-05, + "loss": 0.0343, + "step": 51820 + }, + { + "epoch": 11.005708714726751, + "grad_norm": 0.08107724785804749, + "learning_rate": 3.995979947883756e-05, + "loss": 0.1245, + "step": 51830 + }, + { + "epoch": 11.005762877105562, + "grad_norm": 0.002472478663548827, + "learning_rate": 3.9956790457792464e-05, + "loss": 0.0035, + "step": 51840 + }, + { + "epoch": 11.005817039484374, + "grad_norm": 2.627796173095703, + "learning_rate": 3.995378143674737e-05, + "loss": 0.1379, + "step": 51850 + }, + { + "epoch": 11.005871201863187, + "grad_norm": 5.110692977905273, + "learning_rate": 3.995077241570228e-05, + "loss": 0.0853, + "step": 51860 + }, + { + "epoch": 11.005925364241998, + "grad_norm": 0.010907129384577274, + "learning_rate": 3.994776339465718e-05, + "loss": 0.0305, + "step": 51870 + }, + { + "epoch": 11.00597952662081, + "grad_norm": 0.18140248954296112, + "learning_rate": 3.994475437361209e-05, + "loss": 0.043, + "step": 51880 + }, + { + "epoch": 11.00603368899962, + "grad_norm": 0.2724294364452362, + "learning_rate": 3.9941745352567e-05, + "loss": 0.0638, + "step": 51890 + }, + { + "epoch": 11.006087851378432, + "grad_norm": 5.358048915863037, + "learning_rate": 3.99387363315219e-05, + "loss": 0.0375, + "step": 51900 + }, + { + "epoch": 11.006142013757245, + "grad_norm": 0.1266140639781952, + "learning_rate": 3.9935727310476815e-05, + "loss": 0.0279, + "step": 51910 + }, + { + "epoch": 11.006196176136056, + "grad_norm": 0.004550204146653414, + "learning_rate": 3.993271828943172e-05, + "loss": 0.0092, + "step": 51920 + }, + { + "epoch": 11.006250338514867, + "grad_norm": 0.048486385494470596, + "learning_rate": 3.992970926838662e-05, + "loss": 0.1446, + "step": 51930 + }, + { + "epoch": 11.006304500893679, + "grad_norm": 0.00431641424074769, + "learning_rate": 3.9926700247341534e-05, + "loss": 0.0259, + "step": 51940 + }, + { + "epoch": 11.006358663272492, + "grad_norm": 0.0075751133263111115, + "learning_rate": 3.992369122629644e-05, + "loss": 0.162, + "step": 51950 + }, + { + "epoch": 11.006412825651303, + "grad_norm": 0.21019573509693146, + "learning_rate": 3.9920682205251346e-05, + "loss": 0.0225, + "step": 51960 + }, + { + "epoch": 11.006466988030114, + "grad_norm": 0.005932370200753212, + "learning_rate": 3.991767318420625e-05, + "loss": 0.0747, + "step": 51970 + }, + { + "epoch": 11.006521150408926, + "grad_norm": 0.01106444001197815, + "learning_rate": 3.991466416316116e-05, + "loss": 0.0348, + "step": 51980 + }, + { + "epoch": 11.006575312787737, + "grad_norm": 0.28837913274765015, + "learning_rate": 3.9911655142116065e-05, + "loss": 0.0403, + "step": 51990 + }, + { + "epoch": 11.00662947516655, + "grad_norm": 0.4599083960056305, + "learning_rate": 3.990864612107098e-05, + "loss": 0.0832, + "step": 52000 + }, + { + "epoch": 11.006683637545361, + "grad_norm": 0.002757913898676634, + "learning_rate": 3.990563710002588e-05, + "loss": 0.0008, + "step": 52010 + }, + { + "epoch": 11.006737799924172, + "grad_norm": 0.4972393810749054, + "learning_rate": 3.9902628078980784e-05, + "loss": 0.0299, + "step": 52020 + }, + { + "epoch": 11.006791962302984, + "grad_norm": 4.596846580505371, + "learning_rate": 3.98996190579357e-05, + "loss": 0.0879, + "step": 52030 + }, + { + "epoch": 11.006846124681797, + "grad_norm": 0.011933870613574982, + "learning_rate": 3.98966100368906e-05, + "loss": 0.0141, + "step": 52040 + }, + { + "epoch": 11.006900287060608, + "grad_norm": 0.06356264650821686, + "learning_rate": 3.98936010158455e-05, + "loss": 0.0321, + "step": 52050 + }, + { + "epoch": 11.00695444943942, + "grad_norm": 0.05816834419965744, + "learning_rate": 3.9890591994800416e-05, + "loss": 0.0455, + "step": 52060 + }, + { + "epoch": 11.00700861181823, + "grad_norm": 0.015776189044117928, + "learning_rate": 3.988758297375532e-05, + "loss": 0.0348, + "step": 52070 + }, + { + "epoch": 11.007062774197042, + "grad_norm": 0.06608841568231583, + "learning_rate": 3.988457395271022e-05, + "loss": 0.0728, + "step": 52080 + }, + { + "epoch": 11.007116936575855, + "grad_norm": 0.001876529073342681, + "learning_rate": 3.9881564931665135e-05, + "loss": 0.1507, + "step": 52090 + }, + { + "epoch": 11.007171098954666, + "grad_norm": 0.056324552744627, + "learning_rate": 3.987855591062004e-05, + "loss": 0.0416, + "step": 52100 + }, + { + "epoch": 11.007225261333478, + "grad_norm": 0.007725932635366917, + "learning_rate": 3.987554688957495e-05, + "loss": 0.0245, + "step": 52110 + }, + { + "epoch": 11.007279423712289, + "grad_norm": 0.07142583280801773, + "learning_rate": 3.9872537868529853e-05, + "loss": 0.044, + "step": 52120 + }, + { + "epoch": 11.007333586091102, + "grad_norm": 0.049402397125959396, + "learning_rate": 3.986952884748476e-05, + "loss": 0.037, + "step": 52130 + }, + { + "epoch": 11.007387748469913, + "grad_norm": 0.6743549108505249, + "learning_rate": 3.9866519826439666e-05, + "loss": 0.041, + "step": 52140 + }, + { + "epoch": 11.007441910848724, + "grad_norm": 0.17244429886341095, + "learning_rate": 3.986351080539458e-05, + "loss": 0.0775, + "step": 52150 + }, + { + "epoch": 11.007496073227536, + "grad_norm": 0.026806343346834183, + "learning_rate": 3.986050178434948e-05, + "loss": 0.065, + "step": 52160 + }, + { + "epoch": 11.007550235606347, + "grad_norm": 1.853583574295044, + "learning_rate": 3.985749276330439e-05, + "loss": 0.186, + "step": 52170 + }, + { + "epoch": 11.00760439798516, + "grad_norm": 3.87713623046875, + "learning_rate": 3.98544837422593e-05, + "loss": 0.0357, + "step": 52180 + }, + { + "epoch": 11.007658560363971, + "grad_norm": 0.002484536962583661, + "learning_rate": 3.9851474721214204e-05, + "loss": 0.0597, + "step": 52190 + }, + { + "epoch": 11.007712722742783, + "grad_norm": 1.4792591333389282, + "learning_rate": 3.984846570016911e-05, + "loss": 0.0562, + "step": 52200 + }, + { + "epoch": 11.007766885121594, + "grad_norm": 6.323559284210205, + "learning_rate": 3.984545667912402e-05, + "loss": 0.073, + "step": 52210 + }, + { + "epoch": 11.007821047500407, + "grad_norm": 0.005276564974337816, + "learning_rate": 3.984244765807892e-05, + "loss": 0.0374, + "step": 52220 + }, + { + "epoch": 11.007875209879218, + "grad_norm": 6.198297500610352, + "learning_rate": 3.983943863703383e-05, + "loss": 0.2268, + "step": 52230 + }, + { + "epoch": 11.00792937225803, + "grad_norm": 3.5001349449157715, + "learning_rate": 3.9836429615988735e-05, + "loss": 0.0478, + "step": 52240 + }, + { + "epoch": 11.00798353463684, + "grad_norm": 2.3731820583343506, + "learning_rate": 3.983342059494364e-05, + "loss": 0.1069, + "step": 52250 + }, + { + "epoch": 11.008037697015652, + "grad_norm": 0.8208462595939636, + "learning_rate": 3.9830411573898555e-05, + "loss": 0.1093, + "step": 52260 + }, + { + "epoch": 11.008091859394465, + "grad_norm": 0.2855898141860962, + "learning_rate": 3.9827402552853454e-05, + "loss": 0.0858, + "step": 52270 + }, + { + "epoch": 11.008146021773277, + "grad_norm": 0.0021286201663315296, + "learning_rate": 3.982439353180836e-05, + "loss": 0.0146, + "step": 52280 + }, + { + "epoch": 11.008200184152088, + "grad_norm": 0.05347241088747978, + "learning_rate": 3.9821384510763274e-05, + "loss": 0.0404, + "step": 52290 + }, + { + "epoch": 11.008254346530899, + "grad_norm": 0.6567937135696411, + "learning_rate": 3.981837548971818e-05, + "loss": 0.1018, + "step": 52300 + }, + { + "epoch": 11.008308508909712, + "grad_norm": 3.30086350440979, + "learning_rate": 3.981536646867308e-05, + "loss": 0.1323, + "step": 52310 + }, + { + "epoch": 11.008362671288523, + "grad_norm": 0.001825400977395475, + "learning_rate": 3.981235744762799e-05, + "loss": 0.035, + "step": 52320 + }, + { + "epoch": 11.008416833667335, + "grad_norm": 0.8916699886322021, + "learning_rate": 3.98093484265829e-05, + "loss": 0.0261, + "step": 52330 + }, + { + "epoch": 11.008470996046146, + "grad_norm": 0.004482999909669161, + "learning_rate": 3.9806339405537805e-05, + "loss": 0.0551, + "step": 52340 + }, + { + "epoch": 11.008525158424957, + "grad_norm": 0.004654277581721544, + "learning_rate": 3.980333038449271e-05, + "loss": 0.0247, + "step": 52350 + }, + { + "epoch": 11.00857932080377, + "grad_norm": 0.0018908655038103461, + "learning_rate": 3.980032136344762e-05, + "loss": 0.0841, + "step": 52360 + }, + { + "epoch": 11.008633483182582, + "grad_norm": 8.73569393157959, + "learning_rate": 3.9797312342402524e-05, + "loss": 0.1635, + "step": 52370 + }, + { + "epoch": 11.008687645561393, + "grad_norm": 0.5565881729125977, + "learning_rate": 3.979430332135743e-05, + "loss": 0.0706, + "step": 52380 + }, + { + "epoch": 11.008741807940204, + "grad_norm": 0.0893617570400238, + "learning_rate": 3.9791294300312336e-05, + "loss": 0.0384, + "step": 52390 + }, + { + "epoch": 11.008795970319017, + "grad_norm": 0.07455389201641083, + "learning_rate": 3.978828527926724e-05, + "loss": 0.0585, + "step": 52400 + }, + { + "epoch": 11.008850132697829, + "grad_norm": 0.18927039206027985, + "learning_rate": 3.9785276258222156e-05, + "loss": 0.1507, + "step": 52410 + }, + { + "epoch": 11.00890429507664, + "grad_norm": 0.5656338930130005, + "learning_rate": 3.9782267237177055e-05, + "loss": 0.0578, + "step": 52420 + }, + { + "epoch": 11.008958457455451, + "grad_norm": 0.003477222053334117, + "learning_rate": 3.977925821613197e-05, + "loss": 0.0743, + "step": 52430 + }, + { + "epoch": 11.009012619834262, + "grad_norm": 26.272859573364258, + "learning_rate": 3.9776249195086875e-05, + "loss": 0.05, + "step": 52440 + }, + { + "epoch": 11.009066782213075, + "grad_norm": 0.01173341739922762, + "learning_rate": 3.977324017404178e-05, + "loss": 0.0753, + "step": 52450 + }, + { + "epoch": 11.009120944591887, + "grad_norm": 0.4607820212841034, + "learning_rate": 3.977023115299669e-05, + "loss": 0.0076, + "step": 52460 + }, + { + "epoch": 11.009175106970698, + "grad_norm": 0.11965042352676392, + "learning_rate": 3.976722213195159e-05, + "loss": 0.1407, + "step": 52470 + }, + { + "epoch": 11.00922926934951, + "grad_norm": 0.16001008450984955, + "learning_rate": 3.97642131109065e-05, + "loss": 0.0712, + "step": 52480 + }, + { + "epoch": 11.009283431728322, + "grad_norm": 0.3022940158843994, + "learning_rate": 3.9761204089861406e-05, + "loss": 0.1547, + "step": 52490 + }, + { + "epoch": 11.009337594107134, + "grad_norm": 0.04503113031387329, + "learning_rate": 3.975819506881631e-05, + "loss": 0.096, + "step": 52500 + }, + { + "epoch": 11.009391756485945, + "grad_norm": 0.004265111871063709, + "learning_rate": 3.975518604777122e-05, + "loss": 0.0821, + "step": 52510 + }, + { + "epoch": 11.009445918864756, + "grad_norm": 0.09352822601795197, + "learning_rate": 3.975217702672613e-05, + "loss": 0.0331, + "step": 52520 + }, + { + "epoch": 11.009500081243567, + "grad_norm": 0.623750627040863, + "learning_rate": 3.974916800568103e-05, + "loss": 0.1251, + "step": 52530 + }, + { + "epoch": 11.00955424362238, + "grad_norm": 0.6866241693496704, + "learning_rate": 3.974615898463594e-05, + "loss": 0.0422, + "step": 52540 + }, + { + "epoch": 11.009608406001192, + "grad_norm": 2.3677921295166016, + "learning_rate": 3.974314996359085e-05, + "loss": 0.0943, + "step": 52550 + }, + { + "epoch": 11.009662568380003, + "grad_norm": 0.11374738067388535, + "learning_rate": 3.9740140942545757e-05, + "loss": 0.124, + "step": 52560 + }, + { + "epoch": 11.009716730758814, + "grad_norm": 0.05435345694422722, + "learning_rate": 3.9737131921500656e-05, + "loss": 0.1527, + "step": 52570 + }, + { + "epoch": 11.009770893137627, + "grad_norm": 1.1152544021606445, + "learning_rate": 3.973412290045557e-05, + "loss": 0.1468, + "step": 52580 + }, + { + "epoch": 11.009825055516439, + "grad_norm": 0.4689778685569763, + "learning_rate": 3.9731113879410475e-05, + "loss": 0.0642, + "step": 52590 + }, + { + "epoch": 11.00987921789525, + "grad_norm": 2.6293959617614746, + "learning_rate": 3.972810485836538e-05, + "loss": 0.0405, + "step": 52600 + }, + { + "epoch": 11.009933380274061, + "grad_norm": 0.0020316478330641985, + "learning_rate": 3.972509583732029e-05, + "loss": 0.0589, + "step": 52610 + }, + { + "epoch": 11.009987542652873, + "grad_norm": 4.048202991485596, + "learning_rate": 3.9722086816275194e-05, + "loss": 0.079, + "step": 52620 + }, + { + "epoch": 11.010041705031686, + "grad_norm": 0.3555006682872772, + "learning_rate": 3.97190777952301e-05, + "loss": 0.0192, + "step": 52630 + }, + { + "epoch": 11.010095867410497, + "grad_norm": 0.017698638141155243, + "learning_rate": 3.9716068774185014e-05, + "loss": 0.0046, + "step": 52640 + }, + { + "epoch": 11.010150029789308, + "grad_norm": 0.005577436648309231, + "learning_rate": 3.971305975313991e-05, + "loss": 0.0514, + "step": 52650 + }, + { + "epoch": 11.01020419216812, + "grad_norm": 17.513460159301758, + "learning_rate": 3.971005073209482e-05, + "loss": 0.1376, + "step": 52660 + }, + { + "epoch": 11.010258354546933, + "grad_norm": 0.10386881232261658, + "learning_rate": 3.970704171104973e-05, + "loss": 0.0481, + "step": 52670 + }, + { + "epoch": 11.010312516925744, + "grad_norm": 0.0017838813364505768, + "learning_rate": 3.970403269000463e-05, + "loss": 0.0468, + "step": 52680 + }, + { + "epoch": 11.010366679304555, + "grad_norm": 1.7241969108581543, + "learning_rate": 3.9701023668959545e-05, + "loss": 0.0262, + "step": 52690 + }, + { + "epoch": 11.010420841683366, + "grad_norm": 0.10604594647884369, + "learning_rate": 3.969801464791445e-05, + "loss": 0.0046, + "step": 52700 + }, + { + "epoch": 11.010475004062178, + "grad_norm": 0.0018172899726778269, + "learning_rate": 3.969500562686936e-05, + "loss": 0.0163, + "step": 52710 + }, + { + "epoch": 11.01052916644099, + "grad_norm": 0.083160899579525, + "learning_rate": 3.9691996605824264e-05, + "loss": 0.1142, + "step": 52720 + }, + { + "epoch": 11.010583328819802, + "grad_norm": 1.7244106531143188, + "learning_rate": 3.968898758477917e-05, + "loss": 0.1228, + "step": 52730 + }, + { + "epoch": 11.010637491198613, + "grad_norm": 0.05122191086411476, + "learning_rate": 3.9685978563734076e-05, + "loss": 0.0865, + "step": 52740 + }, + { + "epoch": 11.010691653577425, + "grad_norm": 0.05866334214806557, + "learning_rate": 3.968296954268898e-05, + "loss": 0.0528, + "step": 52750 + }, + { + "epoch": 11.010745815956236, + "grad_norm": 0.01404863316565752, + "learning_rate": 3.967996052164389e-05, + "loss": 0.0045, + "step": 52760 + }, + { + "epoch": 11.010799978335049, + "grad_norm": 0.3357931971549988, + "learning_rate": 3.9676951500598795e-05, + "loss": 0.0681, + "step": 52770 + }, + { + "epoch": 11.01085414071386, + "grad_norm": 2.9276137351989746, + "learning_rate": 3.967394247955371e-05, + "loss": 0.1051, + "step": 52780 + }, + { + "epoch": 11.010908303092672, + "grad_norm": 0.2545166611671448, + "learning_rate": 3.9670933458508614e-05, + "loss": 0.0128, + "step": 52790 + }, + { + "epoch": 11.010962465471483, + "grad_norm": 0.08438225835561752, + "learning_rate": 3.9667924437463514e-05, + "loss": 0.0112, + "step": 52800 + }, + { + "epoch": 11.011016627850296, + "grad_norm": 0.001734160934574902, + "learning_rate": 3.966491541641843e-05, + "loss": 0.1382, + "step": 52810 + }, + { + "epoch": 11.011070790229107, + "grad_norm": 0.22631904482841492, + "learning_rate": 3.966190639537333e-05, + "loss": 0.039, + "step": 52820 + }, + { + "epoch": 11.011124952607918, + "grad_norm": 0.009871766902506351, + "learning_rate": 3.965889737432823e-05, + "loss": 0.0886, + "step": 52830 + }, + { + "epoch": 11.01117911498673, + "grad_norm": 0.0032166773453354836, + "learning_rate": 3.9655888353283146e-05, + "loss": 0.1175, + "step": 52840 + }, + { + "epoch": 11.011233277365541, + "grad_norm": 0.002312248107045889, + "learning_rate": 3.965287933223805e-05, + "loss": 0.0266, + "step": 52850 + }, + { + "epoch": 11.011287439744354, + "grad_norm": 0.18052837252616882, + "learning_rate": 3.964987031119296e-05, + "loss": 0.0755, + "step": 52860 + }, + { + "epoch": 11.011341602123165, + "grad_norm": 0.07786682993173599, + "learning_rate": 3.9646861290147865e-05, + "loss": 0.0293, + "step": 52870 + }, + { + "epoch": 11.011395764501977, + "grad_norm": 2.8108394145965576, + "learning_rate": 3.964385226910277e-05, + "loss": 0.0688, + "step": 52880 + }, + { + "epoch": 11.011449926880788, + "grad_norm": 0.022442888468503952, + "learning_rate": 3.964084324805768e-05, + "loss": 0.0773, + "step": 52890 + }, + { + "epoch": 11.011504089259601, + "grad_norm": 0.002155882306396961, + "learning_rate": 3.963783422701259e-05, + "loss": 0.018, + "step": 52900 + }, + { + "epoch": 11.011558251638412, + "grad_norm": 2.396878957748413, + "learning_rate": 3.963482520596749e-05, + "loss": 0.0411, + "step": 52910 + }, + { + "epoch": 11.011612414017224, + "grad_norm": 2.317451000213623, + "learning_rate": 3.9631816184922396e-05, + "loss": 0.0948, + "step": 52920 + }, + { + "epoch": 11.011666576396035, + "grad_norm": 0.07505404204130173, + "learning_rate": 3.962880716387731e-05, + "loss": 0.0345, + "step": 52930 + }, + { + "epoch": 11.011720738774846, + "grad_norm": 0.6227481365203857, + "learning_rate": 3.9625798142832215e-05, + "loss": 0.2007, + "step": 52940 + }, + { + "epoch": 11.01177490115366, + "grad_norm": 0.09713336080312729, + "learning_rate": 3.962278912178712e-05, + "loss": 0.0682, + "step": 52950 + }, + { + "epoch": 11.01182906353247, + "grad_norm": 0.00534628564491868, + "learning_rate": 3.961978010074203e-05, + "loss": 0.0663, + "step": 52960 + }, + { + "epoch": 11.011883225911282, + "grad_norm": 0.15777051448822021, + "learning_rate": 3.9616771079696934e-05, + "loss": 0.0089, + "step": 52970 + }, + { + "epoch": 11.011937388290093, + "grad_norm": 0.09446561336517334, + "learning_rate": 3.961376205865184e-05, + "loss": 0.0103, + "step": 52980 + }, + { + "epoch": 11.011991550668906, + "grad_norm": 0.013115441426634789, + "learning_rate": 3.961075303760675e-05, + "loss": 0.0036, + "step": 52990 + }, + { + "epoch": 11.012045713047717, + "grad_norm": 0.006545274518430233, + "learning_rate": 3.960774401656165e-05, + "loss": 0.0928, + "step": 53000 + }, + { + "epoch": 11.012099875426529, + "grad_norm": 1.907798171043396, + "learning_rate": 3.960473499551656e-05, + "loss": 0.1149, + "step": 53010 + }, + { + "epoch": 11.01215403780534, + "grad_norm": 0.2754882872104645, + "learning_rate": 3.9601725974471466e-05, + "loss": 0.0612, + "step": 53020 + }, + { + "epoch": 11.012208200184151, + "grad_norm": 4.525589466094971, + "learning_rate": 3.959871695342637e-05, + "loss": 0.1216, + "step": 53030 + }, + { + "epoch": 11.012262362562964, + "grad_norm": 0.0019479263573884964, + "learning_rate": 3.9595707932381285e-05, + "loss": 0.0127, + "step": 53040 + }, + { + "epoch": 11.012316524941776, + "grad_norm": 1.6697386503219604, + "learning_rate": 3.959269891133619e-05, + "loss": 0.1355, + "step": 53050 + }, + { + "epoch": 11.012370687320587, + "grad_norm": 0.004218722227960825, + "learning_rate": 3.958968989029109e-05, + "loss": 0.0222, + "step": 53060 + }, + { + "epoch": 11.012424849699398, + "grad_norm": 0.0019182641990482807, + "learning_rate": 3.9586680869246004e-05, + "loss": 0.153, + "step": 53070 + }, + { + "epoch": 11.012479012078211, + "grad_norm": 0.1443922072649002, + "learning_rate": 3.958367184820091e-05, + "loss": 0.0211, + "step": 53080 + }, + { + "epoch": 11.012533174457023, + "grad_norm": 0.02341126836836338, + "learning_rate": 3.9580662827155816e-05, + "loss": 0.0273, + "step": 53090 + }, + { + "epoch": 11.012587336835834, + "grad_norm": 0.011513935402035713, + "learning_rate": 3.957765380611072e-05, + "loss": 0.1327, + "step": 53100 + }, + { + "epoch": 11.012641499214645, + "grad_norm": 0.013326723128557205, + "learning_rate": 3.957464478506563e-05, + "loss": 0.0712, + "step": 53110 + }, + { + "epoch": 11.012695661593456, + "grad_norm": 2.4982757568359375, + "learning_rate": 3.9571635764020535e-05, + "loss": 0.1469, + "step": 53120 + }, + { + "epoch": 11.01274982397227, + "grad_norm": 0.4912095367908478, + "learning_rate": 3.956862674297544e-05, + "loss": 0.0151, + "step": 53130 + }, + { + "epoch": 11.01280398635108, + "grad_norm": 0.03220800310373306, + "learning_rate": 3.956561772193035e-05, + "loss": 0.0854, + "step": 53140 + }, + { + "epoch": 11.012858148729892, + "grad_norm": 0.4053754508495331, + "learning_rate": 3.9562608700885254e-05, + "loss": 0.0454, + "step": 53150 + }, + { + "epoch": 11.012912311108703, + "grad_norm": 0.04823783412575722, + "learning_rate": 3.955959967984017e-05, + "loss": 0.1476, + "step": 53160 + }, + { + "epoch": 11.012966473487516, + "grad_norm": 0.5519379377365112, + "learning_rate": 3.9556590658795066e-05, + "loss": 0.0091, + "step": 53170 + }, + { + "epoch": 11.013020635866328, + "grad_norm": 0.004854319617152214, + "learning_rate": 3.955358163774997e-05, + "loss": 0.0685, + "step": 53180 + }, + { + "epoch": 11.013074798245139, + "grad_norm": 0.018282949924468994, + "learning_rate": 3.9550572616704886e-05, + "loss": 0.007, + "step": 53190 + }, + { + "epoch": 11.01312896062395, + "grad_norm": 0.07109515368938446, + "learning_rate": 3.954756359565979e-05, + "loss": 0.0019, + "step": 53200 + }, + { + "epoch": 11.013183123002761, + "grad_norm": 0.013940347358584404, + "learning_rate": 3.95445545746147e-05, + "loss": 0.0811, + "step": 53210 + }, + { + "epoch": 11.013237285381575, + "grad_norm": 0.06923342496156693, + "learning_rate": 3.9541545553569605e-05, + "loss": 0.0421, + "step": 53220 + }, + { + "epoch": 11.013291447760386, + "grad_norm": 0.022714413702487946, + "learning_rate": 3.953853653252451e-05, + "loss": 0.0149, + "step": 53230 + }, + { + "epoch": 11.013345610139197, + "grad_norm": 11.39561939239502, + "learning_rate": 3.953552751147942e-05, + "loss": 0.1153, + "step": 53240 + }, + { + "epoch": 11.013399772518008, + "grad_norm": 0.0018296890193596482, + "learning_rate": 3.9532518490434323e-05, + "loss": 0.1875, + "step": 53250 + }, + { + "epoch": 11.013453934896821, + "grad_norm": 0.005941553972661495, + "learning_rate": 3.952950946938923e-05, + "loss": 0.0295, + "step": 53260 + }, + { + "epoch": 11.013508097275633, + "grad_norm": 0.007333850022405386, + "learning_rate": 3.9526500448344136e-05, + "loss": 0.1558, + "step": 53270 + }, + { + "epoch": 11.013562259654444, + "grad_norm": 0.0017476726789027452, + "learning_rate": 3.952349142729904e-05, + "loss": 0.0391, + "step": 53280 + }, + { + "epoch": 11.013616422033255, + "grad_norm": 1.685943603515625, + "learning_rate": 3.952048240625395e-05, + "loss": 0.0324, + "step": 53290 + }, + { + "epoch": 11.013670584412067, + "grad_norm": 0.045395296066999435, + "learning_rate": 3.951747338520886e-05, + "loss": 0.1018, + "step": 53300 + }, + { + "epoch": 11.01372474679088, + "grad_norm": 0.32314756512641907, + "learning_rate": 3.951446436416377e-05, + "loss": 0.0944, + "step": 53310 + }, + { + "epoch": 11.013778909169691, + "grad_norm": 0.06010466814041138, + "learning_rate": 3.951145534311867e-05, + "loss": 0.0506, + "step": 53320 + }, + { + "epoch": 11.013833071548502, + "grad_norm": 9.42709732055664, + "learning_rate": 3.950844632207358e-05, + "loss": 0.0953, + "step": 53330 + }, + { + "epoch": 11.013887233927314, + "grad_norm": 0.002046035835519433, + "learning_rate": 3.950543730102849e-05, + "loss": 0.1699, + "step": 53340 + }, + { + "epoch": 11.013941396306127, + "grad_norm": 0.04443958401679993, + "learning_rate": 3.950242827998339e-05, + "loss": 0.0557, + "step": 53350 + }, + { + "epoch": 11.013995558684938, + "grad_norm": 0.13038615882396698, + "learning_rate": 3.94994192589383e-05, + "loss": 0.2221, + "step": 53360 + }, + { + "epoch": 11.01404972106375, + "grad_norm": 0.33697769045829773, + "learning_rate": 3.9496410237893205e-05, + "loss": 0.0627, + "step": 53370 + }, + { + "epoch": 11.01410388344256, + "grad_norm": 0.027842005714774132, + "learning_rate": 3.949340121684811e-05, + "loss": 0.0279, + "step": 53380 + }, + { + "epoch": 11.014158045821372, + "grad_norm": 1.035765528678894, + "learning_rate": 3.9490392195803025e-05, + "loss": 0.0907, + "step": 53390 + }, + { + "epoch": 11.014212208200185, + "grad_norm": 5.436892986297607, + "learning_rate": 3.9487383174757924e-05, + "loss": 0.0722, + "step": 53400 + }, + { + "epoch": 11.014266370578996, + "grad_norm": 1.4263745546340942, + "learning_rate": 3.948437415371283e-05, + "loss": 0.0552, + "step": 53410 + }, + { + "epoch": 11.014320532957807, + "grad_norm": 0.7639369964599609, + "learning_rate": 3.9481365132667744e-05, + "loss": 0.0192, + "step": 53420 + }, + { + "epoch": 11.014374695336619, + "grad_norm": 0.021959898993372917, + "learning_rate": 3.947835611162264e-05, + "loss": 0.0396, + "step": 53430 + }, + { + "epoch": 11.014428857715432, + "grad_norm": 0.1872323900461197, + "learning_rate": 3.947534709057755e-05, + "loss": 0.0095, + "step": 53440 + }, + { + "epoch": 11.014483020094243, + "grad_norm": 10.616137504577637, + "learning_rate": 3.947233806953246e-05, + "loss": 0.1294, + "step": 53450 + }, + { + "epoch": 11.014537182473054, + "grad_norm": 0.0025559309870004654, + "learning_rate": 3.946932904848737e-05, + "loss": 0.1672, + "step": 53460 + }, + { + "epoch": 11.014591344851866, + "grad_norm": 0.0028896424919366837, + "learning_rate": 3.9466320027442275e-05, + "loss": 0.1392, + "step": 53470 + }, + { + "epoch": 11.014645507230677, + "grad_norm": 0.20474186539649963, + "learning_rate": 3.946331100639718e-05, + "loss": 0.0677, + "step": 53480 + }, + { + "epoch": 11.01469966960949, + "grad_norm": 0.040628302842378616, + "learning_rate": 3.946030198535209e-05, + "loss": 0.1039, + "step": 53490 + }, + { + "epoch": 11.014753831988301, + "grad_norm": 0.3200567662715912, + "learning_rate": 3.9457292964306994e-05, + "loss": 0.0514, + "step": 53500 + }, + { + "epoch": 11.014807994367112, + "grad_norm": 0.3654983639717102, + "learning_rate": 3.94542839432619e-05, + "loss": 0.0374, + "step": 53510 + }, + { + "epoch": 11.014862156745924, + "grad_norm": 0.17201575636863708, + "learning_rate": 3.9451274922216806e-05, + "loss": 0.0532, + "step": 53520 + }, + { + "epoch": 11.014916319124737, + "grad_norm": 2.484048843383789, + "learning_rate": 3.944826590117171e-05, + "loss": 0.0465, + "step": 53530 + }, + { + "epoch": 11.014970481503548, + "grad_norm": 0.007185881957411766, + "learning_rate": 3.9445256880126626e-05, + "loss": 0.0487, + "step": 53540 + }, + { + "epoch": 11.01502464388236, + "grad_norm": 0.07524707168340683, + "learning_rate": 3.9442247859081525e-05, + "loss": 0.0588, + "step": 53550 + }, + { + "epoch": 11.01507880626117, + "grad_norm": 0.15789344906806946, + "learning_rate": 3.943923883803644e-05, + "loss": 0.0243, + "step": 53560 + }, + { + "epoch": 11.015132968639982, + "grad_norm": 0.016346881166100502, + "learning_rate": 3.9436229816991345e-05, + "loss": 0.159, + "step": 53570 + }, + { + "epoch": 11.015187131018795, + "grad_norm": 3.0110373497009277, + "learning_rate": 3.9433220795946244e-05, + "loss": 0.1953, + "step": 53580 + }, + { + "epoch": 11.015241293397606, + "grad_norm": 4.226968288421631, + "learning_rate": 3.943021177490116e-05, + "loss": 0.1118, + "step": 53590 + }, + { + "epoch": 11.015295455776418, + "grad_norm": 0.003489407477900386, + "learning_rate": 3.942720275385606e-05, + "loss": 0.1392, + "step": 53600 + }, + { + "epoch": 11.015349618155229, + "grad_norm": 4.439580917358398, + "learning_rate": 3.942419373281097e-05, + "loss": 0.045, + "step": 53610 + }, + { + "epoch": 11.015403780534042, + "grad_norm": 0.005514693446457386, + "learning_rate": 3.9421184711765876e-05, + "loss": 0.0154, + "step": 53620 + }, + { + "epoch": 11.015457942912853, + "grad_norm": 0.11619826406240463, + "learning_rate": 3.941817569072078e-05, + "loss": 0.0204, + "step": 53630 + }, + { + "epoch": 11.015512105291664, + "grad_norm": 2.4950451850891113, + "learning_rate": 3.941516666967569e-05, + "loss": 0.2381, + "step": 53640 + }, + { + "epoch": 11.015566267670476, + "grad_norm": 2.2625911235809326, + "learning_rate": 3.94121576486306e-05, + "loss": 0.0464, + "step": 53650 + }, + { + "epoch": 11.015620430049287, + "grad_norm": 0.258502721786499, + "learning_rate": 3.94091486275855e-05, + "loss": 0.0819, + "step": 53660 + }, + { + "epoch": 11.0156745924281, + "grad_norm": 1.674443244934082, + "learning_rate": 3.940613960654041e-05, + "loss": 0.0715, + "step": 53670 + }, + { + "epoch": 11.015728754806911, + "grad_norm": 22.26910400390625, + "learning_rate": 3.940313058549532e-05, + "loss": 0.0905, + "step": 53680 + }, + { + "epoch": 11.015782917185723, + "grad_norm": 0.284634530544281, + "learning_rate": 3.9400121564450227e-05, + "loss": 0.0525, + "step": 53690 + }, + { + "epoch": 11.015837079564534, + "grad_norm": 0.5921415686607361, + "learning_rate": 3.9397112543405126e-05, + "loss": 0.0481, + "step": 53700 + }, + { + "epoch": 11.015891241943347, + "grad_norm": 0.08321887254714966, + "learning_rate": 3.939410352236004e-05, + "loss": 0.0551, + "step": 53710 + }, + { + "epoch": 11.015945404322158, + "grad_norm": 0.014783253893256187, + "learning_rate": 3.9391094501314945e-05, + "loss": 0.0494, + "step": 53720 + }, + { + "epoch": 11.01599956670097, + "grad_norm": 1.307984709739685, + "learning_rate": 3.938808548026985e-05, + "loss": 0.0408, + "step": 53730 + }, + { + "epoch": 11.01605372907978, + "grad_norm": 8.344470024108887, + "learning_rate": 3.938507645922476e-05, + "loss": 0.0615, + "step": 53740 + }, + { + "epoch": 11.016107891458592, + "grad_norm": 0.1119021400809288, + "learning_rate": 3.9382067438179664e-05, + "loss": 0.1375, + "step": 53750 + }, + { + "epoch": 11.016162053837405, + "grad_norm": 0.03775536268949509, + "learning_rate": 3.937905841713457e-05, + "loss": 0.0329, + "step": 53760 + }, + { + "epoch": 11.016216216216216, + "grad_norm": 0.7404919266700745, + "learning_rate": 3.937604939608948e-05, + "loss": 0.0302, + "step": 53770 + }, + { + "epoch": 11.016270378595028, + "grad_norm": 0.006996020674705505, + "learning_rate": 3.937304037504438e-05, + "loss": 0.0569, + "step": 53780 + }, + { + "epoch": 11.016324540973839, + "grad_norm": 0.002445648657158017, + "learning_rate": 3.937003135399929e-05, + "loss": 0.0671, + "step": 53790 + }, + { + "epoch": 11.016378703352652, + "grad_norm": 0.03295496106147766, + "learning_rate": 3.93670223329542e-05, + "loss": 0.0388, + "step": 53800 + }, + { + "epoch": 11.016432865731463, + "grad_norm": 0.15949882566928864, + "learning_rate": 3.93640133119091e-05, + "loss": 0.0388, + "step": 53810 + }, + { + "epoch": 11.016487028110275, + "grad_norm": 0.0029634679667651653, + "learning_rate": 3.9361004290864015e-05, + "loss": 0.0322, + "step": 53820 + }, + { + "epoch": 11.016541190489086, + "grad_norm": 1.686767816543579, + "learning_rate": 3.935799526981892e-05, + "loss": 0.03, + "step": 53830 + }, + { + "epoch": 11.016595352867897, + "grad_norm": 0.04941008985042572, + "learning_rate": 3.935498624877383e-05, + "loss": 0.0503, + "step": 53840 + }, + { + "epoch": 11.01664951524671, + "grad_norm": 0.19636055827140808, + "learning_rate": 3.9351977227728734e-05, + "loss": 0.0536, + "step": 53850 + }, + { + "epoch": 11.016703677625522, + "grad_norm": 0.002400684868916869, + "learning_rate": 3.934896820668364e-05, + "loss": 0.1204, + "step": 53860 + }, + { + "epoch": 11.016757840004333, + "grad_norm": 0.002439529402181506, + "learning_rate": 3.9345959185638546e-05, + "loss": 0.0616, + "step": 53870 + }, + { + "epoch": 11.016812002383144, + "grad_norm": 0.006577162072062492, + "learning_rate": 3.934295016459345e-05, + "loss": 0.1583, + "step": 53880 + }, + { + "epoch": 11.016866164761955, + "grad_norm": 0.1685038059949875, + "learning_rate": 3.933994114354836e-05, + "loss": 0.01, + "step": 53890 + }, + { + "epoch": 11.016920327140769, + "grad_norm": 0.014141522347927094, + "learning_rate": 3.9336932122503265e-05, + "loss": 0.1318, + "step": 53900 + }, + { + "epoch": 11.01697448951958, + "grad_norm": 0.6843713521957397, + "learning_rate": 3.933392310145818e-05, + "loss": 0.074, + "step": 53910 + }, + { + "epoch": 11.017028651898391, + "grad_norm": 0.8329265713691711, + "learning_rate": 3.933091408041308e-05, + "loss": 0.008, + "step": 53920 + }, + { + "epoch": 11.017082814277202, + "grad_norm": 0.0028675473295152187, + "learning_rate": 3.9327905059367984e-05, + "loss": 0.0221, + "step": 53930 + }, + { + "epoch": 11.017136976656015, + "grad_norm": 1.9493027925491333, + "learning_rate": 3.93248960383229e-05, + "loss": 0.0587, + "step": 53940 + }, + { + "epoch": 11.017191139034827, + "grad_norm": 0.12319260090589523, + "learning_rate": 3.93218870172778e-05, + "loss": 0.124, + "step": 53950 + }, + { + "epoch": 11.017245301413638, + "grad_norm": 0.030824244022369385, + "learning_rate": 3.93188779962327e-05, + "loss": 0.0745, + "step": 53960 + }, + { + "epoch": 11.01729946379245, + "grad_norm": 0.0030284833628684282, + "learning_rate": 3.9315868975187616e-05, + "loss": 0.1374, + "step": 53970 + }, + { + "epoch": 11.01735362617126, + "grad_norm": 0.049599677324295044, + "learning_rate": 3.931285995414252e-05, + "loss": 0.0033, + "step": 53980 + }, + { + "epoch": 11.017407788550074, + "grad_norm": 0.4736911952495575, + "learning_rate": 3.930985093309743e-05, + "loss": 0.0325, + "step": 53990 + }, + { + "epoch": 11.017461950928885, + "grad_norm": 0.002570857061073184, + "learning_rate": 3.9306841912052335e-05, + "loss": 0.0747, + "step": 54000 + }, + { + "epoch": 11.017516113307696, + "grad_norm": 0.41239243745803833, + "learning_rate": 3.930383289100724e-05, + "loss": 0.0932, + "step": 54010 + }, + { + "epoch": 11.017570275686507, + "grad_norm": 0.5706081986427307, + "learning_rate": 3.930082386996215e-05, + "loss": 0.0068, + "step": 54020 + }, + { + "epoch": 11.01762443806532, + "grad_norm": 0.0022722259163856506, + "learning_rate": 3.9297814848917053e-05, + "loss": 0.1265, + "step": 54030 + }, + { + "epoch": 11.017678600444132, + "grad_norm": 0.059007205069065094, + "learning_rate": 3.929480582787196e-05, + "loss": 0.0901, + "step": 54040 + }, + { + "epoch": 11.017732762822943, + "grad_norm": 0.45051416754722595, + "learning_rate": 3.9291796806826866e-05, + "loss": 0.0469, + "step": 54050 + }, + { + "epoch": 11.017786925201754, + "grad_norm": 0.35539981722831726, + "learning_rate": 3.928878778578178e-05, + "loss": 0.0648, + "step": 54060 + }, + { + "epoch": 11.017841087580566, + "grad_norm": 0.19473238289356232, + "learning_rate": 3.928577876473668e-05, + "loss": 0.0828, + "step": 54070 + }, + { + "epoch": 11.017895249959379, + "grad_norm": 0.1774217039346695, + "learning_rate": 3.928276974369159e-05, + "loss": 0.0641, + "step": 54080 + }, + { + "epoch": 11.01794941233819, + "grad_norm": 0.0035952050238847733, + "learning_rate": 3.92797607226465e-05, + "loss": 0.0814, + "step": 54090 + }, + { + "epoch": 11.018003574717001, + "grad_norm": 2.5425198078155518, + "learning_rate": 3.9276751701601404e-05, + "loss": 0.0578, + "step": 54100 + }, + { + "epoch": 11.018057737095813, + "grad_norm": 0.0027033579535782337, + "learning_rate": 3.927374268055631e-05, + "loss": 0.0648, + "step": 54110 + }, + { + "epoch": 11.018111899474626, + "grad_norm": 0.5011308789253235, + "learning_rate": 3.927073365951122e-05, + "loss": 0.0516, + "step": 54120 + }, + { + "epoch": 11.018166061853437, + "grad_norm": 0.5829753875732422, + "learning_rate": 3.926772463846612e-05, + "loss": 0.0242, + "step": 54130 + }, + { + "epoch": 11.018220224232248, + "grad_norm": 0.0025204108096659184, + "learning_rate": 3.9264715617421036e-05, + "loss": 0.0331, + "step": 54140 + }, + { + "epoch": 11.01827438661106, + "grad_norm": 0.23819896578788757, + "learning_rate": 3.9261706596375936e-05, + "loss": 0.0398, + "step": 54150 + }, + { + "epoch": 11.01832854898987, + "grad_norm": 0.007759080734103918, + "learning_rate": 3.925869757533084e-05, + "loss": 0.0509, + "step": 54160 + }, + { + "epoch": 11.018382711368684, + "grad_norm": 1.4921166896820068, + "learning_rate": 3.9255688554285755e-05, + "loss": 0.0623, + "step": 54170 + }, + { + "epoch": 11.018436873747495, + "grad_norm": 0.0016493565635755658, + "learning_rate": 3.9252679533240654e-05, + "loss": 0.0226, + "step": 54180 + }, + { + "epoch": 11.018491036126306, + "grad_norm": 0.003224099986255169, + "learning_rate": 3.924967051219556e-05, + "loss": 0.0379, + "step": 54190 + }, + { + "epoch": 11.018545198505118, + "grad_norm": 0.8071425557136536, + "learning_rate": 3.9246661491150474e-05, + "loss": 0.0706, + "step": 54200 + }, + { + "epoch": 11.01859936088393, + "grad_norm": 0.0019497666507959366, + "learning_rate": 3.924365247010538e-05, + "loss": 0.0299, + "step": 54210 + }, + { + "epoch": 11.018653523262742, + "grad_norm": 0.46146291494369507, + "learning_rate": 3.924064344906028e-05, + "loss": 0.2557, + "step": 54220 + }, + { + "epoch": 11.018707685641553, + "grad_norm": 0.33842095732688904, + "learning_rate": 3.923763442801519e-05, + "loss": 0.0912, + "step": 54230 + }, + { + "epoch": 11.018761848020365, + "grad_norm": 3.126516819000244, + "learning_rate": 3.92346254069701e-05, + "loss": 0.0906, + "step": 54240 + }, + { + "epoch": 11.018816010399176, + "grad_norm": 0.28408363461494446, + "learning_rate": 3.9231616385925005e-05, + "loss": 0.0392, + "step": 54250 + }, + { + "epoch": 11.018870172777989, + "grad_norm": 0.09306799620389938, + "learning_rate": 3.922860736487991e-05, + "loss": 0.1118, + "step": 54260 + }, + { + "epoch": 11.0189243351568, + "grad_norm": 8.530693054199219, + "learning_rate": 3.922559834383482e-05, + "loss": 0.124, + "step": 54270 + }, + { + "epoch": 11.018978497535612, + "grad_norm": 0.052759695798158646, + "learning_rate": 3.9222589322789724e-05, + "loss": 0.1115, + "step": 54280 + }, + { + "epoch": 11.019032659914423, + "grad_norm": 0.004049794748425484, + "learning_rate": 3.921958030174464e-05, + "loss": 0.039, + "step": 54290 + }, + { + "epoch": 11.019086822293236, + "grad_norm": 0.12743379175662994, + "learning_rate": 3.9216571280699536e-05, + "loss": 0.1274, + "step": 54300 + }, + { + "epoch": 11.019140984672047, + "grad_norm": 2.401773691177368, + "learning_rate": 3.921356225965444e-05, + "loss": 0.0601, + "step": 54310 + }, + { + "epoch": 11.019195147050858, + "grad_norm": 6.239317417144775, + "learning_rate": 3.9210553238609356e-05, + "loss": 0.1061, + "step": 54320 + }, + { + "epoch": 11.01924930942967, + "grad_norm": 1.1176725625991821, + "learning_rate": 3.9207544217564255e-05, + "loss": 0.0483, + "step": 54330 + }, + { + "epoch": 11.019303471808481, + "grad_norm": 0.0034144804812967777, + "learning_rate": 3.920453519651917e-05, + "loss": 0.1299, + "step": 54340 + }, + { + "epoch": 11.019357634187294, + "grad_norm": 0.00683739734813571, + "learning_rate": 3.9201526175474075e-05, + "loss": 0.1093, + "step": 54350 + }, + { + "epoch": 11.019411796566105, + "grad_norm": 4.609752655029297, + "learning_rate": 3.919851715442898e-05, + "loss": 0.1806, + "step": 54360 + }, + { + "epoch": 11.019465958944917, + "grad_norm": 0.3074767291545868, + "learning_rate": 3.919550813338389e-05, + "loss": 0.0513, + "step": 54370 + }, + { + "epoch": 11.019520121323728, + "grad_norm": 1.133823275566101, + "learning_rate": 3.9192499112338793e-05, + "loss": 0.0909, + "step": 54380 + }, + { + "epoch": 11.019574283702541, + "grad_norm": 0.3671674430370331, + "learning_rate": 3.91894900912937e-05, + "loss": 0.0202, + "step": 54390 + }, + { + "epoch": 11.019628446081352, + "grad_norm": 0.012207545340061188, + "learning_rate": 3.918648107024861e-05, + "loss": 0.0882, + "step": 54400 + }, + { + "epoch": 11.019682608460164, + "grad_norm": 0.04838712885975838, + "learning_rate": 3.918347204920351e-05, + "loss": 0.0452, + "step": 54410 + }, + { + "epoch": 11.019736770838975, + "grad_norm": 6.757564067840576, + "learning_rate": 3.918046302815842e-05, + "loss": 0.1079, + "step": 54420 + }, + { + "epoch": 11.019790933217786, + "grad_norm": 0.013720150105655193, + "learning_rate": 3.917745400711333e-05, + "loss": 0.0097, + "step": 54430 + }, + { + "epoch": 11.0198450955966, + "grad_norm": 0.026220010593533516, + "learning_rate": 3.917444498606824e-05, + "loss": 0.0152, + "step": 54440 + }, + { + "epoch": 11.01989925797541, + "grad_norm": 10.282723426818848, + "learning_rate": 3.917143596502314e-05, + "loss": 0.0536, + "step": 54450 + }, + { + "epoch": 11.019953420354222, + "grad_norm": 0.13745513558387756, + "learning_rate": 3.916842694397805e-05, + "loss": 0.0957, + "step": 54460 + }, + { + "epoch": 11.020007582733033, + "grad_norm": 0.01254974864423275, + "learning_rate": 3.916541792293296e-05, + "loss": 0.0206, + "step": 54470 + }, + { + "epoch": 11.020061745111846, + "grad_norm": 0.033167075365781784, + "learning_rate": 3.9162408901887856e-05, + "loss": 0.0862, + "step": 54480 + }, + { + "epoch": 11.020115907490657, + "grad_norm": 0.29792821407318115, + "learning_rate": 3.915939988084277e-05, + "loss": 0.1166, + "step": 54490 + }, + { + "epoch": 11.020170069869469, + "grad_norm": 0.437986820936203, + "learning_rate": 3.9156390859797676e-05, + "loss": 0.0539, + "step": 54500 + }, + { + "epoch": 11.02022423224828, + "grad_norm": 0.05003907531499863, + "learning_rate": 3.915338183875258e-05, + "loss": 0.1489, + "step": 54510 + }, + { + "epoch": 11.020278394627091, + "grad_norm": 1.249622106552124, + "learning_rate": 3.915037281770749e-05, + "loss": 0.1133, + "step": 54520 + }, + { + "epoch": 11.020332557005904, + "grad_norm": 0.16795100271701813, + "learning_rate": 3.9147363796662394e-05, + "loss": 0.0619, + "step": 54530 + }, + { + "epoch": 11.020386719384716, + "grad_norm": 0.005901938769966364, + "learning_rate": 3.91443547756173e-05, + "loss": 0.0395, + "step": 54540 + }, + { + "epoch": 11.020440881763527, + "grad_norm": 0.20587900280952454, + "learning_rate": 3.9141345754572214e-05, + "loss": 0.0679, + "step": 54550 + }, + { + "epoch": 11.020495044142338, + "grad_norm": 0.030430087819695473, + "learning_rate": 3.913833673352711e-05, + "loss": 0.0692, + "step": 54560 + }, + { + "epoch": 11.020549206521151, + "grad_norm": 0.06412339955568314, + "learning_rate": 3.913532771248202e-05, + "loss": 0.0599, + "step": 54570 + }, + { + "epoch": 11.020603368899963, + "grad_norm": 3.459017038345337, + "learning_rate": 3.913231869143693e-05, + "loss": 0.1012, + "step": 54580 + }, + { + "epoch": 11.020657531278774, + "grad_norm": 0.2324037402868271, + "learning_rate": 3.912930967039184e-05, + "loss": 0.0427, + "step": 54590 + }, + { + "epoch": 11.020711693657585, + "grad_norm": 0.10759806632995605, + "learning_rate": 3.9126300649346745e-05, + "loss": 0.0683, + "step": 54600 + }, + { + "epoch": 11.020765856036396, + "grad_norm": 0.338003545999527, + "learning_rate": 3.912329162830165e-05, + "loss": 0.0157, + "step": 54610 + }, + { + "epoch": 11.02082001841521, + "grad_norm": 2.061779022216797, + "learning_rate": 3.912028260725656e-05, + "loss": 0.0273, + "step": 54620 + }, + { + "epoch": 11.02087418079402, + "grad_norm": 0.0058855521492660046, + "learning_rate": 3.9117273586211464e-05, + "loss": 0.0797, + "step": 54630 + }, + { + "epoch": 11.020928343172832, + "grad_norm": 0.07113871723413467, + "learning_rate": 3.911426456516637e-05, + "loss": 0.0762, + "step": 54640 + }, + { + "epoch": 11.020982505551643, + "grad_norm": 0.047400280833244324, + "learning_rate": 3.9111255544121276e-05, + "loss": 0.0541, + "step": 54650 + }, + { + "epoch": 11.021036667930456, + "grad_norm": 0.05375489220023155, + "learning_rate": 3.910824652307619e-05, + "loss": 0.0664, + "step": 54660 + }, + { + "epoch": 11.021090830309268, + "grad_norm": 0.003960954025387764, + "learning_rate": 3.910523750203109e-05, + "loss": 0.0558, + "step": 54670 + }, + { + "epoch": 11.021144992688079, + "grad_norm": 0.005626515485346317, + "learning_rate": 3.9102228480985995e-05, + "loss": 0.0769, + "step": 54680 + }, + { + "epoch": 11.02119915506689, + "grad_norm": 0.42901673913002014, + "learning_rate": 3.909921945994091e-05, + "loss": 0.0628, + "step": 54690 + }, + { + "epoch": 11.021253317445701, + "grad_norm": 0.022078711539506912, + "learning_rate": 3.9096210438895815e-05, + "loss": 0.0861, + "step": 54700 + }, + { + "epoch": 11.021307479824515, + "grad_norm": 6.464361190795898, + "learning_rate": 3.9093201417850714e-05, + "loss": 0.0879, + "step": 54710 + }, + { + "epoch": 11.021361642203326, + "grad_norm": 0.05138465389609337, + "learning_rate": 3.909019239680563e-05, + "loss": 0.131, + "step": 54720 + }, + { + "epoch": 11.021415804582137, + "grad_norm": 0.003568660467863083, + "learning_rate": 3.908718337576053e-05, + "loss": 0.0035, + "step": 54730 + }, + { + "epoch": 11.021469966960948, + "grad_norm": 0.08471604436635971, + "learning_rate": 3.908417435471543e-05, + "loss": 0.0627, + "step": 54740 + }, + { + "epoch": 11.021524129339761, + "grad_norm": 0.04296814650297165, + "learning_rate": 3.9081165333670346e-05, + "loss": 0.2003, + "step": 54750 + }, + { + "epoch": 11.021578291718573, + "grad_norm": 0.24642521142959595, + "learning_rate": 3.907815631262525e-05, + "loss": 0.0655, + "step": 54760 + }, + { + "epoch": 11.021632454097384, + "grad_norm": 0.06630987673997879, + "learning_rate": 3.907514729158016e-05, + "loss": 0.0324, + "step": 54770 + }, + { + "epoch": 11.021686616476195, + "grad_norm": 0.015641571953892708, + "learning_rate": 3.9072138270535065e-05, + "loss": 0.1167, + "step": 54780 + }, + { + "epoch": 11.021740778855007, + "grad_norm": 0.8007751107215881, + "learning_rate": 3.906912924948997e-05, + "loss": 0.0192, + "step": 54790 + }, + { + "epoch": 11.02179494123382, + "grad_norm": 0.02597595751285553, + "learning_rate": 3.906612022844488e-05, + "loss": 0.0062, + "step": 54800 + }, + { + "epoch": 11.021849103612631, + "grad_norm": 0.0023136604577302933, + "learning_rate": 3.906311120739979e-05, + "loss": 0.0041, + "step": 54810 + }, + { + "epoch": 11.021903265991442, + "grad_norm": 0.014175334945321083, + "learning_rate": 3.906010218635469e-05, + "loss": 0.0795, + "step": 54820 + }, + { + "epoch": 11.021957428370253, + "grad_norm": 0.024130240082740784, + "learning_rate": 3.9057093165309596e-05, + "loss": 0.1161, + "step": 54830 + }, + { + "epoch": 11.022011590749067, + "grad_norm": 0.8564153909683228, + "learning_rate": 3.905408414426451e-05, + "loss": 0.0364, + "step": 54840 + }, + { + "epoch": 11.022065753127878, + "grad_norm": 0.08866216987371445, + "learning_rate": 3.9051075123219415e-05, + "loss": 0.1338, + "step": 54850 + }, + { + "epoch": 11.02211991550669, + "grad_norm": 0.17677521705627441, + "learning_rate": 3.904806610217432e-05, + "loss": 0.0252, + "step": 54860 + }, + { + "epoch": 11.0221740778855, + "grad_norm": 0.0024649780243635178, + "learning_rate": 3.904505708112923e-05, + "loss": 0.0271, + "step": 54870 + }, + { + "epoch": 11.022228240264312, + "grad_norm": 3.4466145038604736, + "learning_rate": 3.9042048060084134e-05, + "loss": 0.0949, + "step": 54880 + }, + { + "epoch": 11.022282402643125, + "grad_norm": 6.500361442565918, + "learning_rate": 3.903903903903904e-05, + "loss": 0.1397, + "step": 54890 + }, + { + "epoch": 11.022336565021936, + "grad_norm": 0.17708049714565277, + "learning_rate": 3.903603001799395e-05, + "loss": 0.0823, + "step": 54900 + }, + { + "epoch": 11.022390727400747, + "grad_norm": 0.7922281622886658, + "learning_rate": 3.903302099694885e-05, + "loss": 0.1103, + "step": 54910 + }, + { + "epoch": 11.022444889779559, + "grad_norm": 0.19143812358379364, + "learning_rate": 3.9030011975903766e-05, + "loss": 0.2079, + "step": 54920 + }, + { + "epoch": 11.022499052158372, + "grad_norm": 0.03402164578437805, + "learning_rate": 3.9027002954858666e-05, + "loss": 0.1688, + "step": 54930 + }, + { + "epoch": 11.022553214537183, + "grad_norm": 0.30268844962120056, + "learning_rate": 3.902399393381357e-05, + "loss": 0.0563, + "step": 54940 + }, + { + "epoch": 11.022607376915994, + "grad_norm": 0.006825704127550125, + "learning_rate": 3.9020984912768485e-05, + "loss": 0.0615, + "step": 54950 + }, + { + "epoch": 11.022661539294806, + "grad_norm": 0.8985452055931091, + "learning_rate": 3.901797589172339e-05, + "loss": 0.0558, + "step": 54960 + }, + { + "epoch": 11.022715701673617, + "grad_norm": 0.0029880329966545105, + "learning_rate": 3.901496687067829e-05, + "loss": 0.0388, + "step": 54970 + }, + { + "epoch": 11.02276986405243, + "grad_norm": 1.8556480407714844, + "learning_rate": 3.9011957849633204e-05, + "loss": 0.0336, + "step": 54980 + }, + { + "epoch": 11.022824026431241, + "grad_norm": 0.07185778021812439, + "learning_rate": 3.900894882858811e-05, + "loss": 0.08, + "step": 54990 + }, + { + "epoch": 11.022878188810052, + "grad_norm": 0.4958845376968384, + "learning_rate": 3.9005939807543016e-05, + "loss": 0.0703, + "step": 55000 + }, + { + "epoch": 11.022932351188864, + "grad_norm": 0.02341638319194317, + "learning_rate": 3.900293078649792e-05, + "loss": 0.0089, + "step": 55010 + }, + { + "epoch": 11.022986513567677, + "grad_norm": 0.43208691477775574, + "learning_rate": 3.899992176545283e-05, + "loss": 0.069, + "step": 55020 + }, + { + "epoch": 11.023040675946488, + "grad_norm": 0.25901198387145996, + "learning_rate": 3.8996912744407735e-05, + "loss": 0.0852, + "step": 55030 + }, + { + "epoch": 11.0230948383253, + "grad_norm": 0.044111110270023346, + "learning_rate": 3.899390372336264e-05, + "loss": 0.0714, + "step": 55040 + }, + { + "epoch": 11.02314900070411, + "grad_norm": 0.16852636635303497, + "learning_rate": 3.899089470231755e-05, + "loss": 0.0816, + "step": 55050 + }, + { + "epoch": 11.023203163082922, + "grad_norm": 1.0539885759353638, + "learning_rate": 3.8987885681272454e-05, + "loss": 0.1434, + "step": 55060 + }, + { + "epoch": 11.023257325461735, + "grad_norm": 0.009286528453230858, + "learning_rate": 3.898487666022737e-05, + "loss": 0.0981, + "step": 55070 + }, + { + "epoch": 11.023311487840546, + "grad_norm": 4.070906639099121, + "learning_rate": 3.8981867639182267e-05, + "loss": 0.0872, + "step": 55080 + }, + { + "epoch": 11.023365650219358, + "grad_norm": 0.007816577330231667, + "learning_rate": 3.897885861813718e-05, + "loss": 0.1142, + "step": 55090 + }, + { + "epoch": 11.023419812598169, + "grad_norm": 0.022553281858563423, + "learning_rate": 3.8975849597092086e-05, + "loss": 0.0442, + "step": 55100 + }, + { + "epoch": 11.02347397497698, + "grad_norm": 4.413430213928223, + "learning_rate": 3.897284057604699e-05, + "loss": 0.0741, + "step": 55110 + }, + { + "epoch": 11.023528137355793, + "grad_norm": 0.028173336759209633, + "learning_rate": 3.89698315550019e-05, + "loss": 0.0111, + "step": 55120 + }, + { + "epoch": 11.023582299734604, + "grad_norm": 0.4079621732234955, + "learning_rate": 3.8966822533956805e-05, + "loss": 0.0725, + "step": 55130 + }, + { + "epoch": 11.023636462113416, + "grad_norm": 2.835272789001465, + "learning_rate": 3.896381351291171e-05, + "loss": 0.0667, + "step": 55140 + }, + { + "epoch": 11.023690624492227, + "grad_norm": 4.335601329803467, + "learning_rate": 3.896080449186662e-05, + "loss": 0.052, + "step": 55150 + }, + { + "epoch": 11.02374478687104, + "grad_norm": 1.0505143404006958, + "learning_rate": 3.8957795470821524e-05, + "loss": 0.0397, + "step": 55160 + }, + { + "epoch": 11.023798949249851, + "grad_norm": 0.08349642157554626, + "learning_rate": 3.895478644977643e-05, + "loss": 0.1001, + "step": 55170 + }, + { + "epoch": 11.023853111628663, + "grad_norm": 0.01324869692325592, + "learning_rate": 3.895177742873134e-05, + "loss": 0.0091, + "step": 55180 + }, + { + "epoch": 11.023907274007474, + "grad_norm": 1.294811487197876, + "learning_rate": 3.894876840768624e-05, + "loss": 0.0265, + "step": 55190 + }, + { + "epoch": 11.023961436386285, + "grad_norm": 0.06839901953935623, + "learning_rate": 3.894575938664115e-05, + "loss": 0.0722, + "step": 55200 + }, + { + "epoch": 11.024015598765098, + "grad_norm": 4.895219326019287, + "learning_rate": 3.894275036559606e-05, + "loss": 0.1098, + "step": 55210 + }, + { + "epoch": 11.02406976114391, + "grad_norm": 0.02583489939570427, + "learning_rate": 3.893974134455097e-05, + "loss": 0.0089, + "step": 55220 + }, + { + "epoch": 11.02412392352272, + "grad_norm": 3.2927520275115967, + "learning_rate": 3.893673232350587e-05, + "loss": 0.0306, + "step": 55230 + }, + { + "epoch": 11.024178085901532, + "grad_norm": 0.02771841175854206, + "learning_rate": 3.893372330246078e-05, + "loss": 0.005, + "step": 55240 + }, + { + "epoch": 11.024232248280345, + "grad_norm": 0.38728827238082886, + "learning_rate": 3.893071428141569e-05, + "loss": 0.1687, + "step": 55250 + }, + { + "epoch": 11.024286410659156, + "grad_norm": 0.008784724399447441, + "learning_rate": 3.892770526037059e-05, + "loss": 0.0396, + "step": 55260 + }, + { + "epoch": 11.024340573037968, + "grad_norm": 0.038386233150959015, + "learning_rate": 3.89246962393255e-05, + "loss": 0.0247, + "step": 55270 + }, + { + "epoch": 11.024394735416779, + "grad_norm": 0.040568750351667404, + "learning_rate": 3.8921687218280406e-05, + "loss": 0.0028, + "step": 55280 + }, + { + "epoch": 11.02444889779559, + "grad_norm": 0.0026181850116699934, + "learning_rate": 3.891867819723531e-05, + "loss": 0.0585, + "step": 55290 + }, + { + "epoch": 11.024503060174403, + "grad_norm": 0.002419266849756241, + "learning_rate": 3.8915669176190225e-05, + "loss": 0.186, + "step": 55300 + }, + { + "epoch": 11.024557222553215, + "grad_norm": 0.025561917573213577, + "learning_rate": 3.8912660155145124e-05, + "loss": 0.0895, + "step": 55310 + }, + { + "epoch": 11.024611384932026, + "grad_norm": 0.37550804018974304, + "learning_rate": 3.890965113410003e-05, + "loss": 0.0769, + "step": 55320 + }, + { + "epoch": 11.024665547310837, + "grad_norm": 0.028525473549962044, + "learning_rate": 3.8906642113054944e-05, + "loss": 0.0522, + "step": 55330 + }, + { + "epoch": 11.02471970968965, + "grad_norm": 3.2874367237091064, + "learning_rate": 3.890363309200984e-05, + "loss": 0.0967, + "step": 55340 + }, + { + "epoch": 11.024773872068462, + "grad_norm": 0.01069074310362339, + "learning_rate": 3.8900624070964756e-05, + "loss": 0.0452, + "step": 55350 + }, + { + "epoch": 11.024828034447273, + "grad_norm": 0.06622570008039474, + "learning_rate": 3.889761504991966e-05, + "loss": 0.1431, + "step": 55360 + }, + { + "epoch": 11.024882196826084, + "grad_norm": 0.08823999017477036, + "learning_rate": 3.889460602887457e-05, + "loss": 0.0053, + "step": 55370 + }, + { + "epoch": 11.024936359204895, + "grad_norm": 0.057463765144348145, + "learning_rate": 3.8891597007829475e-05, + "loss": 0.0337, + "step": 55380 + }, + { + "epoch": 11.024990521583709, + "grad_norm": 0.009358290582895279, + "learning_rate": 3.888858798678438e-05, + "loss": 0.019, + "step": 55390 + }, + { + "epoch": 11.025001354059471, + "eval_accuracy": 0.813847158719791, + "eval_loss": 0.7983429431915283, + "eval_runtime": 115.7343, + "eval_samples_per_second": 26.457, + "eval_steps_per_second": 3.309, + "step": 55392 + }, + { + "epoch": 12.000043329903049, + "grad_norm": 2.9739983081817627, + "learning_rate": 3.888557896573929e-05, + "loss": 0.0862, + "step": 55400 + }, + { + "epoch": 12.000097492281862, + "grad_norm": 0.007043239660561085, + "learning_rate": 3.8882569944694194e-05, + "loss": 0.1014, + "step": 55410 + }, + { + "epoch": 12.000151654660673, + "grad_norm": 0.035893380641937256, + "learning_rate": 3.88795609236491e-05, + "loss": 0.0134, + "step": 55420 + }, + { + "epoch": 12.000205817039484, + "grad_norm": 0.0914359912276268, + "learning_rate": 3.8876551902604006e-05, + "loss": 0.0579, + "step": 55430 + }, + { + "epoch": 12.000259979418296, + "grad_norm": 0.8219583630561829, + "learning_rate": 3.887354288155892e-05, + "loss": 0.033, + "step": 55440 + }, + { + "epoch": 12.000314141797107, + "grad_norm": 0.003915607463568449, + "learning_rate": 3.8870533860513826e-05, + "loss": 0.0759, + "step": 55450 + }, + { + "epoch": 12.00036830417592, + "grad_norm": 2.479973793029785, + "learning_rate": 3.8867524839468725e-05, + "loss": 0.1081, + "step": 55460 + }, + { + "epoch": 12.000422466554731, + "grad_norm": 0.07215862721204758, + "learning_rate": 3.886451581842364e-05, + "loss": 0.0788, + "step": 55470 + }, + { + "epoch": 12.000476628933543, + "grad_norm": 0.032415665686130524, + "learning_rate": 3.8861506797378545e-05, + "loss": 0.0487, + "step": 55480 + }, + { + "epoch": 12.000530791312354, + "grad_norm": 0.0731772854924202, + "learning_rate": 3.8858497776333444e-05, + "loss": 0.0082, + "step": 55490 + }, + { + "epoch": 12.000584953691167, + "grad_norm": 0.07334107905626297, + "learning_rate": 3.885548875528836e-05, + "loss": 0.0689, + "step": 55500 + }, + { + "epoch": 12.000639116069978, + "grad_norm": 0.050965774804353714, + "learning_rate": 3.8852479734243263e-05, + "loss": 0.01, + "step": 55510 + }, + { + "epoch": 12.00069327844879, + "grad_norm": 0.002281946362927556, + "learning_rate": 3.884947071319817e-05, + "loss": 0.0078, + "step": 55520 + }, + { + "epoch": 12.0007474408276, + "grad_norm": 0.09916464239358902, + "learning_rate": 3.8846461692153076e-05, + "loss": 0.0039, + "step": 55530 + }, + { + "epoch": 12.000801603206412, + "grad_norm": 1.452404499053955, + "learning_rate": 3.884345267110798e-05, + "loss": 0.1627, + "step": 55540 + }, + { + "epoch": 12.000855765585225, + "grad_norm": 0.006693873088806868, + "learning_rate": 3.884044365006289e-05, + "loss": 0.0404, + "step": 55550 + }, + { + "epoch": 12.000909927964036, + "grad_norm": 0.0657752975821495, + "learning_rate": 3.88374346290178e-05, + "loss": 0.0569, + "step": 55560 + }, + { + "epoch": 12.000964090342848, + "grad_norm": 0.045311372727155685, + "learning_rate": 3.88344256079727e-05, + "loss": 0.024, + "step": 55570 + }, + { + "epoch": 12.001018252721659, + "grad_norm": 0.002018796280026436, + "learning_rate": 3.883141658692761e-05, + "loss": 0.0078, + "step": 55580 + }, + { + "epoch": 12.001072415100472, + "grad_norm": 0.009723776951432228, + "learning_rate": 3.882840756588252e-05, + "loss": 0.0616, + "step": 55590 + }, + { + "epoch": 12.001126577479283, + "grad_norm": 0.050287432968616486, + "learning_rate": 3.882539854483743e-05, + "loss": 0.036, + "step": 55600 + }, + { + "epoch": 12.001180739858095, + "grad_norm": 0.3954840898513794, + "learning_rate": 3.882238952379233e-05, + "loss": 0.0651, + "step": 55610 + }, + { + "epoch": 12.001234902236906, + "grad_norm": 0.002940947888419032, + "learning_rate": 3.881938050274724e-05, + "loss": 0.0849, + "step": 55620 + }, + { + "epoch": 12.001289064615717, + "grad_norm": 0.1567300409078598, + "learning_rate": 3.8816371481702146e-05, + "loss": 0.0104, + "step": 55630 + }, + { + "epoch": 12.00134322699453, + "grad_norm": 0.9998345971107483, + "learning_rate": 3.881336246065705e-05, + "loss": 0.1451, + "step": 55640 + }, + { + "epoch": 12.001397389373341, + "grad_norm": 0.0023947192821651697, + "learning_rate": 3.881035343961196e-05, + "loss": 0.0134, + "step": 55650 + }, + { + "epoch": 12.001451551752153, + "grad_norm": 5.9732842445373535, + "learning_rate": 3.8807344418566864e-05, + "loss": 0.2801, + "step": 55660 + }, + { + "epoch": 12.001505714130964, + "grad_norm": 0.034055065363645554, + "learning_rate": 3.880433539752177e-05, + "loss": 0.0051, + "step": 55670 + }, + { + "epoch": 12.001559876509777, + "grad_norm": 1.4728646278381348, + "learning_rate": 3.880132637647668e-05, + "loss": 0.0653, + "step": 55680 + }, + { + "epoch": 12.001614038888588, + "grad_norm": 0.008423425257205963, + "learning_rate": 3.879831735543158e-05, + "loss": 0.0099, + "step": 55690 + }, + { + "epoch": 12.0016682012674, + "grad_norm": 0.006893581245094538, + "learning_rate": 3.8795308334386496e-05, + "loss": 0.0409, + "step": 55700 + }, + { + "epoch": 12.001722363646211, + "grad_norm": 0.013606692664325237, + "learning_rate": 3.87922993133414e-05, + "loss": 0.0022, + "step": 55710 + }, + { + "epoch": 12.001776526025022, + "grad_norm": 0.2703230381011963, + "learning_rate": 3.87892902922963e-05, + "loss": 0.0468, + "step": 55720 + }, + { + "epoch": 12.001830688403835, + "grad_norm": 0.0079299071803689, + "learning_rate": 3.8786281271251215e-05, + "loss": 0.0879, + "step": 55730 + }, + { + "epoch": 12.001884850782647, + "grad_norm": 0.005391342099756002, + "learning_rate": 3.878327225020612e-05, + "loss": 0.0408, + "step": 55740 + }, + { + "epoch": 12.001939013161458, + "grad_norm": 0.029875673353672028, + "learning_rate": 3.878026322916103e-05, + "loss": 0.0202, + "step": 55750 + }, + { + "epoch": 12.00199317554027, + "grad_norm": 0.0690455287694931, + "learning_rate": 3.8777254208115934e-05, + "loss": 0.0068, + "step": 55760 + }, + { + "epoch": 12.002047337919082, + "grad_norm": 0.003759262152016163, + "learning_rate": 3.877424518707084e-05, + "loss": 0.0022, + "step": 55770 + }, + { + "epoch": 12.002101500297893, + "grad_norm": 0.02002102881669998, + "learning_rate": 3.8771236166025746e-05, + "loss": 0.0751, + "step": 55780 + }, + { + "epoch": 12.002155662676705, + "grad_norm": 14.319259643554688, + "learning_rate": 3.876822714498065e-05, + "loss": 0.1779, + "step": 55790 + }, + { + "epoch": 12.002209825055516, + "grad_norm": 0.014349968172609806, + "learning_rate": 3.876521812393556e-05, + "loss": 0.0272, + "step": 55800 + }, + { + "epoch": 12.002263987434327, + "grad_norm": 0.003104139817878604, + "learning_rate": 3.8762209102890465e-05, + "loss": 0.0036, + "step": 55810 + }, + { + "epoch": 12.00231814981314, + "grad_norm": 0.046450257301330566, + "learning_rate": 3.875920008184538e-05, + "loss": 0.083, + "step": 55820 + }, + { + "epoch": 12.002372312191952, + "grad_norm": 0.10923498868942261, + "learning_rate": 3.875619106080028e-05, + "loss": 0.1127, + "step": 55830 + }, + { + "epoch": 12.002426474570763, + "grad_norm": 0.12005839496850967, + "learning_rate": 3.8753182039755184e-05, + "loss": 0.001, + "step": 55840 + }, + { + "epoch": 12.002480636949574, + "grad_norm": 0.025170499458909035, + "learning_rate": 3.87501730187101e-05, + "loss": 0.0194, + "step": 55850 + }, + { + "epoch": 12.002534799328387, + "grad_norm": 0.029833538457751274, + "learning_rate": 3.8747163997665003e-05, + "loss": 0.0518, + "step": 55860 + }, + { + "epoch": 12.002588961707199, + "grad_norm": 0.01750842295587063, + "learning_rate": 3.874415497661991e-05, + "loss": 0.0434, + "step": 55870 + }, + { + "epoch": 12.00264312408601, + "grad_norm": 0.2621007263660431, + "learning_rate": 3.8741145955574816e-05, + "loss": 0.0029, + "step": 55880 + }, + { + "epoch": 12.002697286464821, + "grad_norm": 5.108348369598389, + "learning_rate": 3.873813693452972e-05, + "loss": 0.2763, + "step": 55890 + }, + { + "epoch": 12.002751448843632, + "grad_norm": 0.01859648898243904, + "learning_rate": 3.873512791348463e-05, + "loss": 0.0287, + "step": 55900 + }, + { + "epoch": 12.002805611222445, + "grad_norm": 1.1293766498565674, + "learning_rate": 3.8732118892439535e-05, + "loss": 0.0476, + "step": 55910 + }, + { + "epoch": 12.002859773601257, + "grad_norm": 1.3758430480957031, + "learning_rate": 3.872910987139444e-05, + "loss": 0.0018, + "step": 55920 + }, + { + "epoch": 12.002913935980068, + "grad_norm": 1.4568406343460083, + "learning_rate": 3.872610085034935e-05, + "loss": 0.0043, + "step": 55930 + }, + { + "epoch": 12.00296809835888, + "grad_norm": 1.5031250715255737, + "learning_rate": 3.8723091829304254e-05, + "loss": 0.0495, + "step": 55940 + }, + { + "epoch": 12.003022260737692, + "grad_norm": 0.8030693531036377, + "learning_rate": 3.872008280825916e-05, + "loss": 0.1001, + "step": 55950 + }, + { + "epoch": 12.003076423116504, + "grad_norm": 0.08513473719358444, + "learning_rate": 3.871707378721407e-05, + "loss": 0.1762, + "step": 55960 + }, + { + "epoch": 12.003130585495315, + "grad_norm": 0.6567205786705017, + "learning_rate": 3.871406476616898e-05, + "loss": 0.0293, + "step": 55970 + }, + { + "epoch": 12.003184747874126, + "grad_norm": 0.007213204633444548, + "learning_rate": 3.871105574512388e-05, + "loss": 0.061, + "step": 55980 + }, + { + "epoch": 12.003238910252938, + "grad_norm": 0.03178811073303223, + "learning_rate": 3.870804672407879e-05, + "loss": 0.0357, + "step": 55990 + }, + { + "epoch": 12.00329307263175, + "grad_norm": 0.028318414464592934, + "learning_rate": 3.87050377030337e-05, + "loss": 0.0836, + "step": 56000 + }, + { + "epoch": 12.003347235010562, + "grad_norm": 0.8120859861373901, + "learning_rate": 3.8702028681988604e-05, + "loss": 0.0713, + "step": 56010 + }, + { + "epoch": 12.003401397389373, + "grad_norm": 0.010600728914141655, + "learning_rate": 3.869901966094351e-05, + "loss": 0.0409, + "step": 56020 + }, + { + "epoch": 12.003455559768184, + "grad_norm": 0.005975477863103151, + "learning_rate": 3.869601063989842e-05, + "loss": 0.0181, + "step": 56030 + }, + { + "epoch": 12.003509722146998, + "grad_norm": 0.01810312271118164, + "learning_rate": 3.869300161885332e-05, + "loss": 0.0047, + "step": 56040 + }, + { + "epoch": 12.003563884525809, + "grad_norm": 2.5843088626861572, + "learning_rate": 3.8689992597808236e-05, + "loss": 0.0493, + "step": 56050 + }, + { + "epoch": 12.00361804690462, + "grad_norm": 15.676197052001953, + "learning_rate": 3.8686983576763136e-05, + "loss": 0.1271, + "step": 56060 + }, + { + "epoch": 12.003672209283431, + "grad_norm": 0.21804746985435486, + "learning_rate": 3.868397455571804e-05, + "loss": 0.1907, + "step": 56070 + }, + { + "epoch": 12.003726371662243, + "grad_norm": 0.09262972325086594, + "learning_rate": 3.8680965534672955e-05, + "loss": 0.0595, + "step": 56080 + }, + { + "epoch": 12.003780534041056, + "grad_norm": 0.26343271136283875, + "learning_rate": 3.8677956513627854e-05, + "loss": 0.0407, + "step": 56090 + }, + { + "epoch": 12.003834696419867, + "grad_norm": 0.004627629648894072, + "learning_rate": 3.867494749258276e-05, + "loss": 0.0058, + "step": 56100 + }, + { + "epoch": 12.003888858798678, + "grad_norm": 0.19376277923583984, + "learning_rate": 3.8671938471537674e-05, + "loss": 0.0624, + "step": 56110 + }, + { + "epoch": 12.00394302117749, + "grad_norm": 0.003782058134675026, + "learning_rate": 3.866892945049258e-05, + "loss": 0.1204, + "step": 56120 + }, + { + "epoch": 12.003997183556303, + "grad_norm": 0.024374786764383316, + "learning_rate": 3.8665920429447486e-05, + "loss": 0.0024, + "step": 56130 + }, + { + "epoch": 12.004051345935114, + "grad_norm": 0.01193908415734768, + "learning_rate": 3.866291140840239e-05, + "loss": 0.1176, + "step": 56140 + }, + { + "epoch": 12.004105508313925, + "grad_norm": 0.317576140165329, + "learning_rate": 3.86599023873573e-05, + "loss": 0.0857, + "step": 56150 + }, + { + "epoch": 12.004159670692736, + "grad_norm": 0.2156895250082016, + "learning_rate": 3.8656893366312205e-05, + "loss": 0.0465, + "step": 56160 + }, + { + "epoch": 12.004213833071548, + "grad_norm": 1.691094994544983, + "learning_rate": 3.865388434526711e-05, + "loss": 0.139, + "step": 56170 + }, + { + "epoch": 12.00426799545036, + "grad_norm": 0.3912772536277771, + "learning_rate": 3.865087532422202e-05, + "loss": 0.1201, + "step": 56180 + }, + { + "epoch": 12.004322157829172, + "grad_norm": 1.3565691709518433, + "learning_rate": 3.8647866303176924e-05, + "loss": 0.124, + "step": 56190 + }, + { + "epoch": 12.004376320207983, + "grad_norm": 0.060963116586208344, + "learning_rate": 3.864485728213184e-05, + "loss": 0.03, + "step": 56200 + }, + { + "epoch": 12.004430482586795, + "grad_norm": 0.18696655333042145, + "learning_rate": 3.8641848261086737e-05, + "loss": 0.1014, + "step": 56210 + }, + { + "epoch": 12.004484644965608, + "grad_norm": 0.1930093616247177, + "learning_rate": 3.863883924004165e-05, + "loss": 0.0108, + "step": 56220 + }, + { + "epoch": 12.004538807344419, + "grad_norm": 0.1641521006822586, + "learning_rate": 3.8635830218996556e-05, + "loss": 0.009, + "step": 56230 + }, + { + "epoch": 12.00459296972323, + "grad_norm": 0.11930553615093231, + "learning_rate": 3.8632821197951455e-05, + "loss": 0.0132, + "step": 56240 + }, + { + "epoch": 12.004647132102042, + "grad_norm": 0.03908658027648926, + "learning_rate": 3.862981217690637e-05, + "loss": 0.0822, + "step": 56250 + }, + { + "epoch": 12.004701294480853, + "grad_norm": 0.011940887197852135, + "learning_rate": 3.8626803155861275e-05, + "loss": 0.0313, + "step": 56260 + }, + { + "epoch": 12.004755456859666, + "grad_norm": 0.019222375005483627, + "learning_rate": 3.862379413481618e-05, + "loss": 0.1153, + "step": 56270 + }, + { + "epoch": 12.004809619238477, + "grad_norm": 0.01034393161535263, + "learning_rate": 3.862078511377109e-05, + "loss": 0.0364, + "step": 56280 + }, + { + "epoch": 12.004863781617289, + "grad_norm": 0.028203604742884636, + "learning_rate": 3.8617776092725994e-05, + "loss": 0.0054, + "step": 56290 + }, + { + "epoch": 12.0049179439961, + "grad_norm": 0.028987528756260872, + "learning_rate": 3.86147670716809e-05, + "loss": 0.0062, + "step": 56300 + }, + { + "epoch": 12.004972106374913, + "grad_norm": 0.024197395890951157, + "learning_rate": 3.861175805063581e-05, + "loss": 0.0654, + "step": 56310 + }, + { + "epoch": 12.005026268753724, + "grad_norm": 0.0026142343413084745, + "learning_rate": 3.860874902959071e-05, + "loss": 0.0965, + "step": 56320 + }, + { + "epoch": 12.005080431132535, + "grad_norm": 0.3418773114681244, + "learning_rate": 3.860574000854562e-05, + "loss": 0.0822, + "step": 56330 + }, + { + "epoch": 12.005134593511347, + "grad_norm": 0.03159237653017044, + "learning_rate": 3.860273098750053e-05, + "loss": 0.0649, + "step": 56340 + }, + { + "epoch": 12.005188755890158, + "grad_norm": 0.002352628158405423, + "learning_rate": 3.859972196645544e-05, + "loss": 0.037, + "step": 56350 + }, + { + "epoch": 12.005242918268971, + "grad_norm": 4.025827407836914, + "learning_rate": 3.859671294541034e-05, + "loss": 0.0464, + "step": 56360 + }, + { + "epoch": 12.005297080647782, + "grad_norm": 0.6563068628311157, + "learning_rate": 3.859370392436525e-05, + "loss": 0.0694, + "step": 56370 + }, + { + "epoch": 12.005351243026594, + "grad_norm": 0.05467509478330612, + "learning_rate": 3.859069490332016e-05, + "loss": 0.0723, + "step": 56380 + }, + { + "epoch": 12.005405405405405, + "grad_norm": 2.8279988765716553, + "learning_rate": 3.858768588227506e-05, + "loss": 0.1687, + "step": 56390 + }, + { + "epoch": 12.005459567784218, + "grad_norm": 0.002745603211224079, + "learning_rate": 3.858467686122997e-05, + "loss": 0.0113, + "step": 56400 + }, + { + "epoch": 12.00551373016303, + "grad_norm": 0.17366205155849457, + "learning_rate": 3.8581667840184876e-05, + "loss": 0.0778, + "step": 56410 + }, + { + "epoch": 12.00556789254184, + "grad_norm": 0.2539968490600586, + "learning_rate": 3.857865881913978e-05, + "loss": 0.0063, + "step": 56420 + }, + { + "epoch": 12.005622054920652, + "grad_norm": 0.0025455625727772713, + "learning_rate": 3.857564979809469e-05, + "loss": 0.0713, + "step": 56430 + }, + { + "epoch": 12.005676217299463, + "grad_norm": 0.0027434551157057285, + "learning_rate": 3.8572640777049594e-05, + "loss": 0.0214, + "step": 56440 + }, + { + "epoch": 12.005730379678276, + "grad_norm": 110.12211608886719, + "learning_rate": 3.85696317560045e-05, + "loss": 0.1318, + "step": 56450 + }, + { + "epoch": 12.005784542057087, + "grad_norm": 0.0648273378610611, + "learning_rate": 3.8566622734959414e-05, + "loss": 0.0405, + "step": 56460 + }, + { + "epoch": 12.005838704435899, + "grad_norm": 9.74371337890625, + "learning_rate": 3.856361371391431e-05, + "loss": 0.0958, + "step": 56470 + }, + { + "epoch": 12.00589286681471, + "grad_norm": 2.176037073135376, + "learning_rate": 3.8560604692869226e-05, + "loss": 0.0226, + "step": 56480 + }, + { + "epoch": 12.005947029193521, + "grad_norm": 5.002775192260742, + "learning_rate": 3.855759567182413e-05, + "loss": 0.1719, + "step": 56490 + }, + { + "epoch": 12.006001191572334, + "grad_norm": 0.06783021241426468, + "learning_rate": 3.855458665077904e-05, + "loss": 0.0116, + "step": 56500 + }, + { + "epoch": 12.006055353951146, + "grad_norm": 2.5360798835754395, + "learning_rate": 3.8551577629733945e-05, + "loss": 0.0821, + "step": 56510 + }, + { + "epoch": 12.006109516329957, + "grad_norm": 0.2378576397895813, + "learning_rate": 3.854856860868885e-05, + "loss": 0.0336, + "step": 56520 + }, + { + "epoch": 12.006163678708768, + "grad_norm": 0.0025522042997181416, + "learning_rate": 3.854555958764376e-05, + "loss": 0.1292, + "step": 56530 + }, + { + "epoch": 12.006217841087581, + "grad_norm": 0.07833196222782135, + "learning_rate": 3.8542550566598664e-05, + "loss": 0.0492, + "step": 56540 + }, + { + "epoch": 12.006272003466393, + "grad_norm": 0.005712743848562241, + "learning_rate": 3.853954154555357e-05, + "loss": 0.1772, + "step": 56550 + }, + { + "epoch": 12.006326165845204, + "grad_norm": 0.011246801353991032, + "learning_rate": 3.8536532524508476e-05, + "loss": 0.0815, + "step": 56560 + }, + { + "epoch": 12.006380328224015, + "grad_norm": 0.04398467019200325, + "learning_rate": 3.853352350346339e-05, + "loss": 0.0719, + "step": 56570 + }, + { + "epoch": 12.006434490602826, + "grad_norm": 0.006323957350105047, + "learning_rate": 3.853051448241829e-05, + "loss": 0.1817, + "step": 56580 + }, + { + "epoch": 12.00648865298164, + "grad_norm": 5.497435569763184, + "learning_rate": 3.8527505461373195e-05, + "loss": 0.0847, + "step": 56590 + }, + { + "epoch": 12.00654281536045, + "grad_norm": 0.16200244426727295, + "learning_rate": 3.852449644032811e-05, + "loss": 0.005, + "step": 56600 + }, + { + "epoch": 12.006596977739262, + "grad_norm": 1.258240818977356, + "learning_rate": 3.8521487419283015e-05, + "loss": 0.0941, + "step": 56610 + }, + { + "epoch": 12.006651140118073, + "grad_norm": 0.14619404077529907, + "learning_rate": 3.8518478398237914e-05, + "loss": 0.0134, + "step": 56620 + }, + { + "epoch": 12.006705302496886, + "grad_norm": 1.8619837760925293, + "learning_rate": 3.851546937719283e-05, + "loss": 0.0294, + "step": 56630 + }, + { + "epoch": 12.006759464875698, + "grad_norm": 0.03811756148934364, + "learning_rate": 3.8512460356147733e-05, + "loss": 0.045, + "step": 56640 + }, + { + "epoch": 12.006813627254509, + "grad_norm": 0.004387996159493923, + "learning_rate": 3.850945133510264e-05, + "loss": 0.0397, + "step": 56650 + }, + { + "epoch": 12.00686778963332, + "grad_norm": 0.0030211242847144604, + "learning_rate": 3.8506442314057546e-05, + "loss": 0.1357, + "step": 56660 + }, + { + "epoch": 12.006921952012132, + "grad_norm": 2.990678071975708, + "learning_rate": 3.850343329301245e-05, + "loss": 0.1366, + "step": 56670 + }, + { + "epoch": 12.006976114390945, + "grad_norm": 0.008051756769418716, + "learning_rate": 3.850042427196736e-05, + "loss": 0.0249, + "step": 56680 + }, + { + "epoch": 12.007030276769756, + "grad_norm": 0.0028707869350910187, + "learning_rate": 3.8497415250922265e-05, + "loss": 0.023, + "step": 56690 + }, + { + "epoch": 12.007084439148567, + "grad_norm": 0.048442430794239044, + "learning_rate": 3.849440622987717e-05, + "loss": 0.0055, + "step": 56700 + }, + { + "epoch": 12.007138601527378, + "grad_norm": 1.7939084768295288, + "learning_rate": 3.849139720883208e-05, + "loss": 0.065, + "step": 56710 + }, + { + "epoch": 12.007192763906192, + "grad_norm": 0.041416965425014496, + "learning_rate": 3.848838818778699e-05, + "loss": 0.0131, + "step": 56720 + }, + { + "epoch": 12.007246926285003, + "grad_norm": 1.854256272315979, + "learning_rate": 3.848537916674189e-05, + "loss": 0.1299, + "step": 56730 + }, + { + "epoch": 12.007301088663814, + "grad_norm": 0.05167490616440773, + "learning_rate": 3.84823701456968e-05, + "loss": 0.0547, + "step": 56740 + }, + { + "epoch": 12.007355251042625, + "grad_norm": 0.23358307778835297, + "learning_rate": 3.847936112465171e-05, + "loss": 0.0695, + "step": 56750 + }, + { + "epoch": 12.007409413421437, + "grad_norm": 0.9774928092956543, + "learning_rate": 3.8476352103606616e-05, + "loss": 0.1024, + "step": 56760 + }, + { + "epoch": 12.00746357580025, + "grad_norm": 0.06905080378055573, + "learning_rate": 3.847334308256152e-05, + "loss": 0.0386, + "step": 56770 + }, + { + "epoch": 12.007517738179061, + "grad_norm": 4.163247108459473, + "learning_rate": 3.847033406151643e-05, + "loss": 0.096, + "step": 56780 + }, + { + "epoch": 12.007571900557872, + "grad_norm": 1.1866700649261475, + "learning_rate": 3.8467325040471334e-05, + "loss": 0.0182, + "step": 56790 + }, + { + "epoch": 12.007626062936684, + "grad_norm": 0.1969834715127945, + "learning_rate": 3.846431601942625e-05, + "loss": 0.0174, + "step": 56800 + }, + { + "epoch": 12.007680225315497, + "grad_norm": 0.3112381100654602, + "learning_rate": 3.846130699838115e-05, + "loss": 0.0323, + "step": 56810 + }, + { + "epoch": 12.007734387694308, + "grad_norm": 0.0024311619345098734, + "learning_rate": 3.845829797733605e-05, + "loss": 0.0339, + "step": 56820 + }, + { + "epoch": 12.00778855007312, + "grad_norm": 2.03141188621521, + "learning_rate": 3.8455288956290966e-05, + "loss": 0.0665, + "step": 56830 + }, + { + "epoch": 12.00784271245193, + "grad_norm": 0.10797475278377533, + "learning_rate": 3.8452279935245866e-05, + "loss": 0.0782, + "step": 56840 + }, + { + "epoch": 12.007896874830742, + "grad_norm": 0.0023334461729973555, + "learning_rate": 3.844927091420077e-05, + "loss": 0.0619, + "step": 56850 + }, + { + "epoch": 12.007951037209555, + "grad_norm": 0.038000013679265976, + "learning_rate": 3.8446261893155685e-05, + "loss": 0.157, + "step": 56860 + }, + { + "epoch": 12.008005199588366, + "grad_norm": 29.064180374145508, + "learning_rate": 3.844325287211059e-05, + "loss": 0.0887, + "step": 56870 + }, + { + "epoch": 12.008059361967177, + "grad_norm": 0.3226475417613983, + "learning_rate": 3.844024385106549e-05, + "loss": 0.0695, + "step": 56880 + }, + { + "epoch": 12.008113524345989, + "grad_norm": 1.3502049446105957, + "learning_rate": 3.8437234830020404e-05, + "loss": 0.0603, + "step": 56890 + }, + { + "epoch": 12.008167686724802, + "grad_norm": 0.009061298333108425, + "learning_rate": 3.843422580897531e-05, + "loss": 0.101, + "step": 56900 + }, + { + "epoch": 12.008221849103613, + "grad_norm": 0.2796061038970947, + "learning_rate": 3.8431216787930216e-05, + "loss": 0.0356, + "step": 56910 + }, + { + "epoch": 12.008276011482424, + "grad_norm": 39.38706588745117, + "learning_rate": 3.842820776688512e-05, + "loss": 0.1664, + "step": 56920 + }, + { + "epoch": 12.008330173861236, + "grad_norm": 0.016626255586743355, + "learning_rate": 3.842519874584003e-05, + "loss": 0.013, + "step": 56930 + }, + { + "epoch": 12.008384336240047, + "grad_norm": 0.1799662709236145, + "learning_rate": 3.8422189724794935e-05, + "loss": 0.0069, + "step": 56940 + }, + { + "epoch": 12.00843849861886, + "grad_norm": 0.12680260837078094, + "learning_rate": 3.841918070374985e-05, + "loss": 0.0213, + "step": 56950 + }, + { + "epoch": 12.008492660997671, + "grad_norm": 0.019270896911621094, + "learning_rate": 3.841617168270475e-05, + "loss": 0.0017, + "step": 56960 + }, + { + "epoch": 12.008546823376482, + "grad_norm": 3.0224978923797607, + "learning_rate": 3.8413162661659654e-05, + "loss": 0.081, + "step": 56970 + }, + { + "epoch": 12.008600985755294, + "grad_norm": 0.0032994234934449196, + "learning_rate": 3.841015364061457e-05, + "loss": 0.1147, + "step": 56980 + }, + { + "epoch": 12.008655148134107, + "grad_norm": 0.00815607514232397, + "learning_rate": 3.840714461956947e-05, + "loss": 0.0404, + "step": 56990 + }, + { + "epoch": 12.008709310512918, + "grad_norm": 0.2927594482898712, + "learning_rate": 3.840413559852438e-05, + "loss": 0.0673, + "step": 57000 + }, + { + "epoch": 12.00876347289173, + "grad_norm": 4.210506916046143, + "learning_rate": 3.8401126577479286e-05, + "loss": 0.0544, + "step": 57010 + }, + { + "epoch": 12.00881763527054, + "grad_norm": 0.38757818937301636, + "learning_rate": 3.839811755643419e-05, + "loss": 0.0719, + "step": 57020 + }, + { + "epoch": 12.008871797649352, + "grad_norm": 0.8786349296569824, + "learning_rate": 3.83951085353891e-05, + "loss": 0.074, + "step": 57030 + }, + { + "epoch": 12.008925960028165, + "grad_norm": 0.6758280992507935, + "learning_rate": 3.8392099514344005e-05, + "loss": 0.076, + "step": 57040 + }, + { + "epoch": 12.008980122406976, + "grad_norm": 0.13112138211727142, + "learning_rate": 3.838909049329891e-05, + "loss": 0.0904, + "step": 57050 + }, + { + "epoch": 12.009034284785788, + "grad_norm": 14.452859878540039, + "learning_rate": 3.8386081472253824e-05, + "loss": 0.1568, + "step": 57060 + }, + { + "epoch": 12.009088447164599, + "grad_norm": 1.3843826055526733, + "learning_rate": 3.8383072451208724e-05, + "loss": 0.0487, + "step": 57070 + }, + { + "epoch": 12.009142609543412, + "grad_norm": 0.056470487266778946, + "learning_rate": 3.838006343016363e-05, + "loss": 0.112, + "step": 57080 + }, + { + "epoch": 12.009196771922223, + "grad_norm": 0.09719572216272354, + "learning_rate": 3.837705440911854e-05, + "loss": 0.0397, + "step": 57090 + }, + { + "epoch": 12.009250934301035, + "grad_norm": 0.8942016363143921, + "learning_rate": 3.837404538807345e-05, + "loss": 0.0447, + "step": 57100 + }, + { + "epoch": 12.009305096679846, + "grad_norm": 0.0035243332386016846, + "learning_rate": 3.837103636702835e-05, + "loss": 0.0663, + "step": 57110 + }, + { + "epoch": 12.009359259058657, + "grad_norm": 0.3655202090740204, + "learning_rate": 3.836802734598326e-05, + "loss": 0.0917, + "step": 57120 + }, + { + "epoch": 12.00941342143747, + "grad_norm": 1.0588548183441162, + "learning_rate": 3.836501832493817e-05, + "loss": 0.0299, + "step": 57130 + }, + { + "epoch": 12.009467583816281, + "grad_norm": 0.015578561462461948, + "learning_rate": 3.836200930389307e-05, + "loss": 0.0556, + "step": 57140 + }, + { + "epoch": 12.009521746195093, + "grad_norm": 0.0031830996740609407, + "learning_rate": 3.835900028284798e-05, + "loss": 0.1071, + "step": 57150 + }, + { + "epoch": 12.009575908573904, + "grad_norm": 0.0032198315020650625, + "learning_rate": 3.835599126180289e-05, + "loss": 0.086, + "step": 57160 + }, + { + "epoch": 12.009630070952717, + "grad_norm": 16.36029624938965, + "learning_rate": 3.835298224075779e-05, + "loss": 0.0684, + "step": 57170 + }, + { + "epoch": 12.009684233331528, + "grad_norm": 1.3470654487609863, + "learning_rate": 3.83499732197127e-05, + "loss": 0.1632, + "step": 57180 + }, + { + "epoch": 12.00973839571034, + "grad_norm": 0.08750702440738678, + "learning_rate": 3.8346964198667606e-05, + "loss": 0.0145, + "step": 57190 + }, + { + "epoch": 12.009792558089151, + "grad_norm": 2.885678291320801, + "learning_rate": 3.834395517762251e-05, + "loss": 0.1339, + "step": 57200 + }, + { + "epoch": 12.009846720467962, + "grad_norm": 0.3501625061035156, + "learning_rate": 3.8340946156577425e-05, + "loss": 0.0063, + "step": 57210 + }, + { + "epoch": 12.009900882846775, + "grad_norm": 3.5399367809295654, + "learning_rate": 3.8337937135532325e-05, + "loss": 0.0899, + "step": 57220 + }, + { + "epoch": 12.009955045225587, + "grad_norm": 0.12790562212467194, + "learning_rate": 3.833492811448723e-05, + "loss": 0.036, + "step": 57230 + }, + { + "epoch": 12.010009207604398, + "grad_norm": 0.330437570810318, + "learning_rate": 3.8331919093442144e-05, + "loss": 0.2233, + "step": 57240 + }, + { + "epoch": 12.01006336998321, + "grad_norm": 0.07868137210607529, + "learning_rate": 3.832891007239705e-05, + "loss": 0.0277, + "step": 57250 + }, + { + "epoch": 12.010117532362022, + "grad_norm": 0.0462774783372879, + "learning_rate": 3.8325901051351956e-05, + "loss": 0.0717, + "step": 57260 + }, + { + "epoch": 12.010171694740833, + "grad_norm": 0.17883354425430298, + "learning_rate": 3.832289203030686e-05, + "loss": 0.03, + "step": 57270 + }, + { + "epoch": 12.010225857119645, + "grad_norm": 2.136997699737549, + "learning_rate": 3.831988300926177e-05, + "loss": 0.091, + "step": 57280 + }, + { + "epoch": 12.010280019498456, + "grad_norm": 0.0035218666307628155, + "learning_rate": 3.8316873988216675e-05, + "loss": 0.0488, + "step": 57290 + }, + { + "epoch": 12.010334181877267, + "grad_norm": 2.5807974338531494, + "learning_rate": 3.831386496717158e-05, + "loss": 0.0946, + "step": 57300 + }, + { + "epoch": 12.01038834425608, + "grad_norm": 0.20799492299556732, + "learning_rate": 3.831085594612649e-05, + "loss": 0.0677, + "step": 57310 + }, + { + "epoch": 12.010442506634892, + "grad_norm": 0.004223010502755642, + "learning_rate": 3.83078469250814e-05, + "loss": 0.028, + "step": 57320 + }, + { + "epoch": 12.010496669013703, + "grad_norm": 0.2520821988582611, + "learning_rate": 3.83048379040363e-05, + "loss": 0.0069, + "step": 57330 + }, + { + "epoch": 12.010550831392514, + "grad_norm": 0.005608646664768457, + "learning_rate": 3.8301828882991207e-05, + "loss": 0.0447, + "step": 57340 + }, + { + "epoch": 12.010604993771327, + "grad_norm": 0.10002730786800385, + "learning_rate": 3.829881986194612e-05, + "loss": 0.1219, + "step": 57350 + }, + { + "epoch": 12.010659156150139, + "grad_norm": 0.011032520793378353, + "learning_rate": 3.8295810840901026e-05, + "loss": 0.1583, + "step": 57360 + }, + { + "epoch": 12.01071331852895, + "grad_norm": 2.4444477558135986, + "learning_rate": 3.8292801819855925e-05, + "loss": 0.0515, + "step": 57370 + }, + { + "epoch": 12.010767480907761, + "grad_norm": 0.07157866656780243, + "learning_rate": 3.828979279881084e-05, + "loss": 0.054, + "step": 57380 + }, + { + "epoch": 12.010821643286572, + "grad_norm": 0.004293117206543684, + "learning_rate": 3.8286783777765745e-05, + "loss": 0.0243, + "step": 57390 + }, + { + "epoch": 12.010875805665385, + "grad_norm": 8.61030101776123, + "learning_rate": 3.828377475672065e-05, + "loss": 0.1771, + "step": 57400 + }, + { + "epoch": 12.010929968044197, + "grad_norm": 0.08579348027706146, + "learning_rate": 3.828076573567556e-05, + "loss": 0.0859, + "step": 57410 + }, + { + "epoch": 12.010984130423008, + "grad_norm": 0.9949190020561218, + "learning_rate": 3.8277756714630464e-05, + "loss": 0.0058, + "step": 57420 + }, + { + "epoch": 12.01103829280182, + "grad_norm": 0.046186525374650955, + "learning_rate": 3.827474769358537e-05, + "loss": 0.1054, + "step": 57430 + }, + { + "epoch": 12.011092455180632, + "grad_norm": 0.013178369030356407, + "learning_rate": 3.8271738672540276e-05, + "loss": 0.1027, + "step": 57440 + }, + { + "epoch": 12.011146617559444, + "grad_norm": 2.4575634002685547, + "learning_rate": 3.826872965149518e-05, + "loss": 0.1518, + "step": 57450 + }, + { + "epoch": 12.011200779938255, + "grad_norm": 0.5665506720542908, + "learning_rate": 3.826572063045009e-05, + "loss": 0.1469, + "step": 57460 + }, + { + "epoch": 12.011254942317066, + "grad_norm": 0.013368207029998302, + "learning_rate": 3.8262711609405e-05, + "loss": 0.0486, + "step": 57470 + }, + { + "epoch": 12.011309104695878, + "grad_norm": 2.8997561931610107, + "learning_rate": 3.82597025883599e-05, + "loss": 0.01, + "step": 57480 + }, + { + "epoch": 12.01136326707469, + "grad_norm": 0.036793459206819534, + "learning_rate": 3.825669356731481e-05, + "loss": 0.0204, + "step": 57490 + }, + { + "epoch": 12.011417429453502, + "grad_norm": 0.0077902087941765785, + "learning_rate": 3.825368454626972e-05, + "loss": 0.004, + "step": 57500 + }, + { + "epoch": 12.011471591832313, + "grad_norm": 0.005464301444590092, + "learning_rate": 3.825067552522463e-05, + "loss": 0.0911, + "step": 57510 + }, + { + "epoch": 12.011525754211124, + "grad_norm": 0.009342980571091175, + "learning_rate": 3.824766650417953e-05, + "loss": 0.091, + "step": 57520 + }, + { + "epoch": 12.011579916589938, + "grad_norm": 0.0856112614274025, + "learning_rate": 3.824465748313444e-05, + "loss": 0.0097, + "step": 57530 + }, + { + "epoch": 12.011634078968749, + "grad_norm": 0.27713608741760254, + "learning_rate": 3.8241648462089346e-05, + "loss": 0.1051, + "step": 57540 + }, + { + "epoch": 12.01168824134756, + "grad_norm": 0.0037320710252970457, + "learning_rate": 3.823863944104425e-05, + "loss": 0.0576, + "step": 57550 + }, + { + "epoch": 12.011742403726371, + "grad_norm": 2.7484230995178223, + "learning_rate": 3.823563041999916e-05, + "loss": 0.0483, + "step": 57560 + }, + { + "epoch": 12.011796566105183, + "grad_norm": 0.006126706022769213, + "learning_rate": 3.8232621398954064e-05, + "loss": 0.0106, + "step": 57570 + }, + { + "epoch": 12.011850728483996, + "grad_norm": 4.500759601593018, + "learning_rate": 3.822961237790898e-05, + "loss": 0.1504, + "step": 57580 + }, + { + "epoch": 12.011904890862807, + "grad_norm": 0.0343351773917675, + "learning_rate": 3.822660335686388e-05, + "loss": 0.0164, + "step": 57590 + }, + { + "epoch": 12.011959053241618, + "grad_norm": 0.005459360312670469, + "learning_rate": 3.822359433581878e-05, + "loss": 0.1133, + "step": 57600 + }, + { + "epoch": 12.01201321562043, + "grad_norm": 0.3798907697200775, + "learning_rate": 3.8220585314773696e-05, + "loss": 0.0388, + "step": 57610 + }, + { + "epoch": 12.01206737799924, + "grad_norm": 0.008089321665465832, + "learning_rate": 3.82175762937286e-05, + "loss": 0.141, + "step": 57620 + }, + { + "epoch": 12.012121540378054, + "grad_norm": 0.24103856086730957, + "learning_rate": 3.82145672726835e-05, + "loss": 0.0046, + "step": 57630 + }, + { + "epoch": 12.012175702756865, + "grad_norm": 0.3732927441596985, + "learning_rate": 3.8211558251638415e-05, + "loss": 0.0454, + "step": 57640 + }, + { + "epoch": 12.012229865135676, + "grad_norm": 1.3136354684829712, + "learning_rate": 3.820854923059332e-05, + "loss": 0.085, + "step": 57650 + }, + { + "epoch": 12.012284027514488, + "grad_norm": 3.383213758468628, + "learning_rate": 3.820554020954823e-05, + "loss": 0.0607, + "step": 57660 + }, + { + "epoch": 12.0123381898933, + "grad_norm": 0.3438917100429535, + "learning_rate": 3.8202531188503134e-05, + "loss": 0.169, + "step": 57670 + }, + { + "epoch": 12.012392352272112, + "grad_norm": 0.11273457854986191, + "learning_rate": 3.819952216745804e-05, + "loss": 0.0009, + "step": 57680 + }, + { + "epoch": 12.012446514650923, + "grad_norm": 0.025266224518418312, + "learning_rate": 3.8196513146412947e-05, + "loss": 0.0872, + "step": 57690 + }, + { + "epoch": 12.012500677029735, + "grad_norm": 0.012008165009319782, + "learning_rate": 3.819350412536786e-05, + "loss": 0.004, + "step": 57700 + }, + { + "epoch": 12.012554839408546, + "grad_norm": 0.030836133286356926, + "learning_rate": 3.819049510432276e-05, + "loss": 0.0231, + "step": 57710 + }, + { + "epoch": 12.012609001787359, + "grad_norm": 0.004082262981683016, + "learning_rate": 3.8187486083277665e-05, + "loss": 0.0032, + "step": 57720 + }, + { + "epoch": 12.01266316416617, + "grad_norm": 0.05669916048645973, + "learning_rate": 3.818447706223258e-05, + "loss": 0.0719, + "step": 57730 + }, + { + "epoch": 12.012717326544982, + "grad_norm": 0.005215510725975037, + "learning_rate": 3.818146804118748e-05, + "loss": 0.0845, + "step": 57740 + }, + { + "epoch": 12.012771488923793, + "grad_norm": 0.2321937382221222, + "learning_rate": 3.8178459020142384e-05, + "loss": 0.0879, + "step": 57750 + }, + { + "epoch": 12.012825651302606, + "grad_norm": 0.08275746554136276, + "learning_rate": 3.81754499990973e-05, + "loss": 0.1135, + "step": 57760 + }, + { + "epoch": 12.012879813681417, + "grad_norm": 0.014598067849874496, + "learning_rate": 3.8172440978052203e-05, + "loss": 0.0385, + "step": 57770 + }, + { + "epoch": 12.012933976060229, + "grad_norm": 0.014976456761360168, + "learning_rate": 3.816943195700711e-05, + "loss": 0.026, + "step": 57780 + }, + { + "epoch": 12.01298813843904, + "grad_norm": 0.05543295294046402, + "learning_rate": 3.8166422935962016e-05, + "loss": 0.0155, + "step": 57790 + }, + { + "epoch": 12.013042300817851, + "grad_norm": 1.7950737476348877, + "learning_rate": 3.816341391491692e-05, + "loss": 0.0567, + "step": 57800 + }, + { + "epoch": 12.013096463196664, + "grad_norm": 0.014186268672347069, + "learning_rate": 3.816040489387183e-05, + "loss": 0.0064, + "step": 57810 + }, + { + "epoch": 12.013150625575475, + "grad_norm": 0.010247809812426567, + "learning_rate": 3.8157395872826735e-05, + "loss": 0.05, + "step": 57820 + }, + { + "epoch": 12.013204787954287, + "grad_norm": 0.00888357125222683, + "learning_rate": 3.815438685178164e-05, + "loss": 0.106, + "step": 57830 + }, + { + "epoch": 12.013258950333098, + "grad_norm": 1.0848100185394287, + "learning_rate": 3.8151377830736554e-05, + "loss": 0.1058, + "step": 57840 + }, + { + "epoch": 12.013313112711911, + "grad_norm": 0.0258562583476305, + "learning_rate": 3.814836880969146e-05, + "loss": 0.0278, + "step": 57850 + }, + { + "epoch": 12.013367275090722, + "grad_norm": 0.17555420100688934, + "learning_rate": 3.814535978864636e-05, + "loss": 0.086, + "step": 57860 + }, + { + "epoch": 12.013421437469534, + "grad_norm": 0.1849963217973709, + "learning_rate": 3.814235076760127e-05, + "loss": 0.0462, + "step": 57870 + }, + { + "epoch": 12.013475599848345, + "grad_norm": 0.010784992016851902, + "learning_rate": 3.813934174655618e-05, + "loss": 0.0147, + "step": 57880 + }, + { + "epoch": 12.013529762227156, + "grad_norm": 0.010228067636489868, + "learning_rate": 3.813633272551108e-05, + "loss": 0.0594, + "step": 57890 + }, + { + "epoch": 12.01358392460597, + "grad_norm": 0.15176540613174438, + "learning_rate": 3.813332370446599e-05, + "loss": 0.0994, + "step": 57900 + }, + { + "epoch": 12.01363808698478, + "grad_norm": 0.058962076902389526, + "learning_rate": 3.81303146834209e-05, + "loss": 0.064, + "step": 57910 + }, + { + "epoch": 12.013692249363592, + "grad_norm": 1.0877000093460083, + "learning_rate": 3.8127305662375804e-05, + "loss": 0.0823, + "step": 57920 + }, + { + "epoch": 12.013746411742403, + "grad_norm": 0.07254647463560104, + "learning_rate": 3.812429664133071e-05, + "loss": 0.1286, + "step": 57930 + }, + { + "epoch": 12.013800574121216, + "grad_norm": 0.00560758076608181, + "learning_rate": 3.812128762028562e-05, + "loss": 0.0604, + "step": 57940 + }, + { + "epoch": 12.013854736500027, + "grad_norm": 0.003356798319146037, + "learning_rate": 3.811827859924052e-05, + "loss": 0.0109, + "step": 57950 + }, + { + "epoch": 12.013908898878839, + "grad_norm": 0.008793523535132408, + "learning_rate": 3.8115269578195436e-05, + "loss": 0.0533, + "step": 57960 + }, + { + "epoch": 12.01396306125765, + "grad_norm": 0.14125093817710876, + "learning_rate": 3.8112260557150336e-05, + "loss": 0.0071, + "step": 57970 + }, + { + "epoch": 12.014017223636461, + "grad_norm": 0.003592077409848571, + "learning_rate": 3.810925153610524e-05, + "loss": 0.0458, + "step": 57980 + }, + { + "epoch": 12.014071386015274, + "grad_norm": 0.014048637822270393, + "learning_rate": 3.8106242515060155e-05, + "loss": 0.1471, + "step": 57990 + }, + { + "epoch": 12.014125548394086, + "grad_norm": 19.955726623535156, + "learning_rate": 3.810323349401506e-05, + "loss": 0.1333, + "step": 58000 + }, + { + "epoch": 12.014179710772897, + "grad_norm": 0.2470199018716812, + "learning_rate": 3.810022447296996e-05, + "loss": 0.0892, + "step": 58010 + }, + { + "epoch": 12.014233873151708, + "grad_norm": 0.7488190531730652, + "learning_rate": 3.8097215451924874e-05, + "loss": 0.028, + "step": 58020 + }, + { + "epoch": 12.014288035530521, + "grad_norm": 0.004245310556143522, + "learning_rate": 3.809420643087978e-05, + "loss": 0.0853, + "step": 58030 + }, + { + "epoch": 12.014342197909333, + "grad_norm": 7.968141555786133, + "learning_rate": 3.8091197409834686e-05, + "loss": 0.0398, + "step": 58040 + }, + { + "epoch": 12.014396360288144, + "grad_norm": 0.09358683228492737, + "learning_rate": 3.808818838878959e-05, + "loss": 0.0097, + "step": 58050 + }, + { + "epoch": 12.014450522666955, + "grad_norm": 1.1874570846557617, + "learning_rate": 3.80851793677445e-05, + "loss": 0.0191, + "step": 58060 + }, + { + "epoch": 12.014504685045766, + "grad_norm": 0.002378737088292837, + "learning_rate": 3.8082170346699405e-05, + "loss": 0.0486, + "step": 58070 + }, + { + "epoch": 12.01455884742458, + "grad_norm": 0.6925317645072937, + "learning_rate": 3.807916132565431e-05, + "loss": 0.0632, + "step": 58080 + }, + { + "epoch": 12.01461300980339, + "grad_norm": 0.03704174607992172, + "learning_rate": 3.807615230460922e-05, + "loss": 0.0129, + "step": 58090 + }, + { + "epoch": 12.014667172182202, + "grad_norm": 0.0026441130321472883, + "learning_rate": 3.807314328356413e-05, + "loss": 0.0478, + "step": 58100 + }, + { + "epoch": 12.014721334561013, + "grad_norm": 0.0020595185924321413, + "learning_rate": 3.807013426251904e-05, + "loss": 0.0057, + "step": 58110 + }, + { + "epoch": 12.014775496939826, + "grad_norm": 0.0021494608372449875, + "learning_rate": 3.806712524147394e-05, + "loss": 0.0312, + "step": 58120 + }, + { + "epoch": 12.014829659318638, + "grad_norm": 0.006535181775689125, + "learning_rate": 3.806411622042885e-05, + "loss": 0.0754, + "step": 58130 + }, + { + "epoch": 12.014883821697449, + "grad_norm": 0.002245508134365082, + "learning_rate": 3.8061107199383756e-05, + "loss": 0.0895, + "step": 58140 + }, + { + "epoch": 12.01493798407626, + "grad_norm": 0.18429788947105408, + "learning_rate": 3.805809817833866e-05, + "loss": 0.0418, + "step": 58150 + }, + { + "epoch": 12.014992146455072, + "grad_norm": 0.07889723777770996, + "learning_rate": 3.805508915729357e-05, + "loss": 0.0428, + "step": 58160 + }, + { + "epoch": 12.015046308833885, + "grad_norm": 0.14790724217891693, + "learning_rate": 3.8052080136248475e-05, + "loss": 0.1422, + "step": 58170 + }, + { + "epoch": 12.015100471212696, + "grad_norm": 0.15957283973693848, + "learning_rate": 3.804907111520338e-05, + "loss": 0.1182, + "step": 58180 + }, + { + "epoch": 12.015154633591507, + "grad_norm": 2.2170441150665283, + "learning_rate": 3.804606209415829e-05, + "loss": 0.0491, + "step": 58190 + }, + { + "epoch": 12.015208795970318, + "grad_norm": 0.431670218706131, + "learning_rate": 3.8043053073113194e-05, + "loss": 0.1771, + "step": 58200 + }, + { + "epoch": 12.015262958349131, + "grad_norm": 0.04533345252275467, + "learning_rate": 3.80400440520681e-05, + "loss": 0.0563, + "step": 58210 + }, + { + "epoch": 12.015317120727943, + "grad_norm": 0.12421765923500061, + "learning_rate": 3.803703503102301e-05, + "loss": 0.0914, + "step": 58220 + }, + { + "epoch": 12.015371283106754, + "grad_norm": 0.0026700033340603113, + "learning_rate": 3.803402600997791e-05, + "loss": 0.0314, + "step": 58230 + }, + { + "epoch": 12.015425445485565, + "grad_norm": 0.10411274433135986, + "learning_rate": 3.803101698893282e-05, + "loss": 0.1952, + "step": 58240 + }, + { + "epoch": 12.015479607864377, + "grad_norm": 0.10718102008104324, + "learning_rate": 3.802800796788773e-05, + "loss": 0.0877, + "step": 58250 + }, + { + "epoch": 12.01553377024319, + "grad_norm": 0.0043409522622823715, + "learning_rate": 3.802499894684264e-05, + "loss": 0.0299, + "step": 58260 + }, + { + "epoch": 12.015587932622001, + "grad_norm": 0.10679816454648972, + "learning_rate": 3.802198992579754e-05, + "loss": 0.0287, + "step": 58270 + }, + { + "epoch": 12.015642095000812, + "grad_norm": 1.0436044931411743, + "learning_rate": 3.801898090475245e-05, + "loss": 0.0979, + "step": 58280 + }, + { + "epoch": 12.015696257379624, + "grad_norm": 0.0029565722215920687, + "learning_rate": 3.801597188370736e-05, + "loss": 0.0282, + "step": 58290 + }, + { + "epoch": 12.015750419758437, + "grad_norm": 0.11231399327516556, + "learning_rate": 3.801296286266226e-05, + "loss": 0.0486, + "step": 58300 + }, + { + "epoch": 12.015804582137248, + "grad_norm": 0.06941176950931549, + "learning_rate": 3.800995384161717e-05, + "loss": 0.0625, + "step": 58310 + }, + { + "epoch": 12.01585874451606, + "grad_norm": 0.09309997409582138, + "learning_rate": 3.8006944820572076e-05, + "loss": 0.0915, + "step": 58320 + }, + { + "epoch": 12.01591290689487, + "grad_norm": 0.07464683800935745, + "learning_rate": 3.800393579952698e-05, + "loss": 0.0788, + "step": 58330 + }, + { + "epoch": 12.015967069273682, + "grad_norm": 0.07689011842012405, + "learning_rate": 3.800092677848189e-05, + "loss": 0.0685, + "step": 58340 + }, + { + "epoch": 12.016021231652495, + "grad_norm": 0.06558187305927277, + "learning_rate": 3.7997917757436795e-05, + "loss": 0.0137, + "step": 58350 + }, + { + "epoch": 12.016075394031306, + "grad_norm": 0.682136058807373, + "learning_rate": 3.799490873639171e-05, + "loss": 0.0716, + "step": 58360 + }, + { + "epoch": 12.016129556410117, + "grad_norm": 0.03329591453075409, + "learning_rate": 3.7991899715346614e-05, + "loss": 0.0348, + "step": 58370 + }, + { + "epoch": 12.016183718788929, + "grad_norm": 0.6136599779129028, + "learning_rate": 3.798889069430151e-05, + "loss": 0.0581, + "step": 58380 + }, + { + "epoch": 12.016237881167742, + "grad_norm": 0.0024306869599968195, + "learning_rate": 3.7985881673256426e-05, + "loss": 0.0736, + "step": 58390 + }, + { + "epoch": 12.016292043546553, + "grad_norm": 0.002223298652097583, + "learning_rate": 3.798287265221133e-05, + "loss": 0.0441, + "step": 58400 + }, + { + "epoch": 12.016346205925364, + "grad_norm": 0.12092767655849457, + "learning_rate": 3.797986363116624e-05, + "loss": 0.0314, + "step": 58410 + }, + { + "epoch": 12.016400368304176, + "grad_norm": 0.06005418300628662, + "learning_rate": 3.7976854610121145e-05, + "loss": 0.0723, + "step": 58420 + }, + { + "epoch": 12.016454530682987, + "grad_norm": 0.2258281111717224, + "learning_rate": 3.797384558907605e-05, + "loss": 0.0779, + "step": 58430 + }, + { + "epoch": 12.0165086930618, + "grad_norm": 0.002733479719609022, + "learning_rate": 3.797083656803096e-05, + "loss": 0.0398, + "step": 58440 + }, + { + "epoch": 12.016562855440611, + "grad_norm": 2.801384687423706, + "learning_rate": 3.796782754698587e-05, + "loss": 0.0557, + "step": 58450 + }, + { + "epoch": 12.016617017819422, + "grad_norm": 0.20938754081726074, + "learning_rate": 3.796481852594077e-05, + "loss": 0.0081, + "step": 58460 + }, + { + "epoch": 12.016671180198234, + "grad_norm": 0.00213070772588253, + "learning_rate": 3.7961809504895677e-05, + "loss": 0.0835, + "step": 58470 + }, + { + "epoch": 12.016725342577047, + "grad_norm": 2.266772985458374, + "learning_rate": 3.795880048385059e-05, + "loss": 0.2305, + "step": 58480 + }, + { + "epoch": 12.016779504955858, + "grad_norm": 0.0034431153908371925, + "learning_rate": 3.795579146280549e-05, + "loss": 0.0336, + "step": 58490 + }, + { + "epoch": 12.01683366733467, + "grad_norm": 0.09353676438331604, + "learning_rate": 3.7952782441760395e-05, + "loss": 0.0771, + "step": 58500 + }, + { + "epoch": 12.01688782971348, + "grad_norm": 0.1152757778763771, + "learning_rate": 3.794977342071531e-05, + "loss": 0.0556, + "step": 58510 + }, + { + "epoch": 12.016941992092292, + "grad_norm": 0.24220092594623566, + "learning_rate": 3.7946764399670215e-05, + "loss": 0.1223, + "step": 58520 + }, + { + "epoch": 12.016996154471105, + "grad_norm": 0.6115189790725708, + "learning_rate": 3.794375537862512e-05, + "loss": 0.0392, + "step": 58530 + }, + { + "epoch": 12.017050316849916, + "grad_norm": 0.0037481221370399, + "learning_rate": 3.794074635758003e-05, + "loss": 0.0122, + "step": 58540 + }, + { + "epoch": 12.017104479228728, + "grad_norm": 0.008501212112605572, + "learning_rate": 3.7937737336534934e-05, + "loss": 0.0619, + "step": 58550 + }, + { + "epoch": 12.017158641607539, + "grad_norm": 0.002282607601955533, + "learning_rate": 3.793472831548984e-05, + "loss": 0.0423, + "step": 58560 + }, + { + "epoch": 12.017212803986352, + "grad_norm": 0.35390719771385193, + "learning_rate": 3.7931719294444746e-05, + "loss": 0.0144, + "step": 58570 + }, + { + "epoch": 12.017266966365163, + "grad_norm": 0.0036749260034412146, + "learning_rate": 3.792871027339965e-05, + "loss": 0.0681, + "step": 58580 + }, + { + "epoch": 12.017321128743975, + "grad_norm": 0.5217741131782532, + "learning_rate": 3.792570125235456e-05, + "loss": 0.0024, + "step": 58590 + }, + { + "epoch": 12.017375291122786, + "grad_norm": 0.00856512039899826, + "learning_rate": 3.792269223130947e-05, + "loss": 0.1358, + "step": 58600 + }, + { + "epoch": 12.017429453501597, + "grad_norm": 1.6419920921325684, + "learning_rate": 3.791968321026437e-05, + "loss": 0.075, + "step": 58610 + }, + { + "epoch": 12.01748361588041, + "grad_norm": 0.23168998956680298, + "learning_rate": 3.7916674189219284e-05, + "loss": 0.0641, + "step": 58620 + }, + { + "epoch": 12.017537778259221, + "grad_norm": 0.1177639439702034, + "learning_rate": 3.791366516817419e-05, + "loss": 0.1168, + "step": 58630 + }, + { + "epoch": 12.017591940638033, + "grad_norm": 0.3803172707557678, + "learning_rate": 3.791065614712909e-05, + "loss": 0.0172, + "step": 58640 + }, + { + "epoch": 12.017646103016844, + "grad_norm": 0.3481379747390747, + "learning_rate": 3.7907647126084e-05, + "loss": 0.0683, + "step": 58650 + }, + { + "epoch": 12.017700265395657, + "grad_norm": 0.03338577598333359, + "learning_rate": 3.790463810503891e-05, + "loss": 0.0727, + "step": 58660 + }, + { + "epoch": 12.017754427774468, + "grad_norm": 0.11061809957027435, + "learning_rate": 3.7901629083993816e-05, + "loss": 0.0749, + "step": 58670 + }, + { + "epoch": 12.01780859015328, + "grad_norm": 0.2479606568813324, + "learning_rate": 3.789862006294872e-05, + "loss": 0.0445, + "step": 58680 + }, + { + "epoch": 12.017862752532091, + "grad_norm": 0.6353053450584412, + "learning_rate": 3.789561104190363e-05, + "loss": 0.1079, + "step": 58690 + }, + { + "epoch": 12.017916914910902, + "grad_norm": 0.002080328995361924, + "learning_rate": 3.7892602020858534e-05, + "loss": 0.0233, + "step": 58700 + }, + { + "epoch": 12.017971077289715, + "grad_norm": 0.017155662178993225, + "learning_rate": 3.788959299981345e-05, + "loss": 0.0481, + "step": 58710 + }, + { + "epoch": 12.018025239668527, + "grad_norm": 0.18821431696414948, + "learning_rate": 3.788658397876835e-05, + "loss": 0.0502, + "step": 58720 + }, + { + "epoch": 12.018079402047338, + "grad_norm": 0.06067870929837227, + "learning_rate": 3.788357495772325e-05, + "loss": 0.0859, + "step": 58730 + }, + { + "epoch": 12.018133564426149, + "grad_norm": 0.2835168242454529, + "learning_rate": 3.7880565936678166e-05, + "loss": 0.0123, + "step": 58740 + }, + { + "epoch": 12.01818772680496, + "grad_norm": 0.05223696306347847, + "learning_rate": 3.787755691563307e-05, + "loss": 0.029, + "step": 58750 + }, + { + "epoch": 12.018241889183773, + "grad_norm": 0.13070213794708252, + "learning_rate": 3.787454789458797e-05, + "loss": 0.2451, + "step": 58760 + }, + { + "epoch": 12.018296051562585, + "grad_norm": 17.11248016357422, + "learning_rate": 3.7871538873542885e-05, + "loss": 0.1192, + "step": 58770 + }, + { + "epoch": 12.018350213941396, + "grad_norm": 1.8217995166778564, + "learning_rate": 3.786852985249779e-05, + "loss": 0.0656, + "step": 58780 + }, + { + "epoch": 12.018404376320207, + "grad_norm": 0.4523073136806488, + "learning_rate": 3.78655208314527e-05, + "loss": 0.0297, + "step": 58790 + }, + { + "epoch": 12.01845853869902, + "grad_norm": 0.5759626030921936, + "learning_rate": 3.7862511810407604e-05, + "loss": 0.0505, + "step": 58800 + }, + { + "epoch": 12.018512701077832, + "grad_norm": 0.32369565963745117, + "learning_rate": 3.785950278936251e-05, + "loss": 0.1295, + "step": 58810 + }, + { + "epoch": 12.018566863456643, + "grad_norm": 0.061983622610569, + "learning_rate": 3.7856493768317417e-05, + "loss": 0.1418, + "step": 58820 + }, + { + "epoch": 12.018621025835454, + "grad_norm": 2.3043203353881836, + "learning_rate": 3.785348474727232e-05, + "loss": 0.0765, + "step": 58830 + }, + { + "epoch": 12.018675188214266, + "grad_norm": 0.07889227569103241, + "learning_rate": 3.785047572622723e-05, + "loss": 0.0574, + "step": 58840 + }, + { + "epoch": 12.018729350593079, + "grad_norm": 0.02061455510556698, + "learning_rate": 3.7847466705182135e-05, + "loss": 0.0905, + "step": 58850 + }, + { + "epoch": 12.01878351297189, + "grad_norm": 0.17631949484348297, + "learning_rate": 3.784445768413705e-05, + "loss": 0.1071, + "step": 58860 + }, + { + "epoch": 12.018837675350701, + "grad_norm": 0.022423341870307922, + "learning_rate": 3.784144866309195e-05, + "loss": 0.0179, + "step": 58870 + }, + { + "epoch": 12.018891837729512, + "grad_norm": 0.026822395622730255, + "learning_rate": 3.783843964204686e-05, + "loss": 0.0301, + "step": 58880 + }, + { + "epoch": 12.018946000108325, + "grad_norm": 8.612110137939453, + "learning_rate": 3.783543062100177e-05, + "loss": 0.0513, + "step": 58890 + }, + { + "epoch": 12.019000162487137, + "grad_norm": 0.41014569997787476, + "learning_rate": 3.7832421599956674e-05, + "loss": 0.1369, + "step": 58900 + }, + { + "epoch": 12.019054324865948, + "grad_norm": 0.002666891785338521, + "learning_rate": 3.782941257891158e-05, + "loss": 0.0719, + "step": 58910 + }, + { + "epoch": 12.01910848724476, + "grad_norm": 0.04378601908683777, + "learning_rate": 3.7826403557866486e-05, + "loss": 0.1057, + "step": 58920 + }, + { + "epoch": 12.01916264962357, + "grad_norm": 0.41140493750572205, + "learning_rate": 3.782339453682139e-05, + "loss": 0.0627, + "step": 58930 + }, + { + "epoch": 12.019216812002384, + "grad_norm": 0.42408132553100586, + "learning_rate": 3.78203855157763e-05, + "loss": 0.0073, + "step": 58940 + }, + { + "epoch": 12.019270974381195, + "grad_norm": 0.6350623965263367, + "learning_rate": 3.7817376494731205e-05, + "loss": 0.0159, + "step": 58950 + }, + { + "epoch": 12.019325136760006, + "grad_norm": 0.014456123113632202, + "learning_rate": 3.781436747368611e-05, + "loss": 0.0131, + "step": 58960 + }, + { + "epoch": 12.019379299138818, + "grad_norm": 0.02912428230047226, + "learning_rate": 3.7811358452641024e-05, + "loss": 0.0617, + "step": 58970 + }, + { + "epoch": 12.01943346151763, + "grad_norm": 0.33499476313591003, + "learning_rate": 3.7808349431595924e-05, + "loss": 0.0606, + "step": 58980 + }, + { + "epoch": 12.019487623896442, + "grad_norm": 1.3465502262115479, + "learning_rate": 3.780534041055083e-05, + "loss": 0.2178, + "step": 58990 + }, + { + "epoch": 12.019541786275253, + "grad_norm": 0.044335443526506424, + "learning_rate": 3.780233138950574e-05, + "loss": 0.0355, + "step": 59000 + }, + { + "epoch": 12.019595948654064, + "grad_norm": 0.7059941291809082, + "learning_rate": 3.779932236846065e-05, + "loss": 0.1743, + "step": 59010 + }, + { + "epoch": 12.019650111032876, + "grad_norm": 0.06174507364630699, + "learning_rate": 3.779631334741555e-05, + "loss": 0.1879, + "step": 59020 + }, + { + "epoch": 12.019704273411689, + "grad_norm": 1.584817886352539, + "learning_rate": 3.779330432637046e-05, + "loss": 0.0859, + "step": 59030 + }, + { + "epoch": 12.0197584357905, + "grad_norm": 0.06104975566267967, + "learning_rate": 3.779029530532537e-05, + "loss": 0.0326, + "step": 59040 + }, + { + "epoch": 12.019812598169311, + "grad_norm": 1.2845083475112915, + "learning_rate": 3.7787286284280274e-05, + "loss": 0.0674, + "step": 59050 + }, + { + "epoch": 12.019866760548123, + "grad_norm": 0.6174518465995789, + "learning_rate": 3.778427726323518e-05, + "loss": 0.0673, + "step": 59060 + }, + { + "epoch": 12.019920922926936, + "grad_norm": 0.016569139435887337, + "learning_rate": 3.778126824219009e-05, + "loss": 0.1304, + "step": 59070 + }, + { + "epoch": 12.019975085305747, + "grad_norm": 0.5474854111671448, + "learning_rate": 3.777825922114499e-05, + "loss": 0.0736, + "step": 59080 + }, + { + "epoch": 12.020029247684558, + "grad_norm": 2.1259403228759766, + "learning_rate": 3.77752502000999e-05, + "loss": 0.0967, + "step": 59090 + }, + { + "epoch": 12.02008341006337, + "grad_norm": 0.005942412652075291, + "learning_rate": 3.7772241179054806e-05, + "loss": 0.0621, + "step": 59100 + }, + { + "epoch": 12.02013757244218, + "grad_norm": 0.0054880003444850445, + "learning_rate": 3.776923215800971e-05, + "loss": 0.007, + "step": 59110 + }, + { + "epoch": 12.020191734820994, + "grad_norm": 0.00993729755282402, + "learning_rate": 3.7766223136964625e-05, + "loss": 0.0116, + "step": 59120 + }, + { + "epoch": 12.020245897199805, + "grad_norm": 0.003834392409771681, + "learning_rate": 3.7763214115919525e-05, + "loss": 0.1108, + "step": 59130 + }, + { + "epoch": 12.020300059578616, + "grad_norm": 0.09911950677633286, + "learning_rate": 3.776020509487444e-05, + "loss": 0.0988, + "step": 59140 + }, + { + "epoch": 12.020354221957428, + "grad_norm": 0.031919367611408234, + "learning_rate": 3.7757196073829344e-05, + "loss": 0.0328, + "step": 59150 + }, + { + "epoch": 12.02040838433624, + "grad_norm": 0.033664822578430176, + "learning_rate": 3.775418705278425e-05, + "loss": 0.0117, + "step": 59160 + }, + { + "epoch": 12.020462546715052, + "grad_norm": 1.3551996946334839, + "learning_rate": 3.7751178031739156e-05, + "loss": 0.0585, + "step": 59170 + }, + { + "epoch": 12.020516709093863, + "grad_norm": 0.003075802931562066, + "learning_rate": 3.774816901069406e-05, + "loss": 0.0195, + "step": 59180 + }, + { + "epoch": 12.020570871472675, + "grad_norm": 0.6108631491661072, + "learning_rate": 3.774515998964897e-05, + "loss": 0.0658, + "step": 59190 + }, + { + "epoch": 12.020625033851486, + "grad_norm": 0.0024947216734290123, + "learning_rate": 3.774215096860388e-05, + "loss": 0.0367, + "step": 59200 + }, + { + "epoch": 12.020679196230299, + "grad_norm": 2.010735511779785, + "learning_rate": 3.773914194755878e-05, + "loss": 0.0275, + "step": 59210 + }, + { + "epoch": 12.02073335860911, + "grad_norm": 0.007301524747163057, + "learning_rate": 3.773613292651369e-05, + "loss": 0.0647, + "step": 59220 + }, + { + "epoch": 12.020787520987922, + "grad_norm": 0.0027816076762974262, + "learning_rate": 3.77331239054686e-05, + "loss": 0.0989, + "step": 59230 + }, + { + "epoch": 12.020841683366733, + "grad_norm": 8.84852409362793, + "learning_rate": 3.77301148844235e-05, + "loss": 0.1567, + "step": 59240 + }, + { + "epoch": 12.020895845745546, + "grad_norm": 0.12345722317695618, + "learning_rate": 3.772710586337841e-05, + "loss": 0.0408, + "step": 59250 + }, + { + "epoch": 12.020950008124357, + "grad_norm": 2.5554025173187256, + "learning_rate": 3.772409684233332e-05, + "loss": 0.0518, + "step": 59260 + }, + { + "epoch": 12.021004170503168, + "grad_norm": 0.05884522572159767, + "learning_rate": 3.7721087821288226e-05, + "loss": 0.0633, + "step": 59270 + }, + { + "epoch": 12.02105833288198, + "grad_norm": 5.839078903198242, + "learning_rate": 3.7718078800243125e-05, + "loss": 0.1524, + "step": 59280 + }, + { + "epoch": 12.021112495260791, + "grad_norm": 0.034284114837646484, + "learning_rate": 3.771506977919804e-05, + "loss": 0.0729, + "step": 59290 + }, + { + "epoch": 12.021166657639604, + "grad_norm": 0.265641987323761, + "learning_rate": 3.7712060758152945e-05, + "loss": 0.0891, + "step": 59300 + }, + { + "epoch": 12.021220820018415, + "grad_norm": 0.2678193151950836, + "learning_rate": 3.770905173710785e-05, + "loss": 0.0748, + "step": 59310 + }, + { + "epoch": 12.021274982397227, + "grad_norm": 0.036483775824308395, + "learning_rate": 3.770604271606276e-05, + "loss": 0.0754, + "step": 59320 + }, + { + "epoch": 12.021329144776038, + "grad_norm": 0.0028388118371367455, + "learning_rate": 3.7703033695017664e-05, + "loss": 0.1342, + "step": 59330 + }, + { + "epoch": 12.021383307154851, + "grad_norm": 0.4902924597263336, + "learning_rate": 3.770002467397257e-05, + "loss": 0.1413, + "step": 59340 + }, + { + "epoch": 12.021437469533662, + "grad_norm": 0.00373762845993042, + "learning_rate": 3.769701565292748e-05, + "loss": 0.2148, + "step": 59350 + }, + { + "epoch": 12.021491631912474, + "grad_norm": 2.9387247562408447, + "learning_rate": 3.769400663188238e-05, + "loss": 0.0576, + "step": 59360 + }, + { + "epoch": 12.021545794291285, + "grad_norm": 0.004384258762001991, + "learning_rate": 3.769099761083729e-05, + "loss": 0.0482, + "step": 59370 + }, + { + "epoch": 12.021599956670096, + "grad_norm": 0.22146141529083252, + "learning_rate": 3.76879885897922e-05, + "loss": 0.085, + "step": 59380 + }, + { + "epoch": 12.02165411904891, + "grad_norm": 0.07093430310487747, + "learning_rate": 3.76849795687471e-05, + "loss": 0.0256, + "step": 59390 + }, + { + "epoch": 12.02170828142772, + "grad_norm": 0.00311722862534225, + "learning_rate": 3.7681970547702014e-05, + "loss": 0.0382, + "step": 59400 + }, + { + "epoch": 12.021762443806532, + "grad_norm": 5.025403022766113, + "learning_rate": 3.767896152665692e-05, + "loss": 0.0867, + "step": 59410 + }, + { + "epoch": 12.021816606185343, + "grad_norm": 5.718755722045898, + "learning_rate": 3.767595250561183e-05, + "loss": 0.0507, + "step": 59420 + }, + { + "epoch": 12.021870768564156, + "grad_norm": 0.21814852952957153, + "learning_rate": 3.767294348456673e-05, + "loss": 0.1053, + "step": 59430 + }, + { + "epoch": 12.021924930942967, + "grad_norm": 1.4842605590820312, + "learning_rate": 3.766993446352164e-05, + "loss": 0.1655, + "step": 59440 + }, + { + "epoch": 12.021979093321779, + "grad_norm": 0.022775234654545784, + "learning_rate": 3.7666925442476546e-05, + "loss": 0.1332, + "step": 59450 + }, + { + "epoch": 12.02203325570059, + "grad_norm": 0.1891983449459076, + "learning_rate": 3.766391642143146e-05, + "loss": 0.0593, + "step": 59460 + }, + { + "epoch": 12.022087418079401, + "grad_norm": 0.1258162558078766, + "learning_rate": 3.766090740038636e-05, + "loss": 0.0533, + "step": 59470 + }, + { + "epoch": 12.022141580458214, + "grad_norm": 0.0033561941236257553, + "learning_rate": 3.7657898379341265e-05, + "loss": 0.0052, + "step": 59480 + }, + { + "epoch": 12.022195742837026, + "grad_norm": 0.0050775413401424885, + "learning_rate": 3.765488935829618e-05, + "loss": 0.0591, + "step": 59490 + }, + { + "epoch": 12.022249905215837, + "grad_norm": 0.3800457715988159, + "learning_rate": 3.7651880337251084e-05, + "loss": 0.0754, + "step": 59500 + }, + { + "epoch": 12.022304067594648, + "grad_norm": 0.2778203785419464, + "learning_rate": 3.764887131620598e-05, + "loss": 0.0732, + "step": 59510 + }, + { + "epoch": 12.022358229973461, + "grad_norm": 0.03427019715309143, + "learning_rate": 3.7645862295160896e-05, + "loss": 0.075, + "step": 59520 + }, + { + "epoch": 12.022412392352273, + "grad_norm": 0.07010390609502792, + "learning_rate": 3.76428532741158e-05, + "loss": 0.0463, + "step": 59530 + }, + { + "epoch": 12.022466554731084, + "grad_norm": 0.00445995619520545, + "learning_rate": 3.76398442530707e-05, + "loss": 0.0324, + "step": 59540 + }, + { + "epoch": 12.022520717109895, + "grad_norm": 6.092194557189941, + "learning_rate": 3.7636835232025615e-05, + "loss": 0.0458, + "step": 59550 + }, + { + "epoch": 12.022574879488706, + "grad_norm": 0.03242335468530655, + "learning_rate": 3.763382621098052e-05, + "loss": 0.0948, + "step": 59560 + }, + { + "epoch": 12.02262904186752, + "grad_norm": 0.2137550264596939, + "learning_rate": 3.763081718993543e-05, + "loss": 0.103, + "step": 59570 + }, + { + "epoch": 12.02268320424633, + "grad_norm": 0.24608875811100006, + "learning_rate": 3.7627808168890334e-05, + "loss": 0.0839, + "step": 59580 + }, + { + "epoch": 12.022737366625142, + "grad_norm": 0.1488351970911026, + "learning_rate": 3.762479914784524e-05, + "loss": 0.0254, + "step": 59590 + }, + { + "epoch": 12.022791529003953, + "grad_norm": 4.117066383361816, + "learning_rate": 3.762179012680015e-05, + "loss": 0.0692, + "step": 59600 + }, + { + "epoch": 12.022845691382766, + "grad_norm": 0.13741368055343628, + "learning_rate": 3.761878110575506e-05, + "loss": 0.0824, + "step": 59610 + }, + { + "epoch": 12.022899853761578, + "grad_norm": 0.08426281809806824, + "learning_rate": 3.761577208470996e-05, + "loss": 0.0303, + "step": 59620 + }, + { + "epoch": 12.022954016140389, + "grad_norm": 0.007731171790510416, + "learning_rate": 3.7612763063664865e-05, + "loss": 0.0423, + "step": 59630 + }, + { + "epoch": 12.0230081785192, + "grad_norm": 0.004443040117621422, + "learning_rate": 3.760975404261978e-05, + "loss": 0.1009, + "step": 59640 + }, + { + "epoch": 12.023062340898012, + "grad_norm": 1.2430890798568726, + "learning_rate": 3.7606745021574685e-05, + "loss": 0.0702, + "step": 59650 + }, + { + "epoch": 12.023116503276825, + "grad_norm": 0.013343242928385735, + "learning_rate": 3.760373600052959e-05, + "loss": 0.1082, + "step": 59660 + }, + { + "epoch": 12.023170665655636, + "grad_norm": 0.1263381689786911, + "learning_rate": 3.76007269794845e-05, + "loss": 0.078, + "step": 59670 + }, + { + "epoch": 12.023224828034447, + "grad_norm": 2.1942660808563232, + "learning_rate": 3.7597717958439404e-05, + "loss": 0.0223, + "step": 59680 + }, + { + "epoch": 12.023278990413258, + "grad_norm": 0.7269461154937744, + "learning_rate": 3.759470893739431e-05, + "loss": 0.0244, + "step": 59690 + }, + { + "epoch": 12.023333152792071, + "grad_norm": 0.0976325273513794, + "learning_rate": 3.7591699916349216e-05, + "loss": 0.0201, + "step": 59700 + }, + { + "epoch": 12.023387315170883, + "grad_norm": 3.9471185207366943, + "learning_rate": 3.758869089530412e-05, + "loss": 0.0729, + "step": 59710 + }, + { + "epoch": 12.023441477549694, + "grad_norm": 0.028233278542757034, + "learning_rate": 3.7585681874259035e-05, + "loss": 0.0085, + "step": 59720 + }, + { + "epoch": 12.023495639928505, + "grad_norm": 0.09579507261514664, + "learning_rate": 3.7582672853213935e-05, + "loss": 0.0661, + "step": 59730 + }, + { + "epoch": 12.023549802307317, + "grad_norm": 0.009651805274188519, + "learning_rate": 3.757966383216884e-05, + "loss": 0.0038, + "step": 59740 + }, + { + "epoch": 12.02360396468613, + "grad_norm": 0.34944599866867065, + "learning_rate": 3.7576654811123754e-05, + "loss": 0.053, + "step": 59750 + }, + { + "epoch": 12.023658127064941, + "grad_norm": 0.0018142089247703552, + "learning_rate": 3.757364579007866e-05, + "loss": 0.0462, + "step": 59760 + }, + { + "epoch": 12.023712289443752, + "grad_norm": 0.01992926001548767, + "learning_rate": 3.757063676903356e-05, + "loss": 0.0163, + "step": 59770 + }, + { + "epoch": 12.023766451822564, + "grad_norm": 0.022873448207974434, + "learning_rate": 3.756762774798847e-05, + "loss": 0.0648, + "step": 59780 + }, + { + "epoch": 12.023820614201377, + "grad_norm": 0.5190929174423218, + "learning_rate": 3.756461872694338e-05, + "loss": 0.1406, + "step": 59790 + }, + { + "epoch": 12.023874776580188, + "grad_norm": 2.1210761070251465, + "learning_rate": 3.7561609705898286e-05, + "loss": 0.0566, + "step": 59800 + }, + { + "epoch": 12.023928938959, + "grad_norm": 0.17606382071971893, + "learning_rate": 3.755860068485319e-05, + "loss": 0.079, + "step": 59810 + }, + { + "epoch": 12.02398310133781, + "grad_norm": 0.02733912132680416, + "learning_rate": 3.75555916638081e-05, + "loss": 0.0811, + "step": 59820 + }, + { + "epoch": 12.024037263716622, + "grad_norm": 0.5483922958374023, + "learning_rate": 3.7552582642763004e-05, + "loss": 0.1918, + "step": 59830 + }, + { + "epoch": 12.024091426095435, + "grad_norm": 0.002457784488797188, + "learning_rate": 3.754957362171791e-05, + "loss": 0.0541, + "step": 59840 + }, + { + "epoch": 12.024145588474246, + "grad_norm": 3.6737475395202637, + "learning_rate": 3.754656460067282e-05, + "loss": 0.1519, + "step": 59850 + }, + { + "epoch": 12.024199750853057, + "grad_norm": 0.36016178131103516, + "learning_rate": 3.754355557962772e-05, + "loss": 0.0792, + "step": 59860 + }, + { + "epoch": 12.024253913231869, + "grad_norm": 0.09477025270462036, + "learning_rate": 3.7540546558582636e-05, + "loss": 0.0911, + "step": 59870 + }, + { + "epoch": 12.02430807561068, + "grad_norm": 0.4160229563713074, + "learning_rate": 3.7537537537537536e-05, + "loss": 0.0846, + "step": 59880 + }, + { + "epoch": 12.024362237989493, + "grad_norm": 0.11907178163528442, + "learning_rate": 3.753452851649244e-05, + "loss": 0.115, + "step": 59890 + }, + { + "epoch": 12.024416400368304, + "grad_norm": 0.04686322808265686, + "learning_rate": 3.7531519495447355e-05, + "loss": 0.032, + "step": 59900 + }, + { + "epoch": 12.024470562747116, + "grad_norm": 0.005115265026688576, + "learning_rate": 3.752851047440226e-05, + "loss": 0.0615, + "step": 59910 + }, + { + "epoch": 12.024524725125927, + "grad_norm": 0.07348153740167618, + "learning_rate": 3.752550145335717e-05, + "loss": 0.0466, + "step": 59920 + }, + { + "epoch": 12.02457888750474, + "grad_norm": 0.751617968082428, + "learning_rate": 3.7522492432312074e-05, + "loss": 0.0174, + "step": 59930 + }, + { + "epoch": 12.024633049883551, + "grad_norm": 0.002724389312788844, + "learning_rate": 3.751948341126698e-05, + "loss": 0.0909, + "step": 59940 + }, + { + "epoch": 12.024687212262362, + "grad_norm": 1.0604246854782104, + "learning_rate": 3.7516474390221887e-05, + "loss": 0.1341, + "step": 59950 + }, + { + "epoch": 12.024741374641174, + "grad_norm": 0.6433671712875366, + "learning_rate": 3.751346536917679e-05, + "loss": 0.0376, + "step": 59960 + }, + { + "epoch": 12.024795537019985, + "grad_norm": 0.11947473138570786, + "learning_rate": 3.75104563481317e-05, + "loss": 0.0365, + "step": 59970 + }, + { + "epoch": 12.024849699398798, + "grad_norm": 0.002105499617755413, + "learning_rate": 3.750744732708661e-05, + "loss": 0.1783, + "step": 59980 + }, + { + "epoch": 12.02490386177761, + "grad_norm": 0.30424466729164124, + "learning_rate": 3.750443830604151e-05, + "loss": 0.0044, + "step": 59990 + }, + { + "epoch": 12.02495802415642, + "grad_norm": 0.2941819429397583, + "learning_rate": 3.750142928499642e-05, + "loss": 0.026, + "step": 60000 + }, + { + "epoch": 12.025001354059471, + "eval_accuracy": 0.8357282821685174, + "eval_loss": 0.6877176761627197, + "eval_runtime": 116.0134, + "eval_samples_per_second": 26.394, + "eval_steps_per_second": 3.301, + "step": 60008 + }, + { + "epoch": 13.000010832475763, + "grad_norm": 0.011731157079339027, + "learning_rate": 3.749842026395133e-05, + "loss": 0.1243, + "step": 60010 + }, + { + "epoch": 13.000064994854574, + "grad_norm": 0.0062983171083033085, + "learning_rate": 3.749541124290624e-05, + "loss": 0.0095, + "step": 60020 + }, + { + "epoch": 13.000119157233385, + "grad_norm": 4.019908905029297, + "learning_rate": 3.749240222186114e-05, + "loss": 0.1757, + "step": 60030 + }, + { + "epoch": 13.000173319612198, + "grad_norm": 0.08693371713161469, + "learning_rate": 3.748939320081605e-05, + "loss": 0.0479, + "step": 60040 + }, + { + "epoch": 13.00022748199101, + "grad_norm": 0.16534559428691864, + "learning_rate": 3.7486384179770956e-05, + "loss": 0.0201, + "step": 60050 + }, + { + "epoch": 13.00028164436982, + "grad_norm": 0.07257046550512314, + "learning_rate": 3.748337515872586e-05, + "loss": 0.1211, + "step": 60060 + }, + { + "epoch": 13.000335806748632, + "grad_norm": 5.985147953033447, + "learning_rate": 3.748036613768077e-05, + "loss": 0.0762, + "step": 60070 + }, + { + "epoch": 13.000389969127443, + "grad_norm": 1.3022407293319702, + "learning_rate": 3.7477357116635675e-05, + "loss": 0.0294, + "step": 60080 + }, + { + "epoch": 13.000444131506256, + "grad_norm": 0.8055107593536377, + "learning_rate": 3.747434809559058e-05, + "loss": 0.0095, + "step": 60090 + }, + { + "epoch": 13.000498293885068, + "grad_norm": 3.308295726776123, + "learning_rate": 3.7471339074545494e-05, + "loss": 0.0884, + "step": 60100 + }, + { + "epoch": 13.000552456263879, + "grad_norm": 0.003296217881143093, + "learning_rate": 3.7468330053500394e-05, + "loss": 0.0248, + "step": 60110 + }, + { + "epoch": 13.00060661864269, + "grad_norm": 0.004649667534977198, + "learning_rate": 3.74653210324553e-05, + "loss": 0.1129, + "step": 60120 + }, + { + "epoch": 13.000660781021502, + "grad_norm": 0.04186655953526497, + "learning_rate": 3.746231201141021e-05, + "loss": 0.0145, + "step": 60130 + }, + { + "epoch": 13.000714943400315, + "grad_norm": 0.003856499446555972, + "learning_rate": 3.745930299036511e-05, + "loss": 0.0447, + "step": 60140 + }, + { + "epoch": 13.000769105779126, + "grad_norm": 0.04871063679456711, + "learning_rate": 3.745629396932002e-05, + "loss": 0.0437, + "step": 60150 + }, + { + "epoch": 13.000823268157937, + "grad_norm": 0.03662370517849922, + "learning_rate": 3.745328494827493e-05, + "loss": 0.0079, + "step": 60160 + }, + { + "epoch": 13.000877430536748, + "grad_norm": 0.003178161336109042, + "learning_rate": 3.745027592722984e-05, + "loss": 0.0453, + "step": 60170 + }, + { + "epoch": 13.000931592915562, + "grad_norm": 0.006899307016283274, + "learning_rate": 3.7447266906184744e-05, + "loss": 0.0186, + "step": 60180 + }, + { + "epoch": 13.000985755294373, + "grad_norm": 2.7362582683563232, + "learning_rate": 3.744425788513965e-05, + "loss": 0.0847, + "step": 60190 + }, + { + "epoch": 13.001039917673184, + "grad_norm": 0.004149832297116518, + "learning_rate": 3.744124886409456e-05, + "loss": 0.0363, + "step": 60200 + }, + { + "epoch": 13.001094080051995, + "grad_norm": 0.024759195744991302, + "learning_rate": 3.743823984304946e-05, + "loss": 0.0338, + "step": 60210 + }, + { + "epoch": 13.001148242430807, + "grad_norm": 0.0021187402307987213, + "learning_rate": 3.743523082200437e-05, + "loss": 0.0203, + "step": 60220 + }, + { + "epoch": 13.00120240480962, + "grad_norm": 0.0020041693933308125, + "learning_rate": 3.7432221800959276e-05, + "loss": 0.1843, + "step": 60230 + }, + { + "epoch": 13.001256567188431, + "grad_norm": 0.0026356996968388557, + "learning_rate": 3.742921277991419e-05, + "loss": 0.1014, + "step": 60240 + }, + { + "epoch": 13.001310729567242, + "grad_norm": 0.0928870141506195, + "learning_rate": 3.7426203758869095e-05, + "loss": 0.1183, + "step": 60250 + }, + { + "epoch": 13.001364891946054, + "grad_norm": 0.04743451997637749, + "learning_rate": 3.7423194737823995e-05, + "loss": 0.0185, + "step": 60260 + }, + { + "epoch": 13.001419054324867, + "grad_norm": 0.01638159342110157, + "learning_rate": 3.742018571677891e-05, + "loss": 0.0396, + "step": 60270 + }, + { + "epoch": 13.001473216703678, + "grad_norm": 0.007583598606288433, + "learning_rate": 3.7417176695733814e-05, + "loss": 0.0255, + "step": 60280 + }, + { + "epoch": 13.00152737908249, + "grad_norm": 2.793285846710205, + "learning_rate": 3.7414167674688713e-05, + "loss": 0.108, + "step": 60290 + }, + { + "epoch": 13.0015815414613, + "grad_norm": 0.31219884753227234, + "learning_rate": 3.7411158653643626e-05, + "loss": 0.0136, + "step": 60300 + }, + { + "epoch": 13.001635703840112, + "grad_norm": 0.0077928099781274796, + "learning_rate": 3.740814963259853e-05, + "loss": 0.07, + "step": 60310 + }, + { + "epoch": 13.001689866218925, + "grad_norm": 2.657029867172241, + "learning_rate": 3.740514061155344e-05, + "loss": 0.0121, + "step": 60320 + }, + { + "epoch": 13.001744028597736, + "grad_norm": 5.98693323135376, + "learning_rate": 3.7402131590508345e-05, + "loss": 0.0759, + "step": 60330 + }, + { + "epoch": 13.001798190976547, + "grad_norm": 5.8365678787231445, + "learning_rate": 3.739912256946325e-05, + "loss": 0.0885, + "step": 60340 + }, + { + "epoch": 13.001852353355359, + "grad_norm": 0.014692327938973904, + "learning_rate": 3.739611354841816e-05, + "loss": 0.0497, + "step": 60350 + }, + { + "epoch": 13.001906515734172, + "grad_norm": 0.005435094237327576, + "learning_rate": 3.739310452737307e-05, + "loss": 0.0316, + "step": 60360 + }, + { + "epoch": 13.001960678112983, + "grad_norm": 0.00862909760326147, + "learning_rate": 3.739009550632797e-05, + "loss": 0.0019, + "step": 60370 + }, + { + "epoch": 13.002014840491794, + "grad_norm": 0.021649204194545746, + "learning_rate": 3.738708648528288e-05, + "loss": 0.0675, + "step": 60380 + }, + { + "epoch": 13.002069002870606, + "grad_norm": 2.9878220558166504, + "learning_rate": 3.738407746423779e-05, + "loss": 0.1653, + "step": 60390 + }, + { + "epoch": 13.002123165249417, + "grad_norm": 0.0032821535132825375, + "learning_rate": 3.7381068443192696e-05, + "loss": 0.1064, + "step": 60400 + }, + { + "epoch": 13.00217732762823, + "grad_norm": 3.4498419761657715, + "learning_rate": 3.7378059422147596e-05, + "loss": 0.0314, + "step": 60410 + }, + { + "epoch": 13.002231490007041, + "grad_norm": 0.012393893674015999, + "learning_rate": 3.737505040110251e-05, + "loss": 0.006, + "step": 60420 + }, + { + "epoch": 13.002285652385853, + "grad_norm": 0.002882422646507621, + "learning_rate": 3.7372041380057415e-05, + "loss": 0.006, + "step": 60430 + }, + { + "epoch": 13.002339814764664, + "grad_norm": 0.05132400617003441, + "learning_rate": 3.736903235901232e-05, + "loss": 0.0693, + "step": 60440 + }, + { + "epoch": 13.002393977143477, + "grad_norm": 3.18452525138855, + "learning_rate": 3.736602333796723e-05, + "loss": 0.0655, + "step": 60450 + }, + { + "epoch": 13.002448139522288, + "grad_norm": 0.10570094734430313, + "learning_rate": 3.7363014316922134e-05, + "loss": 0.0646, + "step": 60460 + }, + { + "epoch": 13.0025023019011, + "grad_norm": 0.03740456700325012, + "learning_rate": 3.736000529587704e-05, + "loss": 0.1131, + "step": 60470 + }, + { + "epoch": 13.00255646427991, + "grad_norm": 0.9479131698608398, + "learning_rate": 3.7356996274831946e-05, + "loss": 0.0344, + "step": 60480 + }, + { + "epoch": 13.002610626658722, + "grad_norm": 1.4475935697555542, + "learning_rate": 3.735398725378685e-05, + "loss": 0.0111, + "step": 60490 + }, + { + "epoch": 13.002664789037535, + "grad_norm": 1.083619236946106, + "learning_rate": 3.7350978232741766e-05, + "loss": 0.0863, + "step": 60500 + }, + { + "epoch": 13.002718951416346, + "grad_norm": 0.0045271264389157295, + "learning_rate": 3.734796921169667e-05, + "loss": 0.0195, + "step": 60510 + }, + { + "epoch": 13.002773113795158, + "grad_norm": 0.23423512279987335, + "learning_rate": 3.734496019065157e-05, + "loss": 0.0787, + "step": 60520 + }, + { + "epoch": 13.002827276173969, + "grad_norm": 0.43022409081459045, + "learning_rate": 3.7341951169606484e-05, + "loss": 0.1737, + "step": 60530 + }, + { + "epoch": 13.002881438552782, + "grad_norm": 4.206092357635498, + "learning_rate": 3.733894214856139e-05, + "loss": 0.121, + "step": 60540 + }, + { + "epoch": 13.002935600931593, + "grad_norm": 0.03737100958824158, + "learning_rate": 3.73359331275163e-05, + "loss": 0.073, + "step": 60550 + }, + { + "epoch": 13.002989763310405, + "grad_norm": 0.5835838913917542, + "learning_rate": 3.73329241064712e-05, + "loss": 0.0457, + "step": 60560 + }, + { + "epoch": 13.003043925689216, + "grad_norm": 1.5924822092056274, + "learning_rate": 3.732991508542611e-05, + "loss": 0.0156, + "step": 60570 + }, + { + "epoch": 13.003098088068027, + "grad_norm": 1.8804144859313965, + "learning_rate": 3.7326906064381016e-05, + "loss": 0.0778, + "step": 60580 + }, + { + "epoch": 13.00315225044684, + "grad_norm": 0.15152095258235931, + "learning_rate": 3.732389704333592e-05, + "loss": 0.0062, + "step": 60590 + }, + { + "epoch": 13.003206412825651, + "grad_norm": 0.011376074515283108, + "learning_rate": 3.732088802229083e-05, + "loss": 0.0087, + "step": 60600 + }, + { + "epoch": 13.003260575204463, + "grad_norm": 0.006093652453273535, + "learning_rate": 3.7317879001245735e-05, + "loss": 0.0164, + "step": 60610 + }, + { + "epoch": 13.003314737583274, + "grad_norm": 0.002700440352782607, + "learning_rate": 3.731486998020065e-05, + "loss": 0.1204, + "step": 60620 + }, + { + "epoch": 13.003368899962087, + "grad_norm": 0.013263346627354622, + "learning_rate": 3.731186095915555e-05, + "loss": 0.1184, + "step": 60630 + }, + { + "epoch": 13.003423062340898, + "grad_norm": 0.39369702339172363, + "learning_rate": 3.730885193811045e-05, + "loss": 0.0777, + "step": 60640 + }, + { + "epoch": 13.00347722471971, + "grad_norm": 5.308866500854492, + "learning_rate": 3.7305842917065366e-05, + "loss": 0.2037, + "step": 60650 + }, + { + "epoch": 13.003531387098521, + "grad_norm": 1.6058367490768433, + "learning_rate": 3.730283389602027e-05, + "loss": 0.1276, + "step": 60660 + }, + { + "epoch": 13.003585549477332, + "grad_norm": 0.007870699279010296, + "learning_rate": 3.729982487497517e-05, + "loss": 0.0227, + "step": 60670 + }, + { + "epoch": 13.003639711856145, + "grad_norm": 2.8317930698394775, + "learning_rate": 3.7296815853930085e-05, + "loss": 0.1172, + "step": 60680 + }, + { + "epoch": 13.003693874234957, + "grad_norm": 0.13893862068653107, + "learning_rate": 3.729380683288499e-05, + "loss": 0.0208, + "step": 60690 + }, + { + "epoch": 13.003748036613768, + "grad_norm": 0.12810073792934418, + "learning_rate": 3.72907978118399e-05, + "loss": 0.0928, + "step": 60700 + }, + { + "epoch": 13.00380219899258, + "grad_norm": 1.2644611597061157, + "learning_rate": 3.7287788790794804e-05, + "loss": 0.0243, + "step": 60710 + }, + { + "epoch": 13.003856361371392, + "grad_norm": 8.261479377746582, + "learning_rate": 3.728477976974971e-05, + "loss": 0.0313, + "step": 60720 + }, + { + "epoch": 13.003910523750204, + "grad_norm": 0.13592664897441864, + "learning_rate": 3.728177074870462e-05, + "loss": 0.0641, + "step": 60730 + }, + { + "epoch": 13.003964686129015, + "grad_norm": 17.016496658325195, + "learning_rate": 3.727876172765952e-05, + "loss": 0.1012, + "step": 60740 + }, + { + "epoch": 13.004018848507826, + "grad_norm": 0.0046323928982019424, + "learning_rate": 3.727575270661443e-05, + "loss": 0.0419, + "step": 60750 + }, + { + "epoch": 13.004073010886637, + "grad_norm": 1.1688082218170166, + "learning_rate": 3.727274368556934e-05, + "loss": 0.1307, + "step": 60760 + }, + { + "epoch": 13.00412717326545, + "grad_norm": 0.008310860022902489, + "learning_rate": 3.726973466452425e-05, + "loss": 0.0435, + "step": 60770 + }, + { + "epoch": 13.004181335644262, + "grad_norm": 0.22816349565982819, + "learning_rate": 3.726672564347915e-05, + "loss": 0.104, + "step": 60780 + }, + { + "epoch": 13.004235498023073, + "grad_norm": 9.461713790893555, + "learning_rate": 3.726371662243406e-05, + "loss": 0.0317, + "step": 60790 + }, + { + "epoch": 13.004289660401884, + "grad_norm": 0.002917839679867029, + "learning_rate": 3.726070760138897e-05, + "loss": 0.0219, + "step": 60800 + }, + { + "epoch": 13.004343822780697, + "grad_norm": 0.002409673063084483, + "learning_rate": 3.7257698580343874e-05, + "loss": 0.0812, + "step": 60810 + }, + { + "epoch": 13.004397985159509, + "grad_norm": 2.7563209533691406, + "learning_rate": 3.725468955929878e-05, + "loss": 0.0793, + "step": 60820 + }, + { + "epoch": 13.00445214753832, + "grad_norm": 0.002987515414133668, + "learning_rate": 3.7251680538253686e-05, + "loss": 0.0179, + "step": 60830 + }, + { + "epoch": 13.004506309917131, + "grad_norm": 0.004823054186999798, + "learning_rate": 3.724867151720859e-05, + "loss": 0.0425, + "step": 60840 + }, + { + "epoch": 13.004560472295942, + "grad_norm": 0.5943058133125305, + "learning_rate": 3.7245662496163505e-05, + "loss": 0.0214, + "step": 60850 + }, + { + "epoch": 13.004614634674756, + "grad_norm": 0.36892107129096985, + "learning_rate": 3.7242653475118405e-05, + "loss": 0.0468, + "step": 60860 + }, + { + "epoch": 13.004668797053567, + "grad_norm": 0.0073524462059140205, + "learning_rate": 3.723964445407331e-05, + "loss": 0.023, + "step": 60870 + }, + { + "epoch": 13.004722959432378, + "grad_norm": 0.03721236810088158, + "learning_rate": 3.7236635433028224e-05, + "loss": 0.1116, + "step": 60880 + }, + { + "epoch": 13.00477712181119, + "grad_norm": 5.515860080718994, + "learning_rate": 3.7233626411983124e-05, + "loss": 0.044, + "step": 60890 + }, + { + "epoch": 13.004831284190002, + "grad_norm": 1.331365704536438, + "learning_rate": 3.723061739093803e-05, + "loss": 0.2581, + "step": 60900 + }, + { + "epoch": 13.004885446568814, + "grad_norm": 0.239400714635849, + "learning_rate": 3.722760836989294e-05, + "loss": 0.0207, + "step": 60910 + }, + { + "epoch": 13.004939608947625, + "grad_norm": 1.6875578165054321, + "learning_rate": 3.722459934884785e-05, + "loss": 0.0992, + "step": 60920 + }, + { + "epoch": 13.004993771326436, + "grad_norm": 0.0024923691526055336, + "learning_rate": 3.722159032780275e-05, + "loss": 0.1057, + "step": 60930 + }, + { + "epoch": 13.005047933705248, + "grad_norm": 0.03388165682554245, + "learning_rate": 3.721858130675766e-05, + "loss": 0.0053, + "step": 60940 + }, + { + "epoch": 13.00510209608406, + "grad_norm": 0.09156761318445206, + "learning_rate": 3.721557228571257e-05, + "loss": 0.0202, + "step": 60950 + }, + { + "epoch": 13.005156258462872, + "grad_norm": 0.18347327411174774, + "learning_rate": 3.7212563264667475e-05, + "loss": 0.014, + "step": 60960 + }, + { + "epoch": 13.005210420841683, + "grad_norm": 0.08523253351449966, + "learning_rate": 3.720955424362238e-05, + "loss": 0.0121, + "step": 60970 + }, + { + "epoch": 13.005264583220495, + "grad_norm": 0.04943469539284706, + "learning_rate": 3.720654522257729e-05, + "loss": 0.0389, + "step": 60980 + }, + { + "epoch": 13.005318745599308, + "grad_norm": 0.0022525833919644356, + "learning_rate": 3.720353620153219e-05, + "loss": 0.076, + "step": 60990 + }, + { + "epoch": 13.005372907978119, + "grad_norm": 0.03869262710213661, + "learning_rate": 3.7200527180487106e-05, + "loss": 0.0224, + "step": 61000 + }, + { + "epoch": 13.00542707035693, + "grad_norm": 0.5177867412567139, + "learning_rate": 3.7197518159442006e-05, + "loss": 0.0198, + "step": 61010 + }, + { + "epoch": 13.005481232735741, + "grad_norm": 1.0529487133026123, + "learning_rate": 3.719450913839692e-05, + "loss": 0.1086, + "step": 61020 + }, + { + "epoch": 13.005535395114553, + "grad_norm": 1.9126176834106445, + "learning_rate": 3.7191500117351825e-05, + "loss": 0.0847, + "step": 61030 + }, + { + "epoch": 13.005589557493366, + "grad_norm": 0.006794221233576536, + "learning_rate": 3.7188491096306725e-05, + "loss": 0.0362, + "step": 61040 + }, + { + "epoch": 13.005643719872177, + "grad_norm": 0.04007340222597122, + "learning_rate": 3.718548207526164e-05, + "loss": 0.0112, + "step": 61050 + }, + { + "epoch": 13.005697882250988, + "grad_norm": 3.973076820373535, + "learning_rate": 3.7182473054216544e-05, + "loss": 0.137, + "step": 61060 + }, + { + "epoch": 13.0057520446298, + "grad_norm": 0.17803701758384705, + "learning_rate": 3.717946403317145e-05, + "loss": 0.0227, + "step": 61070 + }, + { + "epoch": 13.005806207008613, + "grad_norm": 0.9495514035224915, + "learning_rate": 3.7176455012126357e-05, + "loss": 0.0404, + "step": 61080 + }, + { + "epoch": 13.005860369387424, + "grad_norm": 0.011145670898258686, + "learning_rate": 3.717344599108126e-05, + "loss": 0.0678, + "step": 61090 + }, + { + "epoch": 13.005914531766235, + "grad_norm": 0.0021872592624276876, + "learning_rate": 3.717043697003617e-05, + "loss": 0.0644, + "step": 61100 + }, + { + "epoch": 13.005968694145047, + "grad_norm": 1.7712222337722778, + "learning_rate": 3.716742794899108e-05, + "loss": 0.0595, + "step": 61110 + }, + { + "epoch": 13.006022856523858, + "grad_norm": 0.001728959963656962, + "learning_rate": 3.716441892794598e-05, + "loss": 0.0067, + "step": 61120 + }, + { + "epoch": 13.00607701890267, + "grad_norm": 0.05366012081503868, + "learning_rate": 3.716140990690089e-05, + "loss": 0.0647, + "step": 61130 + }, + { + "epoch": 13.006131181281482, + "grad_norm": 0.035909224301576614, + "learning_rate": 3.71584008858558e-05, + "loss": 0.0239, + "step": 61140 + }, + { + "epoch": 13.006185343660293, + "grad_norm": 0.07131689041852951, + "learning_rate": 3.715539186481071e-05, + "loss": 0.0974, + "step": 61150 + }, + { + "epoch": 13.006239506039105, + "grad_norm": 0.6142590641975403, + "learning_rate": 3.715238284376561e-05, + "loss": 0.0316, + "step": 61160 + }, + { + "epoch": 13.006293668417918, + "grad_norm": 1.8069034814834595, + "learning_rate": 3.714937382272052e-05, + "loss": 0.1002, + "step": 61170 + }, + { + "epoch": 13.006347830796729, + "grad_norm": 1.2370322942733765, + "learning_rate": 3.7146364801675426e-05, + "loss": 0.0168, + "step": 61180 + }, + { + "epoch": 13.00640199317554, + "grad_norm": 0.043640103191137314, + "learning_rate": 3.7143355780630326e-05, + "loss": 0.0071, + "step": 61190 + }, + { + "epoch": 13.006456155554352, + "grad_norm": 0.08165459334850311, + "learning_rate": 3.714034675958524e-05, + "loss": 0.151, + "step": 61200 + }, + { + "epoch": 13.006510317933163, + "grad_norm": 0.14461901783943176, + "learning_rate": 3.7137337738540145e-05, + "loss": 0.1258, + "step": 61210 + }, + { + "epoch": 13.006564480311976, + "grad_norm": 0.014223051257431507, + "learning_rate": 3.713432871749505e-05, + "loss": 0.0377, + "step": 61220 + }, + { + "epoch": 13.006618642690787, + "grad_norm": 0.07342901825904846, + "learning_rate": 3.713131969644996e-05, + "loss": 0.0677, + "step": 61230 + }, + { + "epoch": 13.006672805069599, + "grad_norm": 3.927818536758423, + "learning_rate": 3.7128310675404864e-05, + "loss": 0.0622, + "step": 61240 + }, + { + "epoch": 13.00672696744841, + "grad_norm": 3.158637523651123, + "learning_rate": 3.712530165435977e-05, + "loss": 0.0678, + "step": 61250 + }, + { + "epoch": 13.006781129827223, + "grad_norm": 3.3259754180908203, + "learning_rate": 3.712229263331468e-05, + "loss": 0.0922, + "step": 61260 + }, + { + "epoch": 13.006835292206034, + "grad_norm": 0.002236516447737813, + "learning_rate": 3.711928361226958e-05, + "loss": 0.0139, + "step": 61270 + }, + { + "epoch": 13.006889454584845, + "grad_norm": 4.316728591918945, + "learning_rate": 3.7116274591224496e-05, + "loss": 0.1259, + "step": 61280 + }, + { + "epoch": 13.006943616963657, + "grad_norm": 0.0028969282284379005, + "learning_rate": 3.71132655701794e-05, + "loss": 0.0072, + "step": 61290 + }, + { + "epoch": 13.006997779342468, + "grad_norm": 0.07819832116365433, + "learning_rate": 3.711025654913431e-05, + "loss": 0.0661, + "step": 61300 + }, + { + "epoch": 13.007051941721281, + "grad_norm": 0.0022290002088993788, + "learning_rate": 3.7107247528089214e-05, + "loss": 0.0538, + "step": 61310 + }, + { + "epoch": 13.007106104100092, + "grad_norm": 0.029155319556593895, + "learning_rate": 3.710423850704412e-05, + "loss": 0.0594, + "step": 61320 + }, + { + "epoch": 13.007160266478904, + "grad_norm": 0.00635391753166914, + "learning_rate": 3.710122948599903e-05, + "loss": 0.0011, + "step": 61330 + }, + { + "epoch": 13.007214428857715, + "grad_norm": 0.0025265810545533895, + "learning_rate": 3.709822046495393e-05, + "loss": 0.0471, + "step": 61340 + }, + { + "epoch": 13.007268591236526, + "grad_norm": 0.5591217875480652, + "learning_rate": 3.709521144390884e-05, + "loss": 0.1112, + "step": 61350 + }, + { + "epoch": 13.00732275361534, + "grad_norm": 1.1696579456329346, + "learning_rate": 3.7092202422863746e-05, + "loss": 0.0174, + "step": 61360 + }, + { + "epoch": 13.00737691599415, + "grad_norm": 0.042594362050294876, + "learning_rate": 3.708919340181866e-05, + "loss": 0.1526, + "step": 61370 + }, + { + "epoch": 13.007431078372962, + "grad_norm": 0.025486966595053673, + "learning_rate": 3.708618438077356e-05, + "loss": 0.0432, + "step": 61380 + }, + { + "epoch": 13.007485240751773, + "grad_norm": 3.3957176208496094, + "learning_rate": 3.7083175359728465e-05, + "loss": 0.1036, + "step": 61390 + }, + { + "epoch": 13.007539403130586, + "grad_norm": 8.278672218322754, + "learning_rate": 3.708016633868338e-05, + "loss": 0.1538, + "step": 61400 + }, + { + "epoch": 13.007593565509397, + "grad_norm": 0.003496391698718071, + "learning_rate": 3.7077157317638284e-05, + "loss": 0.0361, + "step": 61410 + }, + { + "epoch": 13.007647727888209, + "grad_norm": 0.0985279455780983, + "learning_rate": 3.7074148296593183e-05, + "loss": 0.0763, + "step": 61420 + }, + { + "epoch": 13.00770189026702, + "grad_norm": 0.6311261057853699, + "learning_rate": 3.7071139275548097e-05, + "loss": 0.0189, + "step": 61430 + }, + { + "epoch": 13.007756052645831, + "grad_norm": 0.34100109338760376, + "learning_rate": 3.7068130254503e-05, + "loss": 0.0308, + "step": 61440 + }, + { + "epoch": 13.007810215024644, + "grad_norm": 1.4490957260131836, + "learning_rate": 3.706512123345791e-05, + "loss": 0.0096, + "step": 61450 + }, + { + "epoch": 13.007864377403456, + "grad_norm": 2.9138143062591553, + "learning_rate": 3.7062112212412815e-05, + "loss": 0.091, + "step": 61460 + }, + { + "epoch": 13.007918539782267, + "grad_norm": 0.04174404963850975, + "learning_rate": 3.705910319136772e-05, + "loss": 0.0197, + "step": 61470 + }, + { + "epoch": 13.007972702161078, + "grad_norm": 0.035801827907562256, + "learning_rate": 3.705609417032263e-05, + "loss": 0.0309, + "step": 61480 + }, + { + "epoch": 13.008026864539891, + "grad_norm": 0.07634179294109344, + "learning_rate": 3.7053085149277534e-05, + "loss": 0.0044, + "step": 61490 + }, + { + "epoch": 13.008081026918703, + "grad_norm": 0.022117922082543373, + "learning_rate": 3.705007612823244e-05, + "loss": 0.0438, + "step": 61500 + }, + { + "epoch": 13.008135189297514, + "grad_norm": 1.1045000553131104, + "learning_rate": 3.704706710718735e-05, + "loss": 0.1292, + "step": 61510 + }, + { + "epoch": 13.008189351676325, + "grad_norm": 2.201932430267334, + "learning_rate": 3.704405808614226e-05, + "loss": 0.2016, + "step": 61520 + }, + { + "epoch": 13.008243514055136, + "grad_norm": 2.60272479057312, + "learning_rate": 3.704104906509716e-05, + "loss": 0.0736, + "step": 61530 + }, + { + "epoch": 13.00829767643395, + "grad_norm": 0.023215876892209053, + "learning_rate": 3.703804004405207e-05, + "loss": 0.1327, + "step": 61540 + }, + { + "epoch": 13.00835183881276, + "grad_norm": 1.1065462827682495, + "learning_rate": 3.703503102300698e-05, + "loss": 0.0324, + "step": 61550 + }, + { + "epoch": 13.008406001191572, + "grad_norm": 0.41570499539375305, + "learning_rate": 3.7032022001961885e-05, + "loss": 0.0797, + "step": 61560 + }, + { + "epoch": 13.008460163570383, + "grad_norm": 0.8123894333839417, + "learning_rate": 3.702901298091679e-05, + "loss": 0.0688, + "step": 61570 + }, + { + "epoch": 13.008514325949196, + "grad_norm": 0.0028009458910673857, + "learning_rate": 3.70260039598717e-05, + "loss": 0.0254, + "step": 61580 + }, + { + "epoch": 13.008568488328008, + "grad_norm": 0.41220197081565857, + "learning_rate": 3.7022994938826604e-05, + "loss": 0.125, + "step": 61590 + }, + { + "epoch": 13.008622650706819, + "grad_norm": 0.00410320283845067, + "learning_rate": 3.701998591778152e-05, + "loss": 0.0678, + "step": 61600 + }, + { + "epoch": 13.00867681308563, + "grad_norm": 1.8987531661987305, + "learning_rate": 3.7016976896736416e-05, + "loss": 0.0405, + "step": 61610 + }, + { + "epoch": 13.008730975464442, + "grad_norm": 0.02111344411969185, + "learning_rate": 3.701396787569132e-05, + "loss": 0.1303, + "step": 61620 + }, + { + "epoch": 13.008785137843255, + "grad_norm": 0.15481685101985931, + "learning_rate": 3.7010958854646236e-05, + "loss": 0.1043, + "step": 61630 + }, + { + "epoch": 13.008839300222066, + "grad_norm": 8.752594947814941, + "learning_rate": 3.7007949833601135e-05, + "loss": 0.0561, + "step": 61640 + }, + { + "epoch": 13.008893462600877, + "grad_norm": 0.008434886112809181, + "learning_rate": 3.700494081255604e-05, + "loss": 0.0536, + "step": 61650 + }, + { + "epoch": 13.008947624979688, + "grad_norm": 0.4908641278743744, + "learning_rate": 3.7001931791510954e-05, + "loss": 0.1097, + "step": 61660 + }, + { + "epoch": 13.009001787358502, + "grad_norm": 0.004455775022506714, + "learning_rate": 3.699892277046586e-05, + "loss": 0.0702, + "step": 61670 + }, + { + "epoch": 13.009055949737313, + "grad_norm": 0.17821364104747772, + "learning_rate": 3.699591374942076e-05, + "loss": 0.0506, + "step": 61680 + }, + { + "epoch": 13.009110112116124, + "grad_norm": 0.14176948368549347, + "learning_rate": 3.699290472837567e-05, + "loss": 0.0788, + "step": 61690 + }, + { + "epoch": 13.009164274494935, + "grad_norm": 0.003587106941267848, + "learning_rate": 3.698989570733058e-05, + "loss": 0.1166, + "step": 61700 + }, + { + "epoch": 13.009218436873747, + "grad_norm": 0.8899877667427063, + "learning_rate": 3.6986886686285486e-05, + "loss": 0.0363, + "step": 61710 + }, + { + "epoch": 13.00927259925256, + "grad_norm": 0.29622963070869446, + "learning_rate": 3.698387766524039e-05, + "loss": 0.0474, + "step": 61720 + }, + { + "epoch": 13.009326761631371, + "grad_norm": 0.13120439648628235, + "learning_rate": 3.69808686441953e-05, + "loss": 0.0372, + "step": 61730 + }, + { + "epoch": 13.009380924010182, + "grad_norm": 1.041977047920227, + "learning_rate": 3.6977859623150205e-05, + "loss": 0.0391, + "step": 61740 + }, + { + "epoch": 13.009435086388994, + "grad_norm": 3.6385457515716553, + "learning_rate": 3.697485060210512e-05, + "loss": 0.0398, + "step": 61750 + }, + { + "epoch": 13.009489248767807, + "grad_norm": 0.06810825318098068, + "learning_rate": 3.697184158106002e-05, + "loss": 0.0417, + "step": 61760 + }, + { + "epoch": 13.009543411146618, + "grad_norm": 0.03393168747425079, + "learning_rate": 3.6968832560014923e-05, + "loss": 0.027, + "step": 61770 + }, + { + "epoch": 13.00959757352543, + "grad_norm": 1.497735857963562, + "learning_rate": 3.6965823538969836e-05, + "loss": 0.0769, + "step": 61780 + }, + { + "epoch": 13.00965173590424, + "grad_norm": 0.03923302888870239, + "learning_rate": 3.6962814517924736e-05, + "loss": 0.0406, + "step": 61790 + }, + { + "epoch": 13.009705898283052, + "grad_norm": 0.0021897756960242987, + "learning_rate": 3.695980549687965e-05, + "loss": 0.0671, + "step": 61800 + }, + { + "epoch": 13.009760060661865, + "grad_norm": 0.13036786019802094, + "learning_rate": 3.6956796475834555e-05, + "loss": 0.0091, + "step": 61810 + }, + { + "epoch": 13.009814223040676, + "grad_norm": 0.05382382869720459, + "learning_rate": 3.695378745478946e-05, + "loss": 0.0189, + "step": 61820 + }, + { + "epoch": 13.009868385419487, + "grad_norm": 0.3566938042640686, + "learning_rate": 3.695077843374437e-05, + "loss": 0.0634, + "step": 61830 + }, + { + "epoch": 13.009922547798299, + "grad_norm": 0.059538986533880234, + "learning_rate": 3.6947769412699274e-05, + "loss": 0.0485, + "step": 61840 + }, + { + "epoch": 13.009976710177112, + "grad_norm": 0.0162473414093256, + "learning_rate": 3.694476039165418e-05, + "loss": 0.0831, + "step": 61850 + }, + { + "epoch": 13.010030872555923, + "grad_norm": 0.04184069484472275, + "learning_rate": 3.6941751370609093e-05, + "loss": 0.0164, + "step": 61860 + }, + { + "epoch": 13.010085034934734, + "grad_norm": 0.0919475182890892, + "learning_rate": 3.693874234956399e-05, + "loss": 0.0378, + "step": 61870 + }, + { + "epoch": 13.010139197313546, + "grad_norm": 0.00891643762588501, + "learning_rate": 3.69357333285189e-05, + "loss": 0.0057, + "step": 61880 + }, + { + "epoch": 13.010193359692357, + "grad_norm": 0.0018312711035832763, + "learning_rate": 3.693272430747381e-05, + "loss": 0.0362, + "step": 61890 + }, + { + "epoch": 13.01024752207117, + "grad_norm": 0.050824638456106186, + "learning_rate": 3.692971528642872e-05, + "loss": 0.011, + "step": 61900 + }, + { + "epoch": 13.010301684449981, + "grad_norm": 0.002025444759055972, + "learning_rate": 3.692670626538362e-05, + "loss": 0.0496, + "step": 61910 + }, + { + "epoch": 13.010355846828793, + "grad_norm": 0.0017225549090653658, + "learning_rate": 3.692369724433853e-05, + "loss": 0.0901, + "step": 61920 + }, + { + "epoch": 13.010410009207604, + "grad_norm": 2.28306245803833, + "learning_rate": 3.692068822329344e-05, + "loss": 0.1513, + "step": 61930 + }, + { + "epoch": 13.010464171586417, + "grad_norm": 1.6759953498840332, + "learning_rate": 3.691767920224834e-05, + "loss": 0.0105, + "step": 61940 + }, + { + "epoch": 13.010518333965228, + "grad_norm": 0.06686744838953018, + "learning_rate": 3.691467018120325e-05, + "loss": 0.0432, + "step": 61950 + }, + { + "epoch": 13.01057249634404, + "grad_norm": 0.006218569353222847, + "learning_rate": 3.6911661160158156e-05, + "loss": 0.1586, + "step": 61960 + }, + { + "epoch": 13.01062665872285, + "grad_norm": 0.047681815922260284, + "learning_rate": 3.690865213911306e-05, + "loss": 0.0022, + "step": 61970 + }, + { + "epoch": 13.010680821101662, + "grad_norm": 0.04896952584385872, + "learning_rate": 3.690564311806797e-05, + "loss": 0.0677, + "step": 61980 + }, + { + "epoch": 13.010734983480475, + "grad_norm": 0.014650138095021248, + "learning_rate": 3.6902634097022875e-05, + "loss": 0.0146, + "step": 61990 + }, + { + "epoch": 13.010789145859286, + "grad_norm": 0.13055719435214996, + "learning_rate": 3.689962507597778e-05, + "loss": 0.0108, + "step": 62000 + }, + { + "epoch": 13.010843308238098, + "grad_norm": 0.8465781211853027, + "learning_rate": 3.6896616054932694e-05, + "loss": 0.1136, + "step": 62010 + }, + { + "epoch": 13.010897470616909, + "grad_norm": 0.0025980626232922077, + "learning_rate": 3.6893607033887594e-05, + "loss": 0.1012, + "step": 62020 + }, + { + "epoch": 13.010951632995722, + "grad_norm": 0.18904224038124084, + "learning_rate": 3.68905980128425e-05, + "loss": 0.029, + "step": 62030 + }, + { + "epoch": 13.011005795374533, + "grad_norm": 0.09484024345874786, + "learning_rate": 3.688758899179741e-05, + "loss": 0.1159, + "step": 62040 + }, + { + "epoch": 13.011059957753345, + "grad_norm": 0.0073375520296394825, + "learning_rate": 3.688457997075232e-05, + "loss": 0.101, + "step": 62050 + }, + { + "epoch": 13.011114120132156, + "grad_norm": 0.3369237184524536, + "learning_rate": 3.6881570949707226e-05, + "loss": 0.0818, + "step": 62060 + }, + { + "epoch": 13.011168282510967, + "grad_norm": 0.026528846472501755, + "learning_rate": 3.687856192866213e-05, + "loss": 0.0687, + "step": 62070 + }, + { + "epoch": 13.01122244488978, + "grad_norm": 10.455812454223633, + "learning_rate": 3.687555290761704e-05, + "loss": 0.0668, + "step": 62080 + }, + { + "epoch": 13.011276607268591, + "grad_norm": 0.03794071450829506, + "learning_rate": 3.6872543886571945e-05, + "loss": 0.149, + "step": 62090 + }, + { + "epoch": 13.011330769647403, + "grad_norm": 1.6820719242095947, + "learning_rate": 3.686953486552685e-05, + "loss": 0.0521, + "step": 62100 + }, + { + "epoch": 13.011384932026214, + "grad_norm": 0.07294031232595444, + "learning_rate": 3.686652584448176e-05, + "loss": 0.0569, + "step": 62110 + }, + { + "epoch": 13.011439094405027, + "grad_norm": 0.018439998850226402, + "learning_rate": 3.686351682343667e-05, + "loss": 0.006, + "step": 62120 + }, + { + "epoch": 13.011493256783838, + "grad_norm": 0.043175481259822845, + "learning_rate": 3.686050780239157e-05, + "loss": 0.0675, + "step": 62130 + }, + { + "epoch": 13.01154741916265, + "grad_norm": 80.64580535888672, + "learning_rate": 3.6857498781346476e-05, + "loss": 0.1408, + "step": 62140 + }, + { + "epoch": 13.011601581541461, + "grad_norm": 0.0632072240114212, + "learning_rate": 3.685448976030139e-05, + "loss": 0.0584, + "step": 62150 + }, + { + "epoch": 13.011655743920272, + "grad_norm": 0.10744617134332657, + "learning_rate": 3.6851480739256295e-05, + "loss": 0.0745, + "step": 62160 + }, + { + "epoch": 13.011709906299085, + "grad_norm": 0.14678551256656647, + "learning_rate": 3.6848471718211195e-05, + "loss": 0.1306, + "step": 62170 + }, + { + "epoch": 13.011764068677897, + "grad_norm": 2.7536230087280273, + "learning_rate": 3.684546269716611e-05, + "loss": 0.0399, + "step": 62180 + }, + { + "epoch": 13.011818231056708, + "grad_norm": 0.018707459792494774, + "learning_rate": 3.6842453676121014e-05, + "loss": 0.0568, + "step": 62190 + }, + { + "epoch": 13.01187239343552, + "grad_norm": 0.09083480387926102, + "learning_rate": 3.683944465507592e-05, + "loss": 0.1568, + "step": 62200 + }, + { + "epoch": 13.011926555814332, + "grad_norm": 5.0539326667785645, + "learning_rate": 3.6836435634030827e-05, + "loss": 0.1827, + "step": 62210 + }, + { + "epoch": 13.011980718193144, + "grad_norm": 0.14463338255882263, + "learning_rate": 3.683342661298573e-05, + "loss": 0.0442, + "step": 62220 + }, + { + "epoch": 13.012034880571955, + "grad_norm": 0.39473414421081543, + "learning_rate": 3.683041759194064e-05, + "loss": 0.0485, + "step": 62230 + }, + { + "epoch": 13.012089042950766, + "grad_norm": 0.026918739080429077, + "learning_rate": 3.6827408570895545e-05, + "loss": 0.0513, + "step": 62240 + }, + { + "epoch": 13.012143205329577, + "grad_norm": 0.009362959302961826, + "learning_rate": 3.682439954985045e-05, + "loss": 0.0426, + "step": 62250 + }, + { + "epoch": 13.01219736770839, + "grad_norm": 0.017273638397455215, + "learning_rate": 3.682139052880536e-05, + "loss": 0.0192, + "step": 62260 + }, + { + "epoch": 13.012251530087202, + "grad_norm": 0.2821428179740906, + "learning_rate": 3.681838150776027e-05, + "loss": 0.0468, + "step": 62270 + }, + { + "epoch": 13.012305692466013, + "grad_norm": 0.10498348623514175, + "learning_rate": 3.681537248671517e-05, + "loss": 0.0616, + "step": 62280 + }, + { + "epoch": 13.012359854844824, + "grad_norm": 0.4627566933631897, + "learning_rate": 3.681236346567008e-05, + "loss": 0.1607, + "step": 62290 + }, + { + "epoch": 13.012414017223637, + "grad_norm": 0.006585961207747459, + "learning_rate": 3.680935444462499e-05, + "loss": 0.0954, + "step": 62300 + }, + { + "epoch": 13.012468179602449, + "grad_norm": 0.1990136206150055, + "learning_rate": 3.6806345423579896e-05, + "loss": 0.1161, + "step": 62310 + }, + { + "epoch": 13.01252234198126, + "grad_norm": 0.2241993248462677, + "learning_rate": 3.68033364025348e-05, + "loss": 0.1696, + "step": 62320 + }, + { + "epoch": 13.012576504360071, + "grad_norm": 0.42871880531311035, + "learning_rate": 3.680032738148971e-05, + "loss": 0.0463, + "step": 62330 + }, + { + "epoch": 13.012630666738882, + "grad_norm": 0.010591535829007626, + "learning_rate": 3.6797318360444615e-05, + "loss": 0.0978, + "step": 62340 + }, + { + "epoch": 13.012684829117696, + "grad_norm": 0.09234021604061127, + "learning_rate": 3.679430933939952e-05, + "loss": 0.0917, + "step": 62350 + }, + { + "epoch": 13.012738991496507, + "grad_norm": 0.09224644303321838, + "learning_rate": 3.679130031835443e-05, + "loss": 0.0848, + "step": 62360 + }, + { + "epoch": 13.012793153875318, + "grad_norm": 0.8685188889503479, + "learning_rate": 3.6788291297309334e-05, + "loss": 0.0777, + "step": 62370 + }, + { + "epoch": 13.01284731625413, + "grad_norm": 0.08288077265024185, + "learning_rate": 3.678528227626425e-05, + "loss": 0.0408, + "step": 62380 + }, + { + "epoch": 13.012901478632942, + "grad_norm": 0.25372079014778137, + "learning_rate": 3.6782273255219146e-05, + "loss": 0.0527, + "step": 62390 + }, + { + "epoch": 13.012955641011754, + "grad_norm": 0.10010088235139847, + "learning_rate": 3.677926423417405e-05, + "loss": 0.0354, + "step": 62400 + }, + { + "epoch": 13.013009803390565, + "grad_norm": 0.6852490305900574, + "learning_rate": 3.6776255213128966e-05, + "loss": 0.1012, + "step": 62410 + }, + { + "epoch": 13.013063965769376, + "grad_norm": 5.88931941986084, + "learning_rate": 3.677324619208387e-05, + "loss": 0.0795, + "step": 62420 + }, + { + "epoch": 13.013118128148188, + "grad_norm": 0.021334584802389145, + "learning_rate": 3.677023717103877e-05, + "loss": 0.1038, + "step": 62430 + }, + { + "epoch": 13.013172290527, + "grad_norm": 0.016060365363955498, + "learning_rate": 3.6767228149993684e-05, + "loss": 0.0614, + "step": 62440 + }, + { + "epoch": 13.013226452905812, + "grad_norm": 0.012243784964084625, + "learning_rate": 3.676421912894859e-05, + "loss": 0.0807, + "step": 62450 + }, + { + "epoch": 13.013280615284623, + "grad_norm": 1.553003191947937, + "learning_rate": 3.67612101079035e-05, + "loss": 0.087, + "step": 62460 + }, + { + "epoch": 13.013334777663434, + "grad_norm": 0.01858610473573208, + "learning_rate": 3.67582010868584e-05, + "loss": 0.0331, + "step": 62470 + }, + { + "epoch": 13.013388940042246, + "grad_norm": 1.6866861581802368, + "learning_rate": 3.675519206581331e-05, + "loss": 0.1385, + "step": 62480 + }, + { + "epoch": 13.013443102421059, + "grad_norm": 0.07244337350130081, + "learning_rate": 3.6752183044768216e-05, + "loss": 0.0041, + "step": 62490 + }, + { + "epoch": 13.01349726479987, + "grad_norm": 1.464648962020874, + "learning_rate": 3.674917402372313e-05, + "loss": 0.0341, + "step": 62500 + }, + { + "epoch": 13.013551427178681, + "grad_norm": 0.8674750328063965, + "learning_rate": 3.674616500267803e-05, + "loss": 0.133, + "step": 62510 + }, + { + "epoch": 13.013605589557493, + "grad_norm": 1.2387090921401978, + "learning_rate": 3.6743155981632935e-05, + "loss": 0.2143, + "step": 62520 + }, + { + "epoch": 13.013659751936306, + "grad_norm": 0.9618995785713196, + "learning_rate": 3.674014696058785e-05, + "loss": 0.0827, + "step": 62530 + }, + { + "epoch": 13.013713914315117, + "grad_norm": 0.921850323677063, + "learning_rate": 3.673713793954275e-05, + "loss": 0.1081, + "step": 62540 + }, + { + "epoch": 13.013768076693928, + "grad_norm": 0.016212884336709976, + "learning_rate": 3.6734128918497653e-05, + "loss": 0.047, + "step": 62550 + }, + { + "epoch": 13.01382223907274, + "grad_norm": 0.3263496458530426, + "learning_rate": 3.6731119897452567e-05, + "loss": 0.0546, + "step": 62560 + }, + { + "epoch": 13.013876401451551, + "grad_norm": 0.22416262328624725, + "learning_rate": 3.672811087640747e-05, + "loss": 0.0612, + "step": 62570 + }, + { + "epoch": 13.013930563830364, + "grad_norm": 0.004525887779891491, + "learning_rate": 3.672510185536238e-05, + "loss": 0.0091, + "step": 62580 + }, + { + "epoch": 13.013984726209175, + "grad_norm": 0.2463218867778778, + "learning_rate": 3.6722092834317285e-05, + "loss": 0.0538, + "step": 62590 + }, + { + "epoch": 13.014038888587987, + "grad_norm": 0.10274290293455124, + "learning_rate": 3.671908381327219e-05, + "loss": 0.0076, + "step": 62600 + }, + { + "epoch": 13.014093050966798, + "grad_norm": 0.04484580084681511, + "learning_rate": 3.67160747922271e-05, + "loss": 0.0051, + "step": 62610 + }, + { + "epoch": 13.01414721334561, + "grad_norm": 4.447408199310303, + "learning_rate": 3.6713065771182004e-05, + "loss": 0.0456, + "step": 62620 + }, + { + "epoch": 13.014201375724422, + "grad_norm": 0.0025764917954802513, + "learning_rate": 3.671005675013691e-05, + "loss": 0.0106, + "step": 62630 + }, + { + "epoch": 13.014255538103233, + "grad_norm": 0.0941862165927887, + "learning_rate": 3.6707047729091824e-05, + "loss": 0.0212, + "step": 62640 + }, + { + "epoch": 13.014309700482045, + "grad_norm": 11.29944896697998, + "learning_rate": 3.670403870804673e-05, + "loss": 0.0751, + "step": 62650 + }, + { + "epoch": 13.014363862860856, + "grad_norm": 1.9246063232421875, + "learning_rate": 3.670102968700163e-05, + "loss": 0.1569, + "step": 62660 + }, + { + "epoch": 13.014418025239669, + "grad_norm": 2.8442935943603516, + "learning_rate": 3.669802066595654e-05, + "loss": 0.05, + "step": 62670 + }, + { + "epoch": 13.01447218761848, + "grad_norm": 1.0062083005905151, + "learning_rate": 3.669501164491145e-05, + "loss": 0.072, + "step": 62680 + }, + { + "epoch": 13.014526349997292, + "grad_norm": 2.5132522583007812, + "learning_rate": 3.669200262386635e-05, + "loss": 0.1168, + "step": 62690 + }, + { + "epoch": 13.014580512376103, + "grad_norm": 0.1012909859418869, + "learning_rate": 3.668899360282126e-05, + "loss": 0.1081, + "step": 62700 + }, + { + "epoch": 13.014634674754916, + "grad_norm": 0.09659434109926224, + "learning_rate": 3.668598458177617e-05, + "loss": 0.0593, + "step": 62710 + }, + { + "epoch": 13.014688837133727, + "grad_norm": 13.115408897399902, + "learning_rate": 3.6682975560731074e-05, + "loss": 0.0399, + "step": 62720 + }, + { + "epoch": 13.014742999512539, + "grad_norm": 0.12743361294269562, + "learning_rate": 3.667996653968598e-05, + "loss": 0.0213, + "step": 62730 + }, + { + "epoch": 13.01479716189135, + "grad_norm": 0.07555092126131058, + "learning_rate": 3.6676957518640886e-05, + "loss": 0.012, + "step": 62740 + }, + { + "epoch": 13.014851324270161, + "grad_norm": 3.5188803672790527, + "learning_rate": 3.667394849759579e-05, + "loss": 0.1449, + "step": 62750 + }, + { + "epoch": 13.014905486648974, + "grad_norm": 0.03315355256199837, + "learning_rate": 3.6670939476550706e-05, + "loss": 0.0936, + "step": 62760 + }, + { + "epoch": 13.014959649027785, + "grad_norm": 0.006225817836821079, + "learning_rate": 3.6667930455505605e-05, + "loss": 0.0906, + "step": 62770 + }, + { + "epoch": 13.015013811406597, + "grad_norm": 0.004487162455916405, + "learning_rate": 3.666492143446051e-05, + "loss": 0.0535, + "step": 62780 + }, + { + "epoch": 13.015067973785408, + "grad_norm": 0.006372994277626276, + "learning_rate": 3.6661912413415424e-05, + "loss": 0.0037, + "step": 62790 + }, + { + "epoch": 13.015122136164221, + "grad_norm": 0.0030905597377568483, + "learning_rate": 3.665890339237033e-05, + "loss": 0.1323, + "step": 62800 + }, + { + "epoch": 13.015176298543032, + "grad_norm": 11.985771179199219, + "learning_rate": 3.665589437132523e-05, + "loss": 0.1648, + "step": 62810 + }, + { + "epoch": 13.015230460921844, + "grad_norm": 1.0355321168899536, + "learning_rate": 3.665288535028014e-05, + "loss": 0.0287, + "step": 62820 + }, + { + "epoch": 13.015284623300655, + "grad_norm": 2.5228159427642822, + "learning_rate": 3.664987632923505e-05, + "loss": 0.1331, + "step": 62830 + }, + { + "epoch": 13.015338785679466, + "grad_norm": 0.006353544536978006, + "learning_rate": 3.6646867308189956e-05, + "loss": 0.089, + "step": 62840 + }, + { + "epoch": 13.01539294805828, + "grad_norm": 0.029994867742061615, + "learning_rate": 3.664385828714486e-05, + "loss": 0.0929, + "step": 62850 + }, + { + "epoch": 13.01544711043709, + "grad_norm": 0.09364192187786102, + "learning_rate": 3.664084926609977e-05, + "loss": 0.0877, + "step": 62860 + }, + { + "epoch": 13.015501272815902, + "grad_norm": 0.008222671225667, + "learning_rate": 3.6637840245054675e-05, + "loss": 0.0697, + "step": 62870 + }, + { + "epoch": 13.015555435194713, + "grad_norm": 0.09198040515184402, + "learning_rate": 3.663483122400958e-05, + "loss": 0.0771, + "step": 62880 + }, + { + "epoch": 13.015609597573526, + "grad_norm": 0.15988701581954956, + "learning_rate": 3.663182220296449e-05, + "loss": 0.0141, + "step": 62890 + }, + { + "epoch": 13.015663759952337, + "grad_norm": 11.20113754272461, + "learning_rate": 3.66288131819194e-05, + "loss": 0.1101, + "step": 62900 + }, + { + "epoch": 13.015717922331149, + "grad_norm": 0.1265808790922165, + "learning_rate": 3.6625804160874306e-05, + "loss": 0.1419, + "step": 62910 + }, + { + "epoch": 13.01577208470996, + "grad_norm": 0.004203906282782555, + "learning_rate": 3.6622795139829206e-05, + "loss": 0.1398, + "step": 62920 + }, + { + "epoch": 13.015826247088771, + "grad_norm": 0.07935135066509247, + "learning_rate": 3.661978611878412e-05, + "loss": 0.0335, + "step": 62930 + }, + { + "epoch": 13.015880409467584, + "grad_norm": 0.2389329969882965, + "learning_rate": 3.6616777097739025e-05, + "loss": 0.0611, + "step": 62940 + }, + { + "epoch": 13.015934571846396, + "grad_norm": 0.003011518158018589, + "learning_rate": 3.661376807669393e-05, + "loss": 0.0388, + "step": 62950 + }, + { + "epoch": 13.015988734225207, + "grad_norm": 0.45236465334892273, + "learning_rate": 3.661075905564884e-05, + "loss": 0.0634, + "step": 62960 + }, + { + "epoch": 13.016042896604018, + "grad_norm": 0.1066950112581253, + "learning_rate": 3.6607750034603744e-05, + "loss": 0.0305, + "step": 62970 + }, + { + "epoch": 13.016097058982831, + "grad_norm": 2.4677488803863525, + "learning_rate": 3.660474101355865e-05, + "loss": 0.0457, + "step": 62980 + }, + { + "epoch": 13.016151221361643, + "grad_norm": 0.0029778797179460526, + "learning_rate": 3.660173199251356e-05, + "loss": 0.0143, + "step": 62990 + }, + { + "epoch": 13.016205383740454, + "grad_norm": 0.18412569165229797, + "learning_rate": 3.659872297146846e-05, + "loss": 0.0502, + "step": 63000 + }, + { + "epoch": 13.016259546119265, + "grad_norm": 0.10968642681837082, + "learning_rate": 3.659571395042337e-05, + "loss": 0.0403, + "step": 63010 + }, + { + "epoch": 13.016313708498076, + "grad_norm": 0.21636603772640228, + "learning_rate": 3.659270492937828e-05, + "loss": 0.071, + "step": 63020 + }, + { + "epoch": 13.01636787087689, + "grad_norm": 3.161679983139038, + "learning_rate": 3.658969590833318e-05, + "loss": 0.1598, + "step": 63030 + }, + { + "epoch": 13.0164220332557, + "grad_norm": 0.022817781195044518, + "learning_rate": 3.658668688728809e-05, + "loss": 0.0368, + "step": 63040 + }, + { + "epoch": 13.016476195634512, + "grad_norm": 0.1429600715637207, + "learning_rate": 3.6583677866243e-05, + "loss": 0.0601, + "step": 63050 + }, + { + "epoch": 13.016530358013323, + "grad_norm": 0.7477318048477173, + "learning_rate": 3.658066884519791e-05, + "loss": 0.073, + "step": 63060 + }, + { + "epoch": 13.016584520392136, + "grad_norm": 0.002685021376237273, + "learning_rate": 3.657765982415281e-05, + "loss": 0.0124, + "step": 63070 + }, + { + "epoch": 13.016638682770948, + "grad_norm": 0.3567957282066345, + "learning_rate": 3.657465080310772e-05, + "loss": 0.1285, + "step": 63080 + }, + { + "epoch": 13.016692845149759, + "grad_norm": 0.45182791352272034, + "learning_rate": 3.6571641782062626e-05, + "loss": 0.086, + "step": 63090 + }, + { + "epoch": 13.01674700752857, + "grad_norm": 0.00835437048226595, + "learning_rate": 3.656863276101753e-05, + "loss": 0.0616, + "step": 63100 + }, + { + "epoch": 13.016801169907382, + "grad_norm": 0.004863766022026539, + "learning_rate": 3.656562373997244e-05, + "loss": 0.074, + "step": 63110 + }, + { + "epoch": 13.016855332286195, + "grad_norm": 0.8851223587989807, + "learning_rate": 3.6562614718927345e-05, + "loss": 0.07, + "step": 63120 + }, + { + "epoch": 13.016909494665006, + "grad_norm": 0.3396582305431366, + "learning_rate": 3.655960569788225e-05, + "loss": 0.0839, + "step": 63130 + }, + { + "epoch": 13.016963657043817, + "grad_norm": 3.709404945373535, + "learning_rate": 3.655659667683716e-05, + "loss": 0.1224, + "step": 63140 + }, + { + "epoch": 13.017017819422628, + "grad_norm": 0.04345041140913963, + "learning_rate": 3.6553587655792064e-05, + "loss": 0.0713, + "step": 63150 + }, + { + "epoch": 13.017071981801442, + "grad_norm": 0.20941586792469025, + "learning_rate": 3.655057863474698e-05, + "loss": 0.0147, + "step": 63160 + }, + { + "epoch": 13.017126144180253, + "grad_norm": 0.07286406308412552, + "learning_rate": 3.654756961370188e-05, + "loss": 0.0608, + "step": 63170 + }, + { + "epoch": 13.017180306559064, + "grad_norm": 0.09273598343133926, + "learning_rate": 3.654456059265678e-05, + "loss": 0.1204, + "step": 63180 + }, + { + "epoch": 13.017234468937875, + "grad_norm": 0.007483617402613163, + "learning_rate": 3.6541551571611696e-05, + "loss": 0.0148, + "step": 63190 + }, + { + "epoch": 13.017288631316687, + "grad_norm": 0.03667241707444191, + "learning_rate": 3.65385425505666e-05, + "loss": 0.0021, + "step": 63200 + }, + { + "epoch": 13.0173427936955, + "grad_norm": 0.06307690590620041, + "learning_rate": 3.653553352952151e-05, + "loss": 0.0367, + "step": 63210 + }, + { + "epoch": 13.017396956074311, + "grad_norm": 0.3578941822052002, + "learning_rate": 3.6532524508476415e-05, + "loss": 0.0741, + "step": 63220 + }, + { + "epoch": 13.017451118453122, + "grad_norm": 0.8953835964202881, + "learning_rate": 3.652951548743132e-05, + "loss": 0.0193, + "step": 63230 + }, + { + "epoch": 13.017505280831934, + "grad_norm": 0.0021727478597313166, + "learning_rate": 3.652650646638623e-05, + "loss": 0.0221, + "step": 63240 + }, + { + "epoch": 13.017559443210747, + "grad_norm": 0.15429706871509552, + "learning_rate": 3.652349744534114e-05, + "loss": 0.0945, + "step": 63250 + }, + { + "epoch": 13.017613605589558, + "grad_norm": 0.0022872542031109333, + "learning_rate": 3.652048842429604e-05, + "loss": 0.1057, + "step": 63260 + }, + { + "epoch": 13.01766776796837, + "grad_norm": 1.3290404081344604, + "learning_rate": 3.6517479403250946e-05, + "loss": 0.0511, + "step": 63270 + }, + { + "epoch": 13.01772193034718, + "grad_norm": 0.021435171365737915, + "learning_rate": 3.651447038220586e-05, + "loss": 0.1468, + "step": 63280 + }, + { + "epoch": 13.017776092725992, + "grad_norm": 0.007358397822827101, + "learning_rate": 3.651146136116076e-05, + "loss": 0.0262, + "step": 63290 + }, + { + "epoch": 13.017830255104805, + "grad_norm": 0.8694303631782532, + "learning_rate": 3.6508452340115665e-05, + "loss": 0.0779, + "step": 63300 + }, + { + "epoch": 13.017884417483616, + "grad_norm": 0.27883434295654297, + "learning_rate": 3.650544331907058e-05, + "loss": 0.0858, + "step": 63310 + }, + { + "epoch": 13.017938579862427, + "grad_norm": 9.501397132873535, + "learning_rate": 3.6502434298025484e-05, + "loss": 0.1433, + "step": 63320 + }, + { + "epoch": 13.017992742241239, + "grad_norm": 0.044218510389328, + "learning_rate": 3.6499425276980384e-05, + "loss": 0.0186, + "step": 63330 + }, + { + "epoch": 13.018046904620052, + "grad_norm": 0.05600558593869209, + "learning_rate": 3.64964162559353e-05, + "loss": 0.0711, + "step": 63340 + }, + { + "epoch": 13.018101066998863, + "grad_norm": 0.030728023499250412, + "learning_rate": 3.64934072348902e-05, + "loss": 0.0063, + "step": 63350 + }, + { + "epoch": 13.018155229377674, + "grad_norm": 0.040977220982313156, + "learning_rate": 3.649039821384511e-05, + "loss": 0.0841, + "step": 63360 + }, + { + "epoch": 13.018209391756486, + "grad_norm": 0.3829326629638672, + "learning_rate": 3.6487389192800015e-05, + "loss": 0.0675, + "step": 63370 + }, + { + "epoch": 13.018263554135297, + "grad_norm": 3.3688876628875732, + "learning_rate": 3.648438017175492e-05, + "loss": 0.0113, + "step": 63380 + }, + { + "epoch": 13.01831771651411, + "grad_norm": 0.004550074692815542, + "learning_rate": 3.648137115070983e-05, + "loss": 0.0527, + "step": 63390 + }, + { + "epoch": 13.018371878892921, + "grad_norm": 0.05917298421263695, + "learning_rate": 3.647836212966474e-05, + "loss": 0.1015, + "step": 63400 + }, + { + "epoch": 13.018426041271733, + "grad_norm": 0.0446011982858181, + "learning_rate": 3.647535310861964e-05, + "loss": 0.0586, + "step": 63410 + }, + { + "epoch": 13.018480203650544, + "grad_norm": 0.053426824510097504, + "learning_rate": 3.6472344087574554e-05, + "loss": 0.1649, + "step": 63420 + }, + { + "epoch": 13.018534366029357, + "grad_norm": 7.665705680847168, + "learning_rate": 3.646933506652946e-05, + "loss": 0.0439, + "step": 63430 + }, + { + "epoch": 13.018588528408168, + "grad_norm": 0.1317548304796219, + "learning_rate": 3.646632604548436e-05, + "loss": 0.0751, + "step": 63440 + }, + { + "epoch": 13.01864269078698, + "grad_norm": 0.05991506204009056, + "learning_rate": 3.646331702443927e-05, + "loss": 0.062, + "step": 63450 + }, + { + "epoch": 13.01869685316579, + "grad_norm": 0.411733478307724, + "learning_rate": 3.646030800339418e-05, + "loss": 0.0667, + "step": 63460 + }, + { + "epoch": 13.018751015544602, + "grad_norm": 0.08116250485181808, + "learning_rate": 3.6457298982349085e-05, + "loss": 0.0921, + "step": 63470 + }, + { + "epoch": 13.018805177923415, + "grad_norm": 0.6564339399337769, + "learning_rate": 3.645428996130399e-05, + "loss": 0.1114, + "step": 63480 + }, + { + "epoch": 13.018859340302226, + "grad_norm": 0.8439661860466003, + "learning_rate": 3.64512809402589e-05, + "loss": 0.0514, + "step": 63490 + }, + { + "epoch": 13.018913502681038, + "grad_norm": 1.3046767711639404, + "learning_rate": 3.6448271919213804e-05, + "loss": 0.0764, + "step": 63500 + }, + { + "epoch": 13.018967665059849, + "grad_norm": 0.0033090487122535706, + "learning_rate": 3.644526289816872e-05, + "loss": 0.0234, + "step": 63510 + }, + { + "epoch": 13.019021827438662, + "grad_norm": 0.841461718082428, + "learning_rate": 3.6442253877123616e-05, + "loss": 0.0317, + "step": 63520 + }, + { + "epoch": 13.019075989817473, + "grad_norm": 0.04020598158240318, + "learning_rate": 3.643924485607852e-05, + "loss": 0.0541, + "step": 63530 + }, + { + "epoch": 13.019130152196285, + "grad_norm": 0.0026297434233129025, + "learning_rate": 3.6436235835033436e-05, + "loss": 0.0656, + "step": 63540 + }, + { + "epoch": 13.019184314575096, + "grad_norm": 1.0628321170806885, + "learning_rate": 3.643322681398834e-05, + "loss": 0.0345, + "step": 63550 + }, + { + "epoch": 13.019238476953907, + "grad_norm": 1.1066761016845703, + "learning_rate": 3.643021779294324e-05, + "loss": 0.0481, + "step": 63560 + }, + { + "epoch": 13.01929263933272, + "grad_norm": 0.0021468279883265495, + "learning_rate": 3.6427208771898154e-05, + "loss": 0.0209, + "step": 63570 + }, + { + "epoch": 13.019346801711531, + "grad_norm": 3.688584327697754, + "learning_rate": 3.642419975085306e-05, + "loss": 0.2595, + "step": 63580 + }, + { + "epoch": 13.019400964090343, + "grad_norm": 0.1457432061433792, + "learning_rate": 3.642119072980796e-05, + "loss": 0.0266, + "step": 63590 + }, + { + "epoch": 13.019455126469154, + "grad_norm": 0.16459429264068604, + "learning_rate": 3.641818170876287e-05, + "loss": 0.0422, + "step": 63600 + }, + { + "epoch": 13.019509288847965, + "grad_norm": 0.8301411867141724, + "learning_rate": 3.641517268771778e-05, + "loss": 0.0083, + "step": 63610 + }, + { + "epoch": 13.019563451226778, + "grad_norm": 0.009149542078375816, + "learning_rate": 3.6412163666672686e-05, + "loss": 0.0014, + "step": 63620 + }, + { + "epoch": 13.01961761360559, + "grad_norm": 0.057608168572187424, + "learning_rate": 3.640915464562759e-05, + "loss": 0.0275, + "step": 63630 + }, + { + "epoch": 13.019671775984401, + "grad_norm": 0.07857226580381393, + "learning_rate": 3.64061456245825e-05, + "loss": 0.0044, + "step": 63640 + }, + { + "epoch": 13.019725938363212, + "grad_norm": 3.3325283527374268, + "learning_rate": 3.6403136603537405e-05, + "loss": 0.1059, + "step": 63650 + }, + { + "epoch": 13.019780100742025, + "grad_norm": 0.07278865575790405, + "learning_rate": 3.640012758249232e-05, + "loss": 0.1001, + "step": 63660 + }, + { + "epoch": 13.019834263120837, + "grad_norm": 2.997187614440918, + "learning_rate": 3.639711856144722e-05, + "loss": 0.0635, + "step": 63670 + }, + { + "epoch": 13.019888425499648, + "grad_norm": 0.38384321331977844, + "learning_rate": 3.639410954040213e-05, + "loss": 0.0706, + "step": 63680 + }, + { + "epoch": 13.01994258787846, + "grad_norm": 0.0021855004597455263, + "learning_rate": 3.6391100519357037e-05, + "loss": 0.1077, + "step": 63690 + }, + { + "epoch": 13.01999675025727, + "grad_norm": 1.9560720920562744, + "learning_rate": 3.638809149831194e-05, + "loss": 0.0684, + "step": 63700 + }, + { + "epoch": 13.020050912636083, + "grad_norm": 0.2640744745731354, + "learning_rate": 3.638508247726685e-05, + "loss": 0.0445, + "step": 63710 + }, + { + "epoch": 13.020105075014895, + "grad_norm": 0.026275265961885452, + "learning_rate": 3.6382073456221755e-05, + "loss": 0.0494, + "step": 63720 + }, + { + "epoch": 13.020159237393706, + "grad_norm": 0.0025028756354004145, + "learning_rate": 3.637906443517666e-05, + "loss": 0.0073, + "step": 63730 + }, + { + "epoch": 13.020213399772517, + "grad_norm": 0.03924001753330231, + "learning_rate": 3.637605541413157e-05, + "loss": 0.031, + "step": 63740 + }, + { + "epoch": 13.02026756215133, + "grad_norm": 0.1927756369113922, + "learning_rate": 3.6373046393086474e-05, + "loss": 0.0582, + "step": 63750 + }, + { + "epoch": 13.020321724530142, + "grad_norm": 0.002669468056410551, + "learning_rate": 3.637003737204138e-05, + "loss": 0.0089, + "step": 63760 + }, + { + "epoch": 13.020375886908953, + "grad_norm": 0.04066235572099686, + "learning_rate": 3.6367028350996294e-05, + "loss": 0.0121, + "step": 63770 + }, + { + "epoch": 13.020430049287764, + "grad_norm": 0.002642818260937929, + "learning_rate": 3.636401932995119e-05, + "loss": 0.0245, + "step": 63780 + }, + { + "epoch": 13.020484211666576, + "grad_norm": 0.0024527052883058786, + "learning_rate": 3.63610103089061e-05, + "loss": 0.0451, + "step": 63790 + }, + { + "epoch": 13.020538374045389, + "grad_norm": 0.051615484058856964, + "learning_rate": 3.635800128786101e-05, + "loss": 0.0182, + "step": 63800 + }, + { + "epoch": 13.0205925364242, + "grad_norm": 0.06923660635948181, + "learning_rate": 3.635499226681592e-05, + "loss": 0.0163, + "step": 63810 + }, + { + "epoch": 13.020646698803011, + "grad_norm": 0.0018932081293314695, + "learning_rate": 3.635198324577082e-05, + "loss": 0.0023, + "step": 63820 + }, + { + "epoch": 13.020700861181822, + "grad_norm": 0.02221406251192093, + "learning_rate": 3.634897422472573e-05, + "loss": 0.0721, + "step": 63830 + }, + { + "epoch": 13.020755023560636, + "grad_norm": 0.3926054835319519, + "learning_rate": 3.634596520368064e-05, + "loss": 0.0412, + "step": 63840 + }, + { + "epoch": 13.020809185939447, + "grad_norm": 0.06510842591524124, + "learning_rate": 3.6342956182635544e-05, + "loss": 0.0935, + "step": 63850 + }, + { + "epoch": 13.020863348318258, + "grad_norm": 7.777527809143066, + "learning_rate": 3.633994716159045e-05, + "loss": 0.1088, + "step": 63860 + }, + { + "epoch": 13.02091751069707, + "grad_norm": 0.001873997040092945, + "learning_rate": 3.6336938140545356e-05, + "loss": 0.0329, + "step": 63870 + }, + { + "epoch": 13.02097167307588, + "grad_norm": 0.08201992511749268, + "learning_rate": 3.633392911950026e-05, + "loss": 0.0188, + "step": 63880 + }, + { + "epoch": 13.021025835454694, + "grad_norm": 0.8437464833259583, + "learning_rate": 3.633092009845517e-05, + "loss": 0.0879, + "step": 63890 + }, + { + "epoch": 13.021079997833505, + "grad_norm": 0.003887464525178075, + "learning_rate": 3.6327911077410075e-05, + "loss": 0.0492, + "step": 63900 + }, + { + "epoch": 13.021134160212316, + "grad_norm": 0.0019384264014661312, + "learning_rate": 3.632490205636498e-05, + "loss": 0.0589, + "step": 63910 + }, + { + "epoch": 13.021188322591128, + "grad_norm": 2.1142663955688477, + "learning_rate": 3.6321893035319894e-05, + "loss": 0.0586, + "step": 63920 + }, + { + "epoch": 13.02124248496994, + "grad_norm": 0.48501667380332947, + "learning_rate": 3.6318884014274794e-05, + "loss": 0.0622, + "step": 63930 + }, + { + "epoch": 13.021296647348752, + "grad_norm": 0.08959810435771942, + "learning_rate": 3.631587499322971e-05, + "loss": 0.0354, + "step": 63940 + }, + { + "epoch": 13.021350809727563, + "grad_norm": 0.03782942518591881, + "learning_rate": 3.631286597218461e-05, + "loss": 0.0186, + "step": 63950 + }, + { + "epoch": 13.021404972106374, + "grad_norm": 0.3014518916606903, + "learning_rate": 3.630985695113952e-05, + "loss": 0.2115, + "step": 63960 + }, + { + "epoch": 13.021459134485186, + "grad_norm": 0.5246817469596863, + "learning_rate": 3.6306847930094426e-05, + "loss": 0.025, + "step": 63970 + }, + { + "epoch": 13.021513296863999, + "grad_norm": 0.0019777039997279644, + "learning_rate": 3.630383890904933e-05, + "loss": 0.0131, + "step": 63980 + }, + { + "epoch": 13.02156745924281, + "grad_norm": 6.926818370819092, + "learning_rate": 3.630082988800424e-05, + "loss": 0.1012, + "step": 63990 + }, + { + "epoch": 13.021621621621621, + "grad_norm": 0.015199114568531513, + "learning_rate": 3.6297820866959145e-05, + "loss": 0.009, + "step": 64000 + }, + { + "epoch": 13.021675784000433, + "grad_norm": 0.031117483973503113, + "learning_rate": 3.629481184591405e-05, + "loss": 0.1751, + "step": 64010 + }, + { + "epoch": 13.021729946379246, + "grad_norm": 1.007501482963562, + "learning_rate": 3.629180282486896e-05, + "loss": 0.0091, + "step": 64020 + }, + { + "epoch": 13.021784108758057, + "grad_norm": 0.052474137395620346, + "learning_rate": 3.628879380382387e-05, + "loss": 0.0218, + "step": 64030 + }, + { + "epoch": 13.021838271136868, + "grad_norm": 0.0029598723631352186, + "learning_rate": 3.628578478277877e-05, + "loss": 0.0434, + "step": 64040 + }, + { + "epoch": 13.02189243351568, + "grad_norm": 0.43480849266052246, + "learning_rate": 3.6282775761733676e-05, + "loss": 0.0208, + "step": 64050 + }, + { + "epoch": 13.02194659589449, + "grad_norm": 0.06698027998209, + "learning_rate": 3.627976674068859e-05, + "loss": 0.0018, + "step": 64060 + }, + { + "epoch": 13.022000758273304, + "grad_norm": 0.013219780288636684, + "learning_rate": 3.6276757719643495e-05, + "loss": 0.0534, + "step": 64070 + }, + { + "epoch": 13.022054920652115, + "grad_norm": 4.044347286224365, + "learning_rate": 3.6273748698598395e-05, + "loss": 0.1599, + "step": 64080 + }, + { + "epoch": 13.022109083030927, + "grad_norm": 0.003325042547658086, + "learning_rate": 3.627073967755331e-05, + "loss": 0.0122, + "step": 64090 + }, + { + "epoch": 13.022163245409738, + "grad_norm": 0.033411648124456406, + "learning_rate": 3.6267730656508214e-05, + "loss": 0.175, + "step": 64100 + }, + { + "epoch": 13.02221740778855, + "grad_norm": 0.6856631636619568, + "learning_rate": 3.626472163546312e-05, + "loss": 0.0154, + "step": 64110 + }, + { + "epoch": 13.022271570167362, + "grad_norm": 0.01164758950471878, + "learning_rate": 3.626171261441803e-05, + "loss": 0.0337, + "step": 64120 + }, + { + "epoch": 13.022325732546173, + "grad_norm": 0.5150697231292725, + "learning_rate": 3.625870359337293e-05, + "loss": 0.0043, + "step": 64130 + }, + { + "epoch": 13.022379894924985, + "grad_norm": 7.307633399963379, + "learning_rate": 3.625569457232784e-05, + "loss": 0.1818, + "step": 64140 + }, + { + "epoch": 13.022434057303796, + "grad_norm": 0.11064208298921585, + "learning_rate": 3.625268555128275e-05, + "loss": 0.0175, + "step": 64150 + }, + { + "epoch": 13.022488219682609, + "grad_norm": 0.0023076008073985577, + "learning_rate": 3.624967653023765e-05, + "loss": 0.0036, + "step": 64160 + }, + { + "epoch": 13.02254238206142, + "grad_norm": 0.07793150097131729, + "learning_rate": 3.624666750919256e-05, + "loss": 0.0651, + "step": 64170 + }, + { + "epoch": 13.022596544440232, + "grad_norm": 0.07788438349962234, + "learning_rate": 3.624365848814747e-05, + "loss": 0.0292, + "step": 64180 + }, + { + "epoch": 13.022650706819043, + "grad_norm": 0.074196957051754, + "learning_rate": 3.624064946710237e-05, + "loss": 0.0086, + "step": 64190 + }, + { + "epoch": 13.022704869197856, + "grad_norm": 0.0028054153081029654, + "learning_rate": 3.6237640446057284e-05, + "loss": 0.0077, + "step": 64200 + }, + { + "epoch": 13.022759031576667, + "grad_norm": 0.004000775050371885, + "learning_rate": 3.623463142501219e-05, + "loss": 0.0896, + "step": 64210 + }, + { + "epoch": 13.022813193955479, + "grad_norm": 0.0031889344099909067, + "learning_rate": 3.6231622403967096e-05, + "loss": 0.0881, + "step": 64220 + }, + { + "epoch": 13.02286735633429, + "grad_norm": 3.043365478515625, + "learning_rate": 3.6228613382922e-05, + "loss": 0.1213, + "step": 64230 + }, + { + "epoch": 13.022921518713101, + "grad_norm": 1.1801310777664185, + "learning_rate": 3.622560436187691e-05, + "loss": 0.0036, + "step": 64240 + }, + { + "epoch": 13.022975681091914, + "grad_norm": 0.013032562099397182, + "learning_rate": 3.6222595340831815e-05, + "loss": 0.0705, + "step": 64250 + }, + { + "epoch": 13.023029843470725, + "grad_norm": 0.03223249688744545, + "learning_rate": 3.621958631978672e-05, + "loss": 0.0139, + "step": 64260 + }, + { + "epoch": 13.023084005849537, + "grad_norm": 0.10622381418943405, + "learning_rate": 3.621657729874163e-05, + "loss": 0.0843, + "step": 64270 + }, + { + "epoch": 13.023138168228348, + "grad_norm": 0.0027597825974226, + "learning_rate": 3.6213568277696534e-05, + "loss": 0.0704, + "step": 64280 + }, + { + "epoch": 13.023192330607161, + "grad_norm": 0.00266565615311265, + "learning_rate": 3.621055925665145e-05, + "loss": 0.0013, + "step": 64290 + }, + { + "epoch": 13.023246492985972, + "grad_norm": 0.0380854457616806, + "learning_rate": 3.620755023560635e-05, + "loss": 0.0197, + "step": 64300 + }, + { + "epoch": 13.023300655364784, + "grad_norm": 0.0027255334425717592, + "learning_rate": 3.620454121456125e-05, + "loss": 0.0026, + "step": 64310 + }, + { + "epoch": 13.023354817743595, + "grad_norm": 3.643831729888916, + "learning_rate": 3.6201532193516166e-05, + "loss": 0.0889, + "step": 64320 + }, + { + "epoch": 13.023408980122406, + "grad_norm": 0.10792498290538788, + "learning_rate": 3.619852317247107e-05, + "loss": 0.0332, + "step": 64330 + }, + { + "epoch": 13.02346314250122, + "grad_norm": 0.006014194339513779, + "learning_rate": 3.619551415142597e-05, + "loss": 0.0611, + "step": 64340 + }, + { + "epoch": 13.02351730488003, + "grad_norm": 0.01933242566883564, + "learning_rate": 3.6192505130380885e-05, + "loss": 0.0511, + "step": 64350 + }, + { + "epoch": 13.023571467258842, + "grad_norm": 2.899066209793091, + "learning_rate": 3.618949610933579e-05, + "loss": 0.1438, + "step": 64360 + }, + { + "epoch": 13.023625629637653, + "grad_norm": 0.05953129380941391, + "learning_rate": 3.61864870882907e-05, + "loss": 0.0036, + "step": 64370 + }, + { + "epoch": 13.023679792016466, + "grad_norm": 0.024762725457549095, + "learning_rate": 3.61834780672456e-05, + "loss": 0.0279, + "step": 64380 + }, + { + "epoch": 13.023733954395277, + "grad_norm": 0.3717885911464691, + "learning_rate": 3.618046904620051e-05, + "loss": 0.0321, + "step": 64390 + }, + { + "epoch": 13.023788116774089, + "grad_norm": 38.74428939819336, + "learning_rate": 3.6177460025155416e-05, + "loss": 0.0537, + "step": 64400 + }, + { + "epoch": 13.0238422791529, + "grad_norm": 0.0025990279391407967, + "learning_rate": 3.617445100411033e-05, + "loss": 0.0321, + "step": 64410 + }, + { + "epoch": 13.023896441531711, + "grad_norm": 0.003089729230850935, + "learning_rate": 3.617144198306523e-05, + "loss": 0.0074, + "step": 64420 + }, + { + "epoch": 13.023950603910524, + "grad_norm": 2.05604887008667, + "learning_rate": 3.6168432962020135e-05, + "loss": 0.1108, + "step": 64430 + }, + { + "epoch": 13.024004766289336, + "grad_norm": 0.3903493583202362, + "learning_rate": 3.616542394097505e-05, + "loss": 0.0431, + "step": 64440 + }, + { + "epoch": 13.024058928668147, + "grad_norm": 0.02324155531823635, + "learning_rate": 3.6162414919929954e-05, + "loss": 0.076, + "step": 64450 + }, + { + "epoch": 13.024113091046958, + "grad_norm": 0.021919239312410355, + "learning_rate": 3.615940589888486e-05, + "loss": 0.1719, + "step": 64460 + }, + { + "epoch": 13.024167253425771, + "grad_norm": 0.2716754376888275, + "learning_rate": 3.615639687783977e-05, + "loss": 0.0088, + "step": 64470 + }, + { + "epoch": 13.024221415804583, + "grad_norm": 0.3284105956554413, + "learning_rate": 3.615338785679467e-05, + "loss": 0.0087, + "step": 64480 + }, + { + "epoch": 13.024275578183394, + "grad_norm": 3.04841947555542, + "learning_rate": 3.615037883574958e-05, + "loss": 0.0982, + "step": 64490 + }, + { + "epoch": 13.024329740562205, + "grad_norm": 1.0859380960464478, + "learning_rate": 3.6147369814704485e-05, + "loss": 0.1161, + "step": 64500 + }, + { + "epoch": 13.024383902941016, + "grad_norm": 0.0025651955511420965, + "learning_rate": 3.614436079365939e-05, + "loss": 0.0403, + "step": 64510 + }, + { + "epoch": 13.02443806531983, + "grad_norm": 0.0027652594726532698, + "learning_rate": 3.61413517726143e-05, + "loss": 0.0326, + "step": 64520 + }, + { + "epoch": 13.02449222769864, + "grad_norm": 1.6899696588516235, + "learning_rate": 3.6138342751569204e-05, + "loss": 0.0966, + "step": 64530 + }, + { + "epoch": 13.024546390077452, + "grad_norm": 5.984245777130127, + "learning_rate": 3.613533373052411e-05, + "loss": 0.0395, + "step": 64540 + }, + { + "epoch": 13.024600552456263, + "grad_norm": 0.002670355373993516, + "learning_rate": 3.6132324709479024e-05, + "loss": 0.0518, + "step": 64550 + }, + { + "epoch": 13.024654714835076, + "grad_norm": 0.03780616819858551, + "learning_rate": 3.612931568843393e-05, + "loss": 0.0158, + "step": 64560 + }, + { + "epoch": 13.024708877213888, + "grad_norm": 0.0024927090853452682, + "learning_rate": 3.612630666738883e-05, + "loss": 0.0228, + "step": 64570 + }, + { + "epoch": 13.024763039592699, + "grad_norm": 0.01147007942199707, + "learning_rate": 3.612329764634374e-05, + "loss": 0.004, + "step": 64580 + }, + { + "epoch": 13.02481720197151, + "grad_norm": 6.400564193725586, + "learning_rate": 3.612028862529865e-05, + "loss": 0.0636, + "step": 64590 + }, + { + "epoch": 13.024871364350322, + "grad_norm": 0.804305911064148, + "learning_rate": 3.6117279604253555e-05, + "loss": 0.0102, + "step": 64600 + }, + { + "epoch": 13.024925526729135, + "grad_norm": 4.6574506759643555, + "learning_rate": 3.611427058320846e-05, + "loss": 0.072, + "step": 64610 + }, + { + "epoch": 13.024979689107946, + "grad_norm": 0.08224312961101532, + "learning_rate": 3.611126156216337e-05, + "loss": 0.0095, + "step": 64620 + }, + { + "epoch": 13.025001354059471, + "eval_accuracy": 0.7877204441541477, + "eval_loss": 0.9396129846572876, + "eval_runtime": 117.3181, + "eval_samples_per_second": 26.1, + "eval_steps_per_second": 3.265, + "step": 64624 + }, + { + "epoch": 14.000032497427288, + "grad_norm": 0.053867507725954056, + "learning_rate": 3.6108252541118274e-05, + "loss": 0.0759, + "step": 64630 + }, + { + "epoch": 14.0000866598061, + "grad_norm": 0.0495491549372673, + "learning_rate": 3.610524352007318e-05, + "loss": 0.0932, + "step": 64640 + }, + { + "epoch": 14.00014082218491, + "grad_norm": 1.7535308599472046, + "learning_rate": 3.6102234499028086e-05, + "loss": 0.1574, + "step": 64650 + }, + { + "epoch": 14.000194984563722, + "grad_norm": 1.2763429880142212, + "learning_rate": 3.609922547798299e-05, + "loss": 0.0403, + "step": 64660 + }, + { + "epoch": 14.000249146942533, + "grad_norm": 0.039449095726013184, + "learning_rate": 3.6096216456937906e-05, + "loss": 0.0494, + "step": 64670 + }, + { + "epoch": 14.000303309321346, + "grad_norm": 0.18761277198791504, + "learning_rate": 3.6093207435892805e-05, + "loss": 0.0069, + "step": 64680 + }, + { + "epoch": 14.000357471700157, + "grad_norm": 0.11858753859996796, + "learning_rate": 3.609019841484771e-05, + "loss": 0.0342, + "step": 64690 + }, + { + "epoch": 14.000411634078969, + "grad_norm": 1.076132893562317, + "learning_rate": 3.6087189393802625e-05, + "loss": 0.0506, + "step": 64700 + }, + { + "epoch": 14.00046579645778, + "grad_norm": 0.009732176549732685, + "learning_rate": 3.608418037275753e-05, + "loss": 0.0311, + "step": 64710 + }, + { + "epoch": 14.000519958836593, + "grad_norm": 0.1567910611629486, + "learning_rate": 3.608117135171244e-05, + "loss": 0.0484, + "step": 64720 + }, + { + "epoch": 14.000574121215404, + "grad_norm": 0.056060027331113815, + "learning_rate": 3.607816233066734e-05, + "loss": 0.0218, + "step": 64730 + }, + { + "epoch": 14.000628283594216, + "grad_norm": 0.0809292420744896, + "learning_rate": 3.607515330962225e-05, + "loss": 0.0846, + "step": 64740 + }, + { + "epoch": 14.000682445973027, + "grad_norm": 0.0019335525576025248, + "learning_rate": 3.6072144288577156e-05, + "loss": 0.0314, + "step": 64750 + }, + { + "epoch": 14.000736608351838, + "grad_norm": 0.004593320656567812, + "learning_rate": 3.606913526753206e-05, + "loss": 0.0173, + "step": 64760 + }, + { + "epoch": 14.000790770730651, + "grad_norm": 0.11364643275737762, + "learning_rate": 3.606612624648697e-05, + "loss": 0.0238, + "step": 64770 + }, + { + "epoch": 14.000844933109462, + "grad_norm": 0.5024983286857605, + "learning_rate": 3.606311722544188e-05, + "loss": 0.0254, + "step": 64780 + }, + { + "epoch": 14.000899095488274, + "grad_norm": 0.14966164529323578, + "learning_rate": 3.606010820439678e-05, + "loss": 0.0256, + "step": 64790 + }, + { + "epoch": 14.000953257867085, + "grad_norm": 0.002191508887335658, + "learning_rate": 3.605709918335169e-05, + "loss": 0.0153, + "step": 64800 + }, + { + "epoch": 14.001007420245898, + "grad_norm": 0.001569931861013174, + "learning_rate": 3.60540901623066e-05, + "loss": 0.0816, + "step": 64810 + }, + { + "epoch": 14.00106158262471, + "grad_norm": 0.2888840436935425, + "learning_rate": 3.6051081141261507e-05, + "loss": 0.0503, + "step": 64820 + }, + { + "epoch": 14.00111574500352, + "grad_norm": 0.14018045365810394, + "learning_rate": 3.6048072120216406e-05, + "loss": 0.0021, + "step": 64830 + }, + { + "epoch": 14.001169907382332, + "grad_norm": 0.0016614056657999754, + "learning_rate": 3.604506309917132e-05, + "loss": 0.0583, + "step": 64840 + }, + { + "epoch": 14.001224069761143, + "grad_norm": 0.0015906488988548517, + "learning_rate": 3.6042054078126225e-05, + "loss": 0.0039, + "step": 64850 + }, + { + "epoch": 14.001278232139956, + "grad_norm": 0.14152663946151733, + "learning_rate": 3.603904505708113e-05, + "loss": 0.0781, + "step": 64860 + }, + { + "epoch": 14.001332394518768, + "grad_norm": 0.002773679792881012, + "learning_rate": 3.603603603603604e-05, + "loss": 0.021, + "step": 64870 + }, + { + "epoch": 14.001386556897579, + "grad_norm": 0.001675907289609313, + "learning_rate": 3.6033027014990944e-05, + "loss": 0.1247, + "step": 64880 + }, + { + "epoch": 14.00144071927639, + "grad_norm": 0.024807317182421684, + "learning_rate": 3.603001799394585e-05, + "loss": 0.0519, + "step": 64890 + }, + { + "epoch": 14.001494881655203, + "grad_norm": 0.10581862926483154, + "learning_rate": 3.6027008972900764e-05, + "loss": 0.0022, + "step": 64900 + }, + { + "epoch": 14.001549044034014, + "grad_norm": 0.9906142950057983, + "learning_rate": 3.602399995185566e-05, + "loss": 0.0439, + "step": 64910 + }, + { + "epoch": 14.001603206412826, + "grad_norm": 2.2770228385925293, + "learning_rate": 3.602099093081057e-05, + "loss": 0.1013, + "step": 64920 + }, + { + "epoch": 14.001657368791637, + "grad_norm": 0.037905436009168625, + "learning_rate": 3.601798190976548e-05, + "loss": 0.1635, + "step": 64930 + }, + { + "epoch": 14.001711531170448, + "grad_norm": 0.0052741398103535175, + "learning_rate": 3.601497288872038e-05, + "loss": 0.0165, + "step": 64940 + }, + { + "epoch": 14.001765693549261, + "grad_norm": 0.032530393451452255, + "learning_rate": 3.601196386767529e-05, + "loss": 0.0284, + "step": 64950 + }, + { + "epoch": 14.001819855928073, + "grad_norm": 0.006099713034927845, + "learning_rate": 3.60089548466302e-05, + "loss": 0.0395, + "step": 64960 + }, + { + "epoch": 14.001874018306884, + "grad_norm": 0.0017131047789007425, + "learning_rate": 3.600594582558511e-05, + "loss": 0.0922, + "step": 64970 + }, + { + "epoch": 14.001928180685695, + "grad_norm": 2.4087088108062744, + "learning_rate": 3.6002936804540014e-05, + "loss": 0.1126, + "step": 64980 + }, + { + "epoch": 14.001982343064507, + "grad_norm": 0.004570648539811373, + "learning_rate": 3.599992778349492e-05, + "loss": 0.0548, + "step": 64990 + }, + { + "epoch": 14.00203650544332, + "grad_norm": 0.003062628908082843, + "learning_rate": 3.5996918762449826e-05, + "loss": 0.0015, + "step": 65000 + }, + { + "epoch": 14.00209066782213, + "grad_norm": 0.001721653388813138, + "learning_rate": 3.599390974140473e-05, + "loss": 0.1832, + "step": 65010 + }, + { + "epoch": 14.002144830200942, + "grad_norm": 0.0018388128373771906, + "learning_rate": 3.599090072035964e-05, + "loss": 0.0638, + "step": 65020 + }, + { + "epoch": 14.002198992579753, + "grad_norm": 0.0018897259142249823, + "learning_rate": 3.5987891699314545e-05, + "loss": 0.0001, + "step": 65030 + }, + { + "epoch": 14.002253154958566, + "grad_norm": 0.055544909089803696, + "learning_rate": 3.598488267826946e-05, + "loss": 0.042, + "step": 65040 + }, + { + "epoch": 14.002307317337378, + "grad_norm": 0.7462114691734314, + "learning_rate": 3.5981873657224364e-05, + "loss": 0.0468, + "step": 65050 + }, + { + "epoch": 14.002361479716189, + "grad_norm": 0.005201704800128937, + "learning_rate": 3.5978864636179264e-05, + "loss": 0.0806, + "step": 65060 + }, + { + "epoch": 14.002415642095, + "grad_norm": 0.002422438934445381, + "learning_rate": 3.597585561513418e-05, + "loss": 0.0222, + "step": 65070 + }, + { + "epoch": 14.002469804473812, + "grad_norm": 0.012228080071508884, + "learning_rate": 3.597284659408908e-05, + "loss": 0.0051, + "step": 65080 + }, + { + "epoch": 14.002523966852625, + "grad_norm": 0.006110442336648703, + "learning_rate": 3.596983757304398e-05, + "loss": 0.0867, + "step": 65090 + }, + { + "epoch": 14.002578129231436, + "grad_norm": 0.013506838120520115, + "learning_rate": 3.5966828551998896e-05, + "loss": 0.0726, + "step": 65100 + }, + { + "epoch": 14.002632291610247, + "grad_norm": 0.002191332168877125, + "learning_rate": 3.59638195309538e-05, + "loss": 0.0231, + "step": 65110 + }, + { + "epoch": 14.002686453989059, + "grad_norm": 1.2548243999481201, + "learning_rate": 3.596081050990871e-05, + "loss": 0.0732, + "step": 65120 + }, + { + "epoch": 14.002740616367872, + "grad_norm": 0.4375666081905365, + "learning_rate": 3.5957801488863615e-05, + "loss": 0.0356, + "step": 65130 + }, + { + "epoch": 14.002794778746683, + "grad_norm": 0.0028896809089928865, + "learning_rate": 3.595479246781852e-05, + "loss": 0.0706, + "step": 65140 + }, + { + "epoch": 14.002848941125494, + "grad_norm": 0.0019597201608121395, + "learning_rate": 3.595178344677343e-05, + "loss": 0.0098, + "step": 65150 + }, + { + "epoch": 14.002903103504305, + "grad_norm": 4.150387763977051, + "learning_rate": 3.594877442572834e-05, + "loss": 0.0871, + "step": 65160 + }, + { + "epoch": 14.002957265883117, + "grad_norm": 8.74413013458252, + "learning_rate": 3.594576540468324e-05, + "loss": 0.1548, + "step": 65170 + }, + { + "epoch": 14.00301142826193, + "grad_norm": 0.013693324290215969, + "learning_rate": 3.5942756383638146e-05, + "loss": 0.1357, + "step": 65180 + }, + { + "epoch": 14.003065590640741, + "grad_norm": 0.008726893924176693, + "learning_rate": 3.593974736259306e-05, + "loss": 0.0464, + "step": 65190 + }, + { + "epoch": 14.003119753019552, + "grad_norm": 0.022781115025281906, + "learning_rate": 3.5936738341547965e-05, + "loss": 0.015, + "step": 65200 + }, + { + "epoch": 14.003173915398364, + "grad_norm": 0.01370113343000412, + "learning_rate": 3.5933729320502865e-05, + "loss": 0.0009, + "step": 65210 + }, + { + "epoch": 14.003228077777177, + "grad_norm": 0.0030485684983432293, + "learning_rate": 3.593072029945778e-05, + "loss": 0.0224, + "step": 65220 + }, + { + "epoch": 14.003282240155988, + "grad_norm": 0.0021558182779699564, + "learning_rate": 3.5927711278412684e-05, + "loss": 0.0047, + "step": 65230 + }, + { + "epoch": 14.0033364025348, + "grad_norm": 0.012629101984202862, + "learning_rate": 3.592470225736759e-05, + "loss": 0.0289, + "step": 65240 + }, + { + "epoch": 14.00339056491361, + "grad_norm": 3.5213816165924072, + "learning_rate": 3.59216932363225e-05, + "loss": 0.1199, + "step": 65250 + }, + { + "epoch": 14.003444727292422, + "grad_norm": 0.05852276086807251, + "learning_rate": 3.59186842152774e-05, + "loss": 0.1302, + "step": 65260 + }, + { + "epoch": 14.003498889671235, + "grad_norm": 0.5318979024887085, + "learning_rate": 3.591567519423231e-05, + "loss": 0.115, + "step": 65270 + }, + { + "epoch": 14.003553052050046, + "grad_norm": 0.004290642216801643, + "learning_rate": 3.5912666173187216e-05, + "loss": 0.0011, + "step": 65280 + }, + { + "epoch": 14.003607214428857, + "grad_norm": 0.07402566075325012, + "learning_rate": 3.590965715214212e-05, + "loss": 0.0522, + "step": 65290 + }, + { + "epoch": 14.003661376807669, + "grad_norm": 0.07526256144046783, + "learning_rate": 3.5906648131097035e-05, + "loss": 0.0013, + "step": 65300 + }, + { + "epoch": 14.003715539186482, + "grad_norm": 0.2729751467704773, + "learning_rate": 3.590363911005194e-05, + "loss": 0.1021, + "step": 65310 + }, + { + "epoch": 14.003769701565293, + "grad_norm": 2.10016131401062, + "learning_rate": 3.590063008900684e-05, + "loss": 0.1002, + "step": 65320 + }, + { + "epoch": 14.003823863944104, + "grad_norm": 0.10605352371931076, + "learning_rate": 3.5897621067961754e-05, + "loss": 0.068, + "step": 65330 + }, + { + "epoch": 14.003878026322916, + "grad_norm": 0.17539988458156586, + "learning_rate": 3.589461204691666e-05, + "loss": 0.1494, + "step": 65340 + }, + { + "epoch": 14.003932188701727, + "grad_norm": 0.6028537750244141, + "learning_rate": 3.5891603025871566e-05, + "loss": 0.0485, + "step": 65350 + }, + { + "epoch": 14.00398635108054, + "grad_norm": 2.1357977390289307, + "learning_rate": 3.588859400482647e-05, + "loss": 0.1271, + "step": 65360 + }, + { + "epoch": 14.004040513459351, + "grad_norm": 0.07309179753065109, + "learning_rate": 3.588558498378138e-05, + "loss": 0.0347, + "step": 65370 + }, + { + "epoch": 14.004094675838163, + "grad_norm": 0.028737926855683327, + "learning_rate": 3.5882575962736285e-05, + "loss": 0.0109, + "step": 65380 + }, + { + "epoch": 14.004148838216974, + "grad_norm": 2.662179708480835, + "learning_rate": 3.587956694169119e-05, + "loss": 0.0337, + "step": 65390 + }, + { + "epoch": 14.004203000595787, + "grad_norm": 0.03645888343453407, + "learning_rate": 3.58765579206461e-05, + "loss": 0.0519, + "step": 65400 + }, + { + "epoch": 14.004257162974598, + "grad_norm": 3.3465845584869385, + "learning_rate": 3.5873548899601004e-05, + "loss": 0.0763, + "step": 65410 + }, + { + "epoch": 14.00431132535341, + "grad_norm": 0.8649359941482544, + "learning_rate": 3.587053987855592e-05, + "loss": 0.144, + "step": 65420 + }, + { + "epoch": 14.00436548773222, + "grad_norm": 0.04873667657375336, + "learning_rate": 3.5867530857510816e-05, + "loss": 0.0095, + "step": 65430 + }, + { + "epoch": 14.004419650111032, + "grad_norm": 0.0314815454185009, + "learning_rate": 3.586452183646572e-05, + "loss": 0.1013, + "step": 65440 + }, + { + "epoch": 14.004473812489845, + "grad_norm": 0.1375814974308014, + "learning_rate": 3.5861512815420636e-05, + "loss": 0.0025, + "step": 65450 + }, + { + "epoch": 14.004527974868656, + "grad_norm": 0.004623536020517349, + "learning_rate": 3.585850379437554e-05, + "loss": 0.0548, + "step": 65460 + }, + { + "epoch": 14.004582137247468, + "grad_norm": 0.07612382620573044, + "learning_rate": 3.585549477333044e-05, + "loss": 0.1586, + "step": 65470 + }, + { + "epoch": 14.004636299626279, + "grad_norm": 1.7094182968139648, + "learning_rate": 3.5852485752285355e-05, + "loss": 0.0446, + "step": 65480 + }, + { + "epoch": 14.004690462005092, + "grad_norm": 0.008726349100470543, + "learning_rate": 3.584947673124026e-05, + "loss": 0.0584, + "step": 65490 + }, + { + "epoch": 14.004744624383903, + "grad_norm": 0.12596604228019714, + "learning_rate": 3.584646771019517e-05, + "loss": 0.0236, + "step": 65500 + }, + { + "epoch": 14.004798786762715, + "grad_norm": 0.09673327207565308, + "learning_rate": 3.5843458689150073e-05, + "loss": 0.0531, + "step": 65510 + }, + { + "epoch": 14.004852949141526, + "grad_norm": 0.235332190990448, + "learning_rate": 3.584044966810498e-05, + "loss": 0.0554, + "step": 65520 + }, + { + "epoch": 14.004907111520337, + "grad_norm": 1.1914558410644531, + "learning_rate": 3.5837440647059886e-05, + "loss": 0.0488, + "step": 65530 + }, + { + "epoch": 14.00496127389915, + "grad_norm": 0.6624277234077454, + "learning_rate": 3.583443162601479e-05, + "loss": 0.0234, + "step": 65540 + }, + { + "epoch": 14.005015436277962, + "grad_norm": 0.008455605246126652, + "learning_rate": 3.58314226049697e-05, + "loss": 0.0531, + "step": 65550 + }, + { + "epoch": 14.005069598656773, + "grad_norm": 0.04724739119410515, + "learning_rate": 3.582841358392461e-05, + "loss": 0.0182, + "step": 65560 + }, + { + "epoch": 14.005123761035584, + "grad_norm": 0.0045446050353348255, + "learning_rate": 3.582540456287952e-05, + "loss": 0.0699, + "step": 65570 + }, + { + "epoch": 14.005177923414397, + "grad_norm": 0.05546635761857033, + "learning_rate": 3.582239554183442e-05, + "loss": 0.1583, + "step": 65580 + }, + { + "epoch": 14.005232085793208, + "grad_norm": 1.4111641645431519, + "learning_rate": 3.581938652078933e-05, + "loss": 0.0774, + "step": 65590 + }, + { + "epoch": 14.00528624817202, + "grad_norm": 0.0029782429337501526, + "learning_rate": 3.581637749974424e-05, + "loss": 0.0286, + "step": 65600 + }, + { + "epoch": 14.005340410550831, + "grad_norm": 0.011178501881659031, + "learning_rate": 3.581336847869914e-05, + "loss": 0.0389, + "step": 65610 + }, + { + "epoch": 14.005394572929642, + "grad_norm": 0.6481369137763977, + "learning_rate": 3.581035945765405e-05, + "loss": 0.0218, + "step": 65620 + }, + { + "epoch": 14.005448735308455, + "grad_norm": 0.003120647044852376, + "learning_rate": 3.5807350436608955e-05, + "loss": 0.0482, + "step": 65630 + }, + { + "epoch": 14.005502897687267, + "grad_norm": 1.003159999847412, + "learning_rate": 3.580434141556386e-05, + "loss": 0.0104, + "step": 65640 + }, + { + "epoch": 14.005557060066078, + "grad_norm": 0.18122360110282898, + "learning_rate": 3.5801332394518775e-05, + "loss": 0.0082, + "step": 65650 + }, + { + "epoch": 14.00561122244489, + "grad_norm": 1.9561954736709595, + "learning_rate": 3.5798323373473674e-05, + "loss": 0.0358, + "step": 65660 + }, + { + "epoch": 14.005665384823702, + "grad_norm": 0.0017264712369069457, + "learning_rate": 3.579531435242858e-05, + "loss": 0.0507, + "step": 65670 + }, + { + "epoch": 14.005719547202514, + "grad_norm": 0.6476720571517944, + "learning_rate": 3.5792305331383494e-05, + "loss": 0.0431, + "step": 65680 + }, + { + "epoch": 14.005773709581325, + "grad_norm": 0.07603123039007187, + "learning_rate": 3.578929631033839e-05, + "loss": 0.1145, + "step": 65690 + }, + { + "epoch": 14.005827871960136, + "grad_norm": 0.0022035003639757633, + "learning_rate": 3.57862872892933e-05, + "loss": 0.0125, + "step": 65700 + }, + { + "epoch": 14.005882034338947, + "grad_norm": 0.012157502584159374, + "learning_rate": 3.578327826824821e-05, + "loss": 0.0041, + "step": 65710 + }, + { + "epoch": 14.00593619671776, + "grad_norm": 0.021773144602775574, + "learning_rate": 3.578026924720312e-05, + "loss": 0.1099, + "step": 65720 + }, + { + "epoch": 14.005990359096572, + "grad_norm": 0.8300250172615051, + "learning_rate": 3.577726022615802e-05, + "loss": 0.0839, + "step": 65730 + }, + { + "epoch": 14.006044521475383, + "grad_norm": 0.45189565420150757, + "learning_rate": 3.577425120511293e-05, + "loss": 0.0333, + "step": 65740 + }, + { + "epoch": 14.006098683854194, + "grad_norm": 0.0019038324244320393, + "learning_rate": 3.577124218406784e-05, + "loss": 0.0473, + "step": 65750 + }, + { + "epoch": 14.006152846233007, + "grad_norm": 11.723151206970215, + "learning_rate": 3.5768233163022744e-05, + "loss": 0.0057, + "step": 65760 + }, + { + "epoch": 14.006207008611819, + "grad_norm": 0.020411163568496704, + "learning_rate": 3.576522414197765e-05, + "loss": 0.0052, + "step": 65770 + }, + { + "epoch": 14.00626117099063, + "grad_norm": 0.00819197203963995, + "learning_rate": 3.5762215120932556e-05, + "loss": 0.0693, + "step": 65780 + }, + { + "epoch": 14.006315333369441, + "grad_norm": 0.0017040437087416649, + "learning_rate": 3.575920609988746e-05, + "loss": 0.0641, + "step": 65790 + }, + { + "epoch": 14.006369495748253, + "grad_norm": 11.167998313903809, + "learning_rate": 3.5756197078842376e-05, + "loss": 0.0772, + "step": 65800 + }, + { + "epoch": 14.006423658127066, + "grad_norm": 0.017780620604753494, + "learning_rate": 3.5753188057797275e-05, + "loss": 0.0738, + "step": 65810 + }, + { + "epoch": 14.006477820505877, + "grad_norm": 0.283176988363266, + "learning_rate": 3.575017903675219e-05, + "loss": 0.0752, + "step": 65820 + }, + { + "epoch": 14.006531982884688, + "grad_norm": 3.562448263168335, + "learning_rate": 3.5747170015707095e-05, + "loss": 0.1148, + "step": 65830 + }, + { + "epoch": 14.0065861452635, + "grad_norm": 0.0489826500415802, + "learning_rate": 3.5744160994661994e-05, + "loss": 0.0446, + "step": 65840 + }, + { + "epoch": 14.006640307642312, + "grad_norm": 0.043509624898433685, + "learning_rate": 3.574115197361691e-05, + "loss": 0.0121, + "step": 65850 + }, + { + "epoch": 14.006694470021124, + "grad_norm": 0.27297529578208923, + "learning_rate": 3.573814295257181e-05, + "loss": 0.0299, + "step": 65860 + }, + { + "epoch": 14.006748632399935, + "grad_norm": 3.0385079383850098, + "learning_rate": 3.573513393152672e-05, + "loss": 0.1432, + "step": 65870 + }, + { + "epoch": 14.006802794778746, + "grad_norm": 5.287233352661133, + "learning_rate": 3.5732124910481626e-05, + "loss": 0.0386, + "step": 65880 + }, + { + "epoch": 14.006856957157558, + "grad_norm": 3.0921664237976074, + "learning_rate": 3.572911588943653e-05, + "loss": 0.1271, + "step": 65890 + }, + { + "epoch": 14.00691111953637, + "grad_norm": 0.5161243677139282, + "learning_rate": 3.572610686839144e-05, + "loss": 0.0745, + "step": 65900 + }, + { + "epoch": 14.006965281915182, + "grad_norm": 0.0024698448833078146, + "learning_rate": 3.572309784734635e-05, + "loss": 0.0336, + "step": 65910 + }, + { + "epoch": 14.007019444293993, + "grad_norm": 4.517660140991211, + "learning_rate": 3.572008882630125e-05, + "loss": 0.078, + "step": 65920 + }, + { + "epoch": 14.007073606672805, + "grad_norm": 0.0904371440410614, + "learning_rate": 3.571707980525616e-05, + "loss": 0.0063, + "step": 65930 + }, + { + "epoch": 14.007127769051618, + "grad_norm": 0.10805730521678925, + "learning_rate": 3.571407078421107e-05, + "loss": 0.0457, + "step": 65940 + }, + { + "epoch": 14.007181931430429, + "grad_norm": 0.0023731885012239218, + "learning_rate": 3.571106176316597e-05, + "loss": 0.1283, + "step": 65950 + }, + { + "epoch": 14.00723609380924, + "grad_norm": 1.265386939048767, + "learning_rate": 3.5708052742120876e-05, + "loss": 0.066, + "step": 65960 + }, + { + "epoch": 14.007290256188051, + "grad_norm": 0.13762082159519196, + "learning_rate": 3.570504372107579e-05, + "loss": 0.0152, + "step": 65970 + }, + { + "epoch": 14.007344418566863, + "grad_norm": 0.002518306951969862, + "learning_rate": 3.5702034700030695e-05, + "loss": 0.0951, + "step": 65980 + }, + { + "epoch": 14.007398580945676, + "grad_norm": 0.030946820974349976, + "learning_rate": 3.5699025678985595e-05, + "loss": 0.0771, + "step": 65990 + }, + { + "epoch": 14.007452743324487, + "grad_norm": 0.040170054882764816, + "learning_rate": 3.569601665794051e-05, + "loss": 0.0153, + "step": 66000 + }, + { + "epoch": 14.007506905703298, + "grad_norm": 0.23306246101856232, + "learning_rate": 3.5693007636895414e-05, + "loss": 0.041, + "step": 66010 + }, + { + "epoch": 14.00756106808211, + "grad_norm": 2.6509127616882324, + "learning_rate": 3.568999861585032e-05, + "loss": 0.084, + "step": 66020 + }, + { + "epoch": 14.007615230460923, + "grad_norm": 0.028553806245326996, + "learning_rate": 3.568698959480523e-05, + "loss": 0.0657, + "step": 66030 + }, + { + "epoch": 14.007669392839734, + "grad_norm": 0.24344755709171295, + "learning_rate": 3.568398057376013e-05, + "loss": 0.1214, + "step": 66040 + }, + { + "epoch": 14.007723555218545, + "grad_norm": 0.03668375685811043, + "learning_rate": 3.568097155271504e-05, + "loss": 0.0049, + "step": 66050 + }, + { + "epoch": 14.007777717597357, + "grad_norm": 0.053048424422740936, + "learning_rate": 3.567796253166995e-05, + "loss": 0.0049, + "step": 66060 + }, + { + "epoch": 14.007831879976168, + "grad_norm": 8.07988452911377, + "learning_rate": 3.567495351062485e-05, + "loss": 0.1258, + "step": 66070 + }, + { + "epoch": 14.007886042354981, + "grad_norm": 2.5558319091796875, + "learning_rate": 3.5671944489579765e-05, + "loss": 0.0955, + "step": 66080 + }, + { + "epoch": 14.007940204733792, + "grad_norm": 0.005597478244453669, + "learning_rate": 3.566893546853467e-05, + "loss": 0.0091, + "step": 66090 + }, + { + "epoch": 14.007994367112603, + "grad_norm": 0.07203986495733261, + "learning_rate": 3.566592644748957e-05, + "loss": 0.1549, + "step": 66100 + }, + { + "epoch": 14.008048529491415, + "grad_norm": 1.758253812789917, + "learning_rate": 3.5662917426444484e-05, + "loss": 0.0218, + "step": 66110 + }, + { + "epoch": 14.008102691870226, + "grad_norm": 0.4616374671459198, + "learning_rate": 3.565990840539939e-05, + "loss": 0.086, + "step": 66120 + }, + { + "epoch": 14.00815685424904, + "grad_norm": 1.8285512924194336, + "learning_rate": 3.5656899384354296e-05, + "loss": 0.1102, + "step": 66130 + }, + { + "epoch": 14.00821101662785, + "grad_norm": 0.005583695136010647, + "learning_rate": 3.56538903633092e-05, + "loss": 0.0396, + "step": 66140 + }, + { + "epoch": 14.008265179006662, + "grad_norm": 0.25884363055229187, + "learning_rate": 3.565088134226411e-05, + "loss": 0.0248, + "step": 66150 + }, + { + "epoch": 14.008319341385473, + "grad_norm": 0.09887527674436569, + "learning_rate": 3.5647872321219015e-05, + "loss": 0.1073, + "step": 66160 + }, + { + "epoch": 14.008373503764286, + "grad_norm": 0.06443383544683456, + "learning_rate": 3.564486330017393e-05, + "loss": 0.0889, + "step": 66170 + }, + { + "epoch": 14.008427666143097, + "grad_norm": 0.03976134583353996, + "learning_rate": 3.564185427912883e-05, + "loss": 0.0194, + "step": 66180 + }, + { + "epoch": 14.008481828521909, + "grad_norm": 1.8872089385986328, + "learning_rate": 3.5638845258083734e-05, + "loss": 0.1391, + "step": 66190 + }, + { + "epoch": 14.00853599090072, + "grad_norm": 0.21617111563682556, + "learning_rate": 3.563583623703865e-05, + "loss": 0.0141, + "step": 66200 + }, + { + "epoch": 14.008590153279531, + "grad_norm": 0.09267065674066544, + "learning_rate": 3.563282721599355e-05, + "loss": 0.1487, + "step": 66210 + }, + { + "epoch": 14.008644315658344, + "grad_norm": 2.0311920642852783, + "learning_rate": 3.562981819494845e-05, + "loss": 0.0954, + "step": 66220 + }, + { + "epoch": 14.008698478037156, + "grad_norm": 0.23543210327625275, + "learning_rate": 3.5626809173903366e-05, + "loss": 0.004, + "step": 66230 + }, + { + "epoch": 14.008752640415967, + "grad_norm": 0.0037619764916598797, + "learning_rate": 3.562380015285827e-05, + "loss": 0.0078, + "step": 66240 + }, + { + "epoch": 14.008806802794778, + "grad_norm": 3.001328706741333, + "learning_rate": 3.562079113181317e-05, + "loss": 0.0741, + "step": 66250 + }, + { + "epoch": 14.008860965173591, + "grad_norm": 0.28834277391433716, + "learning_rate": 3.5617782110768085e-05, + "loss": 0.0243, + "step": 66260 + }, + { + "epoch": 14.008915127552402, + "grad_norm": 0.191426619887352, + "learning_rate": 3.561477308972299e-05, + "loss": 0.01, + "step": 66270 + }, + { + "epoch": 14.008969289931214, + "grad_norm": 0.002151745604351163, + "learning_rate": 3.56117640686779e-05, + "loss": 0.0952, + "step": 66280 + }, + { + "epoch": 14.009023452310025, + "grad_norm": 0.0793423280119896, + "learning_rate": 3.5608755047632803e-05, + "loss": 0.0304, + "step": 66290 + }, + { + "epoch": 14.009077614688836, + "grad_norm": 0.002242705086246133, + "learning_rate": 3.560574602658771e-05, + "loss": 0.0607, + "step": 66300 + }, + { + "epoch": 14.00913177706765, + "grad_norm": 1.002663254737854, + "learning_rate": 3.5602737005542616e-05, + "loss": 0.0281, + "step": 66310 + }, + { + "epoch": 14.00918593944646, + "grad_norm": 0.9228360652923584, + "learning_rate": 3.559972798449753e-05, + "loss": 0.0168, + "step": 66320 + }, + { + "epoch": 14.009240101825272, + "grad_norm": 0.15951339900493622, + "learning_rate": 3.559671896345243e-05, + "loss": 0.1604, + "step": 66330 + }, + { + "epoch": 14.009294264204083, + "grad_norm": 0.002303372835740447, + "learning_rate": 3.559370994240734e-05, + "loss": 0.0037, + "step": 66340 + }, + { + "epoch": 14.009348426582896, + "grad_norm": 0.013603420928120613, + "learning_rate": 3.559070092136225e-05, + "loss": 0.0751, + "step": 66350 + }, + { + "epoch": 14.009402588961708, + "grad_norm": 0.5258148908615112, + "learning_rate": 3.5587691900317154e-05, + "loss": 0.0664, + "step": 66360 + }, + { + "epoch": 14.009456751340519, + "grad_norm": 0.07671401649713516, + "learning_rate": 3.558468287927206e-05, + "loss": 0.028, + "step": 66370 + }, + { + "epoch": 14.00951091371933, + "grad_norm": 0.0039547039195895195, + "learning_rate": 3.558167385822697e-05, + "loss": 0.0818, + "step": 66380 + }, + { + "epoch": 14.009565076098141, + "grad_norm": 2.185272216796875, + "learning_rate": 3.557866483718187e-05, + "loss": 0.0652, + "step": 66390 + }, + { + "epoch": 14.009619238476954, + "grad_norm": 0.023357583209872246, + "learning_rate": 3.557565581613678e-05, + "loss": 0.0324, + "step": 66400 + }, + { + "epoch": 14.009673400855766, + "grad_norm": 0.027748318389058113, + "learning_rate": 3.5572646795091686e-05, + "loss": 0.0946, + "step": 66410 + }, + { + "epoch": 14.009727563234577, + "grad_norm": 0.03607930243015289, + "learning_rate": 3.556963777404659e-05, + "loss": 0.1523, + "step": 66420 + }, + { + "epoch": 14.009781725613388, + "grad_norm": 0.0023921073880046606, + "learning_rate": 3.5566628753001505e-05, + "loss": 0.0158, + "step": 66430 + }, + { + "epoch": 14.009835887992201, + "grad_norm": 1.095345139503479, + "learning_rate": 3.5563619731956404e-05, + "loss": 0.0202, + "step": 66440 + }, + { + "epoch": 14.009890050371013, + "grad_norm": 0.35237249732017517, + "learning_rate": 3.556061071091131e-05, + "loss": 0.0672, + "step": 66450 + }, + { + "epoch": 14.009944212749824, + "grad_norm": 2.074861526489258, + "learning_rate": 3.5557601689866224e-05, + "loss": 0.1489, + "step": 66460 + }, + { + "epoch": 14.009998375128635, + "grad_norm": 0.014140096493065357, + "learning_rate": 3.555459266882113e-05, + "loss": 0.0914, + "step": 66470 + }, + { + "epoch": 14.010052537507447, + "grad_norm": 6.532153606414795, + "learning_rate": 3.555158364777603e-05, + "loss": 0.1583, + "step": 66480 + }, + { + "epoch": 14.01010669988626, + "grad_norm": 0.11231096088886261, + "learning_rate": 3.554857462673094e-05, + "loss": 0.043, + "step": 66490 + }, + { + "epoch": 14.01016086226507, + "grad_norm": 0.04133201763033867, + "learning_rate": 3.554556560568585e-05, + "loss": 0.0326, + "step": 66500 + }, + { + "epoch": 14.010215024643882, + "grad_norm": 0.9714996218681335, + "learning_rate": 3.5542556584640755e-05, + "loss": 0.0269, + "step": 66510 + }, + { + "epoch": 14.010269187022693, + "grad_norm": 0.035958632826805115, + "learning_rate": 3.553954756359566e-05, + "loss": 0.0613, + "step": 66520 + }, + { + "epoch": 14.010323349401506, + "grad_norm": 0.001912723877467215, + "learning_rate": 3.553653854255057e-05, + "loss": 0.0618, + "step": 66530 + }, + { + "epoch": 14.010377511780318, + "grad_norm": 0.0064573441632092, + "learning_rate": 3.5533529521505474e-05, + "loss": 0.0643, + "step": 66540 + }, + { + "epoch": 14.010431674159129, + "grad_norm": 0.02405594289302826, + "learning_rate": 3.553052050046038e-05, + "loss": 0.0491, + "step": 66550 + }, + { + "epoch": 14.01048583653794, + "grad_norm": 1.2256089448928833, + "learning_rate": 3.5527511479415286e-05, + "loss": 0.092, + "step": 66560 + }, + { + "epoch": 14.010539998916752, + "grad_norm": 1.5085865259170532, + "learning_rate": 3.552450245837019e-05, + "loss": 0.0316, + "step": 66570 + }, + { + "epoch": 14.010594161295565, + "grad_norm": 0.018639590591192245, + "learning_rate": 3.5521493437325106e-05, + "loss": 0.1147, + "step": 66580 + }, + { + "epoch": 14.010648323674376, + "grad_norm": 10.146055221557617, + "learning_rate": 3.5518484416280005e-05, + "loss": 0.0734, + "step": 66590 + }, + { + "epoch": 14.010702486053187, + "grad_norm": 0.05364314094185829, + "learning_rate": 3.551547539523492e-05, + "loss": 0.0077, + "step": 66600 + }, + { + "epoch": 14.010756648431999, + "grad_norm": 12.156010627746582, + "learning_rate": 3.5512466374189825e-05, + "loss": 0.028, + "step": 66610 + }, + { + "epoch": 14.010810810810812, + "grad_norm": 1.1625401973724365, + "learning_rate": 3.550945735314473e-05, + "loss": 0.0139, + "step": 66620 + }, + { + "epoch": 14.010864973189623, + "grad_norm": 1.7684699296951294, + "learning_rate": 3.550644833209964e-05, + "loss": 0.0231, + "step": 66630 + }, + { + "epoch": 14.010919135568434, + "grad_norm": 0.005208292510360479, + "learning_rate": 3.5503439311054543e-05, + "loss": 0.0091, + "step": 66640 + }, + { + "epoch": 14.010973297947245, + "grad_norm": 0.014389759860932827, + "learning_rate": 3.550043029000945e-05, + "loss": 0.1035, + "step": 66650 + }, + { + "epoch": 14.011027460326057, + "grad_norm": 0.06594161689281464, + "learning_rate": 3.5497421268964356e-05, + "loss": 0.0006, + "step": 66660 + }, + { + "epoch": 14.01108162270487, + "grad_norm": 1.7129089832305908, + "learning_rate": 3.549441224791926e-05, + "loss": 0.0811, + "step": 66670 + }, + { + "epoch": 14.011135785083681, + "grad_norm": 0.048489611595869064, + "learning_rate": 3.549140322687417e-05, + "loss": 0.046, + "step": 66680 + }, + { + "epoch": 14.011189947462492, + "grad_norm": 0.06672480702400208, + "learning_rate": 3.548839420582908e-05, + "loss": 0.0939, + "step": 66690 + }, + { + "epoch": 14.011244109841304, + "grad_norm": 0.6247575879096985, + "learning_rate": 3.548538518478398e-05, + "loss": 0.0526, + "step": 66700 + }, + { + "epoch": 14.011298272220117, + "grad_norm": 0.03180408105254173, + "learning_rate": 3.548237616373889e-05, + "loss": 0.0849, + "step": 66710 + }, + { + "epoch": 14.011352434598928, + "grad_norm": 2.329821825027466, + "learning_rate": 3.54793671426938e-05, + "loss": 0.0695, + "step": 66720 + }, + { + "epoch": 14.01140659697774, + "grad_norm": 0.3701297342777252, + "learning_rate": 3.547635812164871e-05, + "loss": 0.0828, + "step": 66730 + }, + { + "epoch": 14.01146075935655, + "grad_norm": 1.300112009048462, + "learning_rate": 3.5473349100603606e-05, + "loss": 0.0988, + "step": 66740 + }, + { + "epoch": 14.011514921735362, + "grad_norm": 0.0021301230881363153, + "learning_rate": 3.547034007955852e-05, + "loss": 0.0569, + "step": 66750 + }, + { + "epoch": 14.011569084114175, + "grad_norm": 0.013361272402107716, + "learning_rate": 3.5467331058513425e-05, + "loss": 0.0083, + "step": 66760 + }, + { + "epoch": 14.011623246492986, + "grad_norm": 0.0111166350543499, + "learning_rate": 3.546432203746833e-05, + "loss": 0.0514, + "step": 66770 + }, + { + "epoch": 14.011677408871797, + "grad_norm": 8.655162811279297, + "learning_rate": 3.546131301642324e-05, + "loss": 0.1452, + "step": 66780 + }, + { + "epoch": 14.011731571250609, + "grad_norm": 0.9417712688446045, + "learning_rate": 3.5458303995378144e-05, + "loss": 0.1053, + "step": 66790 + }, + { + "epoch": 14.011785733629422, + "grad_norm": 0.05688268318772316, + "learning_rate": 3.545529497433305e-05, + "loss": 0.0836, + "step": 66800 + }, + { + "epoch": 14.011839896008233, + "grad_norm": 15.783863067626953, + "learning_rate": 3.5452285953287964e-05, + "loss": 0.049, + "step": 66810 + }, + { + "epoch": 14.011894058387044, + "grad_norm": 0.011214025318622589, + "learning_rate": 3.544927693224286e-05, + "loss": 0.0082, + "step": 66820 + }, + { + "epoch": 14.011948220765856, + "grad_norm": 0.09818631410598755, + "learning_rate": 3.544626791119777e-05, + "loss": 0.0303, + "step": 66830 + }, + { + "epoch": 14.012002383144667, + "grad_norm": 0.009335997514426708, + "learning_rate": 3.544325889015268e-05, + "loss": 0.1867, + "step": 66840 + }, + { + "epoch": 14.01205654552348, + "grad_norm": 0.05730559676885605, + "learning_rate": 3.544024986910758e-05, + "loss": 0.1031, + "step": 66850 + }, + { + "epoch": 14.012110707902291, + "grad_norm": 0.0035678830463439226, + "learning_rate": 3.5437240848062495e-05, + "loss": 0.0608, + "step": 66860 + }, + { + "epoch": 14.012164870281103, + "grad_norm": 0.0030012507922947407, + "learning_rate": 3.54342318270174e-05, + "loss": 0.0056, + "step": 66870 + }, + { + "epoch": 14.012219032659914, + "grad_norm": 0.007143942639231682, + "learning_rate": 3.543122280597231e-05, + "loss": 0.0321, + "step": 66880 + }, + { + "epoch": 14.012273195038727, + "grad_norm": 1.8817505836486816, + "learning_rate": 3.5428213784927214e-05, + "loss": 0.0735, + "step": 66890 + }, + { + "epoch": 14.012327357417538, + "grad_norm": 0.08933673053979874, + "learning_rate": 3.542520476388212e-05, + "loss": 0.146, + "step": 66900 + }, + { + "epoch": 14.01238151979635, + "grad_norm": 0.002325771376490593, + "learning_rate": 3.5422195742837026e-05, + "loss": 0.0401, + "step": 66910 + }, + { + "epoch": 14.01243568217516, + "grad_norm": 2.734161376953125, + "learning_rate": 3.541918672179193e-05, + "loss": 0.0994, + "step": 66920 + }, + { + "epoch": 14.012489844553972, + "grad_norm": 1.5567985773086548, + "learning_rate": 3.541617770074684e-05, + "loss": 0.1179, + "step": 66930 + }, + { + "epoch": 14.012544006932785, + "grad_norm": 1.8434104919433594, + "learning_rate": 3.5413168679701745e-05, + "loss": 0.0356, + "step": 66940 + }, + { + "epoch": 14.012598169311596, + "grad_norm": 4.755203723907471, + "learning_rate": 3.541015965865666e-05, + "loss": 0.1078, + "step": 66950 + }, + { + "epoch": 14.012652331690408, + "grad_norm": 0.007749571464955807, + "learning_rate": 3.5407150637611565e-05, + "loss": 0.0105, + "step": 66960 + }, + { + "epoch": 14.012706494069219, + "grad_norm": 0.03007737174630165, + "learning_rate": 3.5404141616566464e-05, + "loss": 0.0415, + "step": 66970 + }, + { + "epoch": 14.012760656448032, + "grad_norm": 4.654564380645752, + "learning_rate": 3.540113259552138e-05, + "loss": 0.0573, + "step": 66980 + }, + { + "epoch": 14.012814818826843, + "grad_norm": 2.1636359691619873, + "learning_rate": 3.539812357447628e-05, + "loss": 0.0936, + "step": 66990 + }, + { + "epoch": 14.012868981205655, + "grad_norm": 0.026301275938749313, + "learning_rate": 3.539511455343118e-05, + "loss": 0.082, + "step": 67000 + }, + { + "epoch": 14.012923143584466, + "grad_norm": 0.11471811681985855, + "learning_rate": 3.5392105532386096e-05, + "loss": 0.0481, + "step": 67010 + }, + { + "epoch": 14.012977305963277, + "grad_norm": 0.026231897994875908, + "learning_rate": 3.5389096511341e-05, + "loss": 0.0235, + "step": 67020 + }, + { + "epoch": 14.01303146834209, + "grad_norm": 0.011744718998670578, + "learning_rate": 3.538608749029591e-05, + "loss": 0.0729, + "step": 67030 + }, + { + "epoch": 14.013085630720902, + "grad_norm": 0.01631857641041279, + "learning_rate": 3.5383078469250815e-05, + "loss": 0.0942, + "step": 67040 + }, + { + "epoch": 14.013139793099713, + "grad_norm": 5.858458995819092, + "learning_rate": 3.538006944820572e-05, + "loss": 0.1934, + "step": 67050 + }, + { + "epoch": 14.013193955478524, + "grad_norm": 0.011530772782862186, + "learning_rate": 3.537706042716063e-05, + "loss": 0.1607, + "step": 67060 + }, + { + "epoch": 14.013248117857337, + "grad_norm": 0.21055695414543152, + "learning_rate": 3.537405140611554e-05, + "loss": 0.1526, + "step": 67070 + }, + { + "epoch": 14.013302280236148, + "grad_norm": 0.433136522769928, + "learning_rate": 3.537104238507044e-05, + "loss": 0.1666, + "step": 67080 + }, + { + "epoch": 14.01335644261496, + "grad_norm": 0.013467992655932903, + "learning_rate": 3.5368033364025346e-05, + "loss": 0.1211, + "step": 67090 + }, + { + "epoch": 14.013410604993771, + "grad_norm": 0.02370447665452957, + "learning_rate": 3.536502434298026e-05, + "loss": 0.0488, + "step": 67100 + }, + { + "epoch": 14.013464767372582, + "grad_norm": 2.515540361404419, + "learning_rate": 3.5362015321935165e-05, + "loss": 0.0888, + "step": 67110 + }, + { + "epoch": 14.013518929751395, + "grad_norm": 0.019959336146712303, + "learning_rate": 3.535900630089007e-05, + "loss": 0.0541, + "step": 67120 + }, + { + "epoch": 14.013573092130207, + "grad_norm": 0.08573754131793976, + "learning_rate": 3.535599727984498e-05, + "loss": 0.0171, + "step": 67130 + }, + { + "epoch": 14.013627254509018, + "grad_norm": 0.009488804265856743, + "learning_rate": 3.5352988258799884e-05, + "loss": 0.0446, + "step": 67140 + }, + { + "epoch": 14.01368141688783, + "grad_norm": 1.2910406589508057, + "learning_rate": 3.534997923775479e-05, + "loss": 0.0489, + "step": 67150 + }, + { + "epoch": 14.013735579266642, + "grad_norm": 0.08976460993289948, + "learning_rate": 3.53469702167097e-05, + "loss": 0.0511, + "step": 67160 + }, + { + "epoch": 14.013789741645454, + "grad_norm": 0.0031799678690731525, + "learning_rate": 3.53439611956646e-05, + "loss": 0.0191, + "step": 67170 + }, + { + "epoch": 14.013843904024265, + "grad_norm": 0.07837420701980591, + "learning_rate": 3.534095217461951e-05, + "loss": 0.008, + "step": 67180 + }, + { + "epoch": 14.013898066403076, + "grad_norm": 0.005893607623875141, + "learning_rate": 3.5337943153574416e-05, + "loss": 0.0106, + "step": 67190 + }, + { + "epoch": 14.013952228781887, + "grad_norm": 0.016473762691020966, + "learning_rate": 3.533493413252932e-05, + "loss": 0.045, + "step": 67200 + }, + { + "epoch": 14.0140063911607, + "grad_norm": 0.01664581336081028, + "learning_rate": 3.5331925111484235e-05, + "loss": 0.0251, + "step": 67210 + }, + { + "epoch": 14.014060553539512, + "grad_norm": 0.18821856379508972, + "learning_rate": 3.532891609043914e-05, + "loss": 0.1401, + "step": 67220 + }, + { + "epoch": 14.014114715918323, + "grad_norm": 2.7178826332092285, + "learning_rate": 3.532590706939404e-05, + "loss": 0.0593, + "step": 67230 + }, + { + "epoch": 14.014168878297134, + "grad_norm": 5.40526008605957, + "learning_rate": 3.5322898048348954e-05, + "loss": 0.0832, + "step": 67240 + }, + { + "epoch": 14.014223040675947, + "grad_norm": 1.4034756422042847, + "learning_rate": 3.531988902730386e-05, + "loss": 0.0564, + "step": 67250 + }, + { + "epoch": 14.014277203054759, + "grad_norm": 3.954102039337158, + "learning_rate": 3.5316880006258766e-05, + "loss": 0.0867, + "step": 67260 + }, + { + "epoch": 14.01433136543357, + "grad_norm": 1.2747764587402344, + "learning_rate": 3.531387098521367e-05, + "loss": 0.0706, + "step": 67270 + }, + { + "epoch": 14.014385527812381, + "grad_norm": 0.8000206351280212, + "learning_rate": 3.531086196416858e-05, + "loss": 0.0603, + "step": 67280 + }, + { + "epoch": 14.014439690191193, + "grad_norm": 11.15805435180664, + "learning_rate": 3.5307852943123485e-05, + "loss": 0.0578, + "step": 67290 + }, + { + "epoch": 14.014493852570006, + "grad_norm": 2.840083599090576, + "learning_rate": 3.530484392207839e-05, + "loss": 0.0587, + "step": 67300 + }, + { + "epoch": 14.014548014948817, + "grad_norm": 0.9721812009811401, + "learning_rate": 3.53018349010333e-05, + "loss": 0.0513, + "step": 67310 + }, + { + "epoch": 14.014602177327628, + "grad_norm": 0.20061296224594116, + "learning_rate": 3.5298825879988204e-05, + "loss": 0.1369, + "step": 67320 + }, + { + "epoch": 14.01465633970644, + "grad_norm": 0.0033389029558748007, + "learning_rate": 3.529581685894312e-05, + "loss": 0.1222, + "step": 67330 + }, + { + "epoch": 14.01471050208525, + "grad_norm": 0.3126959800720215, + "learning_rate": 3.5292807837898017e-05, + "loss": 0.0928, + "step": 67340 + }, + { + "epoch": 14.014764664464064, + "grad_norm": 1.0634485483169556, + "learning_rate": 3.528979881685292e-05, + "loss": 0.1146, + "step": 67350 + }, + { + "epoch": 14.014818826842875, + "grad_norm": 1.3921726942062378, + "learning_rate": 3.5286789795807836e-05, + "loss": 0.0799, + "step": 67360 + }, + { + "epoch": 14.014872989221686, + "grad_norm": 2.4824960231781006, + "learning_rate": 3.528378077476274e-05, + "loss": 0.0538, + "step": 67370 + }, + { + "epoch": 14.014927151600498, + "grad_norm": 0.8393128514289856, + "learning_rate": 3.528077175371765e-05, + "loss": 0.0361, + "step": 67380 + }, + { + "epoch": 14.01498131397931, + "grad_norm": 0.027402102947235107, + "learning_rate": 3.5277762732672555e-05, + "loss": 0.0564, + "step": 67390 + }, + { + "epoch": 14.015035476358122, + "grad_norm": 2.9625113010406494, + "learning_rate": 3.527475371162746e-05, + "loss": 0.0703, + "step": 67400 + }, + { + "epoch": 14.015089638736933, + "grad_norm": 0.16376882791519165, + "learning_rate": 3.527174469058237e-05, + "loss": 0.2551, + "step": 67410 + }, + { + "epoch": 14.015143801115745, + "grad_norm": 3.0663135051727295, + "learning_rate": 3.5268735669537274e-05, + "loss": 0.0543, + "step": 67420 + }, + { + "epoch": 14.015197963494556, + "grad_norm": 0.14883750677108765, + "learning_rate": 3.526572664849218e-05, + "loss": 0.0367, + "step": 67430 + }, + { + "epoch": 14.015252125873369, + "grad_norm": 0.002829132601618767, + "learning_rate": 3.5262717627447086e-05, + "loss": 0.0264, + "step": 67440 + }, + { + "epoch": 14.01530628825218, + "grad_norm": 0.09328059107065201, + "learning_rate": 3.525970860640199e-05, + "loss": 0.0378, + "step": 67450 + }, + { + "epoch": 14.015360450630991, + "grad_norm": 0.002487578196451068, + "learning_rate": 3.52566995853569e-05, + "loss": 0.0156, + "step": 67460 + }, + { + "epoch": 14.015414613009803, + "grad_norm": 0.2627184987068176, + "learning_rate": 3.525369056431181e-05, + "loss": 0.0645, + "step": 67470 + }, + { + "epoch": 14.015468775388616, + "grad_norm": 0.011717366054654121, + "learning_rate": 3.525068154326672e-05, + "loss": 0.038, + "step": 67480 + }, + { + "epoch": 14.015522937767427, + "grad_norm": 0.028239065781235695, + "learning_rate": 3.524767252222162e-05, + "loss": 0.0219, + "step": 67490 + }, + { + "epoch": 14.015577100146238, + "grad_norm": 1.5559645891189575, + "learning_rate": 3.524466350117653e-05, + "loss": 0.0557, + "step": 67500 + }, + { + "epoch": 14.01563126252505, + "grad_norm": 0.31659069657325745, + "learning_rate": 3.524165448013144e-05, + "loss": 0.0595, + "step": 67510 + }, + { + "epoch": 14.015685424903861, + "grad_norm": 0.0020398262422531843, + "learning_rate": 3.523864545908634e-05, + "loss": 0.0145, + "step": 67520 + }, + { + "epoch": 14.015739587282674, + "grad_norm": 0.022218158468604088, + "learning_rate": 3.523563643804125e-05, + "loss": 0.0113, + "step": 67530 + }, + { + "epoch": 14.015793749661485, + "grad_norm": 1.2416812181472778, + "learning_rate": 3.5232627416996156e-05, + "loss": 0.1444, + "step": 67540 + }, + { + "epoch": 14.015847912040297, + "grad_norm": 1.8710874319076538, + "learning_rate": 3.522961839595106e-05, + "loss": 0.0721, + "step": 67550 + }, + { + "epoch": 14.015902074419108, + "grad_norm": 0.0024036329705268145, + "learning_rate": 3.5226609374905975e-05, + "loss": 0.032, + "step": 67560 + }, + { + "epoch": 14.015956236797921, + "grad_norm": 0.027663182467222214, + "learning_rate": 3.5223600353860874e-05, + "loss": 0.0207, + "step": 67570 + }, + { + "epoch": 14.016010399176732, + "grad_norm": 0.08079852908849716, + "learning_rate": 3.522059133281578e-05, + "loss": 0.0906, + "step": 67580 + }, + { + "epoch": 14.016064561555543, + "grad_norm": 6.004470348358154, + "learning_rate": 3.5217582311770694e-05, + "loss": 0.1321, + "step": 67590 + }, + { + "epoch": 14.016118723934355, + "grad_norm": 0.0019847562070935965, + "learning_rate": 3.521457329072559e-05, + "loss": 0.0074, + "step": 67600 + }, + { + "epoch": 14.016172886313166, + "grad_norm": 3.0703039169311523, + "learning_rate": 3.52115642696805e-05, + "loss": 0.0968, + "step": 67610 + }, + { + "epoch": 14.016227048691979, + "grad_norm": 0.13135115802288055, + "learning_rate": 3.520855524863541e-05, + "loss": 0.0041, + "step": 67620 + }, + { + "epoch": 14.01628121107079, + "grad_norm": 0.008164403960108757, + "learning_rate": 3.520554622759032e-05, + "loss": 0.018, + "step": 67630 + }, + { + "epoch": 14.016335373449602, + "grad_norm": 0.8517470955848694, + "learning_rate": 3.5202537206545225e-05, + "loss": 0.094, + "step": 67640 + }, + { + "epoch": 14.016389535828413, + "grad_norm": 0.003225624328479171, + "learning_rate": 3.519952818550013e-05, + "loss": 0.0064, + "step": 67650 + }, + { + "epoch": 14.016443698207226, + "grad_norm": 0.08998677879571915, + "learning_rate": 3.519651916445504e-05, + "loss": 0.1403, + "step": 67660 + }, + { + "epoch": 14.016497860586037, + "grad_norm": 0.6385543942451477, + "learning_rate": 3.5193510143409944e-05, + "loss": 0.0672, + "step": 67670 + }, + { + "epoch": 14.016552022964849, + "grad_norm": 0.017960330471396446, + "learning_rate": 3.519050112236485e-05, + "loss": 0.0168, + "step": 67680 + }, + { + "epoch": 14.01660618534366, + "grad_norm": 0.12378832697868347, + "learning_rate": 3.5187492101319756e-05, + "loss": 0.0445, + "step": 67690 + }, + { + "epoch": 14.016660347722471, + "grad_norm": 0.006358519662171602, + "learning_rate": 3.518448308027466e-05, + "loss": 0.0491, + "step": 67700 + }, + { + "epoch": 14.016714510101284, + "grad_norm": 0.055359601974487305, + "learning_rate": 3.5181474059229576e-05, + "loss": 0.086, + "step": 67710 + }, + { + "epoch": 14.016768672480096, + "grad_norm": 0.1125836968421936, + "learning_rate": 3.5178465038184475e-05, + "loss": 0.141, + "step": 67720 + }, + { + "epoch": 14.016822834858907, + "grad_norm": 0.005042908247560263, + "learning_rate": 3.517545601713939e-05, + "loss": 0.0854, + "step": 67730 + }, + { + "epoch": 14.016876997237718, + "grad_norm": 0.012868590652942657, + "learning_rate": 3.5172446996094295e-05, + "loss": 0.1352, + "step": 67740 + }, + { + "epoch": 14.016931159616531, + "grad_norm": 2.2838549613952637, + "learning_rate": 3.5169437975049194e-05, + "loss": 0.1086, + "step": 67750 + }, + { + "epoch": 14.016985321995342, + "grad_norm": 1.0372564792633057, + "learning_rate": 3.516642895400411e-05, + "loss": 0.0402, + "step": 67760 + }, + { + "epoch": 14.017039484374154, + "grad_norm": 0.09536727517843246, + "learning_rate": 3.5163419932959013e-05, + "loss": 0.113, + "step": 67770 + }, + { + "epoch": 14.017093646752965, + "grad_norm": 0.5047188401222229, + "learning_rate": 3.516041091191392e-05, + "loss": 0.0116, + "step": 67780 + }, + { + "epoch": 14.017147809131776, + "grad_norm": 1.1957467794418335, + "learning_rate": 3.5157401890868826e-05, + "loss": 0.116, + "step": 67790 + }, + { + "epoch": 14.01720197151059, + "grad_norm": 0.02592109702527523, + "learning_rate": 3.515439286982373e-05, + "loss": 0.0536, + "step": 67800 + }, + { + "epoch": 14.0172561338894, + "grad_norm": 0.18627530336380005, + "learning_rate": 3.515138384877864e-05, + "loss": 0.0567, + "step": 67810 + }, + { + "epoch": 14.017310296268212, + "grad_norm": 0.01069823931902647, + "learning_rate": 3.514837482773355e-05, + "loss": 0.0224, + "step": 67820 + }, + { + "epoch": 14.017364458647023, + "grad_norm": 0.009979560039937496, + "learning_rate": 3.514536580668845e-05, + "loss": 0.0556, + "step": 67830 + }, + { + "epoch": 14.017418621025836, + "grad_norm": 0.07321822643280029, + "learning_rate": 3.514235678564336e-05, + "loss": 0.0495, + "step": 67840 + }, + { + "epoch": 14.017472783404648, + "grad_norm": 0.41814985871315, + "learning_rate": 3.513934776459827e-05, + "loss": 0.1563, + "step": 67850 + }, + { + "epoch": 14.017526945783459, + "grad_norm": 0.8829734325408936, + "learning_rate": 3.513633874355318e-05, + "loss": 0.0254, + "step": 67860 + }, + { + "epoch": 14.01758110816227, + "grad_norm": 0.03786894306540489, + "learning_rate": 3.5133329722508076e-05, + "loss": 0.1538, + "step": 67870 + }, + { + "epoch": 14.017635270541081, + "grad_norm": 0.2020547091960907, + "learning_rate": 3.513032070146299e-05, + "loss": 0.0059, + "step": 67880 + }, + { + "epoch": 14.017689432919894, + "grad_norm": 0.13168568909168243, + "learning_rate": 3.5127311680417896e-05, + "loss": 0.0069, + "step": 67890 + }, + { + "epoch": 14.017743595298706, + "grad_norm": 0.03948117420077324, + "learning_rate": 3.51243026593728e-05, + "loss": 0.0847, + "step": 67900 + }, + { + "epoch": 14.017797757677517, + "grad_norm": 2.9292163848876953, + "learning_rate": 3.512129363832771e-05, + "loss": 0.1115, + "step": 67910 + }, + { + "epoch": 14.017851920056328, + "grad_norm": 0.17495961487293243, + "learning_rate": 3.5118284617282614e-05, + "loss": 0.0745, + "step": 67920 + }, + { + "epoch": 14.017906082435141, + "grad_norm": 0.0030950799118727446, + "learning_rate": 3.511527559623752e-05, + "loss": 0.0277, + "step": 67930 + }, + { + "epoch": 14.017960244813953, + "grad_norm": 0.005945639219135046, + "learning_rate": 3.511226657519243e-05, + "loss": 0.0674, + "step": 67940 + }, + { + "epoch": 14.018014407192764, + "grad_norm": 0.022502118721604347, + "learning_rate": 3.510925755414733e-05, + "loss": 0.0064, + "step": 67950 + }, + { + "epoch": 14.018068569571575, + "grad_norm": 0.1379355788230896, + "learning_rate": 3.510624853310224e-05, + "loss": 0.067, + "step": 67960 + }, + { + "epoch": 14.018122731950386, + "grad_norm": 0.13334663212299347, + "learning_rate": 3.510323951205715e-05, + "loss": 0.0134, + "step": 67970 + }, + { + "epoch": 14.0181768943292, + "grad_norm": 0.00245639868080616, + "learning_rate": 3.510023049101205e-05, + "loss": 0.0253, + "step": 67980 + }, + { + "epoch": 14.01823105670801, + "grad_norm": 0.4178299605846405, + "learning_rate": 3.5097221469966965e-05, + "loss": 0.0878, + "step": 67990 + }, + { + "epoch": 14.018285219086822, + "grad_norm": 0.03480077534914017, + "learning_rate": 3.509421244892187e-05, + "loss": 0.073, + "step": 68000 + }, + { + "epoch": 14.018339381465633, + "grad_norm": 0.0062500606290996075, + "learning_rate": 3.509120342787678e-05, + "loss": 0.0608, + "step": 68010 + }, + { + "epoch": 14.018393543844446, + "grad_norm": 2.0523295402526855, + "learning_rate": 3.5088194406831684e-05, + "loss": 0.1704, + "step": 68020 + }, + { + "epoch": 14.018447706223258, + "grad_norm": 0.19508801400661469, + "learning_rate": 3.508518538578659e-05, + "loss": 0.0447, + "step": 68030 + }, + { + "epoch": 14.018501868602069, + "grad_norm": 0.011531739495694637, + "learning_rate": 3.5082176364741496e-05, + "loss": 0.0622, + "step": 68040 + }, + { + "epoch": 14.01855603098088, + "grad_norm": 0.3206077218055725, + "learning_rate": 3.50791673436964e-05, + "loss": 0.0648, + "step": 68050 + }, + { + "epoch": 14.018610193359692, + "grad_norm": 0.2786179780960083, + "learning_rate": 3.507615832265131e-05, + "loss": 0.0392, + "step": 68060 + }, + { + "epoch": 14.018664355738505, + "grad_norm": 7.559998989105225, + "learning_rate": 3.5073149301606215e-05, + "loss": 0.1019, + "step": 68070 + }, + { + "epoch": 14.018718518117316, + "grad_norm": 0.16776715219020844, + "learning_rate": 3.507014028056113e-05, + "loss": 0.0309, + "step": 68080 + }, + { + "epoch": 14.018772680496127, + "grad_norm": 0.2674604058265686, + "learning_rate": 3.506713125951603e-05, + "loss": 0.0754, + "step": 68090 + }, + { + "epoch": 14.018826842874939, + "grad_norm": 0.003878691466525197, + "learning_rate": 3.5064122238470934e-05, + "loss": 0.0378, + "step": 68100 + }, + { + "epoch": 14.018881005253752, + "grad_norm": 0.17869392037391663, + "learning_rate": 3.506111321742585e-05, + "loss": 0.0936, + "step": 68110 + }, + { + "epoch": 14.018935167632563, + "grad_norm": 0.328668475151062, + "learning_rate": 3.505810419638075e-05, + "loss": 0.0509, + "step": 68120 + }, + { + "epoch": 14.018989330011374, + "grad_norm": 0.013309791684150696, + "learning_rate": 3.505509517533565e-05, + "loss": 0.1357, + "step": 68130 + }, + { + "epoch": 14.019043492390185, + "grad_norm": 0.004836323671042919, + "learning_rate": 3.5052086154290566e-05, + "loss": 0.0603, + "step": 68140 + }, + { + "epoch": 14.019097654768997, + "grad_norm": 0.2709469497203827, + "learning_rate": 3.504907713324547e-05, + "loss": 0.0193, + "step": 68150 + }, + { + "epoch": 14.01915181714781, + "grad_norm": 0.2333376705646515, + "learning_rate": 3.504606811220038e-05, + "loss": 0.0509, + "step": 68160 + }, + { + "epoch": 14.019205979526621, + "grad_norm": 0.301514208316803, + "learning_rate": 3.5043059091155285e-05, + "loss": 0.073, + "step": 68170 + }, + { + "epoch": 14.019260141905432, + "grad_norm": 8.183430671691895, + "learning_rate": 3.504005007011019e-05, + "loss": 0.053, + "step": 68180 + }, + { + "epoch": 14.019314304284244, + "grad_norm": 0.009504517540335655, + "learning_rate": 3.50370410490651e-05, + "loss": 0.0441, + "step": 68190 + }, + { + "epoch": 14.019368466663057, + "grad_norm": 0.0021227188408374786, + "learning_rate": 3.5034032028020004e-05, + "loss": 0.022, + "step": 68200 + }, + { + "epoch": 14.019422629041868, + "grad_norm": 0.05075376480817795, + "learning_rate": 3.503102300697491e-05, + "loss": 0.0482, + "step": 68210 + }, + { + "epoch": 14.01947679142068, + "grad_norm": 0.00258288299664855, + "learning_rate": 3.502801398592982e-05, + "loss": 0.0473, + "step": 68220 + }, + { + "epoch": 14.01953095379949, + "grad_norm": 0.04577900096774101, + "learning_rate": 3.502500496488473e-05, + "loss": 0.0321, + "step": 68230 + }, + { + "epoch": 14.019585116178302, + "grad_norm": 0.0019677975215017796, + "learning_rate": 3.502199594383963e-05, + "loss": 0.0205, + "step": 68240 + }, + { + "epoch": 14.019639278557115, + "grad_norm": 0.048961322754621506, + "learning_rate": 3.501898692279454e-05, + "loss": 0.0082, + "step": 68250 + }, + { + "epoch": 14.019693440935926, + "grad_norm": 0.29511070251464844, + "learning_rate": 3.501597790174945e-05, + "loss": 0.1139, + "step": 68260 + }, + { + "epoch": 14.019747603314737, + "grad_norm": 0.007372686639428139, + "learning_rate": 3.5012968880704354e-05, + "loss": 0.0811, + "step": 68270 + }, + { + "epoch": 14.019801765693549, + "grad_norm": 3.418614625930786, + "learning_rate": 3.500995985965926e-05, + "loss": 0.0768, + "step": 68280 + }, + { + "epoch": 14.019855928072362, + "grad_norm": 0.2084582895040512, + "learning_rate": 3.500695083861417e-05, + "loss": 0.0134, + "step": 68290 + }, + { + "epoch": 14.019910090451173, + "grad_norm": 0.008706502616405487, + "learning_rate": 3.500394181756907e-05, + "loss": 0.0372, + "step": 68300 + }, + { + "epoch": 14.019964252829984, + "grad_norm": 0.17430023849010468, + "learning_rate": 3.5000932796523986e-05, + "loss": 0.0181, + "step": 68310 + }, + { + "epoch": 14.020018415208796, + "grad_norm": 0.03766495734453201, + "learning_rate": 3.4997923775478886e-05, + "loss": 0.0036, + "step": 68320 + }, + { + "epoch": 14.020072577587607, + "grad_norm": 0.02853299118578434, + "learning_rate": 3.499491475443379e-05, + "loss": 0.0044, + "step": 68330 + }, + { + "epoch": 14.02012673996642, + "grad_norm": 0.023075981065630913, + "learning_rate": 3.4991905733388705e-05, + "loss": 0.1025, + "step": 68340 + }, + { + "epoch": 14.020180902345231, + "grad_norm": 0.08907647430896759, + "learning_rate": 3.4988896712343604e-05, + "loss": 0.1247, + "step": 68350 + }, + { + "epoch": 14.020235064724043, + "grad_norm": 0.002144172554835677, + "learning_rate": 3.498588769129851e-05, + "loss": 0.0826, + "step": 68360 + }, + { + "epoch": 14.020289227102854, + "grad_norm": 0.010957133956253529, + "learning_rate": 3.4982878670253424e-05, + "loss": 0.0483, + "step": 68370 + }, + { + "epoch": 14.020343389481667, + "grad_norm": 0.2681814730167389, + "learning_rate": 3.497986964920833e-05, + "loss": 0.0769, + "step": 68380 + }, + { + "epoch": 14.020397551860478, + "grad_norm": 0.5451262593269348, + "learning_rate": 3.497686062816323e-05, + "loss": 0.027, + "step": 68390 + }, + { + "epoch": 14.02045171423929, + "grad_norm": 0.003455899190157652, + "learning_rate": 3.497385160711814e-05, + "loss": 0.0008, + "step": 68400 + }, + { + "epoch": 14.0205058766181, + "grad_norm": 1.7612054347991943, + "learning_rate": 3.497084258607305e-05, + "loss": 0.1095, + "step": 68410 + }, + { + "epoch": 14.020560038996912, + "grad_norm": 0.21069364249706268, + "learning_rate": 3.4967833565027955e-05, + "loss": 0.0786, + "step": 68420 + }, + { + "epoch": 14.020614201375725, + "grad_norm": 0.021057266741991043, + "learning_rate": 3.496482454398286e-05, + "loss": 0.0242, + "step": 68430 + }, + { + "epoch": 14.020668363754536, + "grad_norm": 0.008652894757688046, + "learning_rate": 3.496181552293777e-05, + "loss": 0.0139, + "step": 68440 + }, + { + "epoch": 14.020722526133348, + "grad_norm": 0.09231819957494736, + "learning_rate": 3.4958806501892674e-05, + "loss": 0.0484, + "step": 68450 + }, + { + "epoch": 14.020776688512159, + "grad_norm": 0.12012439221143723, + "learning_rate": 3.495579748084759e-05, + "loss": 0.0428, + "step": 68460 + }, + { + "epoch": 14.02083085089097, + "grad_norm": 0.08663599193096161, + "learning_rate": 3.4952788459802487e-05, + "loss": 0.0228, + "step": 68470 + }, + { + "epoch": 14.020885013269783, + "grad_norm": 0.006145337130874395, + "learning_rate": 3.49497794387574e-05, + "loss": 0.1654, + "step": 68480 + }, + { + "epoch": 14.020939175648595, + "grad_norm": 12.678372383117676, + "learning_rate": 3.4946770417712306e-05, + "loss": 0.0312, + "step": 68490 + }, + { + "epoch": 14.020993338027406, + "grad_norm": 0.028759056702256203, + "learning_rate": 3.4943761396667205e-05, + "loss": 0.1118, + "step": 68500 + }, + { + "epoch": 14.021047500406217, + "grad_norm": 0.027556488290429115, + "learning_rate": 3.494075237562212e-05, + "loss": 0.0011, + "step": 68510 + }, + { + "epoch": 14.02110166278503, + "grad_norm": 0.048209041357040405, + "learning_rate": 3.4937743354577025e-05, + "loss": 0.0475, + "step": 68520 + }, + { + "epoch": 14.021155825163842, + "grad_norm": 0.030687116086483, + "learning_rate": 3.493473433353193e-05, + "loss": 0.0214, + "step": 68530 + }, + { + "epoch": 14.021209987542653, + "grad_norm": 0.001911192201077938, + "learning_rate": 3.493172531248684e-05, + "loss": 0.0215, + "step": 68540 + }, + { + "epoch": 14.021264149921464, + "grad_norm": 0.00425189733505249, + "learning_rate": 3.4928716291441744e-05, + "loss": 0.1007, + "step": 68550 + }, + { + "epoch": 14.021318312300275, + "grad_norm": 0.41269734501838684, + "learning_rate": 3.492570727039665e-05, + "loss": 0.1087, + "step": 68560 + }, + { + "epoch": 14.021372474679088, + "grad_norm": 0.15358395874500275, + "learning_rate": 3.492269824935156e-05, + "loss": 0.0398, + "step": 68570 + }, + { + "epoch": 14.0214266370579, + "grad_norm": 5.111866474151611, + "learning_rate": 3.491968922830646e-05, + "loss": 0.0457, + "step": 68580 + }, + { + "epoch": 14.021480799436711, + "grad_norm": 0.0025108272675424814, + "learning_rate": 3.491668020726137e-05, + "loss": 0.1403, + "step": 68590 + }, + { + "epoch": 14.021534961815522, + "grad_norm": 5.582846164703369, + "learning_rate": 3.491367118621628e-05, + "loss": 0.058, + "step": 68600 + }, + { + "epoch": 14.021589124194335, + "grad_norm": 0.003216997953131795, + "learning_rate": 3.491066216517119e-05, + "loss": 0.0486, + "step": 68610 + }, + { + "epoch": 14.021643286573147, + "grad_norm": 0.24108712375164032, + "learning_rate": 3.490765314412609e-05, + "loss": 0.1449, + "step": 68620 + }, + { + "epoch": 14.021697448951958, + "grad_norm": 2.801459312438965, + "learning_rate": 3.4904644123081e-05, + "loss": 0.096, + "step": 68630 + }, + { + "epoch": 14.02175161133077, + "grad_norm": 0.004916726145893335, + "learning_rate": 3.490163510203591e-05, + "loss": 0.0247, + "step": 68640 + }, + { + "epoch": 14.02180577370958, + "grad_norm": 0.27666598558425903, + "learning_rate": 3.4898626080990806e-05, + "loss": 0.0262, + "step": 68650 + }, + { + "epoch": 14.021859936088394, + "grad_norm": 0.004951654467731714, + "learning_rate": 3.489561705994572e-05, + "loss": 0.0462, + "step": 68660 + }, + { + "epoch": 14.021914098467205, + "grad_norm": 0.002369803609326482, + "learning_rate": 3.4892608038900626e-05, + "loss": 0.0562, + "step": 68670 + }, + { + "epoch": 14.021968260846016, + "grad_norm": 0.0370187982916832, + "learning_rate": 3.488959901785553e-05, + "loss": 0.0392, + "step": 68680 + }, + { + "epoch": 14.022022423224827, + "grad_norm": 0.02187984436750412, + "learning_rate": 3.488658999681044e-05, + "loss": 0.0088, + "step": 68690 + }, + { + "epoch": 14.02207658560364, + "grad_norm": 0.10110709816217422, + "learning_rate": 3.4883580975765344e-05, + "loss": 0.0502, + "step": 68700 + }, + { + "epoch": 14.022130747982452, + "grad_norm": 3.0085628032684326, + "learning_rate": 3.488057195472025e-05, + "loss": 0.1282, + "step": 68710 + }, + { + "epoch": 14.022184910361263, + "grad_norm": 41.74024963378906, + "learning_rate": 3.4877562933675164e-05, + "loss": 0.1366, + "step": 68720 + }, + { + "epoch": 14.022239072740074, + "grad_norm": 0.7859903573989868, + "learning_rate": 3.487455391263006e-05, + "loss": 0.0366, + "step": 68730 + }, + { + "epoch": 14.022293235118886, + "grad_norm": 0.0018198247998952866, + "learning_rate": 3.4871544891584976e-05, + "loss": 0.0965, + "step": 68740 + }, + { + "epoch": 14.022347397497699, + "grad_norm": 1.4363834857940674, + "learning_rate": 3.486853587053988e-05, + "loss": 0.0547, + "step": 68750 + }, + { + "epoch": 14.02240155987651, + "grad_norm": 0.1463879942893982, + "learning_rate": 3.486552684949479e-05, + "loss": 0.0846, + "step": 68760 + }, + { + "epoch": 14.022455722255321, + "grad_norm": 0.002197754569351673, + "learning_rate": 3.4862517828449695e-05, + "loss": 0.085, + "step": 68770 + }, + { + "epoch": 14.022509884634133, + "grad_norm": 0.04051140695810318, + "learning_rate": 3.48595088074046e-05, + "loss": 0.0335, + "step": 68780 + }, + { + "epoch": 14.022564047012946, + "grad_norm": 0.035369038581848145, + "learning_rate": 3.485649978635951e-05, + "loss": 0.0462, + "step": 68790 + }, + { + "epoch": 14.022618209391757, + "grad_norm": 0.0069663506001234055, + "learning_rate": 3.4853490765314414e-05, + "loss": 0.0694, + "step": 68800 + }, + { + "epoch": 14.022672371770568, + "grad_norm": 0.5706526637077332, + "learning_rate": 3.485048174426932e-05, + "loss": 0.1088, + "step": 68810 + }, + { + "epoch": 14.02272653414938, + "grad_norm": 0.005168741103261709, + "learning_rate": 3.4847472723224226e-05, + "loss": 0.0063, + "step": 68820 + }, + { + "epoch": 14.02278069652819, + "grad_norm": 0.3972233533859253, + "learning_rate": 3.484446370217914e-05, + "loss": 0.0139, + "step": 68830 + }, + { + "epoch": 14.022834858907004, + "grad_norm": 0.0014851760352030396, + "learning_rate": 3.484145468113404e-05, + "loss": 0.0429, + "step": 68840 + }, + { + "epoch": 14.022889021285815, + "grad_norm": 0.003629469545558095, + "learning_rate": 3.4838445660088945e-05, + "loss": 0.0033, + "step": 68850 + }, + { + "epoch": 14.022943183664626, + "grad_norm": 0.001572354231029749, + "learning_rate": 3.483543663904386e-05, + "loss": 0.0179, + "step": 68860 + }, + { + "epoch": 14.022997346043438, + "grad_norm": 0.09681935608386993, + "learning_rate": 3.4832427617998765e-05, + "loss": 0.0928, + "step": 68870 + }, + { + "epoch": 14.02305150842225, + "grad_norm": 0.0013793730176985264, + "learning_rate": 3.4829418596953664e-05, + "loss": 0.0674, + "step": 68880 + }, + { + "epoch": 14.023105670801062, + "grad_norm": 0.006997950840741396, + "learning_rate": 3.482640957590858e-05, + "loss": 0.1757, + "step": 68890 + }, + { + "epoch": 14.023159833179873, + "grad_norm": 0.5907623767852783, + "learning_rate": 3.4823400554863483e-05, + "loss": 0.0193, + "step": 68900 + }, + { + "epoch": 14.023213995558685, + "grad_norm": 0.033875323832035065, + "learning_rate": 3.482039153381839e-05, + "loss": 0.0586, + "step": 68910 + }, + { + "epoch": 14.023268157937496, + "grad_norm": 0.06888419389724731, + "learning_rate": 3.4817382512773296e-05, + "loss": 0.0491, + "step": 68920 + }, + { + "epoch": 14.023322320316309, + "grad_norm": 0.20807655155658722, + "learning_rate": 3.48143734917282e-05, + "loss": 0.0565, + "step": 68930 + }, + { + "epoch": 14.02337648269512, + "grad_norm": 0.5712209343910217, + "learning_rate": 3.481136447068311e-05, + "loss": 0.0912, + "step": 68940 + }, + { + "epoch": 14.023430645073931, + "grad_norm": 0.017829718068242073, + "learning_rate": 3.4808355449638015e-05, + "loss": 0.1105, + "step": 68950 + }, + { + "epoch": 14.023484807452743, + "grad_norm": 3.974693536758423, + "learning_rate": 3.480534642859292e-05, + "loss": 0.0865, + "step": 68960 + }, + { + "epoch": 14.023538969831556, + "grad_norm": 0.2493187040090561, + "learning_rate": 3.480233740754783e-05, + "loss": 0.0407, + "step": 68970 + }, + { + "epoch": 14.023593132210367, + "grad_norm": 0.45687437057495117, + "learning_rate": 3.479932838650274e-05, + "loss": 0.0565, + "step": 68980 + }, + { + "epoch": 14.023647294589178, + "grad_norm": 3.437757730484009, + "learning_rate": 3.479631936545764e-05, + "loss": 0.088, + "step": 68990 + }, + { + "epoch": 14.02370145696799, + "grad_norm": 0.055615250021219254, + "learning_rate": 3.479331034441255e-05, + "loss": 0.0708, + "step": 69000 + }, + { + "epoch": 14.023755619346801, + "grad_norm": 0.0017020885134115815, + "learning_rate": 3.479030132336746e-05, + "loss": 0.0236, + "step": 69010 + }, + { + "epoch": 14.023809781725614, + "grad_norm": 0.009672834537923336, + "learning_rate": 3.4787292302322366e-05, + "loss": 0.0629, + "step": 69020 + }, + { + "epoch": 14.023863944104425, + "grad_norm": 0.0016434973804280162, + "learning_rate": 3.478428328127727e-05, + "loss": 0.006, + "step": 69030 + }, + { + "epoch": 14.023918106483237, + "grad_norm": 0.166352316737175, + "learning_rate": 3.478127426023218e-05, + "loss": 0.1204, + "step": 69040 + }, + { + "epoch": 14.023972268862048, + "grad_norm": 9.554195404052734, + "learning_rate": 3.4778265239187084e-05, + "loss": 0.173, + "step": 69050 + }, + { + "epoch": 14.024026431240861, + "grad_norm": 0.5379524230957031, + "learning_rate": 3.477525621814199e-05, + "loss": 0.1762, + "step": 69060 + }, + { + "epoch": 14.024080593619672, + "grad_norm": 0.24113471806049347, + "learning_rate": 3.47722471970969e-05, + "loss": 0.0085, + "step": 69070 + }, + { + "epoch": 14.024134755998483, + "grad_norm": 0.018871573731303215, + "learning_rate": 3.47692381760518e-05, + "loss": 0.1696, + "step": 69080 + }, + { + "epoch": 14.024188918377295, + "grad_norm": 0.062358565628528595, + "learning_rate": 3.4766229155006716e-05, + "loss": 0.0617, + "step": 69090 + }, + { + "epoch": 14.024243080756106, + "grad_norm": 0.7916091084480286, + "learning_rate": 3.4763220133961616e-05, + "loss": 0.0287, + "step": 69100 + }, + { + "epoch": 14.024297243134919, + "grad_norm": 0.007092838175594807, + "learning_rate": 3.476021111291652e-05, + "loss": 0.041, + "step": 69110 + }, + { + "epoch": 14.02435140551373, + "grad_norm": 0.07457555085420609, + "learning_rate": 3.4757202091871435e-05, + "loss": 0.086, + "step": 69120 + }, + { + "epoch": 14.024405567892542, + "grad_norm": 0.36332258582115173, + "learning_rate": 3.475419307082634e-05, + "loss": 0.015, + "step": 69130 + }, + { + "epoch": 14.024459730271353, + "grad_norm": 0.9015933275222778, + "learning_rate": 3.475118404978124e-05, + "loss": 0.1101, + "step": 69140 + }, + { + "epoch": 14.024513892650166, + "grad_norm": 0.11149220913648605, + "learning_rate": 3.4748175028736154e-05, + "loss": 0.0637, + "step": 69150 + }, + { + "epoch": 14.024568055028977, + "grad_norm": 3.191620349884033, + "learning_rate": 3.474516600769106e-05, + "loss": 0.0871, + "step": 69160 + }, + { + "epoch": 14.024622217407789, + "grad_norm": 0.014172016642987728, + "learning_rate": 3.4742156986645966e-05, + "loss": 0.0201, + "step": 69170 + }, + { + "epoch": 14.0246763797866, + "grad_norm": 2.3997621536254883, + "learning_rate": 3.473914796560087e-05, + "loss": 0.0804, + "step": 69180 + }, + { + "epoch": 14.024730542165411, + "grad_norm": 0.08850657194852829, + "learning_rate": 3.473613894455578e-05, + "loss": 0.0539, + "step": 69190 + }, + { + "epoch": 14.024784704544224, + "grad_norm": 0.019223252311348915, + "learning_rate": 3.4733129923510685e-05, + "loss": 0.049, + "step": 69200 + }, + { + "epoch": 14.024838866923035, + "grad_norm": 0.08582470566034317, + "learning_rate": 3.47301209024656e-05, + "loss": 0.0297, + "step": 69210 + }, + { + "epoch": 14.024893029301847, + "grad_norm": 0.2669553756713867, + "learning_rate": 3.47271118814205e-05, + "loss": 0.0774, + "step": 69220 + }, + { + "epoch": 14.024947191680658, + "grad_norm": 0.16088314354419708, + "learning_rate": 3.4724102860375404e-05, + "loss": 0.0644, + "step": 69230 + }, + { + "epoch": 14.025001354059471, + "grad_norm": 0.0020327528472989798, + "learning_rate": 3.472109383933032e-05, + "loss": 0.0801, + "step": 69240 + }, + { + "epoch": 14.025001354059471, + "eval_accuracy": 0.8399738732854344, + "eval_loss": 0.7813200950622559, + "eval_runtime": 115.552, + "eval_samples_per_second": 26.499, + "eval_steps_per_second": 3.315, + "step": 69240 + }, + { + "epoch": 15.000054162378811, + "grad_norm": 0.5637090802192688, + "learning_rate": 3.471808481828522e-05, + "loss": 0.0996, + "step": 69250 + }, + { + "epoch": 15.000108324757623, + "grad_norm": 0.666327953338623, + "learning_rate": 3.471507579724013e-05, + "loss": 0.0376, + "step": 69260 + }, + { + "epoch": 15.000162487136436, + "grad_norm": 4.665986061096191, + "learning_rate": 3.4712066776195036e-05, + "loss": 0.2176, + "step": 69270 + }, + { + "epoch": 15.000216649515247, + "grad_norm": 0.001993248239159584, + "learning_rate": 3.470905775514994e-05, + "loss": 0.0307, + "step": 69280 + }, + { + "epoch": 15.000270811894058, + "grad_norm": 0.01804165542125702, + "learning_rate": 3.470604873410485e-05, + "loss": 0.1115, + "step": 69290 + }, + { + "epoch": 15.00032497427287, + "grad_norm": 2.852015733718872, + "learning_rate": 3.4703039713059755e-05, + "loss": 0.0548, + "step": 69300 + }, + { + "epoch": 15.000379136651683, + "grad_norm": 0.12200698256492615, + "learning_rate": 3.470003069201466e-05, + "loss": 0.0744, + "step": 69310 + }, + { + "epoch": 15.000433299030494, + "grad_norm": 0.3559195101261139, + "learning_rate": 3.469702167096957e-05, + "loss": 0.1053, + "step": 69320 + }, + { + "epoch": 15.000487461409305, + "grad_norm": 0.03521881625056267, + "learning_rate": 3.4694012649924474e-05, + "loss": 0.1091, + "step": 69330 + }, + { + "epoch": 15.000541623788116, + "grad_norm": 0.008094675838947296, + "learning_rate": 3.469100362887938e-05, + "loss": 0.0211, + "step": 69340 + }, + { + "epoch": 15.000595786166928, + "grad_norm": 0.06621008366346359, + "learning_rate": 3.468799460783429e-05, + "loss": 0.003, + "step": 69350 + }, + { + "epoch": 15.00064994854574, + "grad_norm": 0.04904988780617714, + "learning_rate": 3.46849855867892e-05, + "loss": 0.0739, + "step": 69360 + }, + { + "epoch": 15.000704110924552, + "grad_norm": 0.05760660767555237, + "learning_rate": 3.46819765657441e-05, + "loss": 0.0065, + "step": 69370 + }, + { + "epoch": 15.000758273303363, + "grad_norm": 0.0025660330429673195, + "learning_rate": 3.467896754469901e-05, + "loss": 0.236, + "step": 69380 + }, + { + "epoch": 15.000812435682175, + "grad_norm": 0.009229572489857674, + "learning_rate": 3.467595852365392e-05, + "loss": 0.1049, + "step": 69390 + }, + { + "epoch": 15.000866598060988, + "grad_norm": 3.7157602310180664, + "learning_rate": 3.467294950260882e-05, + "loss": 0.0721, + "step": 69400 + }, + { + "epoch": 15.000920760439799, + "grad_norm": 0.001834418042562902, + "learning_rate": 3.466994048156373e-05, + "loss": 0.0034, + "step": 69410 + }, + { + "epoch": 15.00097492281861, + "grad_norm": 0.0017763464711606503, + "learning_rate": 3.466693146051864e-05, + "loss": 0.0657, + "step": 69420 + }, + { + "epoch": 15.001029085197422, + "grad_norm": 0.0018250008579343557, + "learning_rate": 3.466392243947354e-05, + "loss": 0.0384, + "step": 69430 + }, + { + "epoch": 15.001083247576233, + "grad_norm": 0.11172279715538025, + "learning_rate": 3.466091341842845e-05, + "loss": 0.0684, + "step": 69440 + }, + { + "epoch": 15.001137409955046, + "grad_norm": 1.9654494524002075, + "learning_rate": 3.4657904397383356e-05, + "loss": 0.1018, + "step": 69450 + }, + { + "epoch": 15.001191572333857, + "grad_norm": 0.019648071378469467, + "learning_rate": 3.465489537633826e-05, + "loss": 0.0951, + "step": 69460 + }, + { + "epoch": 15.001245734712668, + "grad_norm": 0.022592071443796158, + "learning_rate": 3.4651886355293175e-05, + "loss": 0.0355, + "step": 69470 + }, + { + "epoch": 15.00129989709148, + "grad_norm": 0.0018844325095415115, + "learning_rate": 3.4648877334248074e-05, + "loss": 0.094, + "step": 69480 + }, + { + "epoch": 15.001354059470293, + "grad_norm": 1.174870491027832, + "learning_rate": 3.464586831320298e-05, + "loss": 0.0691, + "step": 69490 + }, + { + "epoch": 15.001408221849104, + "grad_norm": 0.5177717208862305, + "learning_rate": 3.4642859292157894e-05, + "loss": 0.0052, + "step": 69500 + }, + { + "epoch": 15.001462384227915, + "grad_norm": 0.07258383184671402, + "learning_rate": 3.46398502711128e-05, + "loss": 0.0683, + "step": 69510 + }, + { + "epoch": 15.001516546606727, + "grad_norm": 0.15902844071388245, + "learning_rate": 3.4636841250067706e-05, + "loss": 0.0223, + "step": 69520 + }, + { + "epoch": 15.001570708985538, + "grad_norm": 0.0034717994276434183, + "learning_rate": 3.463383222902261e-05, + "loss": 0.0087, + "step": 69530 + }, + { + "epoch": 15.001624871364351, + "grad_norm": 1.3178619146347046, + "learning_rate": 3.463082320797752e-05, + "loss": 0.1036, + "step": 69540 + }, + { + "epoch": 15.001679033743162, + "grad_norm": 0.002647296991199255, + "learning_rate": 3.4627814186932425e-05, + "loss": 0.1049, + "step": 69550 + }, + { + "epoch": 15.001733196121974, + "grad_norm": 0.001961003290489316, + "learning_rate": 3.462480516588733e-05, + "loss": 0.0434, + "step": 69560 + }, + { + "epoch": 15.001787358500785, + "grad_norm": 3.8249974250793457, + "learning_rate": 3.462179614484224e-05, + "loss": 0.061, + "step": 69570 + }, + { + "epoch": 15.001841520879598, + "grad_norm": 0.001991609111428261, + "learning_rate": 3.4618787123797144e-05, + "loss": 0.0385, + "step": 69580 + }, + { + "epoch": 15.00189568325841, + "grad_norm": 0.026183973997831345, + "learning_rate": 3.461577810275205e-05, + "loss": 0.0117, + "step": 69590 + }, + { + "epoch": 15.00194984563722, + "grad_norm": 4.205644607543945, + "learning_rate": 3.4612769081706957e-05, + "loss": 0.1299, + "step": 69600 + }, + { + "epoch": 15.002004008016032, + "grad_norm": 0.0017903001280501485, + "learning_rate": 3.460976006066187e-05, + "loss": 0.0219, + "step": 69610 + }, + { + "epoch": 15.002058170394843, + "grad_norm": 0.0019643977284431458, + "learning_rate": 3.4606751039616776e-05, + "loss": 0.1099, + "step": 69620 + }, + { + "epoch": 15.002112332773656, + "grad_norm": 0.00214072922244668, + "learning_rate": 3.4603742018571675e-05, + "loss": 0.0947, + "step": 69630 + }, + { + "epoch": 15.002166495152467, + "grad_norm": 0.10705430805683136, + "learning_rate": 3.460073299752659e-05, + "loss": 0.1018, + "step": 69640 + }, + { + "epoch": 15.002220657531279, + "grad_norm": 0.0020021419040858746, + "learning_rate": 3.4597723976481495e-05, + "loss": 0.0424, + "step": 69650 + }, + { + "epoch": 15.00227481991009, + "grad_norm": 0.0023549438919872046, + "learning_rate": 3.45947149554364e-05, + "loss": 0.1134, + "step": 69660 + }, + { + "epoch": 15.002328982288903, + "grad_norm": 0.004046622663736343, + "learning_rate": 3.459170593439131e-05, + "loss": 0.0827, + "step": 69670 + }, + { + "epoch": 15.002383144667714, + "grad_norm": 0.004324848297983408, + "learning_rate": 3.4588696913346214e-05, + "loss": 0.0292, + "step": 69680 + }, + { + "epoch": 15.002437307046526, + "grad_norm": 0.009263569489121437, + "learning_rate": 3.458568789230112e-05, + "loss": 0.0465, + "step": 69690 + }, + { + "epoch": 15.002491469425337, + "grad_norm": 0.34540900588035583, + "learning_rate": 3.4582678871256026e-05, + "loss": 0.0605, + "step": 69700 + }, + { + "epoch": 15.002545631804148, + "grad_norm": 0.02924632653594017, + "learning_rate": 3.457966985021093e-05, + "loss": 0.0025, + "step": 69710 + }, + { + "epoch": 15.002599794182961, + "grad_norm": 11.119128227233887, + "learning_rate": 3.457666082916584e-05, + "loss": 0.0668, + "step": 69720 + }, + { + "epoch": 15.002653956561772, + "grad_norm": 0.33597537875175476, + "learning_rate": 3.457365180812075e-05, + "loss": 0.0654, + "step": 69730 + }, + { + "epoch": 15.002708118940584, + "grad_norm": 0.01254126988351345, + "learning_rate": 3.457064278707565e-05, + "loss": 0.0522, + "step": 69740 + }, + { + "epoch": 15.002762281319395, + "grad_norm": 0.002084740437567234, + "learning_rate": 3.456763376603056e-05, + "loss": 0.0072, + "step": 69750 + }, + { + "epoch": 15.002816443698208, + "grad_norm": 0.07970127463340759, + "learning_rate": 3.456462474498547e-05, + "loss": 0.054, + "step": 69760 + }, + { + "epoch": 15.00287060607702, + "grad_norm": 0.002148453611880541, + "learning_rate": 3.456161572394038e-05, + "loss": 0.0601, + "step": 69770 + }, + { + "epoch": 15.00292476845583, + "grad_norm": 0.1101592406630516, + "learning_rate": 3.455860670289528e-05, + "loss": 0.0321, + "step": 69780 + }, + { + "epoch": 15.002978930834642, + "grad_norm": 2.818976640701294, + "learning_rate": 3.455559768185019e-05, + "loss": 0.0688, + "step": 69790 + }, + { + "epoch": 15.003033093213453, + "grad_norm": 0.00883781909942627, + "learning_rate": 3.4552588660805096e-05, + "loss": 0.0133, + "step": 69800 + }, + { + "epoch": 15.003087255592266, + "grad_norm": 0.10251998156309128, + "learning_rate": 3.454957963976e-05, + "loss": 0.0112, + "step": 69810 + }, + { + "epoch": 15.003141417971078, + "grad_norm": 0.001959841465577483, + "learning_rate": 3.454657061871491e-05, + "loss": 0.0026, + "step": 69820 + }, + { + "epoch": 15.003195580349889, + "grad_norm": 0.0060943737626075745, + "learning_rate": 3.4543561597669814e-05, + "loss": 0.0453, + "step": 69830 + }, + { + "epoch": 15.0032497427287, + "grad_norm": 0.010919392108917236, + "learning_rate": 3.454055257662472e-05, + "loss": 0.0886, + "step": 69840 + }, + { + "epoch": 15.003303905107511, + "grad_norm": 1.2198024988174438, + "learning_rate": 3.453754355557963e-05, + "loss": 0.029, + "step": 69850 + }, + { + "epoch": 15.003358067486325, + "grad_norm": 2.866905689239502, + "learning_rate": 3.453453453453453e-05, + "loss": 0.1457, + "step": 69860 + }, + { + "epoch": 15.003412229865136, + "grad_norm": 0.017968080937862396, + "learning_rate": 3.4531525513489446e-05, + "loss": 0.0306, + "step": 69870 + }, + { + "epoch": 15.003466392243947, + "grad_norm": 0.17599612474441528, + "learning_rate": 3.452851649244435e-05, + "loss": 0.0757, + "step": 69880 + }, + { + "epoch": 15.003520554622758, + "grad_norm": 2.3338871002197266, + "learning_rate": 3.452550747139925e-05, + "loss": 0.0719, + "step": 69890 + }, + { + "epoch": 15.003574717001571, + "grad_norm": 0.002131524495780468, + "learning_rate": 3.4522498450354165e-05, + "loss": 0.0081, + "step": 69900 + }, + { + "epoch": 15.003628879380383, + "grad_norm": 0.0021801977418363094, + "learning_rate": 3.451948942930907e-05, + "loss": 0.0158, + "step": 69910 + }, + { + "epoch": 15.003683041759194, + "grad_norm": 0.0021577253937721252, + "learning_rate": 3.451648040826398e-05, + "loss": 0.0205, + "step": 69920 + }, + { + "epoch": 15.003737204138005, + "grad_norm": 0.004616291727870703, + "learning_rate": 3.4513471387218884e-05, + "loss": 0.0391, + "step": 69930 + }, + { + "epoch": 15.003791366516817, + "grad_norm": 0.050730083137750626, + "learning_rate": 3.451046236617379e-05, + "loss": 0.0326, + "step": 69940 + }, + { + "epoch": 15.00384552889563, + "grad_norm": 0.05567997321486473, + "learning_rate": 3.4507453345128697e-05, + "loss": 0.0539, + "step": 69950 + }, + { + "epoch": 15.003899691274441, + "grad_norm": 0.02661282941699028, + "learning_rate": 3.450444432408361e-05, + "loss": 0.1362, + "step": 69960 + }, + { + "epoch": 15.003953853653252, + "grad_norm": 0.007511479780077934, + "learning_rate": 3.450143530303851e-05, + "loss": 0.0498, + "step": 69970 + }, + { + "epoch": 15.004008016032063, + "grad_norm": 26.967693328857422, + "learning_rate": 3.4498426281993415e-05, + "loss": 0.2006, + "step": 69980 + }, + { + "epoch": 15.004062178410877, + "grad_norm": 0.9001226425170898, + "learning_rate": 3.449541726094833e-05, + "loss": 0.0142, + "step": 69990 + }, + { + "epoch": 15.004116340789688, + "grad_norm": 0.1339036077260971, + "learning_rate": 3.449240823990323e-05, + "loss": 0.0038, + "step": 70000 + }, + { + "epoch": 15.004170503168499, + "grad_norm": 0.352229505777359, + "learning_rate": 3.4489399218858134e-05, + "loss": 0.0472, + "step": 70010 + }, + { + "epoch": 15.00422466554731, + "grad_norm": 1.5944770574569702, + "learning_rate": 3.448639019781305e-05, + "loss": 0.0782, + "step": 70020 + }, + { + "epoch": 15.004278827926122, + "grad_norm": 25.1457576751709, + "learning_rate": 3.4483381176767953e-05, + "loss": 0.089, + "step": 70030 + }, + { + "epoch": 15.004332990304935, + "grad_norm": 1.9397364854812622, + "learning_rate": 3.448037215572286e-05, + "loss": 0.1022, + "step": 70040 + }, + { + "epoch": 15.004387152683746, + "grad_norm": 0.029941555112600327, + "learning_rate": 3.4477363134677766e-05, + "loss": 0.0804, + "step": 70050 + }, + { + "epoch": 15.004441315062557, + "grad_norm": 4.209103584289551, + "learning_rate": 3.447435411363267e-05, + "loss": 0.013, + "step": 70060 + }, + { + "epoch": 15.004495477441369, + "grad_norm": 0.002184516517445445, + "learning_rate": 3.447134509258758e-05, + "loss": 0.0023, + "step": 70070 + }, + { + "epoch": 15.004549639820182, + "grad_norm": 0.03963516652584076, + "learning_rate": 3.4468336071542485e-05, + "loss": 0.0017, + "step": 70080 + }, + { + "epoch": 15.004603802198993, + "grad_norm": 0.0018289933213964105, + "learning_rate": 3.446532705049739e-05, + "loss": 0.0734, + "step": 70090 + }, + { + "epoch": 15.004657964577804, + "grad_norm": 0.973314106464386, + "learning_rate": 3.44623180294523e-05, + "loss": 0.0983, + "step": 70100 + }, + { + "epoch": 15.004712126956615, + "grad_norm": 0.0021822387352585793, + "learning_rate": 3.445930900840721e-05, + "loss": 0.1047, + "step": 70110 + }, + { + "epoch": 15.004766289335427, + "grad_norm": 3.6007187366485596, + "learning_rate": 3.445629998736211e-05, + "loss": 0.0758, + "step": 70120 + }, + { + "epoch": 15.00482045171424, + "grad_norm": 0.02220030128955841, + "learning_rate": 3.445329096631702e-05, + "loss": 0.0697, + "step": 70130 + }, + { + "epoch": 15.004874614093051, + "grad_norm": 0.12971001863479614, + "learning_rate": 3.445028194527193e-05, + "loss": 0.0059, + "step": 70140 + }, + { + "epoch": 15.004928776471862, + "grad_norm": 0.0018825283041223884, + "learning_rate": 3.444727292422683e-05, + "loss": 0.0377, + "step": 70150 + }, + { + "epoch": 15.004982938850674, + "grad_norm": 0.0018483345629647374, + "learning_rate": 3.444426390318174e-05, + "loss": 0.0867, + "step": 70160 + }, + { + "epoch": 15.005037101229487, + "grad_norm": 0.17981915175914764, + "learning_rate": 3.444125488213665e-05, + "loss": 0.036, + "step": 70170 + }, + { + "epoch": 15.005091263608298, + "grad_norm": 0.004177920985966921, + "learning_rate": 3.4438245861091554e-05, + "loss": 0.001, + "step": 70180 + }, + { + "epoch": 15.00514542598711, + "grad_norm": 0.0227790717035532, + "learning_rate": 3.443523684004646e-05, + "loss": 0.0433, + "step": 70190 + }, + { + "epoch": 15.00519958836592, + "grad_norm": 2.052938461303711, + "learning_rate": 3.443222781900137e-05, + "loss": 0.032, + "step": 70200 + }, + { + "epoch": 15.005253750744732, + "grad_norm": 0.0017181944567710161, + "learning_rate": 3.442921879795627e-05, + "loss": 0.0076, + "step": 70210 + }, + { + "epoch": 15.005307913123545, + "grad_norm": 0.07661570608615875, + "learning_rate": 3.4426209776911186e-05, + "loss": 0.0166, + "step": 70220 + }, + { + "epoch": 15.005362075502356, + "grad_norm": 0.006159782875329256, + "learning_rate": 3.4423200755866086e-05, + "loss": 0.0588, + "step": 70230 + }, + { + "epoch": 15.005416237881168, + "grad_norm": 1.2901806831359863, + "learning_rate": 3.442019173482099e-05, + "loss": 0.0049, + "step": 70240 + }, + { + "epoch": 15.005470400259979, + "grad_norm": 2.824479579925537, + "learning_rate": 3.4417182713775905e-05, + "loss": 0.0612, + "step": 70250 + }, + { + "epoch": 15.005524562638792, + "grad_norm": 0.0016529109561815858, + "learning_rate": 3.441417369273081e-05, + "loss": 0.0769, + "step": 70260 + }, + { + "epoch": 15.005578725017603, + "grad_norm": 0.0016743603628128767, + "learning_rate": 3.441116467168571e-05, + "loss": 0.0847, + "step": 70270 + }, + { + "epoch": 15.005632887396414, + "grad_norm": 0.008458712138235569, + "learning_rate": 3.4408155650640624e-05, + "loss": 0.0471, + "step": 70280 + }, + { + "epoch": 15.005687049775226, + "grad_norm": 0.38182270526885986, + "learning_rate": 3.440514662959553e-05, + "loss": 0.0213, + "step": 70290 + }, + { + "epoch": 15.005741212154037, + "grad_norm": 1.2626737356185913, + "learning_rate": 3.4402137608550436e-05, + "loss": 0.0028, + "step": 70300 + }, + { + "epoch": 15.00579537453285, + "grad_norm": 0.13953733444213867, + "learning_rate": 3.439912858750534e-05, + "loss": 0.0963, + "step": 70310 + }, + { + "epoch": 15.005849536911661, + "grad_norm": 0.0015704265097156167, + "learning_rate": 3.439611956646025e-05, + "loss": 0.0036, + "step": 70320 + }, + { + "epoch": 15.005903699290473, + "grad_norm": 3.7467222213745117, + "learning_rate": 3.4393110545415155e-05, + "loss": 0.0568, + "step": 70330 + }, + { + "epoch": 15.005957861669284, + "grad_norm": 0.03684587776660919, + "learning_rate": 3.439010152437006e-05, + "loss": 0.0119, + "step": 70340 + }, + { + "epoch": 15.006012024048097, + "grad_norm": 0.0017552608624100685, + "learning_rate": 3.438709250332497e-05, + "loss": 0.174, + "step": 70350 + }, + { + "epoch": 15.006066186426908, + "grad_norm": 0.2920331358909607, + "learning_rate": 3.4384083482279874e-05, + "loss": 0.0146, + "step": 70360 + }, + { + "epoch": 15.00612034880572, + "grad_norm": 0.05531211942434311, + "learning_rate": 3.438107446123479e-05, + "loss": 0.0557, + "step": 70370 + }, + { + "epoch": 15.00617451118453, + "grad_norm": 0.31307128071784973, + "learning_rate": 3.437806544018969e-05, + "loss": 0.02, + "step": 70380 + }, + { + "epoch": 15.006228673563342, + "grad_norm": 2.169706106185913, + "learning_rate": 3.43750564191446e-05, + "loss": 0.006, + "step": 70390 + }, + { + "epoch": 15.006282835942155, + "grad_norm": 0.006703090853989124, + "learning_rate": 3.4372047398099506e-05, + "loss": 0.0025, + "step": 70400 + }, + { + "epoch": 15.006336998320966, + "grad_norm": 0.003835588227957487, + "learning_rate": 3.436903837705441e-05, + "loss": 0.0058, + "step": 70410 + }, + { + "epoch": 15.006391160699778, + "grad_norm": 0.003725566202774644, + "learning_rate": 3.436602935600932e-05, + "loss": 0.0408, + "step": 70420 + }, + { + "epoch": 15.006445323078589, + "grad_norm": 6.097544193267822, + "learning_rate": 3.4363020334964225e-05, + "loss": 0.1512, + "step": 70430 + }, + { + "epoch": 15.006499485457402, + "grad_norm": 0.0367882214486599, + "learning_rate": 3.436001131391913e-05, + "loss": 0.0875, + "step": 70440 + }, + { + "epoch": 15.006553647836213, + "grad_norm": 0.02977539785206318, + "learning_rate": 3.435700229287404e-05, + "loss": 0.0475, + "step": 70450 + }, + { + "epoch": 15.006607810215025, + "grad_norm": 0.003341501113027334, + "learning_rate": 3.4353993271828944e-05, + "loss": 0.0428, + "step": 70460 + }, + { + "epoch": 15.006661972593836, + "grad_norm": 59.985050201416016, + "learning_rate": 3.435098425078385e-05, + "loss": 0.1977, + "step": 70470 + }, + { + "epoch": 15.006716134972647, + "grad_norm": 0.2173825353384018, + "learning_rate": 3.434797522973876e-05, + "loss": 0.0383, + "step": 70480 + }, + { + "epoch": 15.00677029735146, + "grad_norm": 0.22796398401260376, + "learning_rate": 3.434496620869366e-05, + "loss": 0.0405, + "step": 70490 + }, + { + "epoch": 15.006824459730272, + "grad_norm": 0.1624721884727478, + "learning_rate": 3.434195718764857e-05, + "loss": 0.1958, + "step": 70500 + }, + { + "epoch": 15.006878622109083, + "grad_norm": 17.229568481445312, + "learning_rate": 3.433894816660348e-05, + "loss": 0.1599, + "step": 70510 + }, + { + "epoch": 15.006932784487894, + "grad_norm": 0.057923778891563416, + "learning_rate": 3.433593914555839e-05, + "loss": 0.0948, + "step": 70520 + }, + { + "epoch": 15.006986946866707, + "grad_norm": 0.013269768096506596, + "learning_rate": 3.433293012451329e-05, + "loss": 0.0941, + "step": 70530 + }, + { + "epoch": 15.007041109245518, + "grad_norm": 2.1452529430389404, + "learning_rate": 3.43299211034682e-05, + "loss": 0.0284, + "step": 70540 + }, + { + "epoch": 15.00709527162433, + "grad_norm": 0.0029258979484438896, + "learning_rate": 3.432691208242311e-05, + "loss": 0.003, + "step": 70550 + }, + { + "epoch": 15.007149434003141, + "grad_norm": 3.0673530101776123, + "learning_rate": 3.432390306137801e-05, + "loss": 0.0521, + "step": 70560 + }, + { + "epoch": 15.007203596381952, + "grad_norm": 0.29904016852378845, + "learning_rate": 3.432089404033292e-05, + "loss": 0.1481, + "step": 70570 + }, + { + "epoch": 15.007257758760765, + "grad_norm": 0.13139939308166504, + "learning_rate": 3.4317885019287826e-05, + "loss": 0.007, + "step": 70580 + }, + { + "epoch": 15.007311921139577, + "grad_norm": 0.014689607545733452, + "learning_rate": 3.431487599824273e-05, + "loss": 0.0846, + "step": 70590 + }, + { + "epoch": 15.007366083518388, + "grad_norm": 0.008661016821861267, + "learning_rate": 3.431186697719764e-05, + "loss": 0.0576, + "step": 70600 + }, + { + "epoch": 15.0074202458972, + "grad_norm": 0.0026420503854751587, + "learning_rate": 3.4308857956152545e-05, + "loss": 0.0419, + "step": 70610 + }, + { + "epoch": 15.007474408276012, + "grad_norm": 0.003594073234125972, + "learning_rate": 3.430584893510745e-05, + "loss": 0.0182, + "step": 70620 + }, + { + "epoch": 15.007528570654824, + "grad_norm": 0.26156648993492126, + "learning_rate": 3.4302839914062364e-05, + "loss": 0.014, + "step": 70630 + }, + { + "epoch": 15.007582733033635, + "grad_norm": 0.0052996850572526455, + "learning_rate": 3.429983089301726e-05, + "loss": 0.0679, + "step": 70640 + }, + { + "epoch": 15.007636895412446, + "grad_norm": 0.04185013100504875, + "learning_rate": 3.4296821871972176e-05, + "loss": 0.004, + "step": 70650 + }, + { + "epoch": 15.007691057791257, + "grad_norm": 0.689498782157898, + "learning_rate": 3.429381285092708e-05, + "loss": 0.1147, + "step": 70660 + }, + { + "epoch": 15.00774522017007, + "grad_norm": 0.0020619085989892483, + "learning_rate": 3.429080382988199e-05, + "loss": 0.009, + "step": 70670 + }, + { + "epoch": 15.007799382548882, + "grad_norm": 4.398360729217529, + "learning_rate": 3.4287794808836895e-05, + "loss": 0.1397, + "step": 70680 + }, + { + "epoch": 15.007853544927693, + "grad_norm": 0.047910623252391815, + "learning_rate": 3.42847857877918e-05, + "loss": 0.049, + "step": 70690 + }, + { + "epoch": 15.007907707306504, + "grad_norm": 0.007693914230912924, + "learning_rate": 3.428177676674671e-05, + "loss": 0.1066, + "step": 70700 + }, + { + "epoch": 15.007961869685317, + "grad_norm": 0.019173165783286095, + "learning_rate": 3.427876774570162e-05, + "loss": 0.0044, + "step": 70710 + }, + { + "epoch": 15.008016032064129, + "grad_norm": 0.0756596177816391, + "learning_rate": 3.427575872465652e-05, + "loss": 0.0036, + "step": 70720 + }, + { + "epoch": 15.00807019444294, + "grad_norm": 0.13473008573055267, + "learning_rate": 3.4272749703611427e-05, + "loss": 0.0682, + "step": 70730 + }, + { + "epoch": 15.008124356821751, + "grad_norm": 0.005607891827821732, + "learning_rate": 3.426974068256634e-05, + "loss": 0.0094, + "step": 70740 + }, + { + "epoch": 15.008178519200563, + "grad_norm": 0.617938220500946, + "learning_rate": 3.426673166152124e-05, + "loss": 0.2203, + "step": 70750 + }, + { + "epoch": 15.008232681579376, + "grad_norm": 0.11348841339349747, + "learning_rate": 3.4263722640476145e-05, + "loss": 0.0466, + "step": 70760 + }, + { + "epoch": 15.008286843958187, + "grad_norm": 16.2073974609375, + "learning_rate": 3.426071361943106e-05, + "loss": 0.0526, + "step": 70770 + }, + { + "epoch": 15.008341006336998, + "grad_norm": 0.14325731992721558, + "learning_rate": 3.4257704598385965e-05, + "loss": 0.0896, + "step": 70780 + }, + { + "epoch": 15.00839516871581, + "grad_norm": 0.010348772630095482, + "learning_rate": 3.4254695577340864e-05, + "loss": 0.0024, + "step": 70790 + }, + { + "epoch": 15.008449331094623, + "grad_norm": 0.32086583971977234, + "learning_rate": 3.425168655629578e-05, + "loss": 0.0766, + "step": 70800 + }, + { + "epoch": 15.008503493473434, + "grad_norm": 0.25386837124824524, + "learning_rate": 3.4248677535250684e-05, + "loss": 0.0258, + "step": 70810 + }, + { + "epoch": 15.008557655852245, + "grad_norm": 0.30765506625175476, + "learning_rate": 3.424566851420559e-05, + "loss": 0.0165, + "step": 70820 + }, + { + "epoch": 15.008611818231056, + "grad_norm": 0.0024932336527854204, + "learning_rate": 3.4242659493160496e-05, + "loss": 0.0175, + "step": 70830 + }, + { + "epoch": 15.008665980609868, + "grad_norm": 0.00184445281047374, + "learning_rate": 3.42396504721154e-05, + "loss": 0.0114, + "step": 70840 + }, + { + "epoch": 15.00872014298868, + "grad_norm": 0.0019151481101289392, + "learning_rate": 3.423664145107031e-05, + "loss": 0.0059, + "step": 70850 + }, + { + "epoch": 15.008774305367492, + "grad_norm": 0.034915123134851456, + "learning_rate": 3.423363243002522e-05, + "loss": 0.1646, + "step": 70860 + }, + { + "epoch": 15.008828467746303, + "grad_norm": 0.2590279281139374, + "learning_rate": 3.423062340898012e-05, + "loss": 0.0428, + "step": 70870 + }, + { + "epoch": 15.008882630125115, + "grad_norm": 1.3325506448745728, + "learning_rate": 3.422761438793503e-05, + "loss": 0.0206, + "step": 70880 + }, + { + "epoch": 15.008936792503928, + "grad_norm": 0.07977592945098877, + "learning_rate": 3.422460536688994e-05, + "loss": 0.0017, + "step": 70890 + }, + { + "epoch": 15.008990954882739, + "grad_norm": 0.05915572866797447, + "learning_rate": 3.422159634584484e-05, + "loss": 0.0018, + "step": 70900 + }, + { + "epoch": 15.00904511726155, + "grad_norm": 0.03891126066446304, + "learning_rate": 3.421858732479975e-05, + "loss": 0.121, + "step": 70910 + }, + { + "epoch": 15.009099279640362, + "grad_norm": 0.0376850962638855, + "learning_rate": 3.421557830375466e-05, + "loss": 0.0869, + "step": 70920 + }, + { + "epoch": 15.009153442019173, + "grad_norm": 0.09074627608060837, + "learning_rate": 3.4212569282709566e-05, + "loss": 0.0926, + "step": 70930 + }, + { + "epoch": 15.009207604397986, + "grad_norm": 0.5838919878005981, + "learning_rate": 3.420956026166447e-05, + "loss": 0.036, + "step": 70940 + }, + { + "epoch": 15.009261766776797, + "grad_norm": 0.033677250146865845, + "learning_rate": 3.420655124061938e-05, + "loss": 0.0573, + "step": 70950 + }, + { + "epoch": 15.009315929155608, + "grad_norm": 0.21255043148994446, + "learning_rate": 3.4203542219574284e-05, + "loss": 0.0447, + "step": 70960 + }, + { + "epoch": 15.00937009153442, + "grad_norm": 0.014043588191270828, + "learning_rate": 3.42005331985292e-05, + "loss": 0.1737, + "step": 70970 + }, + { + "epoch": 15.009424253913231, + "grad_norm": 0.003582345088943839, + "learning_rate": 3.41975241774841e-05, + "loss": 0.1226, + "step": 70980 + }, + { + "epoch": 15.009478416292044, + "grad_norm": 0.024352265521883965, + "learning_rate": 3.4194515156439e-05, + "loss": 0.0147, + "step": 70990 + }, + { + "epoch": 15.009532578670855, + "grad_norm": 0.02528487704694271, + "learning_rate": 3.4191506135393916e-05, + "loss": 0.0616, + "step": 71000 + }, + { + "epoch": 15.009586741049667, + "grad_norm": 0.019202692434191704, + "learning_rate": 3.418849711434882e-05, + "loss": 0.1116, + "step": 71010 + }, + { + "epoch": 15.009640903428478, + "grad_norm": 0.002892172895371914, + "learning_rate": 3.418548809330372e-05, + "loss": 0.0363, + "step": 71020 + }, + { + "epoch": 15.009695065807291, + "grad_norm": 0.003134009661152959, + "learning_rate": 3.4182479072258635e-05, + "loss": 0.0092, + "step": 71030 + }, + { + "epoch": 15.009749228186102, + "grad_norm": 0.04474450275301933, + "learning_rate": 3.417947005121354e-05, + "loss": 0.1099, + "step": 71040 + }, + { + "epoch": 15.009803390564914, + "grad_norm": 0.5831013917922974, + "learning_rate": 3.417646103016844e-05, + "loss": 0.0506, + "step": 71050 + }, + { + "epoch": 15.009857552943725, + "grad_norm": 1.3454536199569702, + "learning_rate": 3.4173452009123354e-05, + "loss": 0.0804, + "step": 71060 + }, + { + "epoch": 15.009911715322536, + "grad_norm": 0.009063800796866417, + "learning_rate": 3.417044298807826e-05, + "loss": 0.0348, + "step": 71070 + }, + { + "epoch": 15.00996587770135, + "grad_norm": 4.2052130699157715, + "learning_rate": 3.4167433967033167e-05, + "loss": 0.0313, + "step": 71080 + }, + { + "epoch": 15.01002004008016, + "grad_norm": 0.005957850255072117, + "learning_rate": 3.416442494598807e-05, + "loss": 0.0008, + "step": 71090 + }, + { + "epoch": 15.010074202458972, + "grad_norm": 1.4738335609436035, + "learning_rate": 3.416141592494298e-05, + "loss": 0.0798, + "step": 71100 + }, + { + "epoch": 15.010128364837783, + "grad_norm": 0.8796526789665222, + "learning_rate": 3.4158406903897885e-05, + "loss": 0.0579, + "step": 71110 + }, + { + "epoch": 15.010182527216596, + "grad_norm": 0.04770598188042641, + "learning_rate": 3.41553978828528e-05, + "loss": 0.0026, + "step": 71120 + }, + { + "epoch": 15.010236689595407, + "grad_norm": 9.393486976623535, + "learning_rate": 3.41523888618077e-05, + "loss": 0.0623, + "step": 71130 + }, + { + "epoch": 15.010290851974219, + "grad_norm": 0.8638762831687927, + "learning_rate": 3.4149379840762604e-05, + "loss": 0.08, + "step": 71140 + }, + { + "epoch": 15.01034501435303, + "grad_norm": 0.12819267809391022, + "learning_rate": 3.414637081971752e-05, + "loss": 0.068, + "step": 71150 + }, + { + "epoch": 15.010399176731841, + "grad_norm": 0.06535808742046356, + "learning_rate": 3.4143361798672423e-05, + "loss": 0.1749, + "step": 71160 + }, + { + "epoch": 15.010453339110654, + "grad_norm": 0.027271905913949013, + "learning_rate": 3.414035277762733e-05, + "loss": 0.1194, + "step": 71170 + }, + { + "epoch": 15.010507501489466, + "grad_norm": 0.00510099483653903, + "learning_rate": 3.4137343756582236e-05, + "loss": 0.1551, + "step": 71180 + }, + { + "epoch": 15.010561663868277, + "grad_norm": 0.14385975897312164, + "learning_rate": 3.413433473553714e-05, + "loss": 0.0131, + "step": 71190 + }, + { + "epoch": 15.010615826247088, + "grad_norm": 0.16250814497470856, + "learning_rate": 3.413132571449205e-05, + "loss": 0.0801, + "step": 71200 + }, + { + "epoch": 15.010669988625901, + "grad_norm": 0.008287589065730572, + "learning_rate": 3.4128316693446955e-05, + "loss": 0.1471, + "step": 71210 + }, + { + "epoch": 15.010724151004712, + "grad_norm": 0.6042200326919556, + "learning_rate": 3.412530767240186e-05, + "loss": 0.0662, + "step": 71220 + }, + { + "epoch": 15.010778313383524, + "grad_norm": 9.599783897399902, + "learning_rate": 3.4122298651356774e-05, + "loss": 0.1458, + "step": 71230 + }, + { + "epoch": 15.010832475762335, + "grad_norm": 2.3883275985717773, + "learning_rate": 3.4119289630311674e-05, + "loss": 0.018, + "step": 71240 + }, + { + "epoch": 15.010886638141146, + "grad_norm": 0.004546754527837038, + "learning_rate": 3.411628060926658e-05, + "loss": 0.0569, + "step": 71250 + }, + { + "epoch": 15.01094080051996, + "grad_norm": 0.20400723814964294, + "learning_rate": 3.411327158822149e-05, + "loss": 0.0405, + "step": 71260 + }, + { + "epoch": 15.01099496289877, + "grad_norm": 2.2069568634033203, + "learning_rate": 3.41102625671764e-05, + "loss": 0.0756, + "step": 71270 + }, + { + "epoch": 15.011049125277582, + "grad_norm": 0.003379541914910078, + "learning_rate": 3.41072535461313e-05, + "loss": 0.0296, + "step": 71280 + }, + { + "epoch": 15.011103287656393, + "grad_norm": 0.013297014869749546, + "learning_rate": 3.410424452508621e-05, + "loss": 0.0802, + "step": 71290 + }, + { + "epoch": 15.011157450035206, + "grad_norm": 0.655514121055603, + "learning_rate": 3.410123550404112e-05, + "loss": 0.0473, + "step": 71300 + }, + { + "epoch": 15.011211612414018, + "grad_norm": 0.0722699910402298, + "learning_rate": 3.4098226482996024e-05, + "loss": 0.0029, + "step": 71310 + }, + { + "epoch": 15.011265774792829, + "grad_norm": 3.9827208518981934, + "learning_rate": 3.409521746195093e-05, + "loss": 0.0883, + "step": 71320 + }, + { + "epoch": 15.01131993717164, + "grad_norm": 9.779836654663086, + "learning_rate": 3.409220844090584e-05, + "loss": 0.0998, + "step": 71330 + }, + { + "epoch": 15.011374099550451, + "grad_norm": 2.186828851699829, + "learning_rate": 3.408919941986074e-05, + "loss": 0.1367, + "step": 71340 + }, + { + "epoch": 15.011428261929264, + "grad_norm": 0.24115534126758575, + "learning_rate": 3.408619039881565e-05, + "loss": 0.0909, + "step": 71350 + }, + { + "epoch": 15.011482424308076, + "grad_norm": 3.833831310272217, + "learning_rate": 3.4083181377770556e-05, + "loss": 0.1012, + "step": 71360 + }, + { + "epoch": 15.011536586686887, + "grad_norm": 0.014600790105760098, + "learning_rate": 3.408017235672546e-05, + "loss": 0.0791, + "step": 71370 + }, + { + "epoch": 15.011590749065698, + "grad_norm": 2.7535839080810547, + "learning_rate": 3.4077163335680375e-05, + "loss": 0.0658, + "step": 71380 + }, + { + "epoch": 15.011644911444511, + "grad_norm": 0.029175782576203346, + "learning_rate": 3.4074154314635275e-05, + "loss": 0.0284, + "step": 71390 + }, + { + "epoch": 15.011699073823323, + "grad_norm": 0.014887934550642967, + "learning_rate": 3.407114529359019e-05, + "loss": 0.0612, + "step": 71400 + }, + { + "epoch": 15.011753236202134, + "grad_norm": 15.996197700500488, + "learning_rate": 3.4068136272545094e-05, + "loss": 0.0982, + "step": 71410 + }, + { + "epoch": 15.011807398580945, + "grad_norm": 0.8452463150024414, + "learning_rate": 3.40651272515e-05, + "loss": 0.0755, + "step": 71420 + }, + { + "epoch": 15.011861560959757, + "grad_norm": 0.04489780217409134, + "learning_rate": 3.4062118230454906e-05, + "loss": 0.0546, + "step": 71430 + }, + { + "epoch": 15.01191572333857, + "grad_norm": 0.003305261256173253, + "learning_rate": 3.405910920940981e-05, + "loss": 0.0156, + "step": 71440 + }, + { + "epoch": 15.011969885717381, + "grad_norm": 0.15177324414253235, + "learning_rate": 3.405610018836472e-05, + "loss": 0.0122, + "step": 71450 + }, + { + "epoch": 15.012024048096192, + "grad_norm": 0.0022661201655864716, + "learning_rate": 3.4053091167319625e-05, + "loss": 0.02, + "step": 71460 + }, + { + "epoch": 15.012078210475003, + "grad_norm": 0.002370330272242427, + "learning_rate": 3.405008214627453e-05, + "loss": 0.0584, + "step": 71470 + }, + { + "epoch": 15.012132372853817, + "grad_norm": 0.0195284616202116, + "learning_rate": 3.404707312522944e-05, + "loss": 0.0515, + "step": 71480 + }, + { + "epoch": 15.012186535232628, + "grad_norm": 0.018179025501012802, + "learning_rate": 3.404406410418435e-05, + "loss": 0.005, + "step": 71490 + }, + { + "epoch": 15.012240697611439, + "grad_norm": 0.07490447908639908, + "learning_rate": 3.404105508313925e-05, + "loss": 0.0411, + "step": 71500 + }, + { + "epoch": 15.01229485999025, + "grad_norm": 0.01447906531393528, + "learning_rate": 3.403804606209416e-05, + "loss": 0.0123, + "step": 71510 + }, + { + "epoch": 15.012349022369062, + "grad_norm": 0.0737944096326828, + "learning_rate": 3.403503704104907e-05, + "loss": 0.1242, + "step": 71520 + }, + { + "epoch": 15.012403184747875, + "grad_norm": 0.04140742868185043, + "learning_rate": 3.4032028020003976e-05, + "loss": 0.0589, + "step": 71530 + }, + { + "epoch": 15.012457347126686, + "grad_norm": 0.7668213844299316, + "learning_rate": 3.4029018998958875e-05, + "loss": 0.1908, + "step": 71540 + }, + { + "epoch": 15.012511509505497, + "grad_norm": 0.007549087516963482, + "learning_rate": 3.402600997791379e-05, + "loss": 0.0637, + "step": 71550 + }, + { + "epoch": 15.012565671884309, + "grad_norm": 0.010603290051221848, + "learning_rate": 3.4023000956868695e-05, + "loss": 0.1464, + "step": 71560 + }, + { + "epoch": 15.012619834263122, + "grad_norm": 0.004901548847556114, + "learning_rate": 3.40199919358236e-05, + "loss": 0.0989, + "step": 71570 + }, + { + "epoch": 15.012673996641933, + "grad_norm": 0.14880304038524628, + "learning_rate": 3.401698291477851e-05, + "loss": 0.0797, + "step": 71580 + }, + { + "epoch": 15.012728159020744, + "grad_norm": 0.5679674744606018, + "learning_rate": 3.4013973893733414e-05, + "loss": 0.1166, + "step": 71590 + }, + { + "epoch": 15.012782321399555, + "grad_norm": 0.047273457050323486, + "learning_rate": 3.401096487268832e-05, + "loss": 0.0797, + "step": 71600 + }, + { + "epoch": 15.012836483778367, + "grad_norm": 0.006528167985379696, + "learning_rate": 3.400795585164323e-05, + "loss": 0.0025, + "step": 71610 + }, + { + "epoch": 15.01289064615718, + "grad_norm": 1.2392610311508179, + "learning_rate": 3.400494683059813e-05, + "loss": 0.0293, + "step": 71620 + }, + { + "epoch": 15.012944808535991, + "grad_norm": 0.25305160880088806, + "learning_rate": 3.400193780955304e-05, + "loss": 0.1097, + "step": 71630 + }, + { + "epoch": 15.012998970914802, + "grad_norm": 0.05498197674751282, + "learning_rate": 3.399892878850795e-05, + "loss": 0.0915, + "step": 71640 + }, + { + "epoch": 15.013053133293614, + "grad_norm": 6.779036998748779, + "learning_rate": 3.399591976746285e-05, + "loss": 0.0822, + "step": 71650 + }, + { + "epoch": 15.013107295672427, + "grad_norm": 0.013174294494092464, + "learning_rate": 3.3992910746417764e-05, + "loss": 0.0304, + "step": 71660 + }, + { + "epoch": 15.013161458051238, + "grad_norm": 0.017694663256406784, + "learning_rate": 3.398990172537267e-05, + "loss": 0.0117, + "step": 71670 + }, + { + "epoch": 15.01321562043005, + "grad_norm": 0.008410582318902016, + "learning_rate": 3.398689270432758e-05, + "loss": 0.046, + "step": 71680 + }, + { + "epoch": 15.01326978280886, + "grad_norm": 0.1509094387292862, + "learning_rate": 3.398388368328248e-05, + "loss": 0.0504, + "step": 71690 + }, + { + "epoch": 15.013323945187672, + "grad_norm": 0.28792038559913635, + "learning_rate": 3.398087466223739e-05, + "loss": 0.0542, + "step": 71700 + }, + { + "epoch": 15.013378107566485, + "grad_norm": 0.002954831812530756, + "learning_rate": 3.3977865641192296e-05, + "loss": 0.0451, + "step": 71710 + }, + { + "epoch": 15.013432269945296, + "grad_norm": 0.15369953215122223, + "learning_rate": 3.39748566201472e-05, + "loss": 0.0664, + "step": 71720 + }, + { + "epoch": 15.013486432324108, + "grad_norm": 0.02746802568435669, + "learning_rate": 3.397184759910211e-05, + "loss": 0.0496, + "step": 71730 + }, + { + "epoch": 15.013540594702919, + "grad_norm": 0.2649454176425934, + "learning_rate": 3.3968838578057015e-05, + "loss": 0.0823, + "step": 71740 + }, + { + "epoch": 15.013594757081732, + "grad_norm": 1.27993905544281, + "learning_rate": 3.396582955701193e-05, + "loss": 0.0258, + "step": 71750 + }, + { + "epoch": 15.013648919460543, + "grad_norm": 0.02899673394858837, + "learning_rate": 3.3962820535966834e-05, + "loss": 0.1556, + "step": 71760 + }, + { + "epoch": 15.013703081839354, + "grad_norm": 0.08734390139579773, + "learning_rate": 3.395981151492173e-05, + "loss": 0.1673, + "step": 71770 + }, + { + "epoch": 15.013757244218166, + "grad_norm": 4.1885857582092285, + "learning_rate": 3.3956802493876646e-05, + "loss": 0.2911, + "step": 71780 + }, + { + "epoch": 15.013811406596977, + "grad_norm": 0.014903118833899498, + "learning_rate": 3.395379347283155e-05, + "loss": 0.0446, + "step": 71790 + }, + { + "epoch": 15.01386556897579, + "grad_norm": 0.03778117895126343, + "learning_rate": 3.395078445178645e-05, + "loss": 0.037, + "step": 71800 + }, + { + "epoch": 15.013919731354601, + "grad_norm": 0.8511225581169128, + "learning_rate": 3.3947775430741365e-05, + "loss": 0.0543, + "step": 71810 + }, + { + "epoch": 15.013973893733413, + "grad_norm": 1.3110315799713135, + "learning_rate": 3.394476640969627e-05, + "loss": 0.0735, + "step": 71820 + }, + { + "epoch": 15.014028056112224, + "grad_norm": 0.3756229877471924, + "learning_rate": 3.394175738865118e-05, + "loss": 0.0502, + "step": 71830 + }, + { + "epoch": 15.014082218491037, + "grad_norm": 0.35796791315078735, + "learning_rate": 3.3938748367606084e-05, + "loss": 0.1016, + "step": 71840 + }, + { + "epoch": 15.014136380869848, + "grad_norm": 0.07223343849182129, + "learning_rate": 3.393573934656099e-05, + "loss": 0.0894, + "step": 71850 + }, + { + "epoch": 15.01419054324866, + "grad_norm": 0.13892576098442078, + "learning_rate": 3.3932730325515897e-05, + "loss": 0.0878, + "step": 71860 + }, + { + "epoch": 15.01424470562747, + "grad_norm": 0.013557703234255314, + "learning_rate": 3.392972130447081e-05, + "loss": 0.1227, + "step": 71870 + }, + { + "epoch": 15.014298868006282, + "grad_norm": 0.6968973875045776, + "learning_rate": 3.392671228342571e-05, + "loss": 0.1304, + "step": 71880 + }, + { + "epoch": 15.014353030385095, + "grad_norm": 1.5430923700332642, + "learning_rate": 3.3923703262380615e-05, + "loss": 0.0582, + "step": 71890 + }, + { + "epoch": 15.014407192763906, + "grad_norm": 0.4676024913787842, + "learning_rate": 3.392069424133553e-05, + "loss": 0.123, + "step": 71900 + }, + { + "epoch": 15.014461355142718, + "grad_norm": 0.007061735726892948, + "learning_rate": 3.3917685220290435e-05, + "loss": 0.0634, + "step": 71910 + }, + { + "epoch": 15.014515517521529, + "grad_norm": 0.006528194528073072, + "learning_rate": 3.391467619924534e-05, + "loss": 0.1083, + "step": 71920 + }, + { + "epoch": 15.014569679900342, + "grad_norm": 0.9865891933441162, + "learning_rate": 3.391166717820025e-05, + "loss": 0.0691, + "step": 71930 + }, + { + "epoch": 15.014623842279153, + "grad_norm": 0.01188150979578495, + "learning_rate": 3.3908658157155154e-05, + "loss": 0.0958, + "step": 71940 + }, + { + "epoch": 15.014678004657965, + "grad_norm": 0.018969515338540077, + "learning_rate": 3.390564913611006e-05, + "loss": 0.0504, + "step": 71950 + }, + { + "epoch": 15.014732167036776, + "grad_norm": 0.049076564610004425, + "learning_rate": 3.3902640115064966e-05, + "loss": 0.0629, + "step": 71960 + }, + { + "epoch": 15.014786329415587, + "grad_norm": 0.01559173222631216, + "learning_rate": 3.389963109401987e-05, + "loss": 0.053, + "step": 71970 + }, + { + "epoch": 15.0148404917944, + "grad_norm": 0.08326249569654465, + "learning_rate": 3.389662207297478e-05, + "loss": 0.1341, + "step": 71980 + }, + { + "epoch": 15.014894654173212, + "grad_norm": 0.5363284945487976, + "learning_rate": 3.3893613051929685e-05, + "loss": 0.0238, + "step": 71990 + }, + { + "epoch": 15.014948816552023, + "grad_norm": 0.0968957245349884, + "learning_rate": 3.389060403088459e-05, + "loss": 0.1102, + "step": 72000 + }, + { + "epoch": 15.015002978930834, + "grad_norm": 0.15366381406784058, + "learning_rate": 3.3887595009839504e-05, + "loss": 0.0035, + "step": 72010 + }, + { + "epoch": 15.015057141309647, + "grad_norm": 0.35364535450935364, + "learning_rate": 3.388458598879441e-05, + "loss": 0.0939, + "step": 72020 + }, + { + "epoch": 15.015111303688458, + "grad_norm": 0.4817971885204315, + "learning_rate": 3.388157696774931e-05, + "loss": 0.1036, + "step": 72030 + }, + { + "epoch": 15.01516546606727, + "grad_norm": 0.16083799302577972, + "learning_rate": 3.387856794670422e-05, + "loss": 0.0874, + "step": 72040 + }, + { + "epoch": 15.015219628446081, + "grad_norm": 0.567510724067688, + "learning_rate": 3.387555892565913e-05, + "loss": 0.118, + "step": 72050 + }, + { + "epoch": 15.015273790824892, + "grad_norm": 0.38090378046035767, + "learning_rate": 3.3872549904614036e-05, + "loss": 0.1416, + "step": 72060 + }, + { + "epoch": 15.015327953203705, + "grad_norm": 1.0824508666992188, + "learning_rate": 3.386954088356894e-05, + "loss": 0.0736, + "step": 72070 + }, + { + "epoch": 15.015382115582517, + "grad_norm": 0.12781670689582825, + "learning_rate": 3.386653186252385e-05, + "loss": 0.0413, + "step": 72080 + }, + { + "epoch": 15.015436277961328, + "grad_norm": 0.0049295988865196705, + "learning_rate": 3.3863522841478754e-05, + "loss": 0.0784, + "step": 72090 + }, + { + "epoch": 15.01549044034014, + "grad_norm": 0.7695152759552002, + "learning_rate": 3.386051382043366e-05, + "loss": 0.0295, + "step": 72100 + }, + { + "epoch": 15.01554460271895, + "grad_norm": 0.33443427085876465, + "learning_rate": 3.385750479938857e-05, + "loss": 0.0712, + "step": 72110 + }, + { + "epoch": 15.015598765097764, + "grad_norm": 0.053755369037389755, + "learning_rate": 3.385449577834347e-05, + "loss": 0.0745, + "step": 72120 + }, + { + "epoch": 15.015652927476575, + "grad_norm": 0.002832569647580385, + "learning_rate": 3.3851486757298386e-05, + "loss": 0.0063, + "step": 72130 + }, + { + "epoch": 15.015707089855386, + "grad_norm": 0.0025633624754846096, + "learning_rate": 3.3848477736253286e-05, + "loss": 0.0166, + "step": 72140 + }, + { + "epoch": 15.015761252234197, + "grad_norm": 0.0026019748765975237, + "learning_rate": 3.384546871520819e-05, + "loss": 0.1176, + "step": 72150 + }, + { + "epoch": 15.01581541461301, + "grad_norm": 0.1314394176006317, + "learning_rate": 3.3842459694163105e-05, + "loss": 0.0045, + "step": 72160 + }, + { + "epoch": 15.015869576991822, + "grad_norm": 0.0024861681740731, + "learning_rate": 3.383945067311801e-05, + "loss": 0.0992, + "step": 72170 + }, + { + "epoch": 15.015923739370633, + "grad_norm": 0.3074710965156555, + "learning_rate": 3.383644165207292e-05, + "loss": 0.1186, + "step": 72180 + }, + { + "epoch": 15.015977901749444, + "grad_norm": 0.05684536695480347, + "learning_rate": 3.3833432631027824e-05, + "loss": 0.0231, + "step": 72190 + }, + { + "epoch": 15.016032064128256, + "grad_norm": 0.20328545570373535, + "learning_rate": 3.383042360998273e-05, + "loss": 0.0189, + "step": 72200 + }, + { + "epoch": 15.016086226507069, + "grad_norm": 0.03860659524798393, + "learning_rate": 3.3827414588937637e-05, + "loss": 0.0732, + "step": 72210 + }, + { + "epoch": 15.01614038888588, + "grad_norm": 0.8126491904258728, + "learning_rate": 3.382440556789254e-05, + "loss": 0.0185, + "step": 72220 + }, + { + "epoch": 15.016194551264691, + "grad_norm": 0.14221911132335663, + "learning_rate": 3.382139654684745e-05, + "loss": 0.0323, + "step": 72230 + }, + { + "epoch": 15.016248713643503, + "grad_norm": 25.909116744995117, + "learning_rate": 3.3818387525802355e-05, + "loss": 0.1438, + "step": 72240 + }, + { + "epoch": 15.016302876022316, + "grad_norm": 12.700450897216797, + "learning_rate": 3.381537850475726e-05, + "loss": 0.0607, + "step": 72250 + }, + { + "epoch": 15.016357038401127, + "grad_norm": 0.23091532289981842, + "learning_rate": 3.381236948371217e-05, + "loss": 0.0599, + "step": 72260 + }, + { + "epoch": 15.016411200779938, + "grad_norm": 0.045353710651397705, + "learning_rate": 3.380936046266708e-05, + "loss": 0.0608, + "step": 72270 + }, + { + "epoch": 15.01646536315875, + "grad_norm": 0.016983866691589355, + "learning_rate": 3.380635144162199e-05, + "loss": 0.009, + "step": 72280 + }, + { + "epoch": 15.01651952553756, + "grad_norm": 0.09396453201770782, + "learning_rate": 3.380334242057689e-05, + "loss": 0.1328, + "step": 72290 + }, + { + "epoch": 15.016573687916374, + "grad_norm": 0.06606892496347427, + "learning_rate": 3.38003333995318e-05, + "loss": 0.0697, + "step": 72300 + }, + { + "epoch": 15.016627850295185, + "grad_norm": 0.002166254911571741, + "learning_rate": 3.3797324378486706e-05, + "loss": 0.0169, + "step": 72310 + }, + { + "epoch": 15.016682012673996, + "grad_norm": 0.1984785795211792, + "learning_rate": 3.379431535744161e-05, + "loss": 0.1078, + "step": 72320 + }, + { + "epoch": 15.016736175052808, + "grad_norm": 0.15774181485176086, + "learning_rate": 3.379130633639652e-05, + "loss": 0.0061, + "step": 72330 + }, + { + "epoch": 15.01679033743162, + "grad_norm": 0.011888310313224792, + "learning_rate": 3.3788297315351425e-05, + "loss": 0.0365, + "step": 72340 + }, + { + "epoch": 15.016844499810432, + "grad_norm": 0.9272576570510864, + "learning_rate": 3.378528829430633e-05, + "loss": 0.0358, + "step": 72350 + }, + { + "epoch": 15.016898662189243, + "grad_norm": 0.0618932768702507, + "learning_rate": 3.3782279273261244e-05, + "loss": 0.0526, + "step": 72360 + }, + { + "epoch": 15.016952824568055, + "grad_norm": 0.0023703984916210175, + "learning_rate": 3.3779270252216144e-05, + "loss": 0.018, + "step": 72370 + }, + { + "epoch": 15.017006986946866, + "grad_norm": 0.6432313919067383, + "learning_rate": 3.377626123117105e-05, + "loss": 0.1811, + "step": 72380 + }, + { + "epoch": 15.017061149325679, + "grad_norm": 0.006356316618621349, + "learning_rate": 3.377325221012596e-05, + "loss": 0.0021, + "step": 72390 + }, + { + "epoch": 15.01711531170449, + "grad_norm": 0.002055054996162653, + "learning_rate": 3.377024318908086e-05, + "loss": 0.0188, + "step": 72400 + }, + { + "epoch": 15.017169474083301, + "grad_norm": 0.06688480824232101, + "learning_rate": 3.376723416803577e-05, + "loss": 0.1475, + "step": 72410 + }, + { + "epoch": 15.017223636462113, + "grad_norm": 2.4723458290100098, + "learning_rate": 3.376422514699068e-05, + "loss": 0.1447, + "step": 72420 + }, + { + "epoch": 15.017277798840926, + "grad_norm": 0.002707450417801738, + "learning_rate": 3.376121612594559e-05, + "loss": 0.0393, + "step": 72430 + }, + { + "epoch": 15.017331961219737, + "grad_norm": 2.3736767768859863, + "learning_rate": 3.3758207104900494e-05, + "loss": 0.0379, + "step": 72440 + }, + { + "epoch": 15.017386123598548, + "grad_norm": 0.0288473442196846, + "learning_rate": 3.37551980838554e-05, + "loss": 0.0202, + "step": 72450 + }, + { + "epoch": 15.01744028597736, + "grad_norm": 3.9124152660369873, + "learning_rate": 3.375218906281031e-05, + "loss": 0.1435, + "step": 72460 + }, + { + "epoch": 15.017494448356171, + "grad_norm": 0.9284312129020691, + "learning_rate": 3.374918004176521e-05, + "loss": 0.0072, + "step": 72470 + }, + { + "epoch": 15.017548610734984, + "grad_norm": 0.0031968154944479465, + "learning_rate": 3.374617102072012e-05, + "loss": 0.0549, + "step": 72480 + }, + { + "epoch": 15.017602773113795, + "grad_norm": 0.09534640610218048, + "learning_rate": 3.3743161999675026e-05, + "loss": 0.0882, + "step": 72490 + }, + { + "epoch": 15.017656935492607, + "grad_norm": 0.9540087580680847, + "learning_rate": 3.374015297862993e-05, + "loss": 0.0618, + "step": 72500 + }, + { + "epoch": 15.017711097871418, + "grad_norm": 0.22257080674171448, + "learning_rate": 3.3737143957584845e-05, + "loss": 0.0031, + "step": 72510 + }, + { + "epoch": 15.017765260250231, + "grad_norm": 0.051280297338962555, + "learning_rate": 3.3734134936539745e-05, + "loss": 0.0083, + "step": 72520 + }, + { + "epoch": 15.017819422629042, + "grad_norm": 1.8580098152160645, + "learning_rate": 3.373112591549466e-05, + "loss": 0.0229, + "step": 72530 + }, + { + "epoch": 15.017873585007854, + "grad_norm": 0.032642923295497894, + "learning_rate": 3.3728116894449564e-05, + "loss": 0.0813, + "step": 72540 + }, + { + "epoch": 15.017927747386665, + "grad_norm": 3.8009750843048096, + "learning_rate": 3.3725107873404463e-05, + "loss": 0.0708, + "step": 72550 + }, + { + "epoch": 15.017981909765476, + "grad_norm": 2.1426775455474854, + "learning_rate": 3.3722098852359376e-05, + "loss": 0.0833, + "step": 72560 + }, + { + "epoch": 15.01803607214429, + "grad_norm": 0.10119912028312683, + "learning_rate": 3.371908983131428e-05, + "loss": 0.0562, + "step": 72570 + }, + { + "epoch": 15.0180902345231, + "grad_norm": 0.0033504103776067495, + "learning_rate": 3.371608081026919e-05, + "loss": 0.0418, + "step": 72580 + }, + { + "epoch": 15.018144396901912, + "grad_norm": 0.0652846246957779, + "learning_rate": 3.3713071789224095e-05, + "loss": 0.0964, + "step": 72590 + }, + { + "epoch": 15.018198559280723, + "grad_norm": 0.07578431814908981, + "learning_rate": 3.3710062768179e-05, + "loss": 0.1115, + "step": 72600 + }, + { + "epoch": 15.018252721659536, + "grad_norm": 4.794895648956299, + "learning_rate": 3.370705374713391e-05, + "loss": 0.0455, + "step": 72610 + }, + { + "epoch": 15.018306884038347, + "grad_norm": 3.8375906944274902, + "learning_rate": 3.370404472608882e-05, + "loss": 0.1273, + "step": 72620 + }, + { + "epoch": 15.018361046417159, + "grad_norm": 0.1353122591972351, + "learning_rate": 3.370103570504372e-05, + "loss": 0.1723, + "step": 72630 + }, + { + "epoch": 15.01841520879597, + "grad_norm": 0.002613726304844022, + "learning_rate": 3.369802668399863e-05, + "loss": 0.0439, + "step": 72640 + }, + { + "epoch": 15.018469371174781, + "grad_norm": 0.002647852059453726, + "learning_rate": 3.369501766295354e-05, + "loss": 0.0295, + "step": 72650 + }, + { + "epoch": 15.018523533553594, + "grad_norm": 5.967343330383301, + "learning_rate": 3.3692008641908446e-05, + "loss": 0.065, + "step": 72660 + }, + { + "epoch": 15.018577695932406, + "grad_norm": 0.047681476920843124, + "learning_rate": 3.3688999620863346e-05, + "loss": 0.0318, + "step": 72670 + }, + { + "epoch": 15.018631858311217, + "grad_norm": 0.20455411076545715, + "learning_rate": 3.368599059981826e-05, + "loss": 0.0098, + "step": 72680 + }, + { + "epoch": 15.018686020690028, + "grad_norm": 4.295922756195068, + "learning_rate": 3.3682981578773165e-05, + "loss": 0.0717, + "step": 72690 + }, + { + "epoch": 15.018740183068841, + "grad_norm": 0.010175969451665878, + "learning_rate": 3.367997255772807e-05, + "loss": 0.0086, + "step": 72700 + }, + { + "epoch": 15.018794345447652, + "grad_norm": 0.0022753491066396236, + "learning_rate": 3.367696353668298e-05, + "loss": 0.0592, + "step": 72710 + }, + { + "epoch": 15.018848507826464, + "grad_norm": 0.002218463458120823, + "learning_rate": 3.3673954515637884e-05, + "loss": 0.1021, + "step": 72720 + }, + { + "epoch": 15.018902670205275, + "grad_norm": 0.3377673923969269, + "learning_rate": 3.367094549459279e-05, + "loss": 0.1527, + "step": 72730 + }, + { + "epoch": 15.018956832584086, + "grad_norm": 0.09723151475191116, + "learning_rate": 3.3667936473547696e-05, + "loss": 0.0472, + "step": 72740 + }, + { + "epoch": 15.0190109949629, + "grad_norm": 0.018006905913352966, + "learning_rate": 3.36649274525026e-05, + "loss": 0.1032, + "step": 72750 + }, + { + "epoch": 15.01906515734171, + "grad_norm": 0.002988563384860754, + "learning_rate": 3.366191843145751e-05, + "loss": 0.0521, + "step": 72760 + }, + { + "epoch": 15.019119319720522, + "grad_norm": 0.2943161129951477, + "learning_rate": 3.365890941041242e-05, + "loss": 0.0871, + "step": 72770 + }, + { + "epoch": 15.019173482099333, + "grad_norm": 0.002641248283907771, + "learning_rate": 3.365590038936732e-05, + "loss": 0.0633, + "step": 72780 + }, + { + "epoch": 15.019227644478146, + "grad_norm": 0.003403626149520278, + "learning_rate": 3.3652891368322234e-05, + "loss": 0.0353, + "step": 72790 + }, + { + "epoch": 15.019281806856958, + "grad_norm": 0.00228478922508657, + "learning_rate": 3.364988234727714e-05, + "loss": 0.0298, + "step": 72800 + }, + { + "epoch": 15.019335969235769, + "grad_norm": 1.8097529411315918, + "learning_rate": 3.364687332623205e-05, + "loss": 0.0147, + "step": 72810 + }, + { + "epoch": 15.01939013161458, + "grad_norm": 0.002119947224855423, + "learning_rate": 3.364386430518695e-05, + "loss": 0.0162, + "step": 72820 + }, + { + "epoch": 15.019444293993391, + "grad_norm": 0.05316898226737976, + "learning_rate": 3.364085528414186e-05, + "loss": 0.035, + "step": 72830 + }, + { + "epoch": 15.019498456372204, + "grad_norm": 0.8665507435798645, + "learning_rate": 3.3637846263096766e-05, + "loss": 0.1154, + "step": 72840 + }, + { + "epoch": 15.019552618751016, + "grad_norm": 3.6902506351470947, + "learning_rate": 3.363483724205167e-05, + "loss": 0.0604, + "step": 72850 + }, + { + "epoch": 15.019606781129827, + "grad_norm": 0.3468223214149475, + "learning_rate": 3.363182822100658e-05, + "loss": 0.0051, + "step": 72860 + }, + { + "epoch": 15.019660943508638, + "grad_norm": 0.00989686418324709, + "learning_rate": 3.3628819199961485e-05, + "loss": 0.0088, + "step": 72870 + }, + { + "epoch": 15.019715105887451, + "grad_norm": 0.002252972684800625, + "learning_rate": 3.36258101789164e-05, + "loss": 0.021, + "step": 72880 + }, + { + "epoch": 15.019769268266263, + "grad_norm": 0.020976150408387184, + "learning_rate": 3.36228011578713e-05, + "loss": 0.0329, + "step": 72890 + }, + { + "epoch": 15.019823430645074, + "grad_norm": 1.0019776821136475, + "learning_rate": 3.36197921368262e-05, + "loss": 0.0065, + "step": 72900 + }, + { + "epoch": 15.019877593023885, + "grad_norm": 0.002974327653646469, + "learning_rate": 3.3616783115781116e-05, + "loss": 0.0022, + "step": 72910 + }, + { + "epoch": 15.019931755402697, + "grad_norm": 3.6721019744873047, + "learning_rate": 3.361377409473602e-05, + "loss": 0.0425, + "step": 72920 + }, + { + "epoch": 15.01998591778151, + "grad_norm": 0.0017920503159984946, + "learning_rate": 3.361076507369092e-05, + "loss": 0.0009, + "step": 72930 + }, + { + "epoch": 15.02004008016032, + "grad_norm": 0.0018386433366686106, + "learning_rate": 3.3607756052645835e-05, + "loss": 0.0839, + "step": 72940 + }, + { + "epoch": 15.020094242539132, + "grad_norm": 0.024795912206172943, + "learning_rate": 3.360474703160074e-05, + "loss": 0.1948, + "step": 72950 + }, + { + "epoch": 15.020148404917943, + "grad_norm": 1.0428746938705444, + "learning_rate": 3.360173801055565e-05, + "loss": 0.1166, + "step": 72960 + }, + { + "epoch": 15.020202567296757, + "grad_norm": 0.895595371723175, + "learning_rate": 3.3598728989510554e-05, + "loss": 0.0887, + "step": 72970 + }, + { + "epoch": 15.020256729675568, + "grad_norm": 0.012748701497912407, + "learning_rate": 3.359571996846546e-05, + "loss": 0.1696, + "step": 72980 + }, + { + "epoch": 15.020310892054379, + "grad_norm": 2.5216798782348633, + "learning_rate": 3.359271094742037e-05, + "loss": 0.0468, + "step": 72990 + }, + { + "epoch": 15.02036505443319, + "grad_norm": 3.7672505378723145, + "learning_rate": 3.358970192637527e-05, + "loss": 0.029, + "step": 73000 + }, + { + "epoch": 15.020419216812002, + "grad_norm": 0.073398657143116, + "learning_rate": 3.358669290533018e-05, + "loss": 0.1167, + "step": 73010 + }, + { + "epoch": 15.020473379190815, + "grad_norm": 0.05044649541378021, + "learning_rate": 3.3583683884285085e-05, + "loss": 0.0636, + "step": 73020 + }, + { + "epoch": 15.020527541569626, + "grad_norm": 0.009842402301728725, + "learning_rate": 3.358067486324e-05, + "loss": 0.0462, + "step": 73030 + }, + { + "epoch": 15.020581703948437, + "grad_norm": 0.09400221705436707, + "learning_rate": 3.35776658421949e-05, + "loss": 0.0997, + "step": 73040 + }, + { + "epoch": 15.020635866327249, + "grad_norm": 3.948456048965454, + "learning_rate": 3.357465682114981e-05, + "loss": 0.11, + "step": 73050 + }, + { + "epoch": 15.020690028706062, + "grad_norm": 0.014083602465689182, + "learning_rate": 3.357164780010472e-05, + "loss": 0.1052, + "step": 73060 + }, + { + "epoch": 15.020744191084873, + "grad_norm": 0.004981322214007378, + "learning_rate": 3.3568638779059624e-05, + "loss": 0.0243, + "step": 73070 + }, + { + "epoch": 15.020798353463684, + "grad_norm": 0.025223135948181152, + "learning_rate": 3.356562975801453e-05, + "loss": 0.0445, + "step": 73080 + }, + { + "epoch": 15.020852515842495, + "grad_norm": 0.029803209006786346, + "learning_rate": 3.3562620736969436e-05, + "loss": 0.0432, + "step": 73090 + }, + { + "epoch": 15.020906678221307, + "grad_norm": 0.005013824440538883, + "learning_rate": 3.355961171592434e-05, + "loss": 0.0113, + "step": 73100 + }, + { + "epoch": 15.02096084060012, + "grad_norm": 0.20892882347106934, + "learning_rate": 3.3556602694879255e-05, + "loss": 0.0481, + "step": 73110 + }, + { + "epoch": 15.021015002978931, + "grad_norm": 0.025563813745975494, + "learning_rate": 3.3553593673834155e-05, + "loss": 0.0014, + "step": 73120 + }, + { + "epoch": 15.021069165357742, + "grad_norm": 0.003071928396821022, + "learning_rate": 3.355058465278906e-05, + "loss": 0.0703, + "step": 73130 + }, + { + "epoch": 15.021123327736554, + "grad_norm": 0.5538197755813599, + "learning_rate": 3.3547575631743974e-05, + "loss": 0.0964, + "step": 73140 + }, + { + "epoch": 15.021177490115367, + "grad_norm": 0.0038625216111540794, + "learning_rate": 3.3544566610698874e-05, + "loss": 0.0184, + "step": 73150 + }, + { + "epoch": 15.021231652494178, + "grad_norm": 0.011525994166731834, + "learning_rate": 3.354155758965378e-05, + "loss": 0.0011, + "step": 73160 + }, + { + "epoch": 15.02128581487299, + "grad_norm": 0.031634747982025146, + "learning_rate": 3.353854856860869e-05, + "loss": 0.0082, + "step": 73170 + }, + { + "epoch": 15.0213399772518, + "grad_norm": 5.126675128936768, + "learning_rate": 3.35355395475636e-05, + "loss": 0.1059, + "step": 73180 + }, + { + "epoch": 15.021394139630612, + "grad_norm": 0.005237553734332323, + "learning_rate": 3.35325305265185e-05, + "loss": 0.2626, + "step": 73190 + }, + { + "epoch": 15.021448302009425, + "grad_norm": 0.09907720983028412, + "learning_rate": 3.352952150547341e-05, + "loss": 0.0465, + "step": 73200 + }, + { + "epoch": 15.021502464388236, + "grad_norm": 0.07758275419473648, + "learning_rate": 3.352651248442832e-05, + "loss": 0.1323, + "step": 73210 + }, + { + "epoch": 15.021556626767048, + "grad_norm": 0.38095906376838684, + "learning_rate": 3.3523503463383224e-05, + "loss": 0.0436, + "step": 73220 + }, + { + "epoch": 15.021610789145859, + "grad_norm": 0.11900530010461807, + "learning_rate": 3.352049444233813e-05, + "loss": 0.0331, + "step": 73230 + }, + { + "epoch": 15.021664951524672, + "grad_norm": 0.003290861379355192, + "learning_rate": 3.351748542129304e-05, + "loss": 0.0452, + "step": 73240 + }, + { + "epoch": 15.021719113903483, + "grad_norm": 0.5209442377090454, + "learning_rate": 3.351447640024794e-05, + "loss": 0.0242, + "step": 73250 + }, + { + "epoch": 15.021773276282294, + "grad_norm": 0.07835653424263, + "learning_rate": 3.3511467379202856e-05, + "loss": 0.1478, + "step": 73260 + }, + { + "epoch": 15.021827438661106, + "grad_norm": 3.0428292751312256, + "learning_rate": 3.3508458358157756e-05, + "loss": 0.2411, + "step": 73270 + }, + { + "epoch": 15.021881601039917, + "grad_norm": 2.987621307373047, + "learning_rate": 3.350544933711266e-05, + "loss": 0.0408, + "step": 73280 + }, + { + "epoch": 15.02193576341873, + "grad_norm": 0.00491732731461525, + "learning_rate": 3.3502440316067575e-05, + "loss": 0.0076, + "step": 73290 + }, + { + "epoch": 15.021989925797541, + "grad_norm": 0.05213312804698944, + "learning_rate": 3.3499431295022475e-05, + "loss": 0.0071, + "step": 73300 + }, + { + "epoch": 15.022044088176353, + "grad_norm": 0.014658288098871708, + "learning_rate": 3.349642227397739e-05, + "loss": 0.0238, + "step": 73310 + }, + { + "epoch": 15.022098250555164, + "grad_norm": 1.5612844228744507, + "learning_rate": 3.3493413252932294e-05, + "loss": 0.1477, + "step": 73320 + }, + { + "epoch": 15.022152412933975, + "grad_norm": 0.030553551390767097, + "learning_rate": 3.34904042318872e-05, + "loss": 0.1173, + "step": 73330 + }, + { + "epoch": 15.022206575312788, + "grad_norm": 0.24545031785964966, + "learning_rate": 3.3487395210842107e-05, + "loss": 0.0345, + "step": 73340 + }, + { + "epoch": 15.0222607376916, + "grad_norm": 0.00739244231954217, + "learning_rate": 3.348438618979701e-05, + "loss": 0.0886, + "step": 73350 + }, + { + "epoch": 15.02231490007041, + "grad_norm": 0.02487366646528244, + "learning_rate": 3.348137716875192e-05, + "loss": 0.0426, + "step": 73360 + }, + { + "epoch": 15.022369062449222, + "grad_norm": 7.086108684539795, + "learning_rate": 3.347836814770683e-05, + "loss": 0.0806, + "step": 73370 + }, + { + "epoch": 15.022423224828035, + "grad_norm": 0.030396506190299988, + "learning_rate": 3.347535912666173e-05, + "loss": 0.0041, + "step": 73380 + }, + { + "epoch": 15.022477387206846, + "grad_norm": 0.28259599208831787, + "learning_rate": 3.347235010561664e-05, + "loss": 0.228, + "step": 73390 + }, + { + "epoch": 15.022531549585658, + "grad_norm": 0.046300407499074936, + "learning_rate": 3.346934108457155e-05, + "loss": 0.0105, + "step": 73400 + }, + { + "epoch": 15.022585711964469, + "grad_norm": 0.7364936470985413, + "learning_rate": 3.346633206352646e-05, + "loss": 0.0197, + "step": 73410 + }, + { + "epoch": 15.02263987434328, + "grad_norm": 2.7766716480255127, + "learning_rate": 3.346332304248136e-05, + "loss": 0.0845, + "step": 73420 + }, + { + "epoch": 15.022694036722093, + "grad_norm": 3.8828372955322266, + "learning_rate": 3.346031402143627e-05, + "loss": 0.1377, + "step": 73430 + }, + { + "epoch": 15.022748199100905, + "grad_norm": 0.1126168742775917, + "learning_rate": 3.3457305000391176e-05, + "loss": 0.1195, + "step": 73440 + }, + { + "epoch": 15.022802361479716, + "grad_norm": 0.1518690139055252, + "learning_rate": 3.3454295979346076e-05, + "loss": 0.0262, + "step": 73450 + }, + { + "epoch": 15.022856523858527, + "grad_norm": 1.5083307027816772, + "learning_rate": 3.345128695830099e-05, + "loss": 0.0428, + "step": 73460 + }, + { + "epoch": 15.02291068623734, + "grad_norm": 0.2660144567489624, + "learning_rate": 3.3448277937255895e-05, + "loss": 0.025, + "step": 73470 + }, + { + "epoch": 15.022964848616152, + "grad_norm": 0.06592007726430893, + "learning_rate": 3.34452689162108e-05, + "loss": 0.179, + "step": 73480 + }, + { + "epoch": 15.023019010994963, + "grad_norm": 0.05427199602127075, + "learning_rate": 3.344225989516571e-05, + "loss": 0.0112, + "step": 73490 + }, + { + "epoch": 15.023073173373774, + "grad_norm": 0.004282206762582064, + "learning_rate": 3.3439250874120614e-05, + "loss": 0.0019, + "step": 73500 + }, + { + "epoch": 15.023127335752585, + "grad_norm": 0.6998873353004456, + "learning_rate": 3.343624185307552e-05, + "loss": 0.0754, + "step": 73510 + }, + { + "epoch": 15.023181498131398, + "grad_norm": 2.9753456115722656, + "learning_rate": 3.343323283203043e-05, + "loss": 0.0911, + "step": 73520 + }, + { + "epoch": 15.02323566051021, + "grad_norm": 0.6191282272338867, + "learning_rate": 3.343022381098533e-05, + "loss": 0.0116, + "step": 73530 + }, + { + "epoch": 15.023289822889021, + "grad_norm": 0.04501878470182419, + "learning_rate": 3.342721478994024e-05, + "loss": 0.118, + "step": 73540 + }, + { + "epoch": 15.023343985267832, + "grad_norm": 2.7346973419189453, + "learning_rate": 3.342420576889515e-05, + "loss": 0.0991, + "step": 73550 + }, + { + "epoch": 15.023398147646645, + "grad_norm": 0.3248886466026306, + "learning_rate": 3.342119674785006e-05, + "loss": 0.0676, + "step": 73560 + }, + { + "epoch": 15.023452310025457, + "grad_norm": 0.010885676369071007, + "learning_rate": 3.3418187726804964e-05, + "loss": 0.0994, + "step": 73570 + }, + { + "epoch": 15.023506472404268, + "grad_norm": 6.0569963455200195, + "learning_rate": 3.341517870575987e-05, + "loss": 0.1567, + "step": 73580 + }, + { + "epoch": 15.02356063478308, + "grad_norm": 0.07089092582464218, + "learning_rate": 3.341216968471478e-05, + "loss": 0.0297, + "step": 73590 + }, + { + "epoch": 15.02361479716189, + "grad_norm": 0.01205423939973116, + "learning_rate": 3.340916066366968e-05, + "loss": 0.0885, + "step": 73600 + }, + { + "epoch": 15.023668959540704, + "grad_norm": 0.07808060944080353, + "learning_rate": 3.340615164262459e-05, + "loss": 0.0933, + "step": 73610 + }, + { + "epoch": 15.023723121919515, + "grad_norm": 0.12104486674070358, + "learning_rate": 3.3403142621579496e-05, + "loss": 0.0032, + "step": 73620 + }, + { + "epoch": 15.023777284298326, + "grad_norm": 0.09352324157953262, + "learning_rate": 3.340013360053441e-05, + "loss": 0.0467, + "step": 73630 + }, + { + "epoch": 15.023831446677137, + "grad_norm": 0.013289778493344784, + "learning_rate": 3.339712457948931e-05, + "loss": 0.0136, + "step": 73640 + }, + { + "epoch": 15.02388560905595, + "grad_norm": 3.92199444770813, + "learning_rate": 3.3394115558444215e-05, + "loss": 0.0495, + "step": 73650 + }, + { + "epoch": 15.023939771434762, + "grad_norm": 0.00964539498090744, + "learning_rate": 3.339110653739913e-05, + "loss": 0.097, + "step": 73660 + }, + { + "epoch": 15.023993933813573, + "grad_norm": 0.023310426622629166, + "learning_rate": 3.3388097516354034e-05, + "loss": 0.0654, + "step": 73670 + }, + { + "epoch": 15.024048096192384, + "grad_norm": 0.01370641216635704, + "learning_rate": 3.3385088495308933e-05, + "loss": 0.0497, + "step": 73680 + }, + { + "epoch": 15.024102258571196, + "grad_norm": 0.06757131218910217, + "learning_rate": 3.3382079474263847e-05, + "loss": 0.0553, + "step": 73690 + }, + { + "epoch": 15.024156420950009, + "grad_norm": 1.7428768873214722, + "learning_rate": 3.337907045321875e-05, + "loss": 0.0906, + "step": 73700 + }, + { + "epoch": 15.02421058332882, + "grad_norm": 0.03588842228055, + "learning_rate": 3.337606143217366e-05, + "loss": 0.1174, + "step": 73710 + }, + { + "epoch": 15.024264745707631, + "grad_norm": 0.007792464457452297, + "learning_rate": 3.3373052411128565e-05, + "loss": 0.0385, + "step": 73720 + }, + { + "epoch": 15.024318908086443, + "grad_norm": 0.28760167956352234, + "learning_rate": 3.337004339008347e-05, + "loss": 0.1455, + "step": 73730 + }, + { + "epoch": 15.024373070465256, + "grad_norm": 6.499021053314209, + "learning_rate": 3.336703436903838e-05, + "loss": 0.0372, + "step": 73740 + }, + { + "epoch": 15.024427232844067, + "grad_norm": 1.0570619106292725, + "learning_rate": 3.3364025347993284e-05, + "loss": 0.0818, + "step": 73750 + }, + { + "epoch": 15.024481395222878, + "grad_norm": 1.0512007474899292, + "learning_rate": 3.336101632694819e-05, + "loss": 0.1178, + "step": 73760 + }, + { + "epoch": 15.02453555760169, + "grad_norm": 0.003418005770072341, + "learning_rate": 3.33580073059031e-05, + "loss": 0.0207, + "step": 73770 + }, + { + "epoch": 15.0245897199805, + "grad_norm": 0.25450852513313293, + "learning_rate": 3.335499828485801e-05, + "loss": 0.0049, + "step": 73780 + }, + { + "epoch": 15.024643882359314, + "grad_norm": 0.015287568792700768, + "learning_rate": 3.335198926381291e-05, + "loss": 0.2031, + "step": 73790 + }, + { + "epoch": 15.024698044738125, + "grad_norm": 1.3753069639205933, + "learning_rate": 3.3348980242767816e-05, + "loss": 0.0885, + "step": 73800 + }, + { + "epoch": 15.024752207116936, + "grad_norm": 0.003962323069572449, + "learning_rate": 3.334597122172273e-05, + "loss": 0.1058, + "step": 73810 + }, + { + "epoch": 15.024806369495748, + "grad_norm": 0.0034384634345769882, + "learning_rate": 3.3342962200677635e-05, + "loss": 0.0991, + "step": 73820 + }, + { + "epoch": 15.02486053187456, + "grad_norm": 0.6048163771629333, + "learning_rate": 3.333995317963254e-05, + "loss": 0.0771, + "step": 73830 + }, + { + "epoch": 15.024914694253372, + "grad_norm": 0.026853984221816063, + "learning_rate": 3.333694415858745e-05, + "loss": 0.0281, + "step": 73840 + }, + { + "epoch": 15.024968856632183, + "grad_norm": 1.6159194707870483, + "learning_rate": 3.3333935137542354e-05, + "loss": 0.0243, + "step": 73850 + }, + { + "epoch": 15.025001354059471, + "eval_accuracy": 0.8256041802743305, + "eval_loss": 0.9217148423194885, + "eval_runtime": 116.0187, + "eval_samples_per_second": 26.392, + "eval_steps_per_second": 3.301, + "step": 73856 + }, + { + "epoch": 16.000021664951525, + "grad_norm": 0.009322157129645348, + "learning_rate": 3.333092611649726e-05, + "loss": 0.0767, + "step": 73860 + }, + { + "epoch": 16.000075827330335, + "grad_norm": 0.07087778300046921, + "learning_rate": 3.3327917095452166e-05, + "loss": 0.0339, + "step": 73870 + }, + { + "epoch": 16.000129989709148, + "grad_norm": 0.051802389323711395, + "learning_rate": 3.332490807440707e-05, + "loss": 0.0283, + "step": 73880 + }, + { + "epoch": 16.00018415208796, + "grad_norm": 1.0438640117645264, + "learning_rate": 3.3321899053361986e-05, + "loss": 0.0772, + "step": 73890 + }, + { + "epoch": 16.00023831446677, + "grad_norm": 0.02095668576657772, + "learning_rate": 3.3318890032316885e-05, + "loss": 0.0569, + "step": 73900 + }, + { + "epoch": 16.000292476845583, + "grad_norm": 2.4531984329223633, + "learning_rate": 3.331588101127179e-05, + "loss": 0.0708, + "step": 73910 + }, + { + "epoch": 16.000346639224396, + "grad_norm": 0.005715114995837212, + "learning_rate": 3.3312871990226704e-05, + "loss": 0.0018, + "step": 73920 + }, + { + "epoch": 16.000400801603206, + "grad_norm": 0.0026380715426057577, + "learning_rate": 3.330986296918161e-05, + "loss": 0.0428, + "step": 73930 + }, + { + "epoch": 16.00045496398202, + "grad_norm": 0.06750684231519699, + "learning_rate": 3.330685394813651e-05, + "loss": 0.0239, + "step": 73940 + }, + { + "epoch": 16.00050912636083, + "grad_norm": 0.20700672268867493, + "learning_rate": 3.330384492709142e-05, + "loss": 0.0686, + "step": 73950 + }, + { + "epoch": 16.00056328873964, + "grad_norm": 0.0039676823653280735, + "learning_rate": 3.330083590604633e-05, + "loss": 0.11, + "step": 73960 + }, + { + "epoch": 16.000617451118455, + "grad_norm": 0.011491945944726467, + "learning_rate": 3.3297826885001236e-05, + "loss": 0.014, + "step": 73970 + }, + { + "epoch": 16.000671613497264, + "grad_norm": 0.10161945223808289, + "learning_rate": 3.329481786395614e-05, + "loss": 0.0237, + "step": 73980 + }, + { + "epoch": 16.000725775876077, + "grad_norm": 0.8083288669586182, + "learning_rate": 3.329180884291105e-05, + "loss": 0.0255, + "step": 73990 + }, + { + "epoch": 16.000779938254887, + "grad_norm": 0.006734839174896479, + "learning_rate": 3.3288799821865955e-05, + "loss": 0.0587, + "step": 74000 + }, + { + "epoch": 16.0008341006337, + "grad_norm": 0.008691634051501751, + "learning_rate": 3.328579080082087e-05, + "loss": 0.0192, + "step": 74010 + }, + { + "epoch": 16.000888263012513, + "grad_norm": 0.03597143292427063, + "learning_rate": 3.328278177977577e-05, + "loss": 0.0118, + "step": 74020 + }, + { + "epoch": 16.000942425391322, + "grad_norm": 4.4219865798950195, + "learning_rate": 3.3279772758730673e-05, + "loss": 0.2151, + "step": 74030 + }, + { + "epoch": 16.000996587770135, + "grad_norm": 0.0029070593882352114, + "learning_rate": 3.3276763737685586e-05, + "loss": 0.103, + "step": 74040 + }, + { + "epoch": 16.001050750148945, + "grad_norm": 0.4089408218860626, + "learning_rate": 3.3273754716640486e-05, + "loss": 0.0526, + "step": 74050 + }, + { + "epoch": 16.001104912527758, + "grad_norm": 0.8659070730209351, + "learning_rate": 3.327074569559539e-05, + "loss": 0.0819, + "step": 74060 + }, + { + "epoch": 16.00115907490657, + "grad_norm": 0.4841543734073639, + "learning_rate": 3.3267736674550305e-05, + "loss": 0.0396, + "step": 74070 + }, + { + "epoch": 16.00121323728538, + "grad_norm": 2.599693775177002, + "learning_rate": 3.326472765350521e-05, + "loss": 0.0473, + "step": 74080 + }, + { + "epoch": 16.001267399664194, + "grad_norm": 0.06637229025363922, + "learning_rate": 3.326171863246012e-05, + "loss": 0.0017, + "step": 74090 + }, + { + "epoch": 16.001321562043003, + "grad_norm": 0.7755234241485596, + "learning_rate": 3.3258709611415024e-05, + "loss": 0.0343, + "step": 74100 + }, + { + "epoch": 16.001375724421816, + "grad_norm": 0.010902839712798595, + "learning_rate": 3.325570059036993e-05, + "loss": 0.0396, + "step": 74110 + }, + { + "epoch": 16.00142988680063, + "grad_norm": 0.008836762048304081, + "learning_rate": 3.325269156932484e-05, + "loss": 0.0548, + "step": 74120 + }, + { + "epoch": 16.00148404917944, + "grad_norm": 0.0638483464717865, + "learning_rate": 3.324968254827974e-05, + "loss": 0.0311, + "step": 74130 + }, + { + "epoch": 16.001538211558252, + "grad_norm": 0.005340253934264183, + "learning_rate": 3.324667352723465e-05, + "loss": 0.0122, + "step": 74140 + }, + { + "epoch": 16.001592373937065, + "grad_norm": 0.004943062551319599, + "learning_rate": 3.324366450618956e-05, + "loss": 0.0348, + "step": 74150 + }, + { + "epoch": 16.001646536315874, + "grad_norm": 0.020817669108510017, + "learning_rate": 3.324065548514447e-05, + "loss": 0.002, + "step": 74160 + }, + { + "epoch": 16.001700698694687, + "grad_norm": 0.002500735456123948, + "learning_rate": 3.323764646409937e-05, + "loss": 0.1064, + "step": 74170 + }, + { + "epoch": 16.001754861073497, + "grad_norm": 0.0035287172067910433, + "learning_rate": 3.323463744305428e-05, + "loss": 0.0689, + "step": 74180 + }, + { + "epoch": 16.00180902345231, + "grad_norm": 0.0026037839706987143, + "learning_rate": 3.323162842200919e-05, + "loss": 0.0921, + "step": 74190 + }, + { + "epoch": 16.001863185831123, + "grad_norm": 0.009473448619246483, + "learning_rate": 3.322861940096409e-05, + "loss": 0.0699, + "step": 74200 + }, + { + "epoch": 16.001917348209933, + "grad_norm": 0.5564672946929932, + "learning_rate": 3.3225610379919e-05, + "loss": 0.0073, + "step": 74210 + }, + { + "epoch": 16.001971510588746, + "grad_norm": 2.6863291263580322, + "learning_rate": 3.3222601358873906e-05, + "loss": 0.1428, + "step": 74220 + }, + { + "epoch": 16.002025672967555, + "grad_norm": 0.0031734383665025234, + "learning_rate": 3.321959233782881e-05, + "loss": 0.0036, + "step": 74230 + }, + { + "epoch": 16.00207983534637, + "grad_norm": 1.5647543668746948, + "learning_rate": 3.321658331678372e-05, + "loss": 0.0561, + "step": 74240 + }, + { + "epoch": 16.00213399772518, + "grad_norm": 0.01587357558310032, + "learning_rate": 3.3213574295738625e-05, + "loss": 0.0587, + "step": 74250 + }, + { + "epoch": 16.00218816010399, + "grad_norm": 0.48963937163352966, + "learning_rate": 3.321056527469353e-05, + "loss": 0.0696, + "step": 74260 + }, + { + "epoch": 16.002242322482804, + "grad_norm": 0.002955680014565587, + "learning_rate": 3.3207556253648444e-05, + "loss": 0.0697, + "step": 74270 + }, + { + "epoch": 16.002296484861613, + "grad_norm": 0.002844668924808502, + "learning_rate": 3.3204547232603344e-05, + "loss": 0.1227, + "step": 74280 + }, + { + "epoch": 16.002350647240426, + "grad_norm": 1.6899672746658325, + "learning_rate": 3.320153821155825e-05, + "loss": 0.0202, + "step": 74290 + }, + { + "epoch": 16.00240480961924, + "grad_norm": 0.0029352346900850534, + "learning_rate": 3.319852919051316e-05, + "loss": 0.0009, + "step": 74300 + }, + { + "epoch": 16.00245897199805, + "grad_norm": 0.0033835824579000473, + "learning_rate": 3.319552016946807e-05, + "loss": 0.0034, + "step": 74310 + }, + { + "epoch": 16.002513134376862, + "grad_norm": 0.033410344272851944, + "learning_rate": 3.319251114842297e-05, + "loss": 0.0159, + "step": 74320 + }, + { + "epoch": 16.002567296755675, + "grad_norm": 0.5700317621231079, + "learning_rate": 3.318950212737788e-05, + "loss": 0.0792, + "step": 74330 + }, + { + "epoch": 16.002621459134485, + "grad_norm": 1.2185370922088623, + "learning_rate": 3.318649310633279e-05, + "loss": 0.0465, + "step": 74340 + }, + { + "epoch": 16.002675621513298, + "grad_norm": 0.0066288188099861145, + "learning_rate": 3.3183484085287695e-05, + "loss": 0.0532, + "step": 74350 + }, + { + "epoch": 16.002729783892107, + "grad_norm": 2.828512668609619, + "learning_rate": 3.31804750642426e-05, + "loss": 0.061, + "step": 74360 + }, + { + "epoch": 16.00278394627092, + "grad_norm": 7.031101703643799, + "learning_rate": 3.317746604319751e-05, + "loss": 0.112, + "step": 74370 + }, + { + "epoch": 16.002838108649733, + "grad_norm": 0.0025892616249620914, + "learning_rate": 3.317445702215241e-05, + "loss": 0.1595, + "step": 74380 + }, + { + "epoch": 16.002892271028543, + "grad_norm": 0.007086969446390867, + "learning_rate": 3.317144800110732e-05, + "loss": 0.1155, + "step": 74390 + }, + { + "epoch": 16.002946433407356, + "grad_norm": 0.010789754800498486, + "learning_rate": 3.3168438980062226e-05, + "loss": 0.1142, + "step": 74400 + }, + { + "epoch": 16.003000595786165, + "grad_norm": 0.035593196749687195, + "learning_rate": 3.316542995901714e-05, + "loss": 0.0524, + "step": 74410 + }, + { + "epoch": 16.00305475816498, + "grad_norm": 0.0033396442886441946, + "learning_rate": 3.3162420937972045e-05, + "loss": 0.131, + "step": 74420 + }, + { + "epoch": 16.00310892054379, + "grad_norm": 1.6220591068267822, + "learning_rate": 3.3159411916926945e-05, + "loss": 0.0481, + "step": 74430 + }, + { + "epoch": 16.0031630829226, + "grad_norm": 0.014965927228331566, + "learning_rate": 3.315640289588186e-05, + "loss": 0.0169, + "step": 74440 + }, + { + "epoch": 16.003217245301414, + "grad_norm": 0.9502319097518921, + "learning_rate": 3.3153393874836764e-05, + "loss": 0.0104, + "step": 74450 + }, + { + "epoch": 16.003271407680224, + "grad_norm": 10.472503662109375, + "learning_rate": 3.315038485379167e-05, + "loss": 0.0884, + "step": 74460 + }, + { + "epoch": 16.003325570059037, + "grad_norm": 0.045232515782117844, + "learning_rate": 3.3147375832746577e-05, + "loss": 0.1146, + "step": 74470 + }, + { + "epoch": 16.00337973243785, + "grad_norm": 0.6904150247573853, + "learning_rate": 3.314436681170148e-05, + "loss": 0.0457, + "step": 74480 + }, + { + "epoch": 16.00343389481666, + "grad_norm": 0.029647523537278175, + "learning_rate": 3.314135779065639e-05, + "loss": 0.0317, + "step": 74490 + }, + { + "epoch": 16.003488057195472, + "grad_norm": 0.09746021777391434, + "learning_rate": 3.3138348769611295e-05, + "loss": 0.0998, + "step": 74500 + }, + { + "epoch": 16.003542219574285, + "grad_norm": 3.8432533740997314, + "learning_rate": 3.31353397485662e-05, + "loss": 0.0906, + "step": 74510 + }, + { + "epoch": 16.003596381953095, + "grad_norm": 0.102183498442173, + "learning_rate": 3.313233072752111e-05, + "loss": 0.0096, + "step": 74520 + }, + { + "epoch": 16.003650544331908, + "grad_norm": 0.14829343557357788, + "learning_rate": 3.312932170647602e-05, + "loss": 0.0042, + "step": 74530 + }, + { + "epoch": 16.003704706710717, + "grad_norm": 0.13846513628959656, + "learning_rate": 3.312631268543092e-05, + "loss": 0.0179, + "step": 74540 + }, + { + "epoch": 16.00375886908953, + "grad_norm": 2.8254830837249756, + "learning_rate": 3.312330366438583e-05, + "loss": 0.0748, + "step": 74550 + }, + { + "epoch": 16.003813031468344, + "grad_norm": 2.0347847938537598, + "learning_rate": 3.312029464334074e-05, + "loss": 0.0645, + "step": 74560 + }, + { + "epoch": 16.003867193847153, + "grad_norm": 0.010921045206487179, + "learning_rate": 3.3117285622295646e-05, + "loss": 0.0332, + "step": 74570 + }, + { + "epoch": 16.003921356225966, + "grad_norm": 0.1850990653038025, + "learning_rate": 3.3114276601250546e-05, + "loss": 0.0368, + "step": 74580 + }, + { + "epoch": 16.003975518604776, + "grad_norm": 2.180389165878296, + "learning_rate": 3.311126758020546e-05, + "loss": 0.1238, + "step": 74590 + }, + { + "epoch": 16.00402968098359, + "grad_norm": 0.0024983868934214115, + "learning_rate": 3.3108258559160365e-05, + "loss": 0.0262, + "step": 74600 + }, + { + "epoch": 16.0040838433624, + "grad_norm": 0.17995399236679077, + "learning_rate": 3.310524953811527e-05, + "loss": 0.0283, + "step": 74610 + }, + { + "epoch": 16.00413800574121, + "grad_norm": 0.002499248832464218, + "learning_rate": 3.310224051707018e-05, + "loss": 0.1324, + "step": 74620 + }, + { + "epoch": 16.004192168120024, + "grad_norm": 0.03540632873773575, + "learning_rate": 3.3099231496025084e-05, + "loss": 0.0302, + "step": 74630 + }, + { + "epoch": 16.004246330498834, + "grad_norm": 0.0027766437269747257, + "learning_rate": 3.309622247497999e-05, + "loss": 0.0299, + "step": 74640 + }, + { + "epoch": 16.004300492877647, + "grad_norm": 4.255097389221191, + "learning_rate": 3.3093213453934896e-05, + "loss": 0.0462, + "step": 74650 + }, + { + "epoch": 16.00435465525646, + "grad_norm": 1.644007682800293, + "learning_rate": 3.30902044328898e-05, + "loss": 0.0028, + "step": 74660 + }, + { + "epoch": 16.00440881763527, + "grad_norm": 0.34651052951812744, + "learning_rate": 3.3087195411844716e-05, + "loss": 0.0244, + "step": 74670 + }, + { + "epoch": 16.004462980014083, + "grad_norm": 0.0051836189813911915, + "learning_rate": 3.308418639079962e-05, + "loss": 0.05, + "step": 74680 + }, + { + "epoch": 16.004517142392896, + "grad_norm": 0.002474364126101136, + "learning_rate": 3.308117736975452e-05, + "loss": 0.0451, + "step": 74690 + }, + { + "epoch": 16.004571304771705, + "grad_norm": 0.00238597160205245, + "learning_rate": 3.3078168348709434e-05, + "loss": 0.1475, + "step": 74700 + }, + { + "epoch": 16.004625467150518, + "grad_norm": 0.004115159623324871, + "learning_rate": 3.307515932766434e-05, + "loss": 0.0048, + "step": 74710 + }, + { + "epoch": 16.004679629529328, + "grad_norm": 6.077994346618652, + "learning_rate": 3.307215030661925e-05, + "loss": 0.1034, + "step": 74720 + }, + { + "epoch": 16.00473379190814, + "grad_norm": 3.091240882873535, + "learning_rate": 3.306914128557415e-05, + "loss": 0.0604, + "step": 74730 + }, + { + "epoch": 16.004787954286954, + "grad_norm": 0.8570476770401001, + "learning_rate": 3.306613226452906e-05, + "loss": 0.1221, + "step": 74740 + }, + { + "epoch": 16.004842116665763, + "grad_norm": 0.0024497201666235924, + "learning_rate": 3.3063123243483966e-05, + "loss": 0.0433, + "step": 74750 + }, + { + "epoch": 16.004896279044576, + "grad_norm": 0.0356748066842556, + "learning_rate": 3.306011422243888e-05, + "loss": 0.0041, + "step": 74760 + }, + { + "epoch": 16.004950441423386, + "grad_norm": 0.0023643323220312595, + "learning_rate": 3.305710520139378e-05, + "loss": 0.0234, + "step": 74770 + }, + { + "epoch": 16.0050046038022, + "grad_norm": 0.0023016626946628094, + "learning_rate": 3.3054096180348685e-05, + "loss": 0.1274, + "step": 74780 + }, + { + "epoch": 16.005058766181012, + "grad_norm": 0.1586655080318451, + "learning_rate": 3.30510871593036e-05, + "loss": 0.0591, + "step": 74790 + }, + { + "epoch": 16.00511292855982, + "grad_norm": 0.004769972059875727, + "learning_rate": 3.30480781382585e-05, + "loss": 0.0679, + "step": 74800 + }, + { + "epoch": 16.005167090938635, + "grad_norm": 0.12072424590587616, + "learning_rate": 3.3045069117213403e-05, + "loss": 0.0486, + "step": 74810 + }, + { + "epoch": 16.005221253317444, + "grad_norm": 0.00642429431900382, + "learning_rate": 3.3042060096168317e-05, + "loss": 0.0421, + "step": 74820 + }, + { + "epoch": 16.005275415696257, + "grad_norm": 3.641709566116333, + "learning_rate": 3.303905107512322e-05, + "loss": 0.1124, + "step": 74830 + }, + { + "epoch": 16.00532957807507, + "grad_norm": 0.11088892072439194, + "learning_rate": 3.303604205407813e-05, + "loss": 0.0432, + "step": 74840 + }, + { + "epoch": 16.00538374045388, + "grad_norm": 99.08209228515625, + "learning_rate": 3.3033033033033035e-05, + "loss": 0.094, + "step": 74850 + }, + { + "epoch": 16.005437902832693, + "grad_norm": 0.017675191164016724, + "learning_rate": 3.303002401198794e-05, + "loss": 0.1232, + "step": 74860 + }, + { + "epoch": 16.005492065211506, + "grad_norm": 0.25255924463272095, + "learning_rate": 3.302701499094285e-05, + "loss": 0.0094, + "step": 74870 + }, + { + "epoch": 16.005546227590315, + "grad_norm": 0.22629746794700623, + "learning_rate": 3.3024005969897754e-05, + "loss": 0.0032, + "step": 74880 + }, + { + "epoch": 16.00560038996913, + "grad_norm": 0.053772736340761185, + "learning_rate": 3.302099694885266e-05, + "loss": 0.0817, + "step": 74890 + }, + { + "epoch": 16.005654552347938, + "grad_norm": 0.005721498746424913, + "learning_rate": 3.301798792780757e-05, + "loss": 0.029, + "step": 74900 + }, + { + "epoch": 16.00570871472675, + "grad_norm": 0.08918526768684387, + "learning_rate": 3.301497890676248e-05, + "loss": 0.002, + "step": 74910 + }, + { + "epoch": 16.005762877105564, + "grad_norm": 0.002161793177947402, + "learning_rate": 3.301196988571738e-05, + "loss": 0.0208, + "step": 74920 + }, + { + "epoch": 16.005817039484374, + "grad_norm": 0.25640708208084106, + "learning_rate": 3.300896086467229e-05, + "loss": 0.0892, + "step": 74930 + }, + { + "epoch": 16.005871201863187, + "grad_norm": 0.09533119201660156, + "learning_rate": 3.30059518436272e-05, + "loss": 0.0046, + "step": 74940 + }, + { + "epoch": 16.005925364241996, + "grad_norm": 2.645153760910034, + "learning_rate": 3.30029428225821e-05, + "loss": 0.0908, + "step": 74950 + }, + { + "epoch": 16.00597952662081, + "grad_norm": 6.095547676086426, + "learning_rate": 3.299993380153701e-05, + "loss": 0.1567, + "step": 74960 + }, + { + "epoch": 16.006033688999622, + "grad_norm": 0.026891475543379784, + "learning_rate": 3.299692478049192e-05, + "loss": 0.0719, + "step": 74970 + }, + { + "epoch": 16.00608785137843, + "grad_norm": 0.03202951326966286, + "learning_rate": 3.2993915759446824e-05, + "loss": 0.022, + "step": 74980 + }, + { + "epoch": 16.006142013757245, + "grad_norm": 0.0022544842213392258, + "learning_rate": 3.299090673840173e-05, + "loss": 0.0753, + "step": 74990 + }, + { + "epoch": 16.006196176136054, + "grad_norm": 5.324184894561768, + "learning_rate": 3.2987897717356636e-05, + "loss": 0.1024, + "step": 75000 + }, + { + "epoch": 16.006250338514867, + "grad_norm": 0.015370946377515793, + "learning_rate": 3.298488869631154e-05, + "loss": 0.0056, + "step": 75010 + }, + { + "epoch": 16.00630450089368, + "grad_norm": 0.06686758995056152, + "learning_rate": 3.2981879675266456e-05, + "loss": 0.033, + "step": 75020 + }, + { + "epoch": 16.00635866327249, + "grad_norm": 1.212396502494812, + "learning_rate": 3.2978870654221355e-05, + "loss": 0.1423, + "step": 75030 + }, + { + "epoch": 16.006412825651303, + "grad_norm": 0.005507103633135557, + "learning_rate": 3.297586163317626e-05, + "loss": 0.0505, + "step": 75040 + }, + { + "epoch": 16.006466988030116, + "grad_norm": 0.0764429122209549, + "learning_rate": 3.2972852612131174e-05, + "loss": 0.0049, + "step": 75050 + }, + { + "epoch": 16.006521150408926, + "grad_norm": 0.5971207618713379, + "learning_rate": 3.296984359108608e-05, + "loss": 0.0098, + "step": 75060 + }, + { + "epoch": 16.00657531278774, + "grad_norm": 0.023029936477541924, + "learning_rate": 3.296683457004098e-05, + "loss": 0.0509, + "step": 75070 + }, + { + "epoch": 16.006629475166548, + "grad_norm": 0.002084794221445918, + "learning_rate": 3.296382554899589e-05, + "loss": 0.012, + "step": 75080 + }, + { + "epoch": 16.00668363754536, + "grad_norm": 5.052000522613525, + "learning_rate": 3.29608165279508e-05, + "loss": 0.0763, + "step": 75090 + }, + { + "epoch": 16.006737799924174, + "grad_norm": 0.26755934953689575, + "learning_rate": 3.2957807506905706e-05, + "loss": 0.0019, + "step": 75100 + }, + { + "epoch": 16.006791962302984, + "grad_norm": 0.002000041538849473, + "learning_rate": 3.295479848586061e-05, + "loss": 0.0013, + "step": 75110 + }, + { + "epoch": 16.006846124681797, + "grad_norm": 0.0019914936274290085, + "learning_rate": 3.295178946481552e-05, + "loss": 0.1326, + "step": 75120 + }, + { + "epoch": 16.006900287060606, + "grad_norm": 0.037817031145095825, + "learning_rate": 3.2948780443770425e-05, + "loss": 0.0091, + "step": 75130 + }, + { + "epoch": 16.00695444943942, + "grad_norm": 3.931297779083252, + "learning_rate": 3.294577142272533e-05, + "loss": 0.0305, + "step": 75140 + }, + { + "epoch": 16.007008611818232, + "grad_norm": 6.809922695159912, + "learning_rate": 3.294276240168024e-05, + "loss": 0.0965, + "step": 75150 + }, + { + "epoch": 16.007062774197042, + "grad_norm": 0.056972287595272064, + "learning_rate": 3.2939753380635143e-05, + "loss": 0.0313, + "step": 75160 + }, + { + "epoch": 16.007116936575855, + "grad_norm": 0.12672968208789825, + "learning_rate": 3.2936744359590056e-05, + "loss": 0.0397, + "step": 75170 + }, + { + "epoch": 16.007171098954665, + "grad_norm": 0.05227481201291084, + "learning_rate": 3.2933735338544956e-05, + "loss": 0.109, + "step": 75180 + }, + { + "epoch": 16.007225261333478, + "grad_norm": 0.0862421989440918, + "learning_rate": 3.293072631749987e-05, + "loss": 0.05, + "step": 75190 + }, + { + "epoch": 16.00727942371229, + "grad_norm": 0.004120159428566694, + "learning_rate": 3.2927717296454775e-05, + "loss": 0.0354, + "step": 75200 + }, + { + "epoch": 16.0073335860911, + "grad_norm": 0.6181245446205139, + "learning_rate": 3.292470827540968e-05, + "loss": 0.0555, + "step": 75210 + }, + { + "epoch": 16.007387748469913, + "grad_norm": 3.703822612762451, + "learning_rate": 3.292169925436459e-05, + "loss": 0.1878, + "step": 75220 + }, + { + "epoch": 16.007441910848726, + "grad_norm": 0.8201450705528259, + "learning_rate": 3.2918690233319494e-05, + "loss": 0.0525, + "step": 75230 + }, + { + "epoch": 16.007496073227536, + "grad_norm": 0.017685046419501305, + "learning_rate": 3.29156812122744e-05, + "loss": 0.1383, + "step": 75240 + }, + { + "epoch": 16.00755023560635, + "grad_norm": 4.266755104064941, + "learning_rate": 3.291267219122931e-05, + "loss": 0.0441, + "step": 75250 + }, + { + "epoch": 16.00760439798516, + "grad_norm": 0.2676231861114502, + "learning_rate": 3.290966317018421e-05, + "loss": 0.0219, + "step": 75260 + }, + { + "epoch": 16.00765856036397, + "grad_norm": 0.3206234574317932, + "learning_rate": 3.290665414913912e-05, + "loss": 0.0481, + "step": 75270 + }, + { + "epoch": 16.007712722742784, + "grad_norm": 2.3064637184143066, + "learning_rate": 3.290364512809403e-05, + "loss": 0.0501, + "step": 75280 + }, + { + "epoch": 16.007766885121594, + "grad_norm": 0.020789336413145065, + "learning_rate": 3.290063610704893e-05, + "loss": 0.0771, + "step": 75290 + }, + { + "epoch": 16.007821047500407, + "grad_norm": 5.885796546936035, + "learning_rate": 3.289762708600384e-05, + "loss": 0.0498, + "step": 75300 + }, + { + "epoch": 16.007875209879217, + "grad_norm": 0.06773944944143295, + "learning_rate": 3.289461806495875e-05, + "loss": 0.0389, + "step": 75310 + }, + { + "epoch": 16.00792937225803, + "grad_norm": 0.027853142470121384, + "learning_rate": 3.289160904391366e-05, + "loss": 0.0053, + "step": 75320 + }, + { + "epoch": 16.007983534636843, + "grad_norm": 0.002152129542082548, + "learning_rate": 3.288860002286856e-05, + "loss": 0.0071, + "step": 75330 + }, + { + "epoch": 16.008037697015652, + "grad_norm": 0.42032262682914734, + "learning_rate": 3.288559100182347e-05, + "loss": 0.1166, + "step": 75340 + }, + { + "epoch": 16.008091859394465, + "grad_norm": 0.007733802776783705, + "learning_rate": 3.2882581980778376e-05, + "loss": 0.0558, + "step": 75350 + }, + { + "epoch": 16.008146021773275, + "grad_norm": 0.2663388252258301, + "learning_rate": 3.287957295973328e-05, + "loss": 0.0404, + "step": 75360 + }, + { + "epoch": 16.008200184152088, + "grad_norm": 0.9364127516746521, + "learning_rate": 3.287656393868819e-05, + "loss": 0.0852, + "step": 75370 + }, + { + "epoch": 16.0082543465309, + "grad_norm": 0.18634219467639923, + "learning_rate": 3.2873554917643095e-05, + "loss": 0.0582, + "step": 75380 + }, + { + "epoch": 16.00830850890971, + "grad_norm": 0.2993353307247162, + "learning_rate": 3.2870545896598e-05, + "loss": 0.0206, + "step": 75390 + }, + { + "epoch": 16.008362671288523, + "grad_norm": 0.059657592326402664, + "learning_rate": 3.286753687555291e-05, + "loss": 0.0294, + "step": 75400 + }, + { + "epoch": 16.008416833667333, + "grad_norm": 0.017624225467443466, + "learning_rate": 3.2864527854507814e-05, + "loss": 0.0256, + "step": 75410 + }, + { + "epoch": 16.008470996046146, + "grad_norm": 2.284641742706299, + "learning_rate": 3.286151883346272e-05, + "loss": 0.089, + "step": 75420 + }, + { + "epoch": 16.00852515842496, + "grad_norm": 0.007021688856184483, + "learning_rate": 3.285850981241763e-05, + "loss": 0.0801, + "step": 75430 + }, + { + "epoch": 16.00857932080377, + "grad_norm": 3.050292730331421, + "learning_rate": 3.285550079137253e-05, + "loss": 0.1045, + "step": 75440 + }, + { + "epoch": 16.00863348318258, + "grad_norm": 0.02564200386404991, + "learning_rate": 3.2852491770327446e-05, + "loss": 0.0464, + "step": 75450 + }, + { + "epoch": 16.008687645561395, + "grad_norm": 1.8266022205352783, + "learning_rate": 3.284948274928235e-05, + "loss": 0.023, + "step": 75460 + }, + { + "epoch": 16.008741807940204, + "grad_norm": 0.0023131368216127157, + "learning_rate": 3.284647372823726e-05, + "loss": 0.0467, + "step": 75470 + }, + { + "epoch": 16.008795970319017, + "grad_norm": 0.04809223860502243, + "learning_rate": 3.2843464707192165e-05, + "loss": 0.0411, + "step": 75480 + }, + { + "epoch": 16.008850132697827, + "grad_norm": 0.24364958703517914, + "learning_rate": 3.284045568614707e-05, + "loss": 0.0116, + "step": 75490 + }, + { + "epoch": 16.00890429507664, + "grad_norm": 0.0022465416695922613, + "learning_rate": 3.283744666510198e-05, + "loss": 0.0714, + "step": 75500 + }, + { + "epoch": 16.008958457455453, + "grad_norm": 1.4081840515136719, + "learning_rate": 3.283443764405689e-05, + "loss": 0.0377, + "step": 75510 + }, + { + "epoch": 16.009012619834262, + "grad_norm": 0.28389254212379456, + "learning_rate": 3.283142862301179e-05, + "loss": 0.0263, + "step": 75520 + }, + { + "epoch": 16.009066782213075, + "grad_norm": 0.16734501719474792, + "learning_rate": 3.2828419601966696e-05, + "loss": 0.0831, + "step": 75530 + }, + { + "epoch": 16.009120944591885, + "grad_norm": 0.005888177547603846, + "learning_rate": 3.282541058092161e-05, + "loss": 0.0226, + "step": 75540 + }, + { + "epoch": 16.009175106970698, + "grad_norm": 0.23861843347549438, + "learning_rate": 3.282240155987651e-05, + "loss": 0.0439, + "step": 75550 + }, + { + "epoch": 16.00922926934951, + "grad_norm": 0.009409421123564243, + "learning_rate": 3.2819392538831415e-05, + "loss": 0.0604, + "step": 75560 + }, + { + "epoch": 16.00928343172832, + "grad_norm": 0.026149814948439598, + "learning_rate": 3.281638351778633e-05, + "loss": 0.057, + "step": 75570 + }, + { + "epoch": 16.009337594107134, + "grad_norm": 0.4094344675540924, + "learning_rate": 3.2813374496741234e-05, + "loss": 0.0478, + "step": 75580 + }, + { + "epoch": 16.009391756485943, + "grad_norm": 0.3170933723449707, + "learning_rate": 3.2810365475696134e-05, + "loss": 0.11, + "step": 75590 + }, + { + "epoch": 16.009445918864756, + "grad_norm": 0.3848683536052704, + "learning_rate": 3.2807356454651047e-05, + "loss": 0.0796, + "step": 75600 + }, + { + "epoch": 16.00950008124357, + "grad_norm": 1.210123896598816, + "learning_rate": 3.280434743360595e-05, + "loss": 0.0649, + "step": 75610 + }, + { + "epoch": 16.00955424362238, + "grad_norm": 3.512017011642456, + "learning_rate": 3.280133841256086e-05, + "loss": 0.0302, + "step": 75620 + }, + { + "epoch": 16.009608406001192, + "grad_norm": 4.03743314743042, + "learning_rate": 3.2798329391515765e-05, + "loss": 0.0961, + "step": 75630 + }, + { + "epoch": 16.009662568380005, + "grad_norm": 0.008785332553088665, + "learning_rate": 3.279532037047067e-05, + "loss": 0.0193, + "step": 75640 + }, + { + "epoch": 16.009716730758814, + "grad_norm": 0.16445614397525787, + "learning_rate": 3.279231134942558e-05, + "loss": 0.0134, + "step": 75650 + }, + { + "epoch": 16.009770893137627, + "grad_norm": 0.002638831501826644, + "learning_rate": 3.278930232838049e-05, + "loss": 0.0657, + "step": 75660 + }, + { + "epoch": 16.009825055516437, + "grad_norm": 0.002447335748001933, + "learning_rate": 3.278629330733539e-05, + "loss": 0.018, + "step": 75670 + }, + { + "epoch": 16.00987921789525, + "grad_norm": 0.0049587516114115715, + "learning_rate": 3.27832842862903e-05, + "loss": 0.0126, + "step": 75680 + }, + { + "epoch": 16.009933380274063, + "grad_norm": 0.0022572914604097605, + "learning_rate": 3.278027526524521e-05, + "loss": 0.0469, + "step": 75690 + }, + { + "epoch": 16.009987542652873, + "grad_norm": 11.112726211547852, + "learning_rate": 3.277726624420011e-05, + "loss": 0.1169, + "step": 75700 + }, + { + "epoch": 16.010041705031686, + "grad_norm": 0.0021790231112390757, + "learning_rate": 3.277425722315502e-05, + "loss": 0.1077, + "step": 75710 + }, + { + "epoch": 16.010095867410495, + "grad_norm": 3.215791702270508, + "learning_rate": 3.277124820210993e-05, + "loss": 0.0575, + "step": 75720 + }, + { + "epoch": 16.01015002978931, + "grad_norm": 0.006071347277611494, + "learning_rate": 3.2768239181064835e-05, + "loss": 0.0925, + "step": 75730 + }, + { + "epoch": 16.01020419216812, + "grad_norm": 0.2651464641094208, + "learning_rate": 3.276523016001974e-05, + "loss": 0.0462, + "step": 75740 + }, + { + "epoch": 16.01025835454693, + "grad_norm": 0.0177262332290411, + "learning_rate": 3.276222113897465e-05, + "loss": 0.0211, + "step": 75750 + }, + { + "epoch": 16.010312516925744, + "grad_norm": 0.0022218134254217148, + "learning_rate": 3.2759212117929554e-05, + "loss": 0.044, + "step": 75760 + }, + { + "epoch": 16.010366679304553, + "grad_norm": 0.00990200787782669, + "learning_rate": 3.275620309688447e-05, + "loss": 0.1958, + "step": 75770 + }, + { + "epoch": 16.010420841683366, + "grad_norm": 0.4049496650695801, + "learning_rate": 3.2753194075839366e-05, + "loss": 0.0358, + "step": 75780 + }, + { + "epoch": 16.01047500406218, + "grad_norm": 0.030459566041827202, + "learning_rate": 3.275018505479427e-05, + "loss": 0.0488, + "step": 75790 + }, + { + "epoch": 16.01052916644099, + "grad_norm": 3.157158374786377, + "learning_rate": 3.2747176033749186e-05, + "loss": 0.0673, + "step": 75800 + }, + { + "epoch": 16.010583328819802, + "grad_norm": 0.6169695854187012, + "learning_rate": 3.274416701270409e-05, + "loss": 0.0831, + "step": 75810 + }, + { + "epoch": 16.010637491198615, + "grad_norm": 2.6556215286254883, + "learning_rate": 3.274115799165899e-05, + "loss": 0.0469, + "step": 75820 + }, + { + "epoch": 16.010691653577425, + "grad_norm": 0.01397367101162672, + "learning_rate": 3.2738148970613904e-05, + "loss": 0.0452, + "step": 75830 + }, + { + "epoch": 16.010745815956238, + "grad_norm": 0.002412628149613738, + "learning_rate": 3.273513994956881e-05, + "loss": 0.0927, + "step": 75840 + }, + { + "epoch": 16.010799978335047, + "grad_norm": 0.01950511336326599, + "learning_rate": 3.273213092852371e-05, + "loss": 0.0623, + "step": 75850 + }, + { + "epoch": 16.01085414071386, + "grad_norm": 0.02952899970114231, + "learning_rate": 3.272912190747862e-05, + "loss": 0.1006, + "step": 75860 + }, + { + "epoch": 16.010908303092673, + "grad_norm": 0.021173225715756416, + "learning_rate": 3.272611288643353e-05, + "loss": 0.1326, + "step": 75870 + }, + { + "epoch": 16.010962465471483, + "grad_norm": 0.10582701861858368, + "learning_rate": 3.2723103865388436e-05, + "loss": 0.0186, + "step": 75880 + }, + { + "epoch": 16.011016627850296, + "grad_norm": 0.5004958510398865, + "learning_rate": 3.272009484434334e-05, + "loss": 0.0123, + "step": 75890 + }, + { + "epoch": 16.011070790229105, + "grad_norm": 0.002109787194058299, + "learning_rate": 3.271708582329825e-05, + "loss": 0.0392, + "step": 75900 + }, + { + "epoch": 16.01112495260792, + "grad_norm": 0.049542512744665146, + "learning_rate": 3.2714076802253155e-05, + "loss": 0.0361, + "step": 75910 + }, + { + "epoch": 16.01117911498673, + "grad_norm": 0.20008836686611176, + "learning_rate": 3.271106778120807e-05, + "loss": 0.0169, + "step": 75920 + }, + { + "epoch": 16.01123327736554, + "grad_norm": 0.003245823783800006, + "learning_rate": 3.270805876016297e-05, + "loss": 0.1083, + "step": 75930 + }, + { + "epoch": 16.011287439744354, + "grad_norm": 0.29288607835769653, + "learning_rate": 3.2705049739117873e-05, + "loss": 0.0031, + "step": 75940 + }, + { + "epoch": 16.011341602123164, + "grad_norm": 0.3211303949356079, + "learning_rate": 3.2702040718072787e-05, + "loss": 0.1366, + "step": 75950 + }, + { + "epoch": 16.011395764501977, + "grad_norm": 0.002568483119830489, + "learning_rate": 3.269903169702769e-05, + "loss": 0.0385, + "step": 75960 + }, + { + "epoch": 16.01144992688079, + "grad_norm": 0.14800873398780823, + "learning_rate": 3.26960226759826e-05, + "loss": 0.0017, + "step": 75970 + }, + { + "epoch": 16.0115040892596, + "grad_norm": 3.490119695663452, + "learning_rate": 3.2693013654937505e-05, + "loss": 0.0995, + "step": 75980 + }, + { + "epoch": 16.011558251638412, + "grad_norm": 0.002580048516392708, + "learning_rate": 3.269000463389241e-05, + "loss": 0.0463, + "step": 75990 + }, + { + "epoch": 16.011612414017225, + "grad_norm": 2.8975229263305664, + "learning_rate": 3.268699561284732e-05, + "loss": 0.0561, + "step": 76000 + }, + { + "epoch": 16.011666576396035, + "grad_norm": 0.0036919584963470697, + "learning_rate": 3.2683986591802224e-05, + "loss": 0.0477, + "step": 76010 + }, + { + "epoch": 16.011720738774848, + "grad_norm": 0.0671037957072258, + "learning_rate": 3.268097757075713e-05, + "loss": 0.0632, + "step": 76020 + }, + { + "epoch": 16.011774901153657, + "grad_norm": 0.0021219539921730757, + "learning_rate": 3.2677968549712044e-05, + "loss": 0.0121, + "step": 76030 + }, + { + "epoch": 16.01182906353247, + "grad_norm": 1.3509753942489624, + "learning_rate": 3.267495952866694e-05, + "loss": 0.0579, + "step": 76040 + }, + { + "epoch": 16.011883225911284, + "grad_norm": 3.1851890087127686, + "learning_rate": 3.267195050762185e-05, + "loss": 0.0587, + "step": 76050 + }, + { + "epoch": 16.011937388290093, + "grad_norm": 18.339845657348633, + "learning_rate": 3.266894148657676e-05, + "loss": 0.0847, + "step": 76060 + }, + { + "epoch": 16.011991550668906, + "grad_norm": 0.07915326952934265, + "learning_rate": 3.266593246553167e-05, + "loss": 0.0349, + "step": 76070 + }, + { + "epoch": 16.012045713047716, + "grad_norm": 0.3798791170120239, + "learning_rate": 3.266292344448657e-05, + "loss": 0.0168, + "step": 76080 + }, + { + "epoch": 16.01209987542653, + "grad_norm": 0.3857074975967407, + "learning_rate": 3.265991442344148e-05, + "loss": 0.0785, + "step": 76090 + }, + { + "epoch": 16.01215403780534, + "grad_norm": 0.07355647534132004, + "learning_rate": 3.265690540239639e-05, + "loss": 0.0497, + "step": 76100 + }, + { + "epoch": 16.01220820018415, + "grad_norm": 2.556128978729248, + "learning_rate": 3.2653896381351294e-05, + "loss": 0.1631, + "step": 76110 + }, + { + "epoch": 16.012262362562964, + "grad_norm": 0.10103797167539597, + "learning_rate": 3.26508873603062e-05, + "loss": 0.0487, + "step": 76120 + }, + { + "epoch": 16.012316524941774, + "grad_norm": 0.031070830300450325, + "learning_rate": 3.2647878339261106e-05, + "loss": 0.1648, + "step": 76130 + }, + { + "epoch": 16.012370687320587, + "grad_norm": 0.0033564374316483736, + "learning_rate": 3.264486931821601e-05, + "loss": 0.0545, + "step": 76140 + }, + { + "epoch": 16.0124248496994, + "grad_norm": 0.01517775934189558, + "learning_rate": 3.264186029717092e-05, + "loss": 0.0012, + "step": 76150 + }, + { + "epoch": 16.01247901207821, + "grad_norm": 0.004512593150138855, + "learning_rate": 3.2638851276125825e-05, + "loss": 0.0074, + "step": 76160 + }, + { + "epoch": 16.012533174457023, + "grad_norm": 0.66739422082901, + "learning_rate": 3.263584225508073e-05, + "loss": 0.0546, + "step": 76170 + }, + { + "epoch": 16.012587336835836, + "grad_norm": 0.004416407085955143, + "learning_rate": 3.2632833234035644e-05, + "loss": 0.0322, + "step": 76180 + }, + { + "epoch": 16.012641499214645, + "grad_norm": 0.006042455323040485, + "learning_rate": 3.2629824212990544e-05, + "loss": 0.0135, + "step": 76190 + }, + { + "epoch": 16.012695661593458, + "grad_norm": 0.005269916262477636, + "learning_rate": 3.262681519194545e-05, + "loss": 0.1135, + "step": 76200 + }, + { + "epoch": 16.012749823972268, + "grad_norm": 0.017103323712944984, + "learning_rate": 3.262380617090036e-05, + "loss": 0.0853, + "step": 76210 + }, + { + "epoch": 16.01280398635108, + "grad_norm": 5.39424467086792, + "learning_rate": 3.262079714985527e-05, + "loss": 0.0447, + "step": 76220 + }, + { + "epoch": 16.012858148729894, + "grad_norm": 6.202615261077881, + "learning_rate": 3.2617788128810176e-05, + "loss": 0.0945, + "step": 76230 + }, + { + "epoch": 16.012912311108703, + "grad_norm": 0.007845084182918072, + "learning_rate": 3.261477910776508e-05, + "loss": 0.0261, + "step": 76240 + }, + { + "epoch": 16.012966473487516, + "grad_norm": 0.03384670242667198, + "learning_rate": 3.261177008671999e-05, + "loss": 0.1085, + "step": 76250 + }, + { + "epoch": 16.013020635866326, + "grad_norm": 0.02376280166208744, + "learning_rate": 3.2608761065674895e-05, + "loss": 0.1031, + "step": 76260 + }, + { + "epoch": 16.01307479824514, + "grad_norm": 0.13672593235969543, + "learning_rate": 3.26057520446298e-05, + "loss": 0.0039, + "step": 76270 + }, + { + "epoch": 16.013128960623952, + "grad_norm": 3.53570818901062, + "learning_rate": 3.260274302358471e-05, + "loss": 0.0747, + "step": 76280 + }, + { + "epoch": 16.01318312300276, + "grad_norm": 0.019399061799049377, + "learning_rate": 3.259973400253962e-05, + "loss": 0.1204, + "step": 76290 + }, + { + "epoch": 16.013237285381575, + "grad_norm": 0.021491510793566704, + "learning_rate": 3.259672498149452e-05, + "loss": 0.0872, + "step": 76300 + }, + { + "epoch": 16.013291447760384, + "grad_norm": 0.058992575854063034, + "learning_rate": 3.2593715960449426e-05, + "loss": 0.0428, + "step": 76310 + }, + { + "epoch": 16.013345610139197, + "grad_norm": 0.09369722753763199, + "learning_rate": 3.259070693940434e-05, + "loss": 0.0552, + "step": 76320 + }, + { + "epoch": 16.01339977251801, + "grad_norm": 0.17720644176006317, + "learning_rate": 3.2587697918359245e-05, + "loss": 0.0435, + "step": 76330 + }, + { + "epoch": 16.01345393489682, + "grad_norm": 0.2229992151260376, + "learning_rate": 3.2584688897314145e-05, + "loss": 0.0382, + "step": 76340 + }, + { + "epoch": 16.013508097275633, + "grad_norm": 0.0407106988132, + "learning_rate": 3.258167987626906e-05, + "loss": 0.0264, + "step": 76350 + }, + { + "epoch": 16.013562259654446, + "grad_norm": 0.568580687046051, + "learning_rate": 3.2578670855223964e-05, + "loss": 0.0878, + "step": 76360 + }, + { + "epoch": 16.013616422033255, + "grad_norm": 0.15677978098392487, + "learning_rate": 3.257566183417887e-05, + "loss": 0.0349, + "step": 76370 + }, + { + "epoch": 16.01367058441207, + "grad_norm": 0.04056776314973831, + "learning_rate": 3.257265281313378e-05, + "loss": 0.0677, + "step": 76380 + }, + { + "epoch": 16.013724746790878, + "grad_norm": 0.004493471700698137, + "learning_rate": 3.256964379208868e-05, + "loss": 0.0254, + "step": 76390 + }, + { + "epoch": 16.01377890916969, + "grad_norm": 0.005440738517791033, + "learning_rate": 3.256663477104359e-05, + "loss": 0.0031, + "step": 76400 + }, + { + "epoch": 16.013833071548504, + "grad_norm": 0.5844321250915527, + "learning_rate": 3.25636257499985e-05, + "loss": 0.0449, + "step": 76410 + }, + { + "epoch": 16.013887233927314, + "grad_norm": 0.09399474412202835, + "learning_rate": 3.25606167289534e-05, + "loss": 0.0916, + "step": 76420 + }, + { + "epoch": 16.013941396306127, + "grad_norm": 3.629431962966919, + "learning_rate": 3.255760770790831e-05, + "loss": 0.0253, + "step": 76430 + }, + { + "epoch": 16.013995558684936, + "grad_norm": 0.20570717751979828, + "learning_rate": 3.255459868686322e-05, + "loss": 0.0556, + "step": 76440 + }, + { + "epoch": 16.01404972106375, + "grad_norm": 5.063393592834473, + "learning_rate": 3.255158966581812e-05, + "loss": 0.1242, + "step": 76450 + }, + { + "epoch": 16.014103883442562, + "grad_norm": 0.08378060162067413, + "learning_rate": 3.254858064477303e-05, + "loss": 0.1639, + "step": 76460 + }, + { + "epoch": 16.01415804582137, + "grad_norm": 0.0027047288604080677, + "learning_rate": 3.254557162372794e-05, + "loss": 0.0134, + "step": 76470 + }, + { + "epoch": 16.014212208200185, + "grad_norm": 0.255596786737442, + "learning_rate": 3.2542562602682846e-05, + "loss": 0.0326, + "step": 76480 + }, + { + "epoch": 16.014266370578994, + "grad_norm": 0.0024515953846275806, + "learning_rate": 3.253955358163775e-05, + "loss": 0.0224, + "step": 76490 + }, + { + "epoch": 16.014320532957807, + "grad_norm": 0.005373482592403889, + "learning_rate": 3.253654456059266e-05, + "loss": 0.071, + "step": 76500 + }, + { + "epoch": 16.01437469533662, + "grad_norm": 0.13131171464920044, + "learning_rate": 3.2533535539547565e-05, + "loss": 0.1056, + "step": 76510 + }, + { + "epoch": 16.01442885771543, + "grad_norm": 0.12968167662620544, + "learning_rate": 3.253052651850247e-05, + "loss": 0.0684, + "step": 76520 + }, + { + "epoch": 16.014483020094243, + "grad_norm": 0.007246609311550856, + "learning_rate": 3.252751749745738e-05, + "loss": 0.1459, + "step": 76530 + }, + { + "epoch": 16.014537182473052, + "grad_norm": 0.053959187120199203, + "learning_rate": 3.2524508476412284e-05, + "loss": 0.1236, + "step": 76540 + }, + { + "epoch": 16.014591344851866, + "grad_norm": 0.004175730515271425, + "learning_rate": 3.25214994553672e-05, + "loss": 0.0556, + "step": 76550 + }, + { + "epoch": 16.01464550723068, + "grad_norm": 0.09556764364242554, + "learning_rate": 3.25184904343221e-05, + "loss": 0.1339, + "step": 76560 + }, + { + "epoch": 16.014699669609488, + "grad_norm": 1.0333799123764038, + "learning_rate": 3.2515481413277e-05, + "loss": 0.0648, + "step": 76570 + }, + { + "epoch": 16.0147538319883, + "grad_norm": 0.018843388184905052, + "learning_rate": 3.2512472392231916e-05, + "loss": 0.0327, + "step": 76580 + }, + { + "epoch": 16.014807994367114, + "grad_norm": 0.01752476394176483, + "learning_rate": 3.250946337118682e-05, + "loss": 0.0867, + "step": 76590 + }, + { + "epoch": 16.014862156745924, + "grad_norm": 0.721607506275177, + "learning_rate": 3.250645435014172e-05, + "loss": 0.0047, + "step": 76600 + }, + { + "epoch": 16.014916319124737, + "grad_norm": 0.004045701120048761, + "learning_rate": 3.2503445329096635e-05, + "loss": 0.1684, + "step": 76610 + }, + { + "epoch": 16.014970481503546, + "grad_norm": 0.17320837080478668, + "learning_rate": 3.250043630805154e-05, + "loss": 0.0224, + "step": 76620 + }, + { + "epoch": 16.01502464388236, + "grad_norm": 0.010445835068821907, + "learning_rate": 3.249742728700645e-05, + "loss": 0.0309, + "step": 76630 + }, + { + "epoch": 16.015078806261172, + "grad_norm": 0.17761893570423126, + "learning_rate": 3.249441826596135e-05, + "loss": 0.0595, + "step": 76640 + }, + { + "epoch": 16.015132968639982, + "grad_norm": 0.033209726214408875, + "learning_rate": 3.249140924491626e-05, + "loss": 0.0277, + "step": 76650 + }, + { + "epoch": 16.015187131018795, + "grad_norm": 0.1600228101015091, + "learning_rate": 3.2488400223871166e-05, + "loss": 0.1515, + "step": 76660 + }, + { + "epoch": 16.015241293397604, + "grad_norm": 0.007424378767609596, + "learning_rate": 3.248539120282608e-05, + "loss": 0.0892, + "step": 76670 + }, + { + "epoch": 16.015295455776418, + "grad_norm": 0.02592259831726551, + "learning_rate": 3.248238218178098e-05, + "loss": 0.0733, + "step": 76680 + }, + { + "epoch": 16.01534961815523, + "grad_norm": 4.225876331329346, + "learning_rate": 3.2479373160735885e-05, + "loss": 0.0503, + "step": 76690 + }, + { + "epoch": 16.01540378053404, + "grad_norm": 0.8843226432800293, + "learning_rate": 3.24763641396908e-05, + "loss": 0.1433, + "step": 76700 + }, + { + "epoch": 16.015457942912853, + "grad_norm": 2.5315864086151123, + "learning_rate": 3.2473355118645704e-05, + "loss": 0.1135, + "step": 76710 + }, + { + "epoch": 16.015512105291663, + "grad_norm": 0.004934502299875021, + "learning_rate": 3.2470346097600604e-05, + "loss": 0.0556, + "step": 76720 + }, + { + "epoch": 16.015566267670476, + "grad_norm": 0.014274384826421738, + "learning_rate": 3.246733707655552e-05, + "loss": 0.0063, + "step": 76730 + }, + { + "epoch": 16.01562043004929, + "grad_norm": 3.4407176971435547, + "learning_rate": 3.246432805551042e-05, + "loss": 0.0992, + "step": 76740 + }, + { + "epoch": 16.0156745924281, + "grad_norm": 0.002668722765520215, + "learning_rate": 3.246131903446533e-05, + "loss": 0.0661, + "step": 76750 + }, + { + "epoch": 16.01572875480691, + "grad_norm": 0.046296343207359314, + "learning_rate": 3.2458310013420235e-05, + "loss": 0.0779, + "step": 76760 + }, + { + "epoch": 16.015782917185724, + "grad_norm": 0.07786355167627335, + "learning_rate": 3.245530099237514e-05, + "loss": 0.05, + "step": 76770 + }, + { + "epoch": 16.015837079564534, + "grad_norm": 3.9971582889556885, + "learning_rate": 3.245229197133005e-05, + "loss": 0.0879, + "step": 76780 + }, + { + "epoch": 16.015891241943347, + "grad_norm": 2.667905569076538, + "learning_rate": 3.2449282950284954e-05, + "loss": 0.0457, + "step": 76790 + }, + { + "epoch": 16.015945404322157, + "grad_norm": 0.04537906497716904, + "learning_rate": 3.244627392923986e-05, + "loss": 0.004, + "step": 76800 + }, + { + "epoch": 16.01599956670097, + "grad_norm": 6.331320762634277, + "learning_rate": 3.2443264908194774e-05, + "loss": 0.1825, + "step": 76810 + }, + { + "epoch": 16.016053729079783, + "grad_norm": 0.040974803268909454, + "learning_rate": 3.244025588714968e-05, + "loss": 0.0076, + "step": 76820 + }, + { + "epoch": 16.016107891458592, + "grad_norm": 0.004847168456763029, + "learning_rate": 3.243724686610458e-05, + "loss": 0.1086, + "step": 76830 + }, + { + "epoch": 16.016162053837405, + "grad_norm": 0.009121083654463291, + "learning_rate": 3.243423784505949e-05, + "loss": 0.0119, + "step": 76840 + }, + { + "epoch": 16.016216216216215, + "grad_norm": 0.00615582475438714, + "learning_rate": 3.24312288240144e-05, + "loss": 0.1, + "step": 76850 + }, + { + "epoch": 16.016270378595028, + "grad_norm": 2.2833755016326904, + "learning_rate": 3.2428219802969305e-05, + "loss": 0.0681, + "step": 76860 + }, + { + "epoch": 16.01632454097384, + "grad_norm": 0.002375953597947955, + "learning_rate": 3.242521078192421e-05, + "loss": 0.0146, + "step": 76870 + }, + { + "epoch": 16.01637870335265, + "grad_norm": 0.002669175621122122, + "learning_rate": 3.242220176087912e-05, + "loss": 0.0794, + "step": 76880 + }, + { + "epoch": 16.016432865731463, + "grad_norm": 25.665212631225586, + "learning_rate": 3.2419192739834024e-05, + "loss": 0.0951, + "step": 76890 + }, + { + "epoch": 16.016487028110273, + "grad_norm": 0.002981205703690648, + "learning_rate": 3.241618371878893e-05, + "loss": 0.0548, + "step": 76900 + }, + { + "epoch": 16.016541190489086, + "grad_norm": 0.2651459276676178, + "learning_rate": 3.2413174697743836e-05, + "loss": 0.0161, + "step": 76910 + }, + { + "epoch": 16.0165953528679, + "grad_norm": 0.002732548164203763, + "learning_rate": 3.241016567669874e-05, + "loss": 0.0817, + "step": 76920 + }, + { + "epoch": 16.01664951524671, + "grad_norm": 0.008503780700266361, + "learning_rate": 3.2407156655653656e-05, + "loss": 0.047, + "step": 76930 + }, + { + "epoch": 16.01670367762552, + "grad_norm": 0.7831262946128845, + "learning_rate": 3.2404147634608555e-05, + "loss": 0.0209, + "step": 76940 + }, + { + "epoch": 16.016757840004335, + "grad_norm": 0.0024039745330810547, + "learning_rate": 3.240113861356346e-05, + "loss": 0.0204, + "step": 76950 + }, + { + "epoch": 16.016812002383144, + "grad_norm": 2.593172788619995, + "learning_rate": 3.2398129592518374e-05, + "loss": 0.1493, + "step": 76960 + }, + { + "epoch": 16.016866164761957, + "grad_norm": 0.2902623116970062, + "learning_rate": 3.239512057147328e-05, + "loss": 0.0247, + "step": 76970 + }, + { + "epoch": 16.016920327140767, + "grad_norm": 0.4270835816860199, + "learning_rate": 3.239211155042818e-05, + "loss": 0.1046, + "step": 76980 + }, + { + "epoch": 16.01697448951958, + "grad_norm": 0.134367436170578, + "learning_rate": 3.238910252938309e-05, + "loss": 0.0998, + "step": 76990 + }, + { + "epoch": 16.017028651898393, + "grad_norm": 0.5204854607582092, + "learning_rate": 3.2386093508338e-05, + "loss": 0.1162, + "step": 77000 + }, + { + "epoch": 16.017082814277202, + "grad_norm": 0.047130584716796875, + "learning_rate": 3.2383084487292906e-05, + "loss": 0.0662, + "step": 77010 + }, + { + "epoch": 16.017136976656015, + "grad_norm": 2.8163068294525146, + "learning_rate": 3.238007546624781e-05, + "loss": 0.1086, + "step": 77020 + }, + { + "epoch": 16.017191139034825, + "grad_norm": 0.8842125535011292, + "learning_rate": 3.237706644520272e-05, + "loss": 0.0669, + "step": 77030 + }, + { + "epoch": 16.017245301413638, + "grad_norm": 2.928204298019409, + "learning_rate": 3.2374057424157625e-05, + "loss": 0.0976, + "step": 77040 + }, + { + "epoch": 16.01729946379245, + "grad_norm": 4.2861409187316895, + "learning_rate": 3.237104840311253e-05, + "loss": 0.1006, + "step": 77050 + }, + { + "epoch": 16.01735362617126, + "grad_norm": 0.0031771459616720676, + "learning_rate": 3.236803938206744e-05, + "loss": 0.1468, + "step": 77060 + }, + { + "epoch": 16.017407788550074, + "grad_norm": 1.8599016666412354, + "learning_rate": 3.236503036102235e-05, + "loss": 0.0133, + "step": 77070 + }, + { + "epoch": 16.017461950928883, + "grad_norm": 2.611198902130127, + "learning_rate": 3.2362021339977257e-05, + "loss": 0.1036, + "step": 77080 + }, + { + "epoch": 16.017516113307696, + "grad_norm": 0.1106795221567154, + "learning_rate": 3.2359012318932156e-05, + "loss": 0.0048, + "step": 77090 + }, + { + "epoch": 16.01757027568651, + "grad_norm": 2.8872101306915283, + "learning_rate": 3.235600329788707e-05, + "loss": 0.0645, + "step": 77100 + }, + { + "epoch": 16.01762443806532, + "grad_norm": 0.006009203847497702, + "learning_rate": 3.2352994276841975e-05, + "loss": 0.0039, + "step": 77110 + }, + { + "epoch": 16.017678600444132, + "grad_norm": 0.002934718271717429, + "learning_rate": 3.234998525579688e-05, + "loss": 0.0318, + "step": 77120 + }, + { + "epoch": 16.017732762822945, + "grad_norm": 1.923553228378296, + "learning_rate": 3.234697623475179e-05, + "loss": 0.162, + "step": 77130 + }, + { + "epoch": 16.017786925201754, + "grad_norm": 0.0030097293201833963, + "learning_rate": 3.2343967213706694e-05, + "loss": 0.0172, + "step": 77140 + }, + { + "epoch": 16.017841087580567, + "grad_norm": 0.27522221207618713, + "learning_rate": 3.23409581926616e-05, + "loss": 0.0496, + "step": 77150 + }, + { + "epoch": 16.017895249959377, + "grad_norm": 0.0028779690619558096, + "learning_rate": 3.2337949171616514e-05, + "loss": 0.0947, + "step": 77160 + }, + { + "epoch": 16.01794941233819, + "grad_norm": 9.774185180664062, + "learning_rate": 3.233494015057141e-05, + "loss": 0.0604, + "step": 77170 + }, + { + "epoch": 16.018003574717003, + "grad_norm": 0.012696709483861923, + "learning_rate": 3.233193112952632e-05, + "loss": 0.0317, + "step": 77180 + }, + { + "epoch": 16.018057737095813, + "grad_norm": 0.8278911113739014, + "learning_rate": 3.232892210848123e-05, + "loss": 0.0458, + "step": 77190 + }, + { + "epoch": 16.018111899474626, + "grad_norm": 0.06170130893588066, + "learning_rate": 3.232591308743613e-05, + "loss": 0.0054, + "step": 77200 + }, + { + "epoch": 16.018166061853435, + "grad_norm": 0.0024334483314305544, + "learning_rate": 3.232290406639104e-05, + "loss": 0.0674, + "step": 77210 + }, + { + "epoch": 16.01822022423225, + "grad_norm": 2.8530426025390625, + "learning_rate": 3.231989504534595e-05, + "loss": 0.0997, + "step": 77220 + }, + { + "epoch": 16.01827438661106, + "grad_norm": 0.17970941960811615, + "learning_rate": 3.231688602430086e-05, + "loss": 0.0128, + "step": 77230 + }, + { + "epoch": 16.01832854898987, + "grad_norm": 0.0025549561250954866, + "learning_rate": 3.231387700325576e-05, + "loss": 0.0904, + "step": 77240 + }, + { + "epoch": 16.018382711368684, + "grad_norm": 0.7435043454170227, + "learning_rate": 3.231086798221067e-05, + "loss": 0.0393, + "step": 77250 + }, + { + "epoch": 16.018436873747493, + "grad_norm": 0.0026244877371937037, + "learning_rate": 3.2307858961165576e-05, + "loss": 0.0176, + "step": 77260 + }, + { + "epoch": 16.018491036126306, + "grad_norm": 0.002245779847726226, + "learning_rate": 3.230484994012048e-05, + "loss": 0.0267, + "step": 77270 + }, + { + "epoch": 16.01854519850512, + "grad_norm": 0.010934954509139061, + "learning_rate": 3.230184091907539e-05, + "loss": 0.083, + "step": 77280 + }, + { + "epoch": 16.01859936088393, + "grad_norm": 0.027151301503181458, + "learning_rate": 3.2298831898030295e-05, + "loss": 0.0074, + "step": 77290 + }, + { + "epoch": 16.018653523262742, + "grad_norm": 0.0820474699139595, + "learning_rate": 3.22958228769852e-05, + "loss": 0.0242, + "step": 77300 + }, + { + "epoch": 16.018707685641555, + "grad_norm": 0.27087733149528503, + "learning_rate": 3.229281385594011e-05, + "loss": 0.0115, + "step": 77310 + }, + { + "epoch": 16.018761848020365, + "grad_norm": 0.04417116194963455, + "learning_rate": 3.2289804834895014e-05, + "loss": 0.0021, + "step": 77320 + }, + { + "epoch": 16.018816010399178, + "grad_norm": 0.18470697104930878, + "learning_rate": 3.228679581384993e-05, + "loss": 0.1675, + "step": 77330 + }, + { + "epoch": 16.018870172777987, + "grad_norm": 0.15550294518470764, + "learning_rate": 3.228378679280483e-05, + "loss": 0.1851, + "step": 77340 + }, + { + "epoch": 16.0189243351568, + "grad_norm": 0.002342688385397196, + "learning_rate": 3.228077777175973e-05, + "loss": 0.0579, + "step": 77350 + }, + { + "epoch": 16.018978497535613, + "grad_norm": 0.2166656255722046, + "learning_rate": 3.2277768750714646e-05, + "loss": 0.0405, + "step": 77360 + }, + { + "epoch": 16.019032659914423, + "grad_norm": 0.6817544102668762, + "learning_rate": 3.227475972966955e-05, + "loss": 0.0329, + "step": 77370 + }, + { + "epoch": 16.019086822293236, + "grad_norm": 0.0057806773111224174, + "learning_rate": 3.227175070862446e-05, + "loss": 0.0597, + "step": 77380 + }, + { + "epoch": 16.019140984672045, + "grad_norm": 0.0025486121885478497, + "learning_rate": 3.2268741687579365e-05, + "loss": 0.0861, + "step": 77390 + }, + { + "epoch": 16.01919514705086, + "grad_norm": 1.6417375802993774, + "learning_rate": 3.226573266653427e-05, + "loss": 0.0889, + "step": 77400 + }, + { + "epoch": 16.01924930942967, + "grad_norm": 1.0730398893356323, + "learning_rate": 3.226272364548918e-05, + "loss": 0.0591, + "step": 77410 + }, + { + "epoch": 16.01930347180848, + "grad_norm": 0.0026338740717619658, + "learning_rate": 3.225971462444409e-05, + "loss": 0.0188, + "step": 77420 + }, + { + "epoch": 16.019357634187294, + "grad_norm": 0.5516472458839417, + "learning_rate": 3.225670560339899e-05, + "loss": 0.0441, + "step": 77430 + }, + { + "epoch": 16.019411796566104, + "grad_norm": 2.6175382137298584, + "learning_rate": 3.2253696582353896e-05, + "loss": 0.0321, + "step": 77440 + }, + { + "epoch": 16.019465958944917, + "grad_norm": 0.2282520830631256, + "learning_rate": 3.225068756130881e-05, + "loss": 0.0804, + "step": 77450 + }, + { + "epoch": 16.01952012132373, + "grad_norm": 0.028755825012922287, + "learning_rate": 3.224767854026371e-05, + "loss": 0.0538, + "step": 77460 + }, + { + "epoch": 16.01957428370254, + "grad_norm": 0.003152061952278018, + "learning_rate": 3.2244669519218615e-05, + "loss": 0.0033, + "step": 77470 + }, + { + "epoch": 16.019628446081352, + "grad_norm": 0.0024610343389213085, + "learning_rate": 3.224166049817353e-05, + "loss": 0.0104, + "step": 77480 + }, + { + "epoch": 16.019682608460165, + "grad_norm": 0.17403742671012878, + "learning_rate": 3.2238651477128434e-05, + "loss": 0.0684, + "step": 77490 + }, + { + "epoch": 16.019736770838975, + "grad_norm": 2.9451465606689453, + "learning_rate": 3.2235642456083334e-05, + "loss": 0.1148, + "step": 77500 + }, + { + "epoch": 16.019790933217788, + "grad_norm": 0.05455606430768967, + "learning_rate": 3.223263343503825e-05, + "loss": 0.0822, + "step": 77510 + }, + { + "epoch": 16.019845095596597, + "grad_norm": 3.026013135910034, + "learning_rate": 3.222962441399315e-05, + "loss": 0.053, + "step": 77520 + }, + { + "epoch": 16.01989925797541, + "grad_norm": 0.002464650897309184, + "learning_rate": 3.222661539294806e-05, + "loss": 0.1145, + "step": 77530 + }, + { + "epoch": 16.019953420354224, + "grad_norm": 31.550012588500977, + "learning_rate": 3.2223606371902966e-05, + "loss": 0.0373, + "step": 77540 + }, + { + "epoch": 16.020007582733033, + "grad_norm": 0.20418304204940796, + "learning_rate": 3.222059735085787e-05, + "loss": 0.0722, + "step": 77550 + }, + { + "epoch": 16.020061745111846, + "grad_norm": 0.0025869812816381454, + "learning_rate": 3.221758832981278e-05, + "loss": 0.036, + "step": 77560 + }, + { + "epoch": 16.020115907490656, + "grad_norm": 0.006446062121540308, + "learning_rate": 3.221457930876769e-05, + "loss": 0.0176, + "step": 77570 + }, + { + "epoch": 16.02017006986947, + "grad_norm": 0.10801566392183304, + "learning_rate": 3.221157028772259e-05, + "loss": 0.0326, + "step": 77580 + }, + { + "epoch": 16.02022423224828, + "grad_norm": 0.23172740638256073, + "learning_rate": 3.2208561266677504e-05, + "loss": 0.1191, + "step": 77590 + }, + { + "epoch": 16.02027839462709, + "grad_norm": 0.0033111341763287783, + "learning_rate": 3.220555224563241e-05, + "loss": 0.0262, + "step": 77600 + }, + { + "epoch": 16.020332557005904, + "grad_norm": 0.002369845286011696, + "learning_rate": 3.220254322458731e-05, + "loss": 0.0427, + "step": 77610 + }, + { + "epoch": 16.020386719384714, + "grad_norm": 0.10150100290775299, + "learning_rate": 3.219953420354222e-05, + "loss": 0.0218, + "step": 77620 + }, + { + "epoch": 16.020440881763527, + "grad_norm": 0.0023141461424529552, + "learning_rate": 3.219652518249713e-05, + "loss": 0.0522, + "step": 77630 + }, + { + "epoch": 16.02049504414234, + "grad_norm": 0.0022458559833467007, + "learning_rate": 3.2193516161452035e-05, + "loss": 0.0066, + "step": 77640 + }, + { + "epoch": 16.02054920652115, + "grad_norm": 3.52828311920166, + "learning_rate": 3.219050714040694e-05, + "loss": 0.1206, + "step": 77650 + }, + { + "epoch": 16.020603368899963, + "grad_norm": 2.2873406410217285, + "learning_rate": 3.218749811936185e-05, + "loss": 0.0119, + "step": 77660 + }, + { + "epoch": 16.020657531278772, + "grad_norm": 0.777873158454895, + "learning_rate": 3.2184489098316754e-05, + "loss": 0.1245, + "step": 77670 + }, + { + "epoch": 16.020711693657585, + "grad_norm": 0.002342251129448414, + "learning_rate": 3.218148007727167e-05, + "loss": 0.033, + "step": 77680 + }, + { + "epoch": 16.020765856036398, + "grad_norm": 0.05376071110367775, + "learning_rate": 3.2178471056226566e-05, + "loss": 0.0315, + "step": 77690 + }, + { + "epoch": 16.020820018415208, + "grad_norm": 0.22862064838409424, + "learning_rate": 3.217546203518147e-05, + "loss": 0.0813, + "step": 77700 + }, + { + "epoch": 16.02087418079402, + "grad_norm": 3.589158773422241, + "learning_rate": 3.2172453014136386e-05, + "loss": 0.1046, + "step": 77710 + }, + { + "epoch": 16.020928343172834, + "grad_norm": 0.06442321091890335, + "learning_rate": 3.216944399309129e-05, + "loss": 0.0814, + "step": 77720 + }, + { + "epoch": 16.020982505551643, + "grad_norm": 1.7614476680755615, + "learning_rate": 3.216643497204619e-05, + "loss": 0.0391, + "step": 77730 + }, + { + "epoch": 16.021036667930456, + "grad_norm": 0.007949411869049072, + "learning_rate": 3.2163425951001105e-05, + "loss": 0.019, + "step": 77740 + }, + { + "epoch": 16.021090830309266, + "grad_norm": 0.3651138246059418, + "learning_rate": 3.216041692995601e-05, + "loss": 0.0261, + "step": 77750 + }, + { + "epoch": 16.02114499268808, + "grad_norm": 0.5087392926216125, + "learning_rate": 3.215740790891091e-05, + "loss": 0.029, + "step": 77760 + }, + { + "epoch": 16.021199155066892, + "grad_norm": 0.3261844515800476, + "learning_rate": 3.2154398887865823e-05, + "loss": 0.036, + "step": 77770 + }, + { + "epoch": 16.0212533174457, + "grad_norm": 0.9975797533988953, + "learning_rate": 3.215138986682073e-05, + "loss": 0.0418, + "step": 77780 + }, + { + "epoch": 16.021307479824515, + "grad_norm": 0.02921379543840885, + "learning_rate": 3.2148380845775636e-05, + "loss": 0.0571, + "step": 77790 + }, + { + "epoch": 16.021361642203324, + "grad_norm": 0.06729515641927719, + "learning_rate": 3.214537182473054e-05, + "loss": 0.0067, + "step": 77800 + }, + { + "epoch": 16.021415804582137, + "grad_norm": 0.03035420924425125, + "learning_rate": 3.214236280368545e-05, + "loss": 0.0322, + "step": 77810 + }, + { + "epoch": 16.02146996696095, + "grad_norm": 2.1002230644226074, + "learning_rate": 3.2139353782640355e-05, + "loss": 0.0471, + "step": 77820 + }, + { + "epoch": 16.02152412933976, + "grad_norm": 2.2027652263641357, + "learning_rate": 3.213634476159527e-05, + "loss": 0.1563, + "step": 77830 + }, + { + "epoch": 16.021578291718573, + "grad_norm": 0.00211180723272264, + "learning_rate": 3.213333574055017e-05, + "loss": 0.0065, + "step": 77840 + }, + { + "epoch": 16.021632454097382, + "grad_norm": 0.004417083691805601, + "learning_rate": 3.213032671950508e-05, + "loss": 0.0669, + "step": 77850 + }, + { + "epoch": 16.021686616476195, + "grad_norm": 0.005625599529594183, + "learning_rate": 3.212731769845999e-05, + "loss": 0.1357, + "step": 77860 + }, + { + "epoch": 16.02174077885501, + "grad_norm": 0.1367115080356598, + "learning_rate": 3.212430867741489e-05, + "loss": 0.0731, + "step": 77870 + }, + { + "epoch": 16.021794941233818, + "grad_norm": 0.015557024627923965, + "learning_rate": 3.21212996563698e-05, + "loss": 0.0602, + "step": 77880 + }, + { + "epoch": 16.02184910361263, + "grad_norm": 0.16167080402374268, + "learning_rate": 3.2118290635324705e-05, + "loss": 0.0319, + "step": 77890 + }, + { + "epoch": 16.021903265991444, + "grad_norm": 0.0029888562858104706, + "learning_rate": 3.211528161427961e-05, + "loss": 0.0143, + "step": 77900 + }, + { + "epoch": 16.021957428370253, + "grad_norm": 1.479004979133606, + "learning_rate": 3.211227259323452e-05, + "loss": 0.048, + "step": 77910 + }, + { + "epoch": 16.022011590749067, + "grad_norm": 1.8470914363861084, + "learning_rate": 3.2109263572189424e-05, + "loss": 0.0698, + "step": 77920 + }, + { + "epoch": 16.022065753127876, + "grad_norm": 0.0036659543402493, + "learning_rate": 3.210625455114433e-05, + "loss": 0.1314, + "step": 77930 + }, + { + "epoch": 16.02211991550669, + "grad_norm": 0.1193469986319542, + "learning_rate": 3.2103245530099244e-05, + "loss": 0.0593, + "step": 77940 + }, + { + "epoch": 16.022174077885502, + "grad_norm": 0.0025697434321045876, + "learning_rate": 3.210023650905414e-05, + "loss": 0.0537, + "step": 77950 + }, + { + "epoch": 16.02222824026431, + "grad_norm": 1.9301555156707764, + "learning_rate": 3.209722748800905e-05, + "loss": 0.0628, + "step": 77960 + }, + { + "epoch": 16.022282402643125, + "grad_norm": 0.02373787946999073, + "learning_rate": 3.209421846696396e-05, + "loss": 0.0175, + "step": 77970 + }, + { + "epoch": 16.022336565021934, + "grad_norm": 0.002391077345237136, + "learning_rate": 3.209120944591887e-05, + "loss": 0.0808, + "step": 77980 + }, + { + "epoch": 16.022390727400747, + "grad_norm": 4.406161785125732, + "learning_rate": 3.208820042487377e-05, + "loss": 0.0445, + "step": 77990 + }, + { + "epoch": 16.02244488977956, + "grad_norm": 0.09000173211097717, + "learning_rate": 3.208519140382868e-05, + "loss": 0.1256, + "step": 78000 + }, + { + "epoch": 16.02249905215837, + "grad_norm": 1.8106886148452759, + "learning_rate": 3.208218238278359e-05, + "loss": 0.115, + "step": 78010 + }, + { + "epoch": 16.022553214537183, + "grad_norm": 0.0686754435300827, + "learning_rate": 3.2079173361738494e-05, + "loss": 0.0125, + "step": 78020 + }, + { + "epoch": 16.022607376915992, + "grad_norm": 2.3114609718322754, + "learning_rate": 3.20761643406934e-05, + "loss": 0.0386, + "step": 78030 + }, + { + "epoch": 16.022661539294806, + "grad_norm": 0.0024686784017831087, + "learning_rate": 3.2073155319648306e-05, + "loss": 0.0647, + "step": 78040 + }, + { + "epoch": 16.02271570167362, + "grad_norm": 0.8339136242866516, + "learning_rate": 3.207014629860321e-05, + "loss": 0.1422, + "step": 78050 + }, + { + "epoch": 16.022769864052428, + "grad_norm": 0.12472575902938843, + "learning_rate": 3.206713727755812e-05, + "loss": 0.0167, + "step": 78060 + }, + { + "epoch": 16.02282402643124, + "grad_norm": 0.06803996860980988, + "learning_rate": 3.2064128256513025e-05, + "loss": 0.123, + "step": 78070 + }, + { + "epoch": 16.022878188810054, + "grad_norm": 0.43061360716819763, + "learning_rate": 3.206111923546793e-05, + "loss": 0.0267, + "step": 78080 + }, + { + "epoch": 16.022932351188864, + "grad_norm": 0.0052734725177288055, + "learning_rate": 3.2058110214422845e-05, + "loss": 0.0027, + "step": 78090 + }, + { + "epoch": 16.022986513567677, + "grad_norm": 0.08491503447294235, + "learning_rate": 3.2055101193377744e-05, + "loss": 0.0333, + "step": 78100 + }, + { + "epoch": 16.023040675946486, + "grad_norm": 0.0022498006001114845, + "learning_rate": 3.205209217233266e-05, + "loss": 0.0175, + "step": 78110 + }, + { + "epoch": 16.0230948383253, + "grad_norm": 4.481640338897705, + "learning_rate": 3.204908315128756e-05, + "loss": 0.181, + "step": 78120 + }, + { + "epoch": 16.023149000704112, + "grad_norm": 0.00228111888282001, + "learning_rate": 3.204607413024247e-05, + "loss": 0.0548, + "step": 78130 + }, + { + "epoch": 16.023203163082922, + "grad_norm": 0.5780420899391174, + "learning_rate": 3.2043065109197376e-05, + "loss": 0.0592, + "step": 78140 + }, + { + "epoch": 16.023257325461735, + "grad_norm": 1.4838062524795532, + "learning_rate": 3.204005608815228e-05, + "loss": 0.1015, + "step": 78150 + }, + { + "epoch": 16.023311487840544, + "grad_norm": 0.09153851866722107, + "learning_rate": 3.203704706710719e-05, + "loss": 0.0011, + "step": 78160 + }, + { + "epoch": 16.023365650219358, + "grad_norm": 2.1295104026794434, + "learning_rate": 3.20340380460621e-05, + "loss": 0.0274, + "step": 78170 + }, + { + "epoch": 16.02341981259817, + "grad_norm": 0.08352194726467133, + "learning_rate": 3.2031029025017e-05, + "loss": 0.1044, + "step": 78180 + }, + { + "epoch": 16.02347397497698, + "grad_norm": 0.005999655928462744, + "learning_rate": 3.202802000397191e-05, + "loss": 0.0037, + "step": 78190 + }, + { + "epoch": 16.023528137355793, + "grad_norm": 4.383961200714111, + "learning_rate": 3.202501098292682e-05, + "loss": 0.1173, + "step": 78200 + }, + { + "epoch": 16.023582299734603, + "grad_norm": 2.1743431091308594, + "learning_rate": 3.202200196188172e-05, + "loss": 0.1515, + "step": 78210 + }, + { + "epoch": 16.023636462113416, + "grad_norm": 0.005710158031433821, + "learning_rate": 3.2018992940836626e-05, + "loss": 0.009, + "step": 78220 + }, + { + "epoch": 16.02369062449223, + "grad_norm": 0.014916585758328438, + "learning_rate": 3.201598391979154e-05, + "loss": 0.0795, + "step": 78230 + }, + { + "epoch": 16.02374478687104, + "grad_norm": 0.023256348446011543, + "learning_rate": 3.2012974898746445e-05, + "loss": 0.0331, + "step": 78240 + }, + { + "epoch": 16.02379894924985, + "grad_norm": 0.06007348746061325, + "learning_rate": 3.2009965877701345e-05, + "loss": 0.083, + "step": 78250 + }, + { + "epoch": 16.023853111628664, + "grad_norm": 0.00360553408972919, + "learning_rate": 3.200695685665626e-05, + "loss": 0.1108, + "step": 78260 + }, + { + "epoch": 16.023907274007474, + "grad_norm": 0.008537056855857372, + "learning_rate": 3.2003947835611164e-05, + "loss": 0.0236, + "step": 78270 + }, + { + "epoch": 16.023961436386287, + "grad_norm": 0.5757003426551819, + "learning_rate": 3.200093881456607e-05, + "loss": 0.0251, + "step": 78280 + }, + { + "epoch": 16.024015598765097, + "grad_norm": 28.739452362060547, + "learning_rate": 3.199792979352098e-05, + "loss": 0.2179, + "step": 78290 + }, + { + "epoch": 16.02406976114391, + "grad_norm": 0.3076370060443878, + "learning_rate": 3.199492077247588e-05, + "loss": 0.0527, + "step": 78300 + }, + { + "epoch": 16.024123923522723, + "grad_norm": 0.36270713806152344, + "learning_rate": 3.199191175143079e-05, + "loss": 0.0613, + "step": 78310 + }, + { + "epoch": 16.024178085901532, + "grad_norm": 3.7394611835479736, + "learning_rate": 3.19889027303857e-05, + "loss": 0.0482, + "step": 78320 + }, + { + "epoch": 16.024232248280345, + "grad_norm": 0.01112488005310297, + "learning_rate": 3.19858937093406e-05, + "loss": 0.0662, + "step": 78330 + }, + { + "epoch": 16.024286410659155, + "grad_norm": 0.032521266490221024, + "learning_rate": 3.198288468829551e-05, + "loss": 0.0885, + "step": 78340 + }, + { + "epoch": 16.024340573037968, + "grad_norm": 0.015073385089635849, + "learning_rate": 3.197987566725042e-05, + "loss": 0.0331, + "step": 78350 + }, + { + "epoch": 16.02439473541678, + "grad_norm": 0.014434810727834702, + "learning_rate": 3.197686664620532e-05, + "loss": 0.1277, + "step": 78360 + }, + { + "epoch": 16.02444889779559, + "grad_norm": 0.12785311043262482, + "learning_rate": 3.1973857625160234e-05, + "loss": 0.0048, + "step": 78370 + }, + { + "epoch": 16.024503060174403, + "grad_norm": 0.005204802379012108, + "learning_rate": 3.197084860411514e-05, + "loss": 0.0282, + "step": 78380 + }, + { + "epoch": 16.024557222553213, + "grad_norm": 0.025242634117603302, + "learning_rate": 3.1967839583070046e-05, + "loss": 0.0453, + "step": 78390 + }, + { + "epoch": 16.024611384932026, + "grad_norm": 0.008601161651313305, + "learning_rate": 3.196483056202495e-05, + "loss": 0.0962, + "step": 78400 + }, + { + "epoch": 16.02466554731084, + "grad_norm": 0.009851180016994476, + "learning_rate": 3.196182154097986e-05, + "loss": 0.0506, + "step": 78410 + }, + { + "epoch": 16.02471970968965, + "grad_norm": 0.0258024949580431, + "learning_rate": 3.1958812519934765e-05, + "loss": 0.0189, + "step": 78420 + }, + { + "epoch": 16.02477387206846, + "grad_norm": 0.007879314944148064, + "learning_rate": 3.195580349888968e-05, + "loss": 0.0592, + "step": 78430 + }, + { + "epoch": 16.024828034447275, + "grad_norm": 0.0408296100795269, + "learning_rate": 3.195279447784458e-05, + "loss": 0.0005, + "step": 78440 + }, + { + "epoch": 16.024882196826084, + "grad_norm": 1.2638980150222778, + "learning_rate": 3.1949785456799484e-05, + "loss": 0.0119, + "step": 78450 + }, + { + "epoch": 16.024936359204897, + "grad_norm": 0.006155128590762615, + "learning_rate": 3.19467764357544e-05, + "loss": 0.127, + "step": 78460 + }, + { + "epoch": 16.024990521583707, + "grad_norm": 0.05352199450135231, + "learning_rate": 3.19437674147093e-05, + "loss": 0.1769, + "step": 78470 + }, + { + "epoch": 16.02500135405947, + "eval_accuracy": 0.8380143696930111, + "eval_loss": 0.792042076587677, + "eval_runtime": 117.1011, + "eval_samples_per_second": 26.148, + "eval_steps_per_second": 3.271, + "step": 78472 + }, + { + "epoch": 17.00004332990305, + "grad_norm": 3.8471481800079346, + "learning_rate": 3.19407583936642e-05, + "loss": 0.1374, + "step": 78480 + }, + { + "epoch": 17.00009749228186, + "grad_norm": 0.02588082291185856, + "learning_rate": 3.1937749372619116e-05, + "loss": 0.0776, + "step": 78490 + }, + { + "epoch": 17.000151654660673, + "grad_norm": 0.014929995872080326, + "learning_rate": 3.193474035157402e-05, + "loss": 0.0219, + "step": 78500 + }, + { + "epoch": 17.000205817039486, + "grad_norm": 0.18777622282505035, + "learning_rate": 3.193173133052892e-05, + "loss": 0.0863, + "step": 78510 + }, + { + "epoch": 17.000259979418296, + "grad_norm": 0.042777761816978455, + "learning_rate": 3.1928722309483835e-05, + "loss": 0.0254, + "step": 78520 + }, + { + "epoch": 17.00031414179711, + "grad_norm": 0.05397890880703926, + "learning_rate": 3.192571328843874e-05, + "loss": 0.0288, + "step": 78530 + }, + { + "epoch": 17.000368304175918, + "grad_norm": 0.03566763922572136, + "learning_rate": 3.192270426739365e-05, + "loss": 0.0059, + "step": 78540 + }, + { + "epoch": 17.00042246655473, + "grad_norm": 0.010758453980088234, + "learning_rate": 3.1919695246348553e-05, + "loss": 0.0268, + "step": 78550 + }, + { + "epoch": 17.000476628933544, + "grad_norm": 0.1229625791311264, + "learning_rate": 3.191668622530346e-05, + "loss": 0.0051, + "step": 78560 + }, + { + "epoch": 17.000530791312354, + "grad_norm": 10.598708152770996, + "learning_rate": 3.1913677204258366e-05, + "loss": 0.0868, + "step": 78570 + }, + { + "epoch": 17.000584953691167, + "grad_norm": 0.029689310118556023, + "learning_rate": 3.191066818321328e-05, + "loss": 0.1297, + "step": 78580 + }, + { + "epoch": 17.000639116069976, + "grad_norm": 0.03338216617703438, + "learning_rate": 3.190765916216818e-05, + "loss": 0.0131, + "step": 78590 + }, + { + "epoch": 17.00069327844879, + "grad_norm": 3.8384201526641846, + "learning_rate": 3.1904650141123085e-05, + "loss": 0.0797, + "step": 78600 + }, + { + "epoch": 17.000747440827602, + "grad_norm": 0.005107664503157139, + "learning_rate": 3.1901641120078e-05, + "loss": 0.0376, + "step": 78610 + }, + { + "epoch": 17.000801603206412, + "grad_norm": 0.0033561682794243097, + "learning_rate": 3.1898632099032904e-05, + "loss": 0.0886, + "step": 78620 + }, + { + "epoch": 17.000855765585225, + "grad_norm": 0.003770121606066823, + "learning_rate": 3.189562307798781e-05, + "loss": 0.1127, + "step": 78630 + }, + { + "epoch": 17.000909927964035, + "grad_norm": 0.028029469773173332, + "learning_rate": 3.189261405694272e-05, + "loss": 0.0125, + "step": 78640 + }, + { + "epoch": 17.000964090342848, + "grad_norm": 0.4232163727283478, + "learning_rate": 3.188960503589762e-05, + "loss": 0.0303, + "step": 78650 + }, + { + "epoch": 17.00101825272166, + "grad_norm": 0.4412841200828552, + "learning_rate": 3.188659601485253e-05, + "loss": 0.0914, + "step": 78660 + }, + { + "epoch": 17.00107241510047, + "grad_norm": 4.845357894897461, + "learning_rate": 3.1883586993807436e-05, + "loss": 0.2001, + "step": 78670 + }, + { + "epoch": 17.001126577479283, + "grad_norm": 2.3324241638183594, + "learning_rate": 3.188057797276234e-05, + "loss": 0.188, + "step": 78680 + }, + { + "epoch": 17.001180739858096, + "grad_norm": 0.04678063094615936, + "learning_rate": 3.1877568951717255e-05, + "loss": 0.0481, + "step": 78690 + }, + { + "epoch": 17.001234902236906, + "grad_norm": 4.055030822753906, + "learning_rate": 3.1874559930672154e-05, + "loss": 0.0833, + "step": 78700 + }, + { + "epoch": 17.00128906461572, + "grad_norm": 0.0573616661131382, + "learning_rate": 3.187155090962706e-05, + "loss": 0.0196, + "step": 78710 + }, + { + "epoch": 17.00134322699453, + "grad_norm": 13.591536521911621, + "learning_rate": 3.1868541888581974e-05, + "loss": 0.1115, + "step": 78720 + }, + { + "epoch": 17.00139738937334, + "grad_norm": 0.25314104557037354, + "learning_rate": 3.186553286753688e-05, + "loss": 0.0475, + "step": 78730 + }, + { + "epoch": 17.001451551752155, + "grad_norm": 0.010968795046210289, + "learning_rate": 3.186252384649178e-05, + "loss": 0.0316, + "step": 78740 + }, + { + "epoch": 17.001505714130964, + "grad_norm": 4.183740139007568, + "learning_rate": 3.185951482544669e-05, + "loss": 0.1321, + "step": 78750 + }, + { + "epoch": 17.001559876509777, + "grad_norm": 0.06550271064043045, + "learning_rate": 3.18565058044016e-05, + "loss": 0.0062, + "step": 78760 + }, + { + "epoch": 17.001614038888587, + "grad_norm": 0.06460154056549072, + "learning_rate": 3.1853496783356505e-05, + "loss": 0.0509, + "step": 78770 + }, + { + "epoch": 17.0016682012674, + "grad_norm": 0.007842997089028358, + "learning_rate": 3.185048776231141e-05, + "loss": 0.1054, + "step": 78780 + }, + { + "epoch": 17.001722363646213, + "grad_norm": 0.00602624611929059, + "learning_rate": 3.184747874126632e-05, + "loss": 0.0588, + "step": 78790 + }, + { + "epoch": 17.001776526025022, + "grad_norm": 0.07578565925359726, + "learning_rate": 3.1844469720221224e-05, + "loss": 0.0268, + "step": 78800 + }, + { + "epoch": 17.001830688403835, + "grad_norm": 4.69975471496582, + "learning_rate": 3.184146069917613e-05, + "loss": 0.0781, + "step": 78810 + }, + { + "epoch": 17.001884850782645, + "grad_norm": 0.7949283719062805, + "learning_rate": 3.1838451678131036e-05, + "loss": 0.0579, + "step": 78820 + }, + { + "epoch": 17.001939013161458, + "grad_norm": 6.79856014251709, + "learning_rate": 3.183544265708594e-05, + "loss": 0.0949, + "step": 78830 + }, + { + "epoch": 17.00199317554027, + "grad_norm": 0.03133796900510788, + "learning_rate": 3.1832433636040856e-05, + "loss": 0.0549, + "step": 78840 + }, + { + "epoch": 17.00204733791908, + "grad_norm": 0.10713871568441391, + "learning_rate": 3.1829424614995755e-05, + "loss": 0.0506, + "step": 78850 + }, + { + "epoch": 17.002101500297893, + "grad_norm": 0.26120525598526, + "learning_rate": 3.182641559395066e-05, + "loss": 0.0319, + "step": 78860 + }, + { + "epoch": 17.002155662676707, + "grad_norm": 0.005759132094681263, + "learning_rate": 3.1823406572905575e-05, + "loss": 0.0444, + "step": 78870 + }, + { + "epoch": 17.002209825055516, + "grad_norm": 0.003834006143733859, + "learning_rate": 3.182039755186048e-05, + "loss": 0.0967, + "step": 78880 + }, + { + "epoch": 17.00226398743433, + "grad_norm": 0.03100603073835373, + "learning_rate": 3.181738853081539e-05, + "loss": 0.0056, + "step": 78890 + }, + { + "epoch": 17.00231814981314, + "grad_norm": 0.032688483595848083, + "learning_rate": 3.1814379509770293e-05, + "loss": 0.0045, + "step": 78900 + }, + { + "epoch": 17.00237231219195, + "grad_norm": 1.8076096773147583, + "learning_rate": 3.18113704887252e-05, + "loss": 0.0794, + "step": 78910 + }, + { + "epoch": 17.002426474570765, + "grad_norm": 0.013742816634476185, + "learning_rate": 3.1808361467680106e-05, + "loss": 0.0608, + "step": 78920 + }, + { + "epoch": 17.002480636949574, + "grad_norm": 0.00421794131398201, + "learning_rate": 3.180535244663501e-05, + "loss": 0.0472, + "step": 78930 + }, + { + "epoch": 17.002534799328387, + "grad_norm": 0.5876071453094482, + "learning_rate": 3.180234342558992e-05, + "loss": 0.0678, + "step": 78940 + }, + { + "epoch": 17.002588961707197, + "grad_norm": 0.0829569399356842, + "learning_rate": 3.179933440454483e-05, + "loss": 0.0691, + "step": 78950 + }, + { + "epoch": 17.00264312408601, + "grad_norm": 0.04085131362080574, + "learning_rate": 3.179632538349973e-05, + "loss": 0.0108, + "step": 78960 + }, + { + "epoch": 17.002697286464823, + "grad_norm": 0.003087271237745881, + "learning_rate": 3.179331636245464e-05, + "loss": 0.0558, + "step": 78970 + }, + { + "epoch": 17.002751448843632, + "grad_norm": 0.12451189756393433, + "learning_rate": 3.179030734140955e-05, + "loss": 0.0167, + "step": 78980 + }, + { + "epoch": 17.002805611222445, + "grad_norm": 0.05482170730829239, + "learning_rate": 3.178729832036446e-05, + "loss": 0.0623, + "step": 78990 + }, + { + "epoch": 17.002859773601255, + "grad_norm": 2.5968384742736816, + "learning_rate": 3.1784289299319356e-05, + "loss": 0.0357, + "step": 79000 + }, + { + "epoch": 17.002913935980068, + "grad_norm": 0.035526763647794724, + "learning_rate": 3.178128027827427e-05, + "loss": 0.0619, + "step": 79010 + }, + { + "epoch": 17.00296809835888, + "grad_norm": 2.3385801315307617, + "learning_rate": 3.1778271257229175e-05, + "loss": 0.0785, + "step": 79020 + }, + { + "epoch": 17.00302226073769, + "grad_norm": 2.380733013153076, + "learning_rate": 3.177526223618408e-05, + "loss": 0.015, + "step": 79030 + }, + { + "epoch": 17.003076423116504, + "grad_norm": 0.38333526253700256, + "learning_rate": 3.177225321513899e-05, + "loss": 0.0437, + "step": 79040 + }, + { + "epoch": 17.003130585495313, + "grad_norm": 0.3378598988056183, + "learning_rate": 3.1769244194093894e-05, + "loss": 0.0623, + "step": 79050 + }, + { + "epoch": 17.003184747874126, + "grad_norm": 0.002629409311339259, + "learning_rate": 3.17662351730488e-05, + "loss": 0.0025, + "step": 79060 + }, + { + "epoch": 17.00323891025294, + "grad_norm": 1.7412015199661255, + "learning_rate": 3.1763226152003714e-05, + "loss": 0.059, + "step": 79070 + }, + { + "epoch": 17.00329307263175, + "grad_norm": 0.005883910693228245, + "learning_rate": 3.176021713095861e-05, + "loss": 0.1213, + "step": 79080 + }, + { + "epoch": 17.003347235010562, + "grad_norm": 0.003141030203551054, + "learning_rate": 3.175720810991352e-05, + "loss": 0.1202, + "step": 79090 + }, + { + "epoch": 17.003401397389375, + "grad_norm": 0.00805387832224369, + "learning_rate": 3.175419908886843e-05, + "loss": 0.0981, + "step": 79100 + }, + { + "epoch": 17.003455559768184, + "grad_norm": 0.012353452853858471, + "learning_rate": 3.175119006782333e-05, + "loss": 0.1697, + "step": 79110 + }, + { + "epoch": 17.003509722146998, + "grad_norm": 0.03578873723745346, + "learning_rate": 3.174818104677824e-05, + "loss": 0.0497, + "step": 79120 + }, + { + "epoch": 17.003563884525807, + "grad_norm": 0.004669608548283577, + "learning_rate": 3.174517202573315e-05, + "loss": 0.0043, + "step": 79130 + }, + { + "epoch": 17.00361804690462, + "grad_norm": 0.3552914261817932, + "learning_rate": 3.174216300468806e-05, + "loss": 0.017, + "step": 79140 + }, + { + "epoch": 17.003672209283433, + "grad_norm": 0.0033941338770091534, + "learning_rate": 3.1739153983642964e-05, + "loss": 0.0241, + "step": 79150 + }, + { + "epoch": 17.003726371662243, + "grad_norm": 0.010064552538096905, + "learning_rate": 3.173614496259787e-05, + "loss": 0.0785, + "step": 79160 + }, + { + "epoch": 17.003780534041056, + "grad_norm": 0.08757494390010834, + "learning_rate": 3.1733135941552776e-05, + "loss": 0.0068, + "step": 79170 + }, + { + "epoch": 17.003834696419865, + "grad_norm": 0.0056792027316987514, + "learning_rate": 3.173012692050768e-05, + "loss": 0.0793, + "step": 79180 + }, + { + "epoch": 17.00388885879868, + "grad_norm": 3.941094398498535, + "learning_rate": 3.172711789946259e-05, + "loss": 0.1105, + "step": 79190 + }, + { + "epoch": 17.00394302117749, + "grad_norm": 0.3853471875190735, + "learning_rate": 3.1724108878417495e-05, + "loss": 0.0315, + "step": 79200 + }, + { + "epoch": 17.0039971835563, + "grad_norm": 0.14166326820850372, + "learning_rate": 3.172109985737241e-05, + "loss": 0.0496, + "step": 79210 + }, + { + "epoch": 17.004051345935114, + "grad_norm": 2.893752336502075, + "learning_rate": 3.1718090836327315e-05, + "loss": 0.1421, + "step": 79220 + }, + { + "epoch": 17.004105508313923, + "grad_norm": 0.34060701727867126, + "learning_rate": 3.1715081815282214e-05, + "loss": 0.0521, + "step": 79230 + }, + { + "epoch": 17.004159670692736, + "grad_norm": 0.004210758954286575, + "learning_rate": 3.171207279423713e-05, + "loss": 0.0037, + "step": 79240 + }, + { + "epoch": 17.00421383307155, + "grad_norm": 0.22968260943889618, + "learning_rate": 3.170906377319203e-05, + "loss": 0.0645, + "step": 79250 + }, + { + "epoch": 17.00426799545036, + "grad_norm": 0.013533082790672779, + "learning_rate": 3.170605475214693e-05, + "loss": 0.0735, + "step": 79260 + }, + { + "epoch": 17.004322157829172, + "grad_norm": 0.030730728060007095, + "learning_rate": 3.1703045731101846e-05, + "loss": 0.0528, + "step": 79270 + }, + { + "epoch": 17.004376320207985, + "grad_norm": 0.0826261043548584, + "learning_rate": 3.170003671005675e-05, + "loss": 0.0402, + "step": 79280 + }, + { + "epoch": 17.004430482586795, + "grad_norm": 0.004895021207630634, + "learning_rate": 3.169702768901166e-05, + "loss": 0.0549, + "step": 79290 + }, + { + "epoch": 17.004484644965608, + "grad_norm": 0.04312998801469803, + "learning_rate": 3.1694018667966565e-05, + "loss": 0.0459, + "step": 79300 + }, + { + "epoch": 17.004538807344417, + "grad_norm": 0.5580698251724243, + "learning_rate": 3.169100964692147e-05, + "loss": 0.0164, + "step": 79310 + }, + { + "epoch": 17.00459296972323, + "grad_norm": 0.06515784561634064, + "learning_rate": 3.168800062587638e-05, + "loss": 0.0049, + "step": 79320 + }, + { + "epoch": 17.004647132102043, + "grad_norm": 0.003456731094047427, + "learning_rate": 3.168499160483129e-05, + "loss": 0.0266, + "step": 79330 + }, + { + "epoch": 17.004701294480853, + "grad_norm": 0.0039411261677742004, + "learning_rate": 3.168198258378619e-05, + "loss": 0.0352, + "step": 79340 + }, + { + "epoch": 17.004755456859666, + "grad_norm": 0.07694245874881744, + "learning_rate": 3.1678973562741096e-05, + "loss": 0.0169, + "step": 79350 + }, + { + "epoch": 17.004809619238475, + "grad_norm": 0.0030374350026249886, + "learning_rate": 3.167596454169601e-05, + "loss": 0.0598, + "step": 79360 + }, + { + "epoch": 17.00486378161729, + "grad_norm": 0.0028399378061294556, + "learning_rate": 3.1672955520650915e-05, + "loss": 0.0005, + "step": 79370 + }, + { + "epoch": 17.0049179439961, + "grad_norm": 22.544225692749023, + "learning_rate": 3.1669946499605815e-05, + "loss": 0.0856, + "step": 79380 + }, + { + "epoch": 17.00497210637491, + "grad_norm": 0.6785149574279785, + "learning_rate": 3.166693747856073e-05, + "loss": 0.096, + "step": 79390 + }, + { + "epoch": 17.005026268753724, + "grad_norm": 0.2125740647315979, + "learning_rate": 3.1663928457515634e-05, + "loss": 0.0123, + "step": 79400 + }, + { + "epoch": 17.005080431132534, + "grad_norm": 0.002773706568405032, + "learning_rate": 3.166091943647054e-05, + "loss": 0.0653, + "step": 79410 + }, + { + "epoch": 17.005134593511347, + "grad_norm": 1.552103042602539, + "learning_rate": 3.165791041542545e-05, + "loss": 0.0455, + "step": 79420 + }, + { + "epoch": 17.00518875589016, + "grad_norm": 0.05703600496053696, + "learning_rate": 3.165490139438035e-05, + "loss": 0.0101, + "step": 79430 + }, + { + "epoch": 17.00524291826897, + "grad_norm": 2.954023599624634, + "learning_rate": 3.165189237333526e-05, + "loss": 0.1184, + "step": 79440 + }, + { + "epoch": 17.005297080647782, + "grad_norm": 0.06670890003442764, + "learning_rate": 3.1648883352290166e-05, + "loss": 0.0644, + "step": 79450 + }, + { + "epoch": 17.005351243026595, + "grad_norm": 0.005623277276754379, + "learning_rate": 3.164587433124507e-05, + "loss": 0.0758, + "step": 79460 + }, + { + "epoch": 17.005405405405405, + "grad_norm": 1.1799075603485107, + "learning_rate": 3.1642865310199985e-05, + "loss": 0.0617, + "step": 79470 + }, + { + "epoch": 17.005459567784218, + "grad_norm": 24.176189422607422, + "learning_rate": 3.163985628915489e-05, + "loss": 0.0357, + "step": 79480 + }, + { + "epoch": 17.005513730163027, + "grad_norm": 0.07041531801223755, + "learning_rate": 3.163684726810979e-05, + "loss": 0.1268, + "step": 79490 + }, + { + "epoch": 17.00556789254184, + "grad_norm": 1.594749927520752, + "learning_rate": 3.1633838247064704e-05, + "loss": 0.0382, + "step": 79500 + }, + { + "epoch": 17.005622054920654, + "grad_norm": 0.027822362259030342, + "learning_rate": 3.163082922601961e-05, + "loss": 0.056, + "step": 79510 + }, + { + "epoch": 17.005676217299463, + "grad_norm": 1.028234839439392, + "learning_rate": 3.1627820204974516e-05, + "loss": 0.1247, + "step": 79520 + }, + { + "epoch": 17.005730379678276, + "grad_norm": 0.03273740038275719, + "learning_rate": 3.162481118392942e-05, + "loss": 0.0268, + "step": 79530 + }, + { + "epoch": 17.005784542057086, + "grad_norm": 0.013492719270288944, + "learning_rate": 3.162180216288433e-05, + "loss": 0.0025, + "step": 79540 + }, + { + "epoch": 17.0058387044359, + "grad_norm": 0.061437029391527176, + "learning_rate": 3.1618793141839235e-05, + "loss": 0.0568, + "step": 79550 + }, + { + "epoch": 17.005892866814712, + "grad_norm": 1.833286166191101, + "learning_rate": 3.161578412079414e-05, + "loss": 0.0815, + "step": 79560 + }, + { + "epoch": 17.00594702919352, + "grad_norm": 0.0036590818781405687, + "learning_rate": 3.161277509974905e-05, + "loss": 0.0026, + "step": 79570 + }, + { + "epoch": 17.006001191572334, + "grad_norm": 0.862637996673584, + "learning_rate": 3.1609766078703954e-05, + "loss": 0.006, + "step": 79580 + }, + { + "epoch": 17.006055353951144, + "grad_norm": 0.034654684364795685, + "learning_rate": 3.160675705765887e-05, + "loss": 0.0669, + "step": 79590 + }, + { + "epoch": 17.006109516329957, + "grad_norm": 2.463503122329712, + "learning_rate": 3.1603748036613767e-05, + "loss": 0.1131, + "step": 79600 + }, + { + "epoch": 17.00616367870877, + "grad_norm": 0.8279139995574951, + "learning_rate": 3.160073901556867e-05, + "loss": 0.1568, + "step": 79610 + }, + { + "epoch": 17.00621784108758, + "grad_norm": 1.2893774509429932, + "learning_rate": 3.1597729994523586e-05, + "loss": 0.0871, + "step": 79620 + }, + { + "epoch": 17.006272003466393, + "grad_norm": 0.003709252458065748, + "learning_rate": 3.159472097347849e-05, + "loss": 0.1255, + "step": 79630 + }, + { + "epoch": 17.006326165845206, + "grad_norm": 0.08878220617771149, + "learning_rate": 3.159171195243339e-05, + "loss": 0.0017, + "step": 79640 + }, + { + "epoch": 17.006380328224015, + "grad_norm": 0.1592489331960678, + "learning_rate": 3.1588702931388305e-05, + "loss": 0.0625, + "step": 79650 + }, + { + "epoch": 17.006434490602828, + "grad_norm": 0.003974318504333496, + "learning_rate": 3.158569391034321e-05, + "loss": 0.0825, + "step": 79660 + }, + { + "epoch": 17.006488652981638, + "grad_norm": 0.003748390357941389, + "learning_rate": 3.158268488929812e-05, + "loss": 0.0213, + "step": 79670 + }, + { + "epoch": 17.00654281536045, + "grad_norm": 0.5965576171875, + "learning_rate": 3.1579675868253023e-05, + "loss": 0.0308, + "step": 79680 + }, + { + "epoch": 17.006596977739264, + "grad_norm": 1.193915605545044, + "learning_rate": 3.157666684720793e-05, + "loss": 0.0241, + "step": 79690 + }, + { + "epoch": 17.006651140118073, + "grad_norm": 0.12906219065189362, + "learning_rate": 3.1573657826162836e-05, + "loss": 0.0519, + "step": 79700 + }, + { + "epoch": 17.006705302496886, + "grad_norm": 0.0030719346832484007, + "learning_rate": 3.157064880511774e-05, + "loss": 0.053, + "step": 79710 + }, + { + "epoch": 17.006759464875696, + "grad_norm": 0.11150621622800827, + "learning_rate": 3.156763978407265e-05, + "loss": 0.0111, + "step": 79720 + }, + { + "epoch": 17.00681362725451, + "grad_norm": 0.00592014892026782, + "learning_rate": 3.156463076302756e-05, + "loss": 0.1185, + "step": 79730 + }, + { + "epoch": 17.006867789633322, + "grad_norm": 0.0035410274285823107, + "learning_rate": 3.156162174198247e-05, + "loss": 0.0816, + "step": 79740 + }, + { + "epoch": 17.00692195201213, + "grad_norm": 0.004044313449412584, + "learning_rate": 3.155861272093737e-05, + "loss": 0.0026, + "step": 79750 + }, + { + "epoch": 17.006976114390945, + "grad_norm": 0.058433614671230316, + "learning_rate": 3.155560369989228e-05, + "loss": 0.0843, + "step": 79760 + }, + { + "epoch": 17.007030276769754, + "grad_norm": 0.0039308988489210606, + "learning_rate": 3.155259467884719e-05, + "loss": 0.0008, + "step": 79770 + }, + { + "epoch": 17.007084439148567, + "grad_norm": 0.6496865153312683, + "learning_rate": 3.154958565780209e-05, + "loss": 0.0573, + "step": 79780 + }, + { + "epoch": 17.00713860152738, + "grad_norm": 0.0036397273652255535, + "learning_rate": 3.1546576636757e-05, + "loss": 0.0033, + "step": 79790 + }, + { + "epoch": 17.00719276390619, + "grad_norm": 0.0033300002105534077, + "learning_rate": 3.1543567615711906e-05, + "loss": 0.0596, + "step": 79800 + }, + { + "epoch": 17.007246926285003, + "grad_norm": 0.04647617042064667, + "learning_rate": 3.154055859466681e-05, + "loss": 0.0399, + "step": 79810 + }, + { + "epoch": 17.007301088663816, + "grad_norm": 2.945808172225952, + "learning_rate": 3.1537549573621725e-05, + "loss": 0.108, + "step": 79820 + }, + { + "epoch": 17.007355251042625, + "grad_norm": 0.1102890819311142, + "learning_rate": 3.1534540552576624e-05, + "loss": 0.106, + "step": 79830 + }, + { + "epoch": 17.00740941342144, + "grad_norm": 0.06666342169046402, + "learning_rate": 3.153153153153153e-05, + "loss": 0.0042, + "step": 79840 + }, + { + "epoch": 17.007463575800248, + "grad_norm": 0.1363012045621872, + "learning_rate": 3.1528522510486444e-05, + "loss": 0.0065, + "step": 79850 + }, + { + "epoch": 17.00751773817906, + "grad_norm": 0.40778765082359314, + "learning_rate": 3.152551348944134e-05, + "loss": 0.1181, + "step": 79860 + }, + { + "epoch": 17.007571900557874, + "grad_norm": 0.010136300697922707, + "learning_rate": 3.152250446839625e-05, + "loss": 0.0493, + "step": 79870 + }, + { + "epoch": 17.007626062936684, + "grad_norm": 0.007602451369166374, + "learning_rate": 3.151949544735116e-05, + "loss": 0.3197, + "step": 79880 + }, + { + "epoch": 17.007680225315497, + "grad_norm": 0.5789742469787598, + "learning_rate": 3.151648642630607e-05, + "loss": 0.0123, + "step": 79890 + }, + { + "epoch": 17.007734387694306, + "grad_norm": 0.012702579610049725, + "learning_rate": 3.151347740526097e-05, + "loss": 0.003, + "step": 79900 + }, + { + "epoch": 17.00778855007312, + "grad_norm": 0.04820021241903305, + "learning_rate": 3.151046838421588e-05, + "loss": 0.0029, + "step": 79910 + }, + { + "epoch": 17.007842712451932, + "grad_norm": 0.005732425022870302, + "learning_rate": 3.150745936317079e-05, + "loss": 0.1173, + "step": 79920 + }, + { + "epoch": 17.00789687483074, + "grad_norm": 0.1282171756029129, + "learning_rate": 3.1504450342125694e-05, + "loss": 0.0397, + "step": 79930 + }, + { + "epoch": 17.007951037209555, + "grad_norm": 0.0030776297207921743, + "learning_rate": 3.15014413210806e-05, + "loss": 0.0309, + "step": 79940 + }, + { + "epoch": 17.008005199588364, + "grad_norm": 0.02071268856525421, + "learning_rate": 3.1498432300035506e-05, + "loss": 0.0074, + "step": 79950 + }, + { + "epoch": 17.008059361967177, + "grad_norm": 0.002656232099980116, + "learning_rate": 3.149542327899041e-05, + "loss": 0.0029, + "step": 79960 + }, + { + "epoch": 17.00811352434599, + "grad_norm": 0.0024770519230514765, + "learning_rate": 3.1492414257945326e-05, + "loss": 0.0125, + "step": 79970 + }, + { + "epoch": 17.0081676867248, + "grad_norm": 2.1932849884033203, + "learning_rate": 3.1489405236900225e-05, + "loss": 0.0526, + "step": 79980 + }, + { + "epoch": 17.008221849103613, + "grad_norm": 0.04902574047446251, + "learning_rate": 3.148639621585514e-05, + "loss": 0.108, + "step": 79990 + }, + { + "epoch": 17.008276011482426, + "grad_norm": 0.09089544415473938, + "learning_rate": 3.1483387194810045e-05, + "loss": 0.0064, + "step": 80000 + }, + { + "epoch": 17.008330173861236, + "grad_norm": 0.13761106133460999, + "learning_rate": 3.1480378173764944e-05, + "loss": 0.075, + "step": 80010 + }, + { + "epoch": 17.00838433624005, + "grad_norm": 0.003694425104185939, + "learning_rate": 3.147736915271986e-05, + "loss": 0.1103, + "step": 80020 + }, + { + "epoch": 17.008438498618858, + "grad_norm": 1.0693542957305908, + "learning_rate": 3.1474360131674763e-05, + "loss": 0.0029, + "step": 80030 + }, + { + "epoch": 17.00849266099767, + "grad_norm": 0.10430699586868286, + "learning_rate": 3.147135111062967e-05, + "loss": 0.003, + "step": 80040 + }, + { + "epoch": 17.008546823376484, + "grad_norm": 1.4460246562957764, + "learning_rate": 3.1468342089584576e-05, + "loss": 0.0449, + "step": 80050 + }, + { + "epoch": 17.008600985755294, + "grad_norm": 0.10495376586914062, + "learning_rate": 3.146533306853948e-05, + "loss": 0.07, + "step": 80060 + }, + { + "epoch": 17.008655148134107, + "grad_norm": 0.00579857686534524, + "learning_rate": 3.146232404749439e-05, + "loss": 0.0227, + "step": 80070 + }, + { + "epoch": 17.008709310512916, + "grad_norm": 0.0025820627342909575, + "learning_rate": 3.14593150264493e-05, + "loss": 0.1024, + "step": 80080 + }, + { + "epoch": 17.00876347289173, + "grad_norm": 0.21210691332817078, + "learning_rate": 3.14563060054042e-05, + "loss": 0.0102, + "step": 80090 + }, + { + "epoch": 17.008817635270542, + "grad_norm": 0.0026698037981987, + "learning_rate": 3.145329698435911e-05, + "loss": 0.057, + "step": 80100 + }, + { + "epoch": 17.008871797649352, + "grad_norm": 0.0025882828049361706, + "learning_rate": 3.145028796331402e-05, + "loss": 0.0359, + "step": 80110 + }, + { + "epoch": 17.008925960028165, + "grad_norm": 0.42188993096351624, + "learning_rate": 3.144727894226893e-05, + "loss": 0.0539, + "step": 80120 + }, + { + "epoch": 17.008980122406975, + "grad_norm": 0.14205272495746613, + "learning_rate": 3.1444269921223826e-05, + "loss": 0.0079, + "step": 80130 + }, + { + "epoch": 17.009034284785788, + "grad_norm": 0.06725367903709412, + "learning_rate": 3.144126090017874e-05, + "loss": 0.001, + "step": 80140 + }, + { + "epoch": 17.0090884471646, + "grad_norm": 0.0022304817102849483, + "learning_rate": 3.1438251879133646e-05, + "loss": 0.0143, + "step": 80150 + }, + { + "epoch": 17.00914260954341, + "grad_norm": 0.049066320061683655, + "learning_rate": 3.1435242858088545e-05, + "loss": 0.021, + "step": 80160 + }, + { + "epoch": 17.009196771922223, + "grad_norm": 0.0022316896356642246, + "learning_rate": 3.143223383704346e-05, + "loss": 0.1354, + "step": 80170 + }, + { + "epoch": 17.009250934301033, + "grad_norm": 0.39032477140426636, + "learning_rate": 3.1429224815998364e-05, + "loss": 0.0655, + "step": 80180 + }, + { + "epoch": 17.009305096679846, + "grad_norm": 0.04689328372478485, + "learning_rate": 3.142621579495327e-05, + "loss": 0.1022, + "step": 80190 + }, + { + "epoch": 17.00935925905866, + "grad_norm": 0.0051420824602246284, + "learning_rate": 3.142320677390818e-05, + "loss": 0.0412, + "step": 80200 + }, + { + "epoch": 17.00941342143747, + "grad_norm": 0.0023351318668574095, + "learning_rate": 3.142019775286308e-05, + "loss": 0.0584, + "step": 80210 + }, + { + "epoch": 17.00946758381628, + "grad_norm": 0.062164779752492905, + "learning_rate": 3.141718873181799e-05, + "loss": 0.0883, + "step": 80220 + }, + { + "epoch": 17.009521746195094, + "grad_norm": 0.002343448344618082, + "learning_rate": 3.14141797107729e-05, + "loss": 0.0032, + "step": 80230 + }, + { + "epoch": 17.009575908573904, + "grad_norm": 0.121719129383564, + "learning_rate": 3.14111706897278e-05, + "loss": 0.0009, + "step": 80240 + }, + { + "epoch": 17.009630070952717, + "grad_norm": 0.9510889649391174, + "learning_rate": 3.1408161668682715e-05, + "loss": 0.1139, + "step": 80250 + }, + { + "epoch": 17.009684233331527, + "grad_norm": 0.2347315549850464, + "learning_rate": 3.140515264763762e-05, + "loss": 0.0686, + "step": 80260 + }, + { + "epoch": 17.00973839571034, + "grad_norm": 0.6910862922668457, + "learning_rate": 3.140214362659253e-05, + "loss": 0.0218, + "step": 80270 + }, + { + "epoch": 17.009792558089153, + "grad_norm": 1.8659451007843018, + "learning_rate": 3.1399134605547434e-05, + "loss": 0.1267, + "step": 80280 + }, + { + "epoch": 17.009846720467962, + "grad_norm": 2.4139461517333984, + "learning_rate": 3.139612558450234e-05, + "loss": 0.1337, + "step": 80290 + }, + { + "epoch": 17.009900882846775, + "grad_norm": 3.332608938217163, + "learning_rate": 3.1393116563457246e-05, + "loss": 0.0754, + "step": 80300 + }, + { + "epoch": 17.009955045225585, + "grad_norm": 0.002963629085570574, + "learning_rate": 3.139010754241215e-05, + "loss": 0.0762, + "step": 80310 + }, + { + "epoch": 17.010009207604398, + "grad_norm": 0.6341912746429443, + "learning_rate": 3.138709852136706e-05, + "loss": 0.0479, + "step": 80320 + }, + { + "epoch": 17.01006336998321, + "grad_norm": 0.002996433060616255, + "learning_rate": 3.1384089500321965e-05, + "loss": 0.1486, + "step": 80330 + }, + { + "epoch": 17.01011753236202, + "grad_norm": 0.09359271079301834, + "learning_rate": 3.138108047927688e-05, + "loss": 0.0496, + "step": 80340 + }, + { + "epoch": 17.010171694740833, + "grad_norm": 0.6051295399665833, + "learning_rate": 3.137807145823178e-05, + "loss": 0.0293, + "step": 80350 + }, + { + "epoch": 17.010225857119643, + "grad_norm": 0.04835955798625946, + "learning_rate": 3.1375062437186684e-05, + "loss": 0.081, + "step": 80360 + }, + { + "epoch": 17.010280019498456, + "grad_norm": 0.0030102403834462166, + "learning_rate": 3.13720534161416e-05, + "loss": 0.1012, + "step": 80370 + }, + { + "epoch": 17.01033418187727, + "grad_norm": 0.34785470366477966, + "learning_rate": 3.13690443950965e-05, + "loss": 0.0266, + "step": 80380 + }, + { + "epoch": 17.01038834425608, + "grad_norm": 0.2996721863746643, + "learning_rate": 3.13660353740514e-05, + "loss": 0.1616, + "step": 80390 + }, + { + "epoch": 17.01044250663489, + "grad_norm": 2.18086576461792, + "learning_rate": 3.1363026353006316e-05, + "loss": 0.0703, + "step": 80400 + }, + { + "epoch": 17.010496669013705, + "grad_norm": 0.0934981033205986, + "learning_rate": 3.136001733196122e-05, + "loss": 0.0524, + "step": 80410 + }, + { + "epoch": 17.010550831392514, + "grad_norm": 0.36713099479675293, + "learning_rate": 3.135700831091613e-05, + "loss": 0.0739, + "step": 80420 + }, + { + "epoch": 17.010604993771327, + "grad_norm": 0.6447151899337769, + "learning_rate": 3.1353999289871035e-05, + "loss": 0.0064, + "step": 80430 + }, + { + "epoch": 17.010659156150137, + "grad_norm": 2.0834832191467285, + "learning_rate": 3.135099026882594e-05, + "loss": 0.0261, + "step": 80440 + }, + { + "epoch": 17.01071331852895, + "grad_norm": 0.00575697747990489, + "learning_rate": 3.134798124778085e-05, + "loss": 0.1477, + "step": 80450 + }, + { + "epoch": 17.010767480907763, + "grad_norm": 2.9022552967071533, + "learning_rate": 3.1344972226735754e-05, + "loss": 0.0982, + "step": 80460 + }, + { + "epoch": 17.010821643286572, + "grad_norm": 2.644228219985962, + "learning_rate": 3.134196320569066e-05, + "loss": 0.1809, + "step": 80470 + }, + { + "epoch": 17.010875805665385, + "grad_norm": 1.2452754974365234, + "learning_rate": 3.1338954184645566e-05, + "loss": 0.0945, + "step": 80480 + }, + { + "epoch": 17.010929968044195, + "grad_norm": 0.009227215312421322, + "learning_rate": 3.133594516360048e-05, + "loss": 0.0838, + "step": 80490 + }, + { + "epoch": 17.010984130423008, + "grad_norm": 0.11211829632520676, + "learning_rate": 3.133293614255538e-05, + "loss": 0.0468, + "step": 80500 + }, + { + "epoch": 17.01103829280182, + "grad_norm": 0.5094943046569824, + "learning_rate": 3.132992712151029e-05, + "loss": 0.1299, + "step": 80510 + }, + { + "epoch": 17.01109245518063, + "grad_norm": 0.06843312829732895, + "learning_rate": 3.13269181004652e-05, + "loss": 0.0501, + "step": 80520 + }, + { + "epoch": 17.011146617559444, + "grad_norm": 0.030051499605178833, + "learning_rate": 3.1323909079420104e-05, + "loss": 0.0921, + "step": 80530 + }, + { + "epoch": 17.011200779938253, + "grad_norm": 0.054309993982315063, + "learning_rate": 3.132090005837501e-05, + "loss": 0.0201, + "step": 80540 + }, + { + "epoch": 17.011254942317066, + "grad_norm": 0.035278912633657455, + "learning_rate": 3.131789103732992e-05, + "loss": 0.0256, + "step": 80550 + }, + { + "epoch": 17.01130910469588, + "grad_norm": 0.047698915004730225, + "learning_rate": 3.131488201628482e-05, + "loss": 0.0661, + "step": 80560 + }, + { + "epoch": 17.01136326707469, + "grad_norm": 1.0215471982955933, + "learning_rate": 3.131187299523973e-05, + "loss": 0.0537, + "step": 80570 + }, + { + "epoch": 17.011417429453502, + "grad_norm": 0.012123003602027893, + "learning_rate": 3.1308863974194636e-05, + "loss": 0.027, + "step": 80580 + }, + { + "epoch": 17.011471591832315, + "grad_norm": 0.0124366981908679, + "learning_rate": 3.130585495314954e-05, + "loss": 0.0566, + "step": 80590 + }, + { + "epoch": 17.011525754211124, + "grad_norm": 0.7764852046966553, + "learning_rate": 3.1302845932104455e-05, + "loss": 0.2078, + "step": 80600 + }, + { + "epoch": 17.011579916589938, + "grad_norm": 1.6436822414398193, + "learning_rate": 3.1299836911059354e-05, + "loss": 0.0469, + "step": 80610 + }, + { + "epoch": 17.011634078968747, + "grad_norm": 3.7439701557159424, + "learning_rate": 3.129682789001426e-05, + "loss": 0.0512, + "step": 80620 + }, + { + "epoch": 17.01168824134756, + "grad_norm": 0.3805181682109833, + "learning_rate": 3.1293818868969174e-05, + "loss": 0.0716, + "step": 80630 + }, + { + "epoch": 17.011742403726373, + "grad_norm": 0.003842518199235201, + "learning_rate": 3.129080984792408e-05, + "loss": 0.0005, + "step": 80640 + }, + { + "epoch": 17.011796566105183, + "grad_norm": 0.03808274492621422, + "learning_rate": 3.128780082687898e-05, + "loss": 0.0479, + "step": 80650 + }, + { + "epoch": 17.011850728483996, + "grad_norm": 0.13060277700424194, + "learning_rate": 3.128479180583389e-05, + "loss": 0.0603, + "step": 80660 + }, + { + "epoch": 17.011904890862805, + "grad_norm": 1.9427756071090698, + "learning_rate": 3.12817827847888e-05, + "loss": 0.021, + "step": 80670 + }, + { + "epoch": 17.01195905324162, + "grad_norm": 0.003415146376937628, + "learning_rate": 3.1278773763743705e-05, + "loss": 0.0748, + "step": 80680 + }, + { + "epoch": 17.01201321562043, + "grad_norm": 0.04189033806324005, + "learning_rate": 3.127576474269861e-05, + "loss": 0.0315, + "step": 80690 + }, + { + "epoch": 17.01206737799924, + "grad_norm": 0.09165455400943756, + "learning_rate": 3.127275572165352e-05, + "loss": 0.1391, + "step": 80700 + }, + { + "epoch": 17.012121540378054, + "grad_norm": 0.005660169757902622, + "learning_rate": 3.1269746700608424e-05, + "loss": 0.0032, + "step": 80710 + }, + { + "epoch": 17.012175702756863, + "grad_norm": 2.215351104736328, + "learning_rate": 3.126673767956334e-05, + "loss": 0.027, + "step": 80720 + }, + { + "epoch": 17.012229865135676, + "grad_norm": 0.04313770681619644, + "learning_rate": 3.1263728658518237e-05, + "loss": 0.1133, + "step": 80730 + }, + { + "epoch": 17.01228402751449, + "grad_norm": 106.89547729492188, + "learning_rate": 3.126071963747314e-05, + "loss": 0.0698, + "step": 80740 + }, + { + "epoch": 17.0123381898933, + "grad_norm": 0.009171550162136555, + "learning_rate": 3.1257710616428056e-05, + "loss": 0.0556, + "step": 80750 + }, + { + "epoch": 17.012392352272112, + "grad_norm": 5.986160755157471, + "learning_rate": 3.1254701595382955e-05, + "loss": 0.1159, + "step": 80760 + }, + { + "epoch": 17.012446514650925, + "grad_norm": 0.004833173472434282, + "learning_rate": 3.125169257433787e-05, + "loss": 0.0496, + "step": 80770 + }, + { + "epoch": 17.012500677029735, + "grad_norm": 0.0058579775504767895, + "learning_rate": 3.1248683553292775e-05, + "loss": 0.0474, + "step": 80780 + }, + { + "epoch": 17.012554839408548, + "grad_norm": 0.869175374507904, + "learning_rate": 3.124567453224768e-05, + "loss": 0.0724, + "step": 80790 + }, + { + "epoch": 17.012609001787357, + "grad_norm": 0.33608105778694153, + "learning_rate": 3.124266551120259e-05, + "loss": 0.1343, + "step": 80800 + }, + { + "epoch": 17.01266316416617, + "grad_norm": 0.023254243656992912, + "learning_rate": 3.1239656490157494e-05, + "loss": 0.0609, + "step": 80810 + }, + { + "epoch": 17.012717326544983, + "grad_norm": 4.958517074584961, + "learning_rate": 3.12366474691124e-05, + "loss": 0.1124, + "step": 80820 + }, + { + "epoch": 17.012771488923793, + "grad_norm": 0.007615399081259966, + "learning_rate": 3.1233638448067306e-05, + "loss": 0.0479, + "step": 80830 + }, + { + "epoch": 17.012825651302606, + "grad_norm": 0.27668625116348267, + "learning_rate": 3.123062942702221e-05, + "loss": 0.0199, + "step": 80840 + }, + { + "epoch": 17.012879813681415, + "grad_norm": 0.09542283415794373, + "learning_rate": 3.122762040597712e-05, + "loss": 0.0195, + "step": 80850 + }, + { + "epoch": 17.01293397606023, + "grad_norm": 0.5854291915893555, + "learning_rate": 3.122461138493203e-05, + "loss": 0.0078, + "step": 80860 + }, + { + "epoch": 17.01298813843904, + "grad_norm": 0.0741918757557869, + "learning_rate": 3.122160236388694e-05, + "loss": 0.1972, + "step": 80870 + }, + { + "epoch": 17.01304230081785, + "grad_norm": 0.03503486141562462, + "learning_rate": 3.121859334284184e-05, + "loss": 0.0201, + "step": 80880 + }, + { + "epoch": 17.013096463196664, + "grad_norm": 0.5289972424507141, + "learning_rate": 3.121558432179675e-05, + "loss": 0.0588, + "step": 80890 + }, + { + "epoch": 17.013150625575474, + "grad_norm": 0.010761972516775131, + "learning_rate": 3.121257530075166e-05, + "loss": 0.0834, + "step": 80900 + }, + { + "epoch": 17.013204787954287, + "grad_norm": 0.7130253314971924, + "learning_rate": 3.1209566279706556e-05, + "loss": 0.0033, + "step": 80910 + }, + { + "epoch": 17.0132589503331, + "grad_norm": 9.844873428344727, + "learning_rate": 3.120655725866147e-05, + "loss": 0.1641, + "step": 80920 + }, + { + "epoch": 17.01331311271191, + "grad_norm": 0.0812835618853569, + "learning_rate": 3.1203548237616376e-05, + "loss": 0.095, + "step": 80930 + }, + { + "epoch": 17.013367275090722, + "grad_norm": 0.022374767810106277, + "learning_rate": 3.120053921657128e-05, + "loss": 0.036, + "step": 80940 + }, + { + "epoch": 17.013421437469535, + "grad_norm": 0.06846451014280319, + "learning_rate": 3.119753019552619e-05, + "loss": 0.1258, + "step": 80950 + }, + { + "epoch": 17.013475599848345, + "grad_norm": 0.06414218246936798, + "learning_rate": 3.1194521174481094e-05, + "loss": 0.0253, + "step": 80960 + }, + { + "epoch": 17.013529762227158, + "grad_norm": 0.0055621652863919735, + "learning_rate": 3.1191512153436e-05, + "loss": 0.1162, + "step": 80970 + }, + { + "epoch": 17.013583924605967, + "grad_norm": 0.04548288881778717, + "learning_rate": 3.1188503132390914e-05, + "loss": 0.0618, + "step": 80980 + }, + { + "epoch": 17.01363808698478, + "grad_norm": 0.5583672523498535, + "learning_rate": 3.118549411134581e-05, + "loss": 0.017, + "step": 80990 + }, + { + "epoch": 17.013692249363594, + "grad_norm": 0.09022771567106247, + "learning_rate": 3.118248509030072e-05, + "loss": 0.1072, + "step": 81000 + }, + { + "epoch": 17.013746411742403, + "grad_norm": 2.0994443893432617, + "learning_rate": 3.117947606925563e-05, + "loss": 0.0311, + "step": 81010 + }, + { + "epoch": 17.013800574121216, + "grad_norm": 0.10017501562833786, + "learning_rate": 3.117646704821054e-05, + "loss": 0.0754, + "step": 81020 + }, + { + "epoch": 17.013854736500026, + "grad_norm": 3.6716108322143555, + "learning_rate": 3.1173458027165445e-05, + "loss": 0.1132, + "step": 81030 + }, + { + "epoch": 17.01390889887884, + "grad_norm": 0.004911716561764479, + "learning_rate": 3.117044900612035e-05, + "loss": 0.0626, + "step": 81040 + }, + { + "epoch": 17.013963061257652, + "grad_norm": 0.0874355137348175, + "learning_rate": 3.116743998507526e-05, + "loss": 0.1044, + "step": 81050 + }, + { + "epoch": 17.01401722363646, + "grad_norm": 2.555818557739258, + "learning_rate": 3.1164430964030164e-05, + "loss": 0.0655, + "step": 81060 + }, + { + "epoch": 17.014071386015274, + "grad_norm": 0.9474825859069824, + "learning_rate": 3.116142194298507e-05, + "loss": 0.0213, + "step": 81070 + }, + { + "epoch": 17.014125548394084, + "grad_norm": 0.5504528284072876, + "learning_rate": 3.1158412921939976e-05, + "loss": 0.0631, + "step": 81080 + }, + { + "epoch": 17.014179710772897, + "grad_norm": 0.04142036288976669, + "learning_rate": 3.115540390089489e-05, + "loss": 0.035, + "step": 81090 + }, + { + "epoch": 17.01423387315171, + "grad_norm": 0.45334964990615845, + "learning_rate": 3.115239487984979e-05, + "loss": 0.0488, + "step": 81100 + }, + { + "epoch": 17.01428803553052, + "grad_norm": 0.013404322788119316, + "learning_rate": 3.1149385858804695e-05, + "loss": 0.0675, + "step": 81110 + }, + { + "epoch": 17.014342197909333, + "grad_norm": 0.016258681192994118, + "learning_rate": 3.114637683775961e-05, + "loss": 0.0699, + "step": 81120 + }, + { + "epoch": 17.014396360288146, + "grad_norm": 0.06128128618001938, + "learning_rate": 3.1143367816714515e-05, + "loss": 0.0077, + "step": 81130 + }, + { + "epoch": 17.014450522666955, + "grad_norm": 0.3704070448875427, + "learning_rate": 3.1140358795669414e-05, + "loss": 0.0212, + "step": 81140 + }, + { + "epoch": 17.014504685045768, + "grad_norm": 0.39111560583114624, + "learning_rate": 3.113734977462433e-05, + "loss": 0.0234, + "step": 81150 + }, + { + "epoch": 17.014558847424578, + "grad_norm": 0.0030654226429760456, + "learning_rate": 3.1134340753579233e-05, + "loss": 0.03, + "step": 81160 + }, + { + "epoch": 17.01461300980339, + "grad_norm": 0.003190582152456045, + "learning_rate": 3.113133173253414e-05, + "loss": 0.008, + "step": 81170 + }, + { + "epoch": 17.014667172182204, + "grad_norm": 0.6419220566749573, + "learning_rate": 3.1128322711489046e-05, + "loss": 0.0423, + "step": 81180 + }, + { + "epoch": 17.014721334561013, + "grad_norm": 0.0029912434983998537, + "learning_rate": 3.112531369044395e-05, + "loss": 0.1663, + "step": 81190 + }, + { + "epoch": 17.014775496939826, + "grad_norm": 1.0293205976486206, + "learning_rate": 3.112230466939886e-05, + "loss": 0.0287, + "step": 81200 + }, + { + "epoch": 17.014829659318636, + "grad_norm": 0.006327813025563955, + "learning_rate": 3.1119295648353765e-05, + "loss": 0.2766, + "step": 81210 + }, + { + "epoch": 17.01488382169745, + "grad_norm": 0.15399350225925446, + "learning_rate": 3.111628662730867e-05, + "loss": 0.1054, + "step": 81220 + }, + { + "epoch": 17.014937984076262, + "grad_norm": 0.0071227638982236385, + "learning_rate": 3.111327760626358e-05, + "loss": 0.0394, + "step": 81230 + }, + { + "epoch": 17.01499214645507, + "grad_norm": 0.8394711017608643, + "learning_rate": 3.111026858521849e-05, + "loss": 0.0288, + "step": 81240 + }, + { + "epoch": 17.015046308833885, + "grad_norm": 1.0355321168899536, + "learning_rate": 3.110725956417339e-05, + "loss": 0.0943, + "step": 81250 + }, + { + "epoch": 17.015100471212694, + "grad_norm": 0.008053823374211788, + "learning_rate": 3.1104250543128296e-05, + "loss": 0.0685, + "step": 81260 + }, + { + "epoch": 17.015154633591507, + "grad_norm": 0.08352383971214294, + "learning_rate": 3.110124152208321e-05, + "loss": 0.0417, + "step": 81270 + }, + { + "epoch": 17.01520879597032, + "grad_norm": 0.005721885245293379, + "learning_rate": 3.1098232501038116e-05, + "loss": 0.0499, + "step": 81280 + }, + { + "epoch": 17.01526295834913, + "grad_norm": 0.03935566172003746, + "learning_rate": 3.109522347999302e-05, + "loss": 0.1281, + "step": 81290 + }, + { + "epoch": 17.015317120727943, + "grad_norm": 0.014596889726817608, + "learning_rate": 3.109221445894793e-05, + "loss": 0.0086, + "step": 81300 + }, + { + "epoch": 17.015371283106752, + "grad_norm": 0.5049822926521301, + "learning_rate": 3.1089205437902834e-05, + "loss": 0.0312, + "step": 81310 + }, + { + "epoch": 17.015425445485565, + "grad_norm": 0.0741894468665123, + "learning_rate": 3.108619641685774e-05, + "loss": 0.0783, + "step": 81320 + }, + { + "epoch": 17.01547960786438, + "grad_norm": 0.0060574207454919815, + "learning_rate": 3.108318739581265e-05, + "loss": 0.136, + "step": 81330 + }, + { + "epoch": 17.015533770243188, + "grad_norm": 2.7759125232696533, + "learning_rate": 3.108017837476755e-05, + "loss": 0.0432, + "step": 81340 + }, + { + "epoch": 17.015587932622, + "grad_norm": 0.010713867843151093, + "learning_rate": 3.1077169353722466e-05, + "loss": 0.0579, + "step": 81350 + }, + { + "epoch": 17.015642095000814, + "grad_norm": 1.048979640007019, + "learning_rate": 3.1074160332677366e-05, + "loss": 0.0391, + "step": 81360 + }, + { + "epoch": 17.015696257379624, + "grad_norm": 0.013397743925452232, + "learning_rate": 3.107115131163227e-05, + "loss": 0.1374, + "step": 81370 + }, + { + "epoch": 17.015750419758437, + "grad_norm": 0.17578552663326263, + "learning_rate": 3.1068142290587185e-05, + "loss": 0.0242, + "step": 81380 + }, + { + "epoch": 17.015804582137246, + "grad_norm": 0.38504961133003235, + "learning_rate": 3.106513326954209e-05, + "loss": 0.0432, + "step": 81390 + }, + { + "epoch": 17.01585874451606, + "grad_norm": 2.1076817512512207, + "learning_rate": 3.106212424849699e-05, + "loss": 0.0487, + "step": 81400 + }, + { + "epoch": 17.015912906894872, + "grad_norm": 2.0335564613342285, + "learning_rate": 3.1059115227451904e-05, + "loss": 0.0778, + "step": 81410 + }, + { + "epoch": 17.01596706927368, + "grad_norm": 0.007714620791375637, + "learning_rate": 3.105610620640681e-05, + "loss": 0.03, + "step": 81420 + }, + { + "epoch": 17.016021231652495, + "grad_norm": 0.01289957482367754, + "learning_rate": 3.1053097185361716e-05, + "loss": 0.0074, + "step": 81430 + }, + { + "epoch": 17.016075394031304, + "grad_norm": 2.069234609603882, + "learning_rate": 3.105008816431662e-05, + "loss": 0.0676, + "step": 81440 + }, + { + "epoch": 17.016129556410117, + "grad_norm": 0.008447413332760334, + "learning_rate": 3.104707914327153e-05, + "loss": 0.0073, + "step": 81450 + }, + { + "epoch": 17.01618371878893, + "grad_norm": 0.00934838131070137, + "learning_rate": 3.1044070122226435e-05, + "loss": 0.0346, + "step": 81460 + }, + { + "epoch": 17.01623788116774, + "grad_norm": 0.013470943085849285, + "learning_rate": 3.104106110118135e-05, + "loss": 0.2554, + "step": 81470 + }, + { + "epoch": 17.016292043546553, + "grad_norm": 0.8088827729225159, + "learning_rate": 3.103805208013625e-05, + "loss": 0.2179, + "step": 81480 + }, + { + "epoch": 17.016346205925363, + "grad_norm": 0.7485466003417969, + "learning_rate": 3.1035043059091154e-05, + "loss": 0.034, + "step": 81490 + }, + { + "epoch": 17.016400368304176, + "grad_norm": 0.029126720502972603, + "learning_rate": 3.103203403804607e-05, + "loss": 0.0344, + "step": 81500 + }, + { + "epoch": 17.01645453068299, + "grad_norm": 0.010241818614304066, + "learning_rate": 3.102902501700097e-05, + "loss": 0.0282, + "step": 81510 + }, + { + "epoch": 17.016508693061798, + "grad_norm": 0.9548880457878113, + "learning_rate": 3.102601599595587e-05, + "loss": 0.0127, + "step": 81520 + }, + { + "epoch": 17.01656285544061, + "grad_norm": 0.0483139231801033, + "learning_rate": 3.1023006974910786e-05, + "loss": 0.0062, + "step": 81530 + }, + { + "epoch": 17.016617017819424, + "grad_norm": 3.755737543106079, + "learning_rate": 3.101999795386569e-05, + "loss": 0.0661, + "step": 81540 + }, + { + "epoch": 17.016671180198234, + "grad_norm": 0.636866569519043, + "learning_rate": 3.10169889328206e-05, + "loss": 0.0062, + "step": 81550 + }, + { + "epoch": 17.016725342577047, + "grad_norm": 9.258989334106445, + "learning_rate": 3.1013979911775505e-05, + "loss": 0.1138, + "step": 81560 + }, + { + "epoch": 17.016779504955856, + "grad_norm": 0.21852274239063263, + "learning_rate": 3.101097089073041e-05, + "loss": 0.0493, + "step": 81570 + }, + { + "epoch": 17.01683366733467, + "grad_norm": 0.059196289628744125, + "learning_rate": 3.100796186968532e-05, + "loss": 0.0044, + "step": 81580 + }, + { + "epoch": 17.016887829713482, + "grad_norm": 0.41051027178764343, + "learning_rate": 3.1004952848640224e-05, + "loss": 0.1433, + "step": 81590 + }, + { + "epoch": 17.016941992092292, + "grad_norm": 0.055062949657440186, + "learning_rate": 3.100194382759513e-05, + "loss": 0.0445, + "step": 81600 + }, + { + "epoch": 17.016996154471105, + "grad_norm": 12.159394264221191, + "learning_rate": 3.099893480655004e-05, + "loss": 0.1545, + "step": 81610 + }, + { + "epoch": 17.017050316849915, + "grad_norm": 1.0264277458190918, + "learning_rate": 3.099592578550495e-05, + "loss": 0.0254, + "step": 81620 + }, + { + "epoch": 17.017104479228728, + "grad_norm": 0.013593213632702827, + "learning_rate": 3.099291676445985e-05, + "loss": 0.037, + "step": 81630 + }, + { + "epoch": 17.01715864160754, + "grad_norm": 0.03399431332945824, + "learning_rate": 3.098990774341476e-05, + "loss": 0.043, + "step": 81640 + }, + { + "epoch": 17.01721280398635, + "grad_norm": 0.005226735956966877, + "learning_rate": 3.098689872236967e-05, + "loss": 0.0248, + "step": 81650 + }, + { + "epoch": 17.017266966365163, + "grad_norm": 4.007198333740234, + "learning_rate": 3.098388970132457e-05, + "loss": 0.0873, + "step": 81660 + }, + { + "epoch": 17.017321128743973, + "grad_norm": 0.005805707536637783, + "learning_rate": 3.098088068027948e-05, + "loss": 0.0158, + "step": 81670 + }, + { + "epoch": 17.017375291122786, + "grad_norm": 0.005747640039771795, + "learning_rate": 3.097787165923439e-05, + "loss": 0.0325, + "step": 81680 + }, + { + "epoch": 17.0174294535016, + "grad_norm": 0.003260598750784993, + "learning_rate": 3.097486263818929e-05, + "loss": 0.0269, + "step": 81690 + }, + { + "epoch": 17.01748361588041, + "grad_norm": 0.005242534447461367, + "learning_rate": 3.09718536171442e-05, + "loss": 0.0497, + "step": 81700 + }, + { + "epoch": 17.01753777825922, + "grad_norm": 2.880596876144409, + "learning_rate": 3.0968844596099106e-05, + "loss": 0.1662, + "step": 81710 + }, + { + "epoch": 17.017591940638034, + "grad_norm": 0.20455396175384521, + "learning_rate": 3.096583557505401e-05, + "loss": 0.0095, + "step": 81720 + }, + { + "epoch": 17.017646103016844, + "grad_norm": 0.004328593611717224, + "learning_rate": 3.0962826554008925e-05, + "loss": 0.0026, + "step": 81730 + }, + { + "epoch": 17.017700265395657, + "grad_norm": 18.429176330566406, + "learning_rate": 3.0959817532963824e-05, + "loss": 0.1546, + "step": 81740 + }, + { + "epoch": 17.017754427774467, + "grad_norm": 0.009120097383856773, + "learning_rate": 3.095680851191873e-05, + "loss": 0.0131, + "step": 81750 + }, + { + "epoch": 17.01780859015328, + "grad_norm": 0.005861148238182068, + "learning_rate": 3.0953799490873644e-05, + "loss": 0.0118, + "step": 81760 + }, + { + "epoch": 17.017862752532093, + "grad_norm": 2.6026313304901123, + "learning_rate": 3.095079046982855e-05, + "loss": 0.1189, + "step": 81770 + }, + { + "epoch": 17.017916914910902, + "grad_norm": 0.028293557465076447, + "learning_rate": 3.094778144878345e-05, + "loss": 0.0745, + "step": 81780 + }, + { + "epoch": 17.017971077289715, + "grad_norm": 0.025339173153042793, + "learning_rate": 3.094477242773836e-05, + "loss": 0.0576, + "step": 81790 + }, + { + "epoch": 17.018025239668525, + "grad_norm": 0.005553723778575659, + "learning_rate": 3.094176340669327e-05, + "loss": 0.0848, + "step": 81800 + }, + { + "epoch": 17.018079402047338, + "grad_norm": 1.69377601146698, + "learning_rate": 3.0938754385648175e-05, + "loss": 0.0216, + "step": 81810 + }, + { + "epoch": 17.01813356442615, + "grad_norm": 0.0036481122951954603, + "learning_rate": 3.093574536460308e-05, + "loss": 0.0006, + "step": 81820 + }, + { + "epoch": 17.01818772680496, + "grad_norm": 0.003381181275472045, + "learning_rate": 3.093273634355799e-05, + "loss": 0.0259, + "step": 81830 + }, + { + "epoch": 17.018241889183773, + "grad_norm": 0.0033805870916694403, + "learning_rate": 3.0929727322512894e-05, + "loss": 0.0189, + "step": 81840 + }, + { + "epoch": 17.018296051562583, + "grad_norm": 0.015798380598425865, + "learning_rate": 3.09267183014678e-05, + "loss": 0.0312, + "step": 81850 + }, + { + "epoch": 17.018350213941396, + "grad_norm": 0.32045748829841614, + "learning_rate": 3.0923709280422707e-05, + "loss": 0.1001, + "step": 81860 + }, + { + "epoch": 17.01840437632021, + "grad_norm": 0.5415897965431213, + "learning_rate": 3.092070025937762e-05, + "loss": 0.1467, + "step": 81870 + }, + { + "epoch": 17.01845853869902, + "grad_norm": 0.007884296588599682, + "learning_rate": 3.0917691238332526e-05, + "loss": 0.0383, + "step": 81880 + }, + { + "epoch": 17.01851270107783, + "grad_norm": 0.0830337181687355, + "learning_rate": 3.0914682217287425e-05, + "loss": 0.0175, + "step": 81890 + }, + { + "epoch": 17.018566863456645, + "grad_norm": 0.05148888751864433, + "learning_rate": 3.091167319624234e-05, + "loss": 0.0206, + "step": 81900 + }, + { + "epoch": 17.018621025835454, + "grad_norm": 4.0070295333862305, + "learning_rate": 3.0908664175197245e-05, + "loss": 0.1088, + "step": 81910 + }, + { + "epoch": 17.018675188214267, + "grad_norm": 0.10164815932512283, + "learning_rate": 3.090565515415215e-05, + "loss": 0.0207, + "step": 81920 + }, + { + "epoch": 17.018729350593077, + "grad_norm": 0.029014797881245613, + "learning_rate": 3.090264613310706e-05, + "loss": 0.0696, + "step": 81930 + }, + { + "epoch": 17.01878351297189, + "grad_norm": 0.0041742632165551186, + "learning_rate": 3.0899637112061964e-05, + "loss": 0.0593, + "step": 81940 + }, + { + "epoch": 17.018837675350703, + "grad_norm": 0.03328093886375427, + "learning_rate": 3.089662809101687e-05, + "loss": 0.0389, + "step": 81950 + }, + { + "epoch": 17.018891837729512, + "grad_norm": 0.00841626338660717, + "learning_rate": 3.0893619069971776e-05, + "loss": 0.043, + "step": 81960 + }, + { + "epoch": 17.018946000108325, + "grad_norm": 0.003823489649221301, + "learning_rate": 3.089061004892668e-05, + "loss": 0.009, + "step": 81970 + }, + { + "epoch": 17.019000162487135, + "grad_norm": 0.007909516803920269, + "learning_rate": 3.088760102788159e-05, + "loss": 0.0344, + "step": 81980 + }, + { + "epoch": 17.019054324865948, + "grad_norm": 0.07476577162742615, + "learning_rate": 3.08845920068365e-05, + "loss": 0.077, + "step": 81990 + }, + { + "epoch": 17.01910848724476, + "grad_norm": 0.21630528569221497, + "learning_rate": 3.08815829857914e-05, + "loss": 0.03, + "step": 82000 + }, + { + "epoch": 17.01916264962357, + "grad_norm": 0.07405272871255875, + "learning_rate": 3.087857396474631e-05, + "loss": 0.0175, + "step": 82010 + }, + { + "epoch": 17.019216812002384, + "grad_norm": 3.6085407733917236, + "learning_rate": 3.087556494370122e-05, + "loss": 0.0651, + "step": 82020 + }, + { + "epoch": 17.019270974381193, + "grad_norm": 0.005322073120623827, + "learning_rate": 3.087255592265613e-05, + "loss": 0.0449, + "step": 82030 + }, + { + "epoch": 17.019325136760006, + "grad_norm": 0.0028236359357833862, + "learning_rate": 3.0869546901611026e-05, + "loss": 0.1019, + "step": 82040 + }, + { + "epoch": 17.01937929913882, + "grad_norm": 1.2780455350875854, + "learning_rate": 3.086653788056594e-05, + "loss": 0.1086, + "step": 82050 + }, + { + "epoch": 17.01943346151763, + "grad_norm": 0.06973475217819214, + "learning_rate": 3.0863528859520846e-05, + "loss": 0.0116, + "step": 82060 + }, + { + "epoch": 17.019487623896442, + "grad_norm": 0.011453086510300636, + "learning_rate": 3.086051983847575e-05, + "loss": 0.064, + "step": 82070 + }, + { + "epoch": 17.019541786275255, + "grad_norm": 0.10121775418519974, + "learning_rate": 3.085751081743066e-05, + "loss": 0.0098, + "step": 82080 + }, + { + "epoch": 17.019595948654064, + "grad_norm": 0.06700122356414795, + "learning_rate": 3.0854501796385564e-05, + "loss": 0.0405, + "step": 82090 + }, + { + "epoch": 17.019650111032878, + "grad_norm": 0.020954787731170654, + "learning_rate": 3.085149277534047e-05, + "loss": 0.0225, + "step": 82100 + }, + { + "epoch": 17.019704273411687, + "grad_norm": 0.028530379757285118, + "learning_rate": 3.084848375429538e-05, + "loss": 0.0071, + "step": 82110 + }, + { + "epoch": 17.0197584357905, + "grad_norm": 0.0069800978526473045, + "learning_rate": 3.084547473325028e-05, + "loss": 0.0051, + "step": 82120 + }, + { + "epoch": 17.019812598169313, + "grad_norm": 0.00949507299810648, + "learning_rate": 3.0842465712205196e-05, + "loss": 0.0811, + "step": 82130 + }, + { + "epoch": 17.019866760548123, + "grad_norm": 0.0023741424083709717, + "learning_rate": 3.08394566911601e-05, + "loss": 0.1209, + "step": 82140 + }, + { + "epoch": 17.019920922926936, + "grad_norm": 2.473177909851074, + "learning_rate": 3.0836447670115e-05, + "loss": 0.0492, + "step": 82150 + }, + { + "epoch": 17.019975085305745, + "grad_norm": 0.030246036127209663, + "learning_rate": 3.0833438649069915e-05, + "loss": 0.0542, + "step": 82160 + }, + { + "epoch": 17.02002924768456, + "grad_norm": 0.28598788380622864, + "learning_rate": 3.083042962802482e-05, + "loss": 0.0063, + "step": 82170 + }, + { + "epoch": 17.02008341006337, + "grad_norm": 3.7010436058044434, + "learning_rate": 3.082742060697973e-05, + "loss": 0.0728, + "step": 82180 + }, + { + "epoch": 17.02013757244218, + "grad_norm": 2.763502597808838, + "learning_rate": 3.0824411585934634e-05, + "loss": 0.0578, + "step": 82190 + }, + { + "epoch": 17.020191734820994, + "grad_norm": 0.3242050111293793, + "learning_rate": 3.082140256488954e-05, + "loss": 0.083, + "step": 82200 + }, + { + "epoch": 17.020245897199803, + "grad_norm": 0.010185373947024345, + "learning_rate": 3.0818393543844446e-05, + "loss": 0.0548, + "step": 82210 + }, + { + "epoch": 17.020300059578616, + "grad_norm": 3.987861394882202, + "learning_rate": 3.081538452279936e-05, + "loss": 0.0511, + "step": 82220 + }, + { + "epoch": 17.02035422195743, + "grad_norm": 2.3493099212646484, + "learning_rate": 3.081237550175426e-05, + "loss": 0.0619, + "step": 82230 + }, + { + "epoch": 17.02040838433624, + "grad_norm": 2.257988452911377, + "learning_rate": 3.0809366480709165e-05, + "loss": 0.1165, + "step": 82240 + }, + { + "epoch": 17.020462546715052, + "grad_norm": 0.016960084438323975, + "learning_rate": 3.080635745966408e-05, + "loss": 0.0908, + "step": 82250 + }, + { + "epoch": 17.020516709093865, + "grad_norm": 16.94743537902832, + "learning_rate": 3.080334843861898e-05, + "loss": 0.0429, + "step": 82260 + }, + { + "epoch": 17.020570871472675, + "grad_norm": 0.003956854809075594, + "learning_rate": 3.0800339417573884e-05, + "loss": 0.0088, + "step": 82270 + }, + { + "epoch": 17.020625033851488, + "grad_norm": 2.519094467163086, + "learning_rate": 3.07973303965288e-05, + "loss": 0.041, + "step": 82280 + }, + { + "epoch": 17.020679196230297, + "grad_norm": 0.012109656818211079, + "learning_rate": 3.0794321375483703e-05, + "loss": 0.0365, + "step": 82290 + }, + { + "epoch": 17.02073335860911, + "grad_norm": 0.06716471165418625, + "learning_rate": 3.07913123544386e-05, + "loss": 0.047, + "step": 82300 + }, + { + "epoch": 17.020787520987923, + "grad_norm": 0.33013200759887695, + "learning_rate": 3.0788303333393516e-05, + "loss": 0.077, + "step": 82310 + }, + { + "epoch": 17.020841683366733, + "grad_norm": 0.0031613775063306093, + "learning_rate": 3.078529431234842e-05, + "loss": 0.0648, + "step": 82320 + }, + { + "epoch": 17.020895845745546, + "grad_norm": 0.3848833441734314, + "learning_rate": 3.078228529130333e-05, + "loss": 0.0938, + "step": 82330 + }, + { + "epoch": 17.020950008124355, + "grad_norm": 0.008558560162782669, + "learning_rate": 3.0779276270258235e-05, + "loss": 0.0363, + "step": 82340 + }, + { + "epoch": 17.02100417050317, + "grad_norm": 0.05526991933584213, + "learning_rate": 3.077626724921314e-05, + "loss": 0.0388, + "step": 82350 + }, + { + "epoch": 17.02105833288198, + "grad_norm": 0.0036756049375981092, + "learning_rate": 3.077325822816805e-05, + "loss": 0.0253, + "step": 82360 + }, + { + "epoch": 17.02111249526079, + "grad_norm": 0.004907859954982996, + "learning_rate": 3.077024920712296e-05, + "loss": 0.0009, + "step": 82370 + }, + { + "epoch": 17.021166657639604, + "grad_norm": 0.003285503014922142, + "learning_rate": 3.076724018607786e-05, + "loss": 0.0609, + "step": 82380 + }, + { + "epoch": 17.021220820018414, + "grad_norm": 0.003032003529369831, + "learning_rate": 3.076423116503277e-05, + "loss": 0.0165, + "step": 82390 + }, + { + "epoch": 17.021274982397227, + "grad_norm": 0.5256637930870056, + "learning_rate": 3.076122214398768e-05, + "loss": 0.0549, + "step": 82400 + }, + { + "epoch": 17.02132914477604, + "grad_norm": 0.5385025143623352, + "learning_rate": 3.075821312294258e-05, + "loss": 0.0211, + "step": 82410 + }, + { + "epoch": 17.02138330715485, + "grad_norm": 0.004742652177810669, + "learning_rate": 3.075520410189749e-05, + "loss": 0.0092, + "step": 82420 + }, + { + "epoch": 17.021437469533662, + "grad_norm": 0.18193495273590088, + "learning_rate": 3.07521950808524e-05, + "loss": 0.1268, + "step": 82430 + }, + { + "epoch": 17.021491631912472, + "grad_norm": 0.48475152254104614, + "learning_rate": 3.0749186059807304e-05, + "loss": 0.0073, + "step": 82440 + }, + { + "epoch": 17.021545794291285, + "grad_norm": 0.005240994039922953, + "learning_rate": 3.074617703876221e-05, + "loss": 0.0792, + "step": 82450 + }, + { + "epoch": 17.021599956670098, + "grad_norm": 0.011313110589981079, + "learning_rate": 3.074316801771712e-05, + "loss": 0.0354, + "step": 82460 + }, + { + "epoch": 17.021654119048907, + "grad_norm": 0.0025337333790957928, + "learning_rate": 3.074015899667202e-05, + "loss": 0.0004, + "step": 82470 + }, + { + "epoch": 17.02170828142772, + "grad_norm": 0.0024467839393764734, + "learning_rate": 3.0737149975626936e-05, + "loss": 0.0037, + "step": 82480 + }, + { + "epoch": 17.021762443806534, + "grad_norm": 0.0025226064026355743, + "learning_rate": 3.0734140954581836e-05, + "loss": 0.0178, + "step": 82490 + }, + { + "epoch": 17.021816606185343, + "grad_norm": 2.099498748779297, + "learning_rate": 3.073113193353674e-05, + "loss": 0.2028, + "step": 82500 + }, + { + "epoch": 17.021870768564156, + "grad_norm": 0.003373922547325492, + "learning_rate": 3.0728122912491655e-05, + "loss": 0.0234, + "step": 82510 + }, + { + "epoch": 17.021924930942966, + "grad_norm": 0.0039010702166706324, + "learning_rate": 3.072511389144656e-05, + "loss": 0.0386, + "step": 82520 + }, + { + "epoch": 17.02197909332178, + "grad_norm": 2.4617886543273926, + "learning_rate": 3.072210487040146e-05, + "loss": 0.1052, + "step": 82530 + }, + { + "epoch": 17.022033255700592, + "grad_norm": 0.002919976133853197, + "learning_rate": 3.0719095849356374e-05, + "loss": 0.0152, + "step": 82540 + }, + { + "epoch": 17.0220874180794, + "grad_norm": 0.008532845415174961, + "learning_rate": 3.071608682831128e-05, + "loss": 0.2567, + "step": 82550 + }, + { + "epoch": 17.022141580458214, + "grad_norm": 0.006004451774060726, + "learning_rate": 3.071307780726618e-05, + "loss": 0.0264, + "step": 82560 + }, + { + "epoch": 17.022195742837024, + "grad_norm": 6.2566704750061035, + "learning_rate": 3.071006878622109e-05, + "loss": 0.1552, + "step": 82570 + }, + { + "epoch": 17.022249905215837, + "grad_norm": 0.007092671934515238, + "learning_rate": 3.0707059765176e-05, + "loss": 0.0632, + "step": 82580 + }, + { + "epoch": 17.02230406759465, + "grad_norm": 0.006793722044676542, + "learning_rate": 3.0704050744130905e-05, + "loss": 0.0535, + "step": 82590 + }, + { + "epoch": 17.02235822997346, + "grad_norm": 0.0038306498900055885, + "learning_rate": 3.070104172308581e-05, + "loss": 0.0063, + "step": 82600 + }, + { + "epoch": 17.022412392352273, + "grad_norm": 0.23141807317733765, + "learning_rate": 3.069803270204072e-05, + "loss": 0.0325, + "step": 82610 + }, + { + "epoch": 17.022466554731082, + "grad_norm": 0.006729941815137863, + "learning_rate": 3.0695023680995624e-05, + "loss": 0.0293, + "step": 82620 + }, + { + "epoch": 17.022520717109895, + "grad_norm": 0.008146404288709164, + "learning_rate": 3.069201465995054e-05, + "loss": 0.0715, + "step": 82630 + }, + { + "epoch": 17.022574879488708, + "grad_norm": 0.10057543963193893, + "learning_rate": 3.068900563890544e-05, + "loss": 0.0776, + "step": 82640 + }, + { + "epoch": 17.022629041867518, + "grad_norm": 0.004305498208850622, + "learning_rate": 3.068599661786035e-05, + "loss": 0.0711, + "step": 82650 + }, + { + "epoch": 17.02268320424633, + "grad_norm": 0.14938074350357056, + "learning_rate": 3.0682987596815256e-05, + "loss": 0.0438, + "step": 82660 + }, + { + "epoch": 17.022737366625144, + "grad_norm": 7.615922451019287, + "learning_rate": 3.067997857577016e-05, + "loss": 0.0263, + "step": 82670 + }, + { + "epoch": 17.022791529003953, + "grad_norm": 0.00959621462970972, + "learning_rate": 3.067696955472507e-05, + "loss": 0.0209, + "step": 82680 + }, + { + "epoch": 17.022845691382766, + "grad_norm": 0.010619011707603931, + "learning_rate": 3.0673960533679975e-05, + "loss": 0.0168, + "step": 82690 + }, + { + "epoch": 17.022899853761576, + "grad_norm": 4.30894136428833, + "learning_rate": 3.067095151263488e-05, + "loss": 0.1168, + "step": 82700 + }, + { + "epoch": 17.02295401614039, + "grad_norm": 1.6389023065567017, + "learning_rate": 3.066794249158979e-05, + "loss": 0.0526, + "step": 82710 + }, + { + "epoch": 17.023008178519202, + "grad_norm": 0.0032970490865409374, + "learning_rate": 3.0664933470544694e-05, + "loss": 0.0051, + "step": 82720 + }, + { + "epoch": 17.02306234089801, + "grad_norm": 0.030320297926664352, + "learning_rate": 3.06619244494996e-05, + "loss": 0.025, + "step": 82730 + }, + { + "epoch": 17.023116503276825, + "grad_norm": 0.0030131633393466473, + "learning_rate": 3.065891542845451e-05, + "loss": 0.0064, + "step": 82740 + }, + { + "epoch": 17.023170665655634, + "grad_norm": 0.0036079739220440388, + "learning_rate": 3.065590640740941e-05, + "loss": 0.1614, + "step": 82750 + }, + { + "epoch": 17.023224828034447, + "grad_norm": 0.5452288389205933, + "learning_rate": 3.065289738636432e-05, + "loss": 0.082, + "step": 82760 + }, + { + "epoch": 17.02327899041326, + "grad_norm": 0.019313760101795197, + "learning_rate": 3.064988836531923e-05, + "loss": 0.1166, + "step": 82770 + }, + { + "epoch": 17.02333315279207, + "grad_norm": 2.482476234436035, + "learning_rate": 3.064687934427414e-05, + "loss": 0.1579, + "step": 82780 + }, + { + "epoch": 17.023387315170883, + "grad_norm": 0.1785402148962021, + "learning_rate": 3.064387032322904e-05, + "loss": 0.0367, + "step": 82790 + }, + { + "epoch": 17.023441477549692, + "grad_norm": 0.006552781909704208, + "learning_rate": 3.064086130218395e-05, + "loss": 0.0875, + "step": 82800 + }, + { + "epoch": 17.023495639928505, + "grad_norm": 8.096405982971191, + "learning_rate": 3.063785228113886e-05, + "loss": 0.1502, + "step": 82810 + }, + { + "epoch": 17.02354980230732, + "grad_norm": 0.41529637575149536, + "learning_rate": 3.063484326009376e-05, + "loss": 0.0222, + "step": 82820 + }, + { + "epoch": 17.023603964686128, + "grad_norm": 0.7362390756607056, + "learning_rate": 3.063183423904867e-05, + "loss": 0.0382, + "step": 82830 + }, + { + "epoch": 17.02365812706494, + "grad_norm": 0.2050061821937561, + "learning_rate": 3.0628825218003576e-05, + "loss": 0.0625, + "step": 82840 + }, + { + "epoch": 17.023712289443754, + "grad_norm": 0.07728075981140137, + "learning_rate": 3.062581619695848e-05, + "loss": 0.0375, + "step": 82850 + }, + { + "epoch": 17.023766451822564, + "grad_norm": 4.213122844696045, + "learning_rate": 3.062280717591339e-05, + "loss": 0.0419, + "step": 82860 + }, + { + "epoch": 17.023820614201377, + "grad_norm": 4.290411949157715, + "learning_rate": 3.0619798154868295e-05, + "loss": 0.0804, + "step": 82870 + }, + { + "epoch": 17.023874776580186, + "grad_norm": 0.0714799240231514, + "learning_rate": 3.06167891338232e-05, + "loss": 0.009, + "step": 82880 + }, + { + "epoch": 17.023928938959, + "grad_norm": 0.12148834764957428, + "learning_rate": 3.0613780112778114e-05, + "loss": 0.004, + "step": 82890 + }, + { + "epoch": 17.023983101337812, + "grad_norm": 6.577834606170654, + "learning_rate": 3.061077109173301e-05, + "loss": 0.0742, + "step": 82900 + }, + { + "epoch": 17.02403726371662, + "grad_norm": 0.2525550127029419, + "learning_rate": 3.0607762070687926e-05, + "loss": 0.0667, + "step": 82910 + }, + { + "epoch": 17.024091426095435, + "grad_norm": 0.048489317297935486, + "learning_rate": 3.060475304964283e-05, + "loss": 0.073, + "step": 82920 + }, + { + "epoch": 17.024145588474244, + "grad_norm": 0.018960315734148026, + "learning_rate": 3.060174402859774e-05, + "loss": 0.0155, + "step": 82930 + }, + { + "epoch": 17.024199750853057, + "grad_norm": 0.003443404333665967, + "learning_rate": 3.0598735007552645e-05, + "loss": 0.0145, + "step": 82940 + }, + { + "epoch": 17.02425391323187, + "grad_norm": 2.714634895324707, + "learning_rate": 3.059572598650755e-05, + "loss": 0.0978, + "step": 82950 + }, + { + "epoch": 17.02430807561068, + "grad_norm": 0.026839857921004295, + "learning_rate": 3.059271696546246e-05, + "loss": 0.0266, + "step": 82960 + }, + { + "epoch": 17.024362237989493, + "grad_norm": 0.05449331924319267, + "learning_rate": 3.0589707944417364e-05, + "loss": 0.0841, + "step": 82970 + }, + { + "epoch": 17.024416400368303, + "grad_norm": 0.00315367104485631, + "learning_rate": 3.058669892337227e-05, + "loss": 0.0406, + "step": 82980 + }, + { + "epoch": 17.024470562747116, + "grad_norm": 0.008357955142855644, + "learning_rate": 3.0583689902327177e-05, + "loss": 0.0106, + "step": 82990 + }, + { + "epoch": 17.02452472512593, + "grad_norm": 0.003322295844554901, + "learning_rate": 3.058068088128209e-05, + "loss": 0.0184, + "step": 83000 + }, + { + "epoch": 17.024578887504738, + "grad_norm": 3.9870293140411377, + "learning_rate": 3.057767186023699e-05, + "loss": 0.0264, + "step": 83010 + }, + { + "epoch": 17.02463304988355, + "grad_norm": 0.02579578198492527, + "learning_rate": 3.0574662839191895e-05, + "loss": 0.0265, + "step": 83020 + }, + { + "epoch": 17.024687212262364, + "grad_norm": 0.003107338212430477, + "learning_rate": 3.057165381814681e-05, + "loss": 0.1278, + "step": 83030 + }, + { + "epoch": 17.024741374641174, + "grad_norm": 0.0033622002229094505, + "learning_rate": 3.0568644797101715e-05, + "loss": 0.0638, + "step": 83040 + }, + { + "epoch": 17.024795537019987, + "grad_norm": 0.38594403862953186, + "learning_rate": 3.0565635776056614e-05, + "loss": 0.0562, + "step": 83050 + }, + { + "epoch": 17.024849699398796, + "grad_norm": 2.5284457206726074, + "learning_rate": 3.056262675501153e-05, + "loss": 0.0269, + "step": 83060 + }, + { + "epoch": 17.02490386177761, + "grad_norm": 1.64149010181427, + "learning_rate": 3.0559617733966434e-05, + "loss": 0.0684, + "step": 83070 + }, + { + "epoch": 17.024958024156422, + "grad_norm": 0.004132198169827461, + "learning_rate": 3.055660871292134e-05, + "loss": 0.0027, + "step": 83080 + }, + { + "epoch": 17.02500135405947, + "eval_accuracy": 0.8291966035271064, + "eval_loss": 0.9413933157920837, + "eval_runtime": 117.1015, + "eval_samples_per_second": 26.148, + "eval_steps_per_second": 3.271, + "step": 83088 + }, + { + "epoch": 18.000010832475763, + "grad_norm": 0.012402115389704704, + "learning_rate": 3.0553599691876246e-05, + "loss": 0.0494, + "step": 83090 + }, + { + "epoch": 18.000064994854576, + "grad_norm": 0.0032356835436075926, + "learning_rate": 3.055059067083115e-05, + "loss": 0.2467, + "step": 83100 + }, + { + "epoch": 18.000119157233385, + "grad_norm": 0.5657600164413452, + "learning_rate": 3.054758164978606e-05, + "loss": 0.0179, + "step": 83110 + }, + { + "epoch": 18.0001733196122, + "grad_norm": 0.045265715569257736, + "learning_rate": 3.054457262874097e-05, + "loss": 0.0125, + "step": 83120 + }, + { + "epoch": 18.000227481991008, + "grad_norm": 0.0057475077919662, + "learning_rate": 3.054156360769587e-05, + "loss": 0.0034, + "step": 83130 + }, + { + "epoch": 18.00028164436982, + "grad_norm": 0.003220506012439728, + "learning_rate": 3.053855458665078e-05, + "loss": 0.0111, + "step": 83140 + }, + { + "epoch": 18.000335806748634, + "grad_norm": 0.004388886038213968, + "learning_rate": 3.053554556560569e-05, + "loss": 0.0308, + "step": 83150 + }, + { + "epoch": 18.000389969127443, + "grad_norm": 0.0657278522849083, + "learning_rate": 3.053253654456059e-05, + "loss": 0.0159, + "step": 83160 + }, + { + "epoch": 18.000444131506256, + "grad_norm": 2.2161388397216797, + "learning_rate": 3.05295275235155e-05, + "loss": 0.0256, + "step": 83170 + }, + { + "epoch": 18.000498293885066, + "grad_norm": 0.8488688468933105, + "learning_rate": 3.052651850247041e-05, + "loss": 0.1111, + "step": 83180 + }, + { + "epoch": 18.00055245626388, + "grad_norm": 30.52864646911621, + "learning_rate": 3.0523509481425316e-05, + "loss": 0.0787, + "step": 83190 + }, + { + "epoch": 18.000606618642692, + "grad_norm": 0.0032243607565760612, + "learning_rate": 3.052050046038022e-05, + "loss": 0.0017, + "step": 83200 + }, + { + "epoch": 18.0006607810215, + "grad_norm": 0.018880408257246017, + "learning_rate": 3.0517491439335128e-05, + "loss": 0.0172, + "step": 83210 + }, + { + "epoch": 18.000714943400315, + "grad_norm": 0.013193146325647831, + "learning_rate": 3.0514482418290034e-05, + "loss": 0.0234, + "step": 83220 + }, + { + "epoch": 18.000769105779124, + "grad_norm": 0.004326340276747942, + "learning_rate": 3.0511473397244944e-05, + "loss": 0.0038, + "step": 83230 + }, + { + "epoch": 18.000823268157937, + "grad_norm": 0.01883275993168354, + "learning_rate": 3.0508464376199847e-05, + "loss": 0.072, + "step": 83240 + }, + { + "epoch": 18.00087743053675, + "grad_norm": 0.4874444603919983, + "learning_rate": 3.0505455355154757e-05, + "loss": 0.0332, + "step": 83250 + }, + { + "epoch": 18.00093159291556, + "grad_norm": 1.4804768562316895, + "learning_rate": 3.0502446334109663e-05, + "loss": 0.0099, + "step": 83260 + }, + { + "epoch": 18.000985755294373, + "grad_norm": 0.004488931503146887, + "learning_rate": 3.0499437313064573e-05, + "loss": 0.1398, + "step": 83270 + }, + { + "epoch": 18.001039917673186, + "grad_norm": 0.04818711057305336, + "learning_rate": 3.0496428292019475e-05, + "loss": 0.0869, + "step": 83280 + }, + { + "epoch": 18.001094080051995, + "grad_norm": 0.029649704694747925, + "learning_rate": 3.0493419270974382e-05, + "loss": 0.0704, + "step": 83290 + }, + { + "epoch": 18.00114824243081, + "grad_norm": 0.03841274604201317, + "learning_rate": 3.049041024992929e-05, + "loss": 0.1045, + "step": 83300 + }, + { + "epoch": 18.001202404809618, + "grad_norm": 0.004463192541152239, + "learning_rate": 3.0487401228884194e-05, + "loss": 0.1079, + "step": 83310 + }, + { + "epoch": 18.00125656718843, + "grad_norm": 0.30392447113990784, + "learning_rate": 3.04843922078391e-05, + "loss": 0.0036, + "step": 83320 + }, + { + "epoch": 18.001310729567244, + "grad_norm": 0.006497963331639767, + "learning_rate": 3.048138318679401e-05, + "loss": 0.0202, + "step": 83330 + }, + { + "epoch": 18.001364891946054, + "grad_norm": 0.004066881258040667, + "learning_rate": 3.047837416574892e-05, + "loss": 0.0015, + "step": 83340 + }, + { + "epoch": 18.001419054324867, + "grad_norm": 0.008174370042979717, + "learning_rate": 3.0475365144703823e-05, + "loss": 0.0402, + "step": 83350 + }, + { + "epoch": 18.001473216703676, + "grad_norm": 0.010284753516316414, + "learning_rate": 3.047235612365873e-05, + "loss": 0.0971, + "step": 83360 + }, + { + "epoch": 18.00152737908249, + "grad_norm": 0.008110263384878635, + "learning_rate": 3.046934710261364e-05, + "loss": 0.0746, + "step": 83370 + }, + { + "epoch": 18.001581541461302, + "grad_norm": 0.02082757279276848, + "learning_rate": 3.0466338081568545e-05, + "loss": 0.0917, + "step": 83380 + }, + { + "epoch": 18.001635703840112, + "grad_norm": 0.18012209236621857, + "learning_rate": 3.0463329060523448e-05, + "loss": 0.013, + "step": 83390 + }, + { + "epoch": 18.001689866218925, + "grad_norm": 0.0409679040312767, + "learning_rate": 3.0460320039478358e-05, + "loss": 0.0487, + "step": 83400 + }, + { + "epoch": 18.001744028597734, + "grad_norm": 0.09382478147745132, + "learning_rate": 3.0457311018433264e-05, + "loss": 0.0423, + "step": 83410 + }, + { + "epoch": 18.001798190976547, + "grad_norm": 0.5317050218582153, + "learning_rate": 3.0454301997388173e-05, + "loss": 0.1383, + "step": 83420 + }, + { + "epoch": 18.00185235335536, + "grad_norm": 7.2817063331604, + "learning_rate": 3.0451292976343076e-05, + "loss": 0.0522, + "step": 83430 + }, + { + "epoch": 18.00190651573417, + "grad_norm": 0.6537160277366638, + "learning_rate": 3.0448283955297986e-05, + "loss": 0.0695, + "step": 83440 + }, + { + "epoch": 18.001960678112983, + "grad_norm": 18.56237030029297, + "learning_rate": 3.0445274934252892e-05, + "loss": 0.0582, + "step": 83450 + }, + { + "epoch": 18.002014840491796, + "grad_norm": 0.02808946929872036, + "learning_rate": 3.0442265913207795e-05, + "loss": 0.0026, + "step": 83460 + }, + { + "epoch": 18.002069002870606, + "grad_norm": 0.04322090744972229, + "learning_rate": 3.0439256892162705e-05, + "loss": 0.1687, + "step": 83470 + }, + { + "epoch": 18.00212316524942, + "grad_norm": 0.004183062352240086, + "learning_rate": 3.043624787111761e-05, + "loss": 0.0135, + "step": 83480 + }, + { + "epoch": 18.002177327628228, + "grad_norm": 0.00506970239803195, + "learning_rate": 3.043323885007252e-05, + "loss": 0.0253, + "step": 83490 + }, + { + "epoch": 18.00223149000704, + "grad_norm": 0.003559002885594964, + "learning_rate": 3.0430229829027424e-05, + "loss": 0.1033, + "step": 83500 + }, + { + "epoch": 18.002285652385854, + "grad_norm": 0.004722402896732092, + "learning_rate": 3.0427220807982333e-05, + "loss": 0.0475, + "step": 83510 + }, + { + "epoch": 18.002339814764664, + "grad_norm": 3.3057398796081543, + "learning_rate": 3.042421178693724e-05, + "loss": 0.0436, + "step": 83520 + }, + { + "epoch": 18.002393977143477, + "grad_norm": 0.3171602785587311, + "learning_rate": 3.042120276589215e-05, + "loss": 0.071, + "step": 83530 + }, + { + "epoch": 18.002448139522286, + "grad_norm": 0.16502486169338226, + "learning_rate": 3.0418193744847052e-05, + "loss": 0.0118, + "step": 83540 + }, + { + "epoch": 18.0025023019011, + "grad_norm": 0.042114950716495514, + "learning_rate": 3.041518472380196e-05, + "loss": 0.0467, + "step": 83550 + }, + { + "epoch": 18.002556464279913, + "grad_norm": 0.0041323136538267136, + "learning_rate": 3.0412175702756868e-05, + "loss": 0.1291, + "step": 83560 + }, + { + "epoch": 18.002610626658722, + "grad_norm": 0.004899246152490377, + "learning_rate": 3.0409166681711774e-05, + "loss": 0.029, + "step": 83570 + }, + { + "epoch": 18.002664789037535, + "grad_norm": 0.04742032289505005, + "learning_rate": 3.0406157660666677e-05, + "loss": 0.1119, + "step": 83580 + }, + { + "epoch": 18.002718951416345, + "grad_norm": 0.005130028817802668, + "learning_rate": 3.0403148639621587e-05, + "loss": 0.1193, + "step": 83590 + }, + { + "epoch": 18.002773113795158, + "grad_norm": 1.3778676986694336, + "learning_rate": 3.0400139618576497e-05, + "loss": 0.0403, + "step": 83600 + }, + { + "epoch": 18.00282727617397, + "grad_norm": 0.0089153703302145, + "learning_rate": 3.03971305975314e-05, + "loss": 0.1442, + "step": 83610 + }, + { + "epoch": 18.00288143855278, + "grad_norm": 0.021688953042030334, + "learning_rate": 3.0394121576486306e-05, + "loss": 0.0183, + "step": 83620 + }, + { + "epoch": 18.002935600931593, + "grad_norm": 0.007933493703603745, + "learning_rate": 3.0391112555441215e-05, + "loss": 0.0139, + "step": 83630 + }, + { + "epoch": 18.002989763310406, + "grad_norm": 0.008190508000552654, + "learning_rate": 3.0388103534396122e-05, + "loss": 0.0414, + "step": 83640 + }, + { + "epoch": 18.003043925689216, + "grad_norm": 0.06468312442302704, + "learning_rate": 3.0385094513351025e-05, + "loss": 0.0017, + "step": 83650 + }, + { + "epoch": 18.00309808806803, + "grad_norm": 0.7153671383857727, + "learning_rate": 3.0382085492305934e-05, + "loss": 0.0465, + "step": 83660 + }, + { + "epoch": 18.00315225044684, + "grad_norm": 0.9253516793251038, + "learning_rate": 3.0379076471260844e-05, + "loss": 0.0316, + "step": 83670 + }, + { + "epoch": 18.00320641282565, + "grad_norm": 0.00948075857013464, + "learning_rate": 3.037606745021575e-05, + "loss": 0.085, + "step": 83680 + }, + { + "epoch": 18.003260575204465, + "grad_norm": 0.013646922074258327, + "learning_rate": 3.0373058429170653e-05, + "loss": 0.0927, + "step": 83690 + }, + { + "epoch": 18.003314737583274, + "grad_norm": 0.02702776901423931, + "learning_rate": 3.0370049408125563e-05, + "loss": 0.0463, + "step": 83700 + }, + { + "epoch": 18.003368899962087, + "grad_norm": 0.0071902526542544365, + "learning_rate": 3.036704038708047e-05, + "loss": 0.0721, + "step": 83710 + }, + { + "epoch": 18.003423062340897, + "grad_norm": 0.050222791731357574, + "learning_rate": 3.036403136603538e-05, + "loss": 0.1098, + "step": 83720 + }, + { + "epoch": 18.00347722471971, + "grad_norm": 0.006100944243371487, + "learning_rate": 3.036102234499028e-05, + "loss": 0.1031, + "step": 83730 + }, + { + "epoch": 18.003531387098523, + "grad_norm": 0.0990576297044754, + "learning_rate": 3.0358013323945188e-05, + "loss": 0.1077, + "step": 83740 + }, + { + "epoch": 18.003585549477332, + "grad_norm": 1.754343867301941, + "learning_rate": 3.0355004302900097e-05, + "loss": 0.0763, + "step": 83750 + }, + { + "epoch": 18.003639711856145, + "grad_norm": 0.4051898419857025, + "learning_rate": 3.0351995281855e-05, + "loss": 0.0687, + "step": 83760 + }, + { + "epoch": 18.003693874234955, + "grad_norm": 0.012271502055227757, + "learning_rate": 3.034898626080991e-05, + "loss": 0.1346, + "step": 83770 + }, + { + "epoch": 18.003748036613768, + "grad_norm": 0.07905948162078857, + "learning_rate": 3.0345977239764816e-05, + "loss": 0.038, + "step": 83780 + }, + { + "epoch": 18.00380219899258, + "grad_norm": 1.8577827215194702, + "learning_rate": 3.0342968218719726e-05, + "loss": 0.0513, + "step": 83790 + }, + { + "epoch": 18.00385636137139, + "grad_norm": 0.36005792021751404, + "learning_rate": 3.033995919767463e-05, + "loss": 0.0628, + "step": 83800 + }, + { + "epoch": 18.003910523750204, + "grad_norm": 0.016782645136117935, + "learning_rate": 3.0336950176629535e-05, + "loss": 0.0102, + "step": 83810 + }, + { + "epoch": 18.003964686129013, + "grad_norm": 1.8451688289642334, + "learning_rate": 3.0333941155584445e-05, + "loss": 0.0695, + "step": 83820 + }, + { + "epoch": 18.004018848507826, + "grad_norm": 0.1273113489151001, + "learning_rate": 3.033093213453935e-05, + "loss": 0.0051, + "step": 83830 + }, + { + "epoch": 18.00407301088664, + "grad_norm": 0.0053852638229727745, + "learning_rate": 3.0327923113494254e-05, + "loss": 0.0022, + "step": 83840 + }, + { + "epoch": 18.00412717326545, + "grad_norm": 0.004974712617695332, + "learning_rate": 3.0324914092449164e-05, + "loss": 0.0547, + "step": 83850 + }, + { + "epoch": 18.00418133564426, + "grad_norm": 0.005309861619025469, + "learning_rate": 3.0321905071404073e-05, + "loss": 0.0198, + "step": 83860 + }, + { + "epoch": 18.004235498023075, + "grad_norm": 0.18100881576538086, + "learning_rate": 3.031889605035898e-05, + "loss": 0.0638, + "step": 83870 + }, + { + "epoch": 18.004289660401884, + "grad_norm": 1.7727916240692139, + "learning_rate": 3.0315887029313882e-05, + "loss": 0.0237, + "step": 83880 + }, + { + "epoch": 18.004343822780697, + "grad_norm": 0.1262628734111786, + "learning_rate": 3.0312878008268792e-05, + "loss": 0.074, + "step": 83890 + }, + { + "epoch": 18.004397985159507, + "grad_norm": 1.1704022884368896, + "learning_rate": 3.03098689872237e-05, + "loss": 0.0833, + "step": 83900 + }, + { + "epoch": 18.00445214753832, + "grad_norm": 0.005934743210673332, + "learning_rate": 3.03068599661786e-05, + "loss": 0.0204, + "step": 83910 + }, + { + "epoch": 18.004506309917133, + "grad_norm": 0.005279392935335636, + "learning_rate": 3.030385094513351e-05, + "loss": 0.0033, + "step": 83920 + }, + { + "epoch": 18.004560472295942, + "grad_norm": 0.016770683228969574, + "learning_rate": 3.030084192408842e-05, + "loss": 0.163, + "step": 83930 + }, + { + "epoch": 18.004614634674756, + "grad_norm": 0.006376330275088549, + "learning_rate": 3.0297832903043327e-05, + "loss": 0.0129, + "step": 83940 + }, + { + "epoch": 18.004668797053565, + "grad_norm": 1.9398974180221558, + "learning_rate": 3.029482388199823e-05, + "loss": 0.0454, + "step": 83950 + }, + { + "epoch": 18.004722959432378, + "grad_norm": 0.48712730407714844, + "learning_rate": 3.029181486095314e-05, + "loss": 0.1479, + "step": 83960 + }, + { + "epoch": 18.00477712181119, + "grad_norm": 0.3834538757801056, + "learning_rate": 3.0288805839908046e-05, + "loss": 0.0255, + "step": 83970 + }, + { + "epoch": 18.00483128419, + "grad_norm": 0.0037678389344364405, + "learning_rate": 3.0285796818862955e-05, + "loss": 0.0407, + "step": 83980 + }, + { + "epoch": 18.004885446568814, + "grad_norm": 0.008990883827209473, + "learning_rate": 3.0282787797817858e-05, + "loss": 0.0282, + "step": 83990 + }, + { + "epoch": 18.004939608947623, + "grad_norm": 1.7221921682357788, + "learning_rate": 3.0279778776772765e-05, + "loss": 0.0282, + "step": 84000 + }, + { + "epoch": 18.004993771326436, + "grad_norm": 0.5077130198478699, + "learning_rate": 3.0276769755727674e-05, + "loss": 0.0606, + "step": 84010 + }, + { + "epoch": 18.00504793370525, + "grad_norm": 0.004111565183848143, + "learning_rate": 3.0273760734682584e-05, + "loss": 0.0009, + "step": 84020 + }, + { + "epoch": 18.00510209608406, + "grad_norm": 0.002945445477962494, + "learning_rate": 3.0270751713637487e-05, + "loss": 0.1582, + "step": 84030 + }, + { + "epoch": 18.005156258462872, + "grad_norm": 0.032123010605573654, + "learning_rate": 3.0267742692592393e-05, + "loss": 0.0281, + "step": 84040 + }, + { + "epoch": 18.005210420841685, + "grad_norm": 0.029956279322504997, + "learning_rate": 3.0264733671547303e-05, + "loss": 0.0256, + "step": 84050 + }, + { + "epoch": 18.005264583220495, + "grad_norm": 0.005594940856099129, + "learning_rate": 3.0261724650502206e-05, + "loss": 0.0191, + "step": 84060 + }, + { + "epoch": 18.005318745599308, + "grad_norm": 0.4621908962726593, + "learning_rate": 3.0258715629457112e-05, + "loss": 0.021, + "step": 84070 + }, + { + "epoch": 18.005372907978117, + "grad_norm": 0.0031613658647984266, + "learning_rate": 3.025570660841202e-05, + "loss": 0.0729, + "step": 84080 + }, + { + "epoch": 18.00542707035693, + "grad_norm": 0.0030368026345968246, + "learning_rate": 3.0252697587366928e-05, + "loss": 0.0898, + "step": 84090 + }, + { + "epoch": 18.005481232735743, + "grad_norm": 0.06072474271059036, + "learning_rate": 3.024968856632183e-05, + "loss": 0.0065, + "step": 84100 + }, + { + "epoch": 18.005535395114553, + "grad_norm": 0.5097464323043823, + "learning_rate": 3.024667954527674e-05, + "loss": 0.0353, + "step": 84110 + }, + { + "epoch": 18.005589557493366, + "grad_norm": 0.387661874294281, + "learning_rate": 3.024367052423165e-05, + "loss": 0.0318, + "step": 84120 + }, + { + "epoch": 18.005643719872175, + "grad_norm": 0.037506088614463806, + "learning_rate": 3.0240661503186556e-05, + "loss": 0.1671, + "step": 84130 + }, + { + "epoch": 18.00569788225099, + "grad_norm": 4.206666946411133, + "learning_rate": 3.023765248214146e-05, + "loss": 0.0678, + "step": 84140 + }, + { + "epoch": 18.0057520446298, + "grad_norm": 0.004361710045486689, + "learning_rate": 3.023464346109637e-05, + "loss": 0.0823, + "step": 84150 + }, + { + "epoch": 18.00580620700861, + "grad_norm": 0.02128193899989128, + "learning_rate": 3.0231634440051275e-05, + "loss": 0.0106, + "step": 84160 + }, + { + "epoch": 18.005860369387424, + "grad_norm": 0.3308612108230591, + "learning_rate": 3.0228625419006185e-05, + "loss": 0.013, + "step": 84170 + }, + { + "epoch": 18.005914531766233, + "grad_norm": 0.00390449701808393, + "learning_rate": 3.0225616397961088e-05, + "loss": 0.0334, + "step": 84180 + }, + { + "epoch": 18.005968694145047, + "grad_norm": 0.06645824015140533, + "learning_rate": 3.0222607376915997e-05, + "loss": 0.0005, + "step": 84190 + }, + { + "epoch": 18.00602285652386, + "grad_norm": 0.006427777465432882, + "learning_rate": 3.0219598355870904e-05, + "loss": 0.0597, + "step": 84200 + }, + { + "epoch": 18.00607701890267, + "grad_norm": 2.1761250495910645, + "learning_rate": 3.0216589334825806e-05, + "loss": 0.0544, + "step": 84210 + }, + { + "epoch": 18.006131181281482, + "grad_norm": 0.01939866505563259, + "learning_rate": 3.0213580313780716e-05, + "loss": 0.0894, + "step": 84220 + }, + { + "epoch": 18.006185343660295, + "grad_norm": 0.003732192562893033, + "learning_rate": 3.0210571292735622e-05, + "loss": 0.0738, + "step": 84230 + }, + { + "epoch": 18.006239506039105, + "grad_norm": 0.9151702523231506, + "learning_rate": 3.0207562271690532e-05, + "loss": 0.0135, + "step": 84240 + }, + { + "epoch": 18.006293668417918, + "grad_norm": 0.0036934996023774147, + "learning_rate": 3.0204553250645435e-05, + "loss": 0.0628, + "step": 84250 + }, + { + "epoch": 18.006347830796727, + "grad_norm": 0.009111727587878704, + "learning_rate": 3.020154422960034e-05, + "loss": 0.0685, + "step": 84260 + }, + { + "epoch": 18.00640199317554, + "grad_norm": 1.192131519317627, + "learning_rate": 3.019853520855525e-05, + "loss": 0.05, + "step": 84270 + }, + { + "epoch": 18.006456155554353, + "grad_norm": 0.03894825279712677, + "learning_rate": 3.019552618751016e-05, + "loss": 0.0264, + "step": 84280 + }, + { + "epoch": 18.006510317933163, + "grad_norm": 0.29860439896583557, + "learning_rate": 3.0192517166465063e-05, + "loss": 0.003, + "step": 84290 + }, + { + "epoch": 18.006564480311976, + "grad_norm": 5.857994079589844, + "learning_rate": 3.018950814541997e-05, + "loss": 0.255, + "step": 84300 + }, + { + "epoch": 18.006618642690785, + "grad_norm": 1.5958999395370483, + "learning_rate": 3.018649912437488e-05, + "loss": 0.1284, + "step": 84310 + }, + { + "epoch": 18.0066728050696, + "grad_norm": 0.021587321534752846, + "learning_rate": 3.0183490103329786e-05, + "loss": 0.0571, + "step": 84320 + }, + { + "epoch": 18.00672696744841, + "grad_norm": 0.007211804855614901, + "learning_rate": 3.018048108228469e-05, + "loss": 0.0504, + "step": 84330 + }, + { + "epoch": 18.00678112982722, + "grad_norm": 0.037680916488170624, + "learning_rate": 3.0177472061239598e-05, + "loss": 0.0187, + "step": 84340 + }, + { + "epoch": 18.006835292206034, + "grad_norm": 0.012363355606794357, + "learning_rate": 3.0174463040194504e-05, + "loss": 0.0165, + "step": 84350 + }, + { + "epoch": 18.006889454584844, + "grad_norm": 0.6275243759155273, + "learning_rate": 3.0171454019149407e-05, + "loss": 0.0156, + "step": 84360 + }, + { + "epoch": 18.006943616963657, + "grad_norm": 0.007159887347370386, + "learning_rate": 3.0168444998104317e-05, + "loss": 0.0229, + "step": 84370 + }, + { + "epoch": 18.00699777934247, + "grad_norm": 0.02356140688061714, + "learning_rate": 3.0165435977059227e-05, + "loss": 0.1327, + "step": 84380 + }, + { + "epoch": 18.00705194172128, + "grad_norm": 0.02900024875998497, + "learning_rate": 3.0162426956014133e-05, + "loss": 0.0714, + "step": 84390 + }, + { + "epoch": 18.007106104100092, + "grad_norm": 0.2796241044998169, + "learning_rate": 3.0159417934969036e-05, + "loss": 0.0694, + "step": 84400 + }, + { + "epoch": 18.007160266478905, + "grad_norm": 0.010139483958482742, + "learning_rate": 3.0156408913923945e-05, + "loss": 0.0034, + "step": 84410 + }, + { + "epoch": 18.007214428857715, + "grad_norm": 0.045028142631053925, + "learning_rate": 3.0153399892878852e-05, + "loss": 0.0694, + "step": 84420 + }, + { + "epoch": 18.007268591236528, + "grad_norm": 0.0548754520714283, + "learning_rate": 3.015039087183376e-05, + "loss": 0.0421, + "step": 84430 + }, + { + "epoch": 18.007322753615338, + "grad_norm": 0.02787497267127037, + "learning_rate": 3.0147381850788664e-05, + "loss": 0.0255, + "step": 84440 + }, + { + "epoch": 18.00737691599415, + "grad_norm": 6.560012340545654, + "learning_rate": 3.0144372829743574e-05, + "loss": 0.0736, + "step": 84450 + }, + { + "epoch": 18.007431078372964, + "grad_norm": 0.03778275474905968, + "learning_rate": 3.014136380869848e-05, + "loss": 0.1083, + "step": 84460 + }, + { + "epoch": 18.007485240751773, + "grad_norm": 0.003622845048084855, + "learning_rate": 3.013835478765339e-05, + "loss": 0.0483, + "step": 84470 + }, + { + "epoch": 18.007539403130586, + "grad_norm": 1.4898051023483276, + "learning_rate": 3.0135345766608293e-05, + "loss": 0.063, + "step": 84480 + }, + { + "epoch": 18.007593565509396, + "grad_norm": 0.4190404415130615, + "learning_rate": 3.01323367455632e-05, + "loss": 0.0458, + "step": 84490 + }, + { + "epoch": 18.00764772788821, + "grad_norm": 0.2315373569726944, + "learning_rate": 3.012932772451811e-05, + "loss": 0.004, + "step": 84500 + }, + { + "epoch": 18.007701890267022, + "grad_norm": 0.03511878475546837, + "learning_rate": 3.012631870347301e-05, + "loss": 0.0517, + "step": 84510 + }, + { + "epoch": 18.00775605264583, + "grad_norm": 0.0730569139122963, + "learning_rate": 3.0123309682427918e-05, + "loss": 0.0076, + "step": 84520 + }, + { + "epoch": 18.007810215024644, + "grad_norm": 0.014580407179892063, + "learning_rate": 3.0120300661382828e-05, + "loss": 0.058, + "step": 84530 + }, + { + "epoch": 18.007864377403454, + "grad_norm": 0.018189644441008568, + "learning_rate": 3.0117291640337737e-05, + "loss": 0.0624, + "step": 84540 + }, + { + "epoch": 18.007918539782267, + "grad_norm": 0.004361574072390795, + "learning_rate": 3.011428261929264e-05, + "loss": 0.0801, + "step": 84550 + }, + { + "epoch": 18.00797270216108, + "grad_norm": 0.004275106359273195, + "learning_rate": 3.0111273598247546e-05, + "loss": 0.0452, + "step": 84560 + }, + { + "epoch": 18.00802686453989, + "grad_norm": 0.002720784628763795, + "learning_rate": 3.0108264577202456e-05, + "loss": 0.0033, + "step": 84570 + }, + { + "epoch": 18.008081026918703, + "grad_norm": 2.346174955368042, + "learning_rate": 3.0105255556157362e-05, + "loss": 0.0399, + "step": 84580 + }, + { + "epoch": 18.008135189297516, + "grad_norm": 4.447365760803223, + "learning_rate": 3.0102246535112265e-05, + "loss": 0.0805, + "step": 84590 + }, + { + "epoch": 18.008189351676325, + "grad_norm": 0.005568996071815491, + "learning_rate": 3.0099237514067175e-05, + "loss": 0.0097, + "step": 84600 + }, + { + "epoch": 18.00824351405514, + "grad_norm": 0.5795577764511108, + "learning_rate": 3.009622849302208e-05, + "loss": 0.0183, + "step": 84610 + }, + { + "epoch": 18.008297676433948, + "grad_norm": 0.49374791979789734, + "learning_rate": 3.009321947197699e-05, + "loss": 0.0329, + "step": 84620 + }, + { + "epoch": 18.00835183881276, + "grad_norm": 1.3205294609069824, + "learning_rate": 3.0090210450931894e-05, + "loss": 0.0733, + "step": 84630 + }, + { + "epoch": 18.008406001191574, + "grad_norm": 0.0029420279897749424, + "learning_rate": 3.0087201429886803e-05, + "loss": 0.1237, + "step": 84640 + }, + { + "epoch": 18.008460163570383, + "grad_norm": 0.0023197231348603964, + "learning_rate": 3.008419240884171e-05, + "loss": 0.0081, + "step": 84650 + }, + { + "epoch": 18.008514325949196, + "grad_norm": 0.00841154158115387, + "learning_rate": 3.0081183387796613e-05, + "loss": 0.006, + "step": 84660 + }, + { + "epoch": 18.008568488328006, + "grad_norm": 0.0021520776208490133, + "learning_rate": 3.0078174366751522e-05, + "loss": 0.01, + "step": 84670 + }, + { + "epoch": 18.00862265070682, + "grad_norm": 0.05737295001745224, + "learning_rate": 3.007516534570643e-05, + "loss": 0.0924, + "step": 84680 + }, + { + "epoch": 18.008676813085632, + "grad_norm": 0.02091067098081112, + "learning_rate": 3.0072156324661338e-05, + "loss": 0.0605, + "step": 84690 + }, + { + "epoch": 18.00873097546444, + "grad_norm": 0.0046985638327896595, + "learning_rate": 3.006914730361624e-05, + "loss": 0.0197, + "step": 84700 + }, + { + "epoch": 18.008785137843255, + "grad_norm": 0.038174573332071304, + "learning_rate": 3.006613828257115e-05, + "loss": 0.001, + "step": 84710 + }, + { + "epoch": 18.008839300222064, + "grad_norm": 0.1739526093006134, + "learning_rate": 3.0063129261526057e-05, + "loss": 0.1191, + "step": 84720 + }, + { + "epoch": 18.008893462600877, + "grad_norm": 8.706696510314941, + "learning_rate": 3.0060120240480967e-05, + "loss": 0.1272, + "step": 84730 + }, + { + "epoch": 18.00894762497969, + "grad_norm": 0.0038021658547222614, + "learning_rate": 3.005711121943587e-05, + "loss": 0.0708, + "step": 84740 + }, + { + "epoch": 18.0090017873585, + "grad_norm": 0.03016846999526024, + "learning_rate": 3.0054102198390776e-05, + "loss": 0.121, + "step": 84750 + }, + { + "epoch": 18.009055949737313, + "grad_norm": 0.01706627756357193, + "learning_rate": 3.0051093177345685e-05, + "loss": 0.0958, + "step": 84760 + }, + { + "epoch": 18.009110112116126, + "grad_norm": 0.29685908555984497, + "learning_rate": 3.0048084156300592e-05, + "loss": 0.0333, + "step": 84770 + }, + { + "epoch": 18.009164274494935, + "grad_norm": 0.017768753692507744, + "learning_rate": 3.0045075135255495e-05, + "loss": 0.0346, + "step": 84780 + }, + { + "epoch": 18.00921843687375, + "grad_norm": 0.023344583809375763, + "learning_rate": 3.0042066114210404e-05, + "loss": 0.0236, + "step": 84790 + }, + { + "epoch": 18.009272599252558, + "grad_norm": 0.04603537172079086, + "learning_rate": 3.0039057093165314e-05, + "loss": 0.0041, + "step": 84800 + }, + { + "epoch": 18.00932676163137, + "grad_norm": 0.03243299573659897, + "learning_rate": 3.0036048072120217e-05, + "loss": 0.0644, + "step": 84810 + }, + { + "epoch": 18.009380924010184, + "grad_norm": 0.0023723982740193605, + "learning_rate": 3.0033039051075123e-05, + "loss": 0.0119, + "step": 84820 + }, + { + "epoch": 18.009435086388994, + "grad_norm": 1.943078875541687, + "learning_rate": 3.0030030030030033e-05, + "loss": 0.2178, + "step": 84830 + }, + { + "epoch": 18.009489248767807, + "grad_norm": 0.40767672657966614, + "learning_rate": 3.002702100898494e-05, + "loss": 0.0234, + "step": 84840 + }, + { + "epoch": 18.009543411146616, + "grad_norm": 0.0025926416274160147, + "learning_rate": 3.0024011987939842e-05, + "loss": 0.0065, + "step": 84850 + }, + { + "epoch": 18.00959757352543, + "grad_norm": 0.0026119661051779985, + "learning_rate": 3.002100296689475e-05, + "loss": 0.0246, + "step": 84860 + }, + { + "epoch": 18.009651735904242, + "grad_norm": 0.3102646768093109, + "learning_rate": 3.0017993945849658e-05, + "loss": 0.0022, + "step": 84870 + }, + { + "epoch": 18.009705898283052, + "grad_norm": 0.003222233848646283, + "learning_rate": 3.0014984924804568e-05, + "loss": 0.0649, + "step": 84880 + }, + { + "epoch": 18.009760060661865, + "grad_norm": 0.0024259104393422604, + "learning_rate": 3.001197590375947e-05, + "loss": 0.0049, + "step": 84890 + }, + { + "epoch": 18.009814223040674, + "grad_norm": 0.013145947828888893, + "learning_rate": 3.000896688271438e-05, + "loss": 0.0027, + "step": 84900 + }, + { + "epoch": 18.009868385419487, + "grad_norm": 3.984689474105835, + "learning_rate": 3.0005957861669286e-05, + "loss": 0.0894, + "step": 84910 + }, + { + "epoch": 18.0099225477983, + "grad_norm": 0.00923980213701725, + "learning_rate": 3.0002948840624196e-05, + "loss": 0.0011, + "step": 84920 + }, + { + "epoch": 18.00997671017711, + "grad_norm": 0.002573007019236684, + "learning_rate": 2.99999398195791e-05, + "loss": 0.0776, + "step": 84930 + }, + { + "epoch": 18.010030872555923, + "grad_norm": 3.5921010971069336, + "learning_rate": 2.9996930798534005e-05, + "loss": 0.0552, + "step": 84940 + }, + { + "epoch": 18.010085034934733, + "grad_norm": 4.7312235832214355, + "learning_rate": 2.9993921777488915e-05, + "loss": 0.0349, + "step": 84950 + }, + { + "epoch": 18.010139197313546, + "grad_norm": 2.038822650909424, + "learning_rate": 2.9990912756443818e-05, + "loss": 0.0478, + "step": 84960 + }, + { + "epoch": 18.01019335969236, + "grad_norm": 0.08880063891410828, + "learning_rate": 2.9987903735398727e-05, + "loss": 0.0195, + "step": 84970 + }, + { + "epoch": 18.010247522071168, + "grad_norm": 0.05814759060740471, + "learning_rate": 2.9984894714353634e-05, + "loss": 0.0123, + "step": 84980 + }, + { + "epoch": 18.01030168444998, + "grad_norm": 0.6709592342376709, + "learning_rate": 2.9981885693308543e-05, + "loss": 0.0676, + "step": 84990 + }, + { + "epoch": 18.010355846828794, + "grad_norm": 0.03155878186225891, + "learning_rate": 2.9978876672263446e-05, + "loss": 0.056, + "step": 85000 + }, + { + "epoch": 18.010410009207604, + "grad_norm": 0.2785189151763916, + "learning_rate": 2.9975867651218352e-05, + "loss": 0.0033, + "step": 85010 + }, + { + "epoch": 18.010464171586417, + "grad_norm": 0.030018135905265808, + "learning_rate": 2.9972858630173262e-05, + "loss": 0.0284, + "step": 85020 + }, + { + "epoch": 18.010518333965226, + "grad_norm": 0.0021084612235426903, + "learning_rate": 2.996984960912817e-05, + "loss": 0.0089, + "step": 85030 + }, + { + "epoch": 18.01057249634404, + "grad_norm": 0.0051675960421562195, + "learning_rate": 2.996684058808307e-05, + "loss": 0.0453, + "step": 85040 + }, + { + "epoch": 18.010626658722853, + "grad_norm": 0.014508464373648167, + "learning_rate": 2.996383156703798e-05, + "loss": 0.0347, + "step": 85050 + }, + { + "epoch": 18.010680821101662, + "grad_norm": 0.17373226583003998, + "learning_rate": 2.996082254599289e-05, + "loss": 0.0564, + "step": 85060 + }, + { + "epoch": 18.010734983480475, + "grad_norm": 0.0021932823583483696, + "learning_rate": 2.9957813524947797e-05, + "loss": 0.0669, + "step": 85070 + }, + { + "epoch": 18.010789145859285, + "grad_norm": 0.0024943617172539234, + "learning_rate": 2.99548045039027e-05, + "loss": 0.0963, + "step": 85080 + }, + { + "epoch": 18.010843308238098, + "grad_norm": 9.035041809082031, + "learning_rate": 2.995179548285761e-05, + "loss": 0.2004, + "step": 85090 + }, + { + "epoch": 18.01089747061691, + "grad_norm": 0.10927310585975647, + "learning_rate": 2.9948786461812516e-05, + "loss": 0.098, + "step": 85100 + }, + { + "epoch": 18.01095163299572, + "grad_norm": 0.4712069034576416, + "learning_rate": 2.994577744076742e-05, + "loss": 0.0174, + "step": 85110 + }, + { + "epoch": 18.011005795374533, + "grad_norm": 0.0029552436899393797, + "learning_rate": 2.9942768419722328e-05, + "loss": 0.0775, + "step": 85120 + }, + { + "epoch": 18.011059957753343, + "grad_norm": 0.0029875708278268576, + "learning_rate": 2.9939759398677235e-05, + "loss": 0.1042, + "step": 85130 + }, + { + "epoch": 18.011114120132156, + "grad_norm": 0.02923324704170227, + "learning_rate": 2.9936750377632144e-05, + "loss": 0.07, + "step": 85140 + }, + { + "epoch": 18.01116828251097, + "grad_norm": 6.090675354003906, + "learning_rate": 2.9933741356587047e-05, + "loss": 0.0299, + "step": 85150 + }, + { + "epoch": 18.01122244488978, + "grad_norm": 0.34694892168045044, + "learning_rate": 2.9930732335541957e-05, + "loss": 0.0086, + "step": 85160 + }, + { + "epoch": 18.01127660726859, + "grad_norm": 0.4528650939464569, + "learning_rate": 2.9927723314496863e-05, + "loss": 0.046, + "step": 85170 + }, + { + "epoch": 18.011330769647405, + "grad_norm": 0.027434952557086945, + "learning_rate": 2.9924714293451773e-05, + "loss": 0.0352, + "step": 85180 + }, + { + "epoch": 18.011384932026214, + "grad_norm": 0.0041155689395964146, + "learning_rate": 2.9921705272406676e-05, + "loss": 0.0018, + "step": 85190 + }, + { + "epoch": 18.011439094405027, + "grad_norm": 0.03116792067885399, + "learning_rate": 2.9918696251361582e-05, + "loss": 0.0852, + "step": 85200 + }, + { + "epoch": 18.011493256783837, + "grad_norm": 0.03521978110074997, + "learning_rate": 2.991568723031649e-05, + "loss": 0.1164, + "step": 85210 + }, + { + "epoch": 18.01154741916265, + "grad_norm": 0.9789474010467529, + "learning_rate": 2.99126782092714e-05, + "loss": 0.0118, + "step": 85220 + }, + { + "epoch": 18.011601581541463, + "grad_norm": 0.016001256182789803, + "learning_rate": 2.9909669188226304e-05, + "loss": 0.1288, + "step": 85230 + }, + { + "epoch": 18.011655743920272, + "grad_norm": 5.154709339141846, + "learning_rate": 2.990666016718121e-05, + "loss": 0.0552, + "step": 85240 + }, + { + "epoch": 18.011709906299085, + "grad_norm": 0.0028060576878488064, + "learning_rate": 2.990365114613612e-05, + "loss": 0.0097, + "step": 85250 + }, + { + "epoch": 18.011764068677895, + "grad_norm": 0.5005328059196472, + "learning_rate": 2.9900642125091023e-05, + "loss": 0.0294, + "step": 85260 + }, + { + "epoch": 18.011818231056708, + "grad_norm": 0.005662641488015652, + "learning_rate": 2.989763310404593e-05, + "loss": 0.0194, + "step": 85270 + }, + { + "epoch": 18.01187239343552, + "grad_norm": 0.33646807074546814, + "learning_rate": 2.989462408300084e-05, + "loss": 0.041, + "step": 85280 + }, + { + "epoch": 18.01192655581433, + "grad_norm": 0.02881225198507309, + "learning_rate": 2.9891615061955745e-05, + "loss": 0.1632, + "step": 85290 + }, + { + "epoch": 18.011980718193144, + "grad_norm": 4.306267738342285, + "learning_rate": 2.9888606040910648e-05, + "loss": 0.052, + "step": 85300 + }, + { + "epoch": 18.012034880571953, + "grad_norm": 0.07004830986261368, + "learning_rate": 2.9885597019865558e-05, + "loss": 0.0172, + "step": 85310 + }, + { + "epoch": 18.012089042950766, + "grad_norm": 0.7135646343231201, + "learning_rate": 2.9882587998820467e-05, + "loss": 0.1439, + "step": 85320 + }, + { + "epoch": 18.01214320532958, + "grad_norm": 0.021406080573797226, + "learning_rate": 2.9879578977775374e-05, + "loss": 0.0603, + "step": 85330 + }, + { + "epoch": 18.01219736770839, + "grad_norm": 2.9345085620880127, + "learning_rate": 2.9876569956730276e-05, + "loss": 0.1442, + "step": 85340 + }, + { + "epoch": 18.0122515300872, + "grad_norm": 0.33712857961654663, + "learning_rate": 2.9873560935685186e-05, + "loss": 0.0765, + "step": 85350 + }, + { + "epoch": 18.012305692466015, + "grad_norm": 3.621126413345337, + "learning_rate": 2.9870551914640092e-05, + "loss": 0.097, + "step": 85360 + }, + { + "epoch": 18.012359854844824, + "grad_norm": 0.035187818109989166, + "learning_rate": 2.9867542893595002e-05, + "loss": 0.0832, + "step": 85370 + }, + { + "epoch": 18.012414017223637, + "grad_norm": 0.030368724837899208, + "learning_rate": 2.9864533872549905e-05, + "loss": 0.0095, + "step": 85380 + }, + { + "epoch": 18.012468179602447, + "grad_norm": 0.6743516325950623, + "learning_rate": 2.9861524851504815e-05, + "loss": 0.0476, + "step": 85390 + }, + { + "epoch": 18.01252234198126, + "grad_norm": 0.014918413013219833, + "learning_rate": 2.985851583045972e-05, + "loss": 0.0035, + "step": 85400 + }, + { + "epoch": 18.012576504360073, + "grad_norm": 0.0036783029790967703, + "learning_rate": 2.9855506809414624e-05, + "loss": 0.0505, + "step": 85410 + }, + { + "epoch": 18.012630666738882, + "grad_norm": 0.03820263594388962, + "learning_rate": 2.9852497788369533e-05, + "loss": 0.0803, + "step": 85420 + }, + { + "epoch": 18.012684829117696, + "grad_norm": 1.3831815719604492, + "learning_rate": 2.984948876732444e-05, + "loss": 0.0273, + "step": 85430 + }, + { + "epoch": 18.012738991496505, + "grad_norm": 1.9572582244873047, + "learning_rate": 2.984647974627935e-05, + "loss": 0.0452, + "step": 85440 + }, + { + "epoch": 18.012793153875318, + "grad_norm": 0.007714897394180298, + "learning_rate": 2.9843470725234252e-05, + "loss": 0.0925, + "step": 85450 + }, + { + "epoch": 18.01284731625413, + "grad_norm": 0.02182268537580967, + "learning_rate": 2.984046170418916e-05, + "loss": 0.0241, + "step": 85460 + }, + { + "epoch": 18.01290147863294, + "grad_norm": 0.1186097040772438, + "learning_rate": 2.9837452683144068e-05, + "loss": 0.1454, + "step": 85470 + }, + { + "epoch": 18.012955641011754, + "grad_norm": 1.1849571466445923, + "learning_rate": 2.9834443662098978e-05, + "loss": 0.1894, + "step": 85480 + }, + { + "epoch": 18.013009803390563, + "grad_norm": 0.018674734979867935, + "learning_rate": 2.983143464105388e-05, + "loss": 0.0882, + "step": 85490 + }, + { + "epoch": 18.013063965769376, + "grad_norm": 0.2028966248035431, + "learning_rate": 2.9828425620008787e-05, + "loss": 0.0978, + "step": 85500 + }, + { + "epoch": 18.01311812814819, + "grad_norm": 0.004257789347320795, + "learning_rate": 2.9825416598963697e-05, + "loss": 0.0245, + "step": 85510 + }, + { + "epoch": 18.013172290527, + "grad_norm": 0.012982035987079144, + "learning_rate": 2.9822407577918603e-05, + "loss": 0.0658, + "step": 85520 + }, + { + "epoch": 18.013226452905812, + "grad_norm": 0.007616781163960695, + "learning_rate": 2.9819398556873506e-05, + "loss": 0.0078, + "step": 85530 + }, + { + "epoch": 18.013280615284625, + "grad_norm": 0.14533527195453644, + "learning_rate": 2.9816389535828416e-05, + "loss": 0.0403, + "step": 85540 + }, + { + "epoch": 18.013334777663434, + "grad_norm": 0.21640439331531525, + "learning_rate": 2.9813380514783322e-05, + "loss": 0.0201, + "step": 85550 + }, + { + "epoch": 18.013388940042248, + "grad_norm": 0.9871035218238831, + "learning_rate": 2.9810371493738225e-05, + "loss": 0.0116, + "step": 85560 + }, + { + "epoch": 18.013443102421057, + "grad_norm": 0.8962903618812561, + "learning_rate": 2.9807362472693134e-05, + "loss": 0.0392, + "step": 85570 + }, + { + "epoch": 18.01349726479987, + "grad_norm": 0.0658143162727356, + "learning_rate": 2.9804353451648044e-05, + "loss": 0.0709, + "step": 85580 + }, + { + "epoch": 18.013551427178683, + "grad_norm": 0.11079297959804535, + "learning_rate": 2.980134443060295e-05, + "loss": 0.0887, + "step": 85590 + }, + { + "epoch": 18.013605589557493, + "grad_norm": 0.02086169458925724, + "learning_rate": 2.9798335409557853e-05, + "loss": 0.0873, + "step": 85600 + }, + { + "epoch": 18.013659751936306, + "grad_norm": 0.003013269742950797, + "learning_rate": 2.9795326388512763e-05, + "loss": 0.0091, + "step": 85610 + }, + { + "epoch": 18.013713914315115, + "grad_norm": 0.6525566577911377, + "learning_rate": 2.979231736746767e-05, + "loss": 0.056, + "step": 85620 + }, + { + "epoch": 18.01376807669393, + "grad_norm": 0.0032188964542001486, + "learning_rate": 2.978930834642258e-05, + "loss": 0.0477, + "step": 85630 + }, + { + "epoch": 18.01382223907274, + "grad_norm": 3.14029860496521, + "learning_rate": 2.978629932537748e-05, + "loss": 0.0638, + "step": 85640 + }, + { + "epoch": 18.01387640145155, + "grad_norm": 0.10068132728338242, + "learning_rate": 2.978329030433239e-05, + "loss": 0.0377, + "step": 85650 + }, + { + "epoch": 18.013930563830364, + "grad_norm": 0.30688348412513733, + "learning_rate": 2.9780281283287298e-05, + "loss": 0.0022, + "step": 85660 + }, + { + "epoch": 18.013984726209173, + "grad_norm": 5.353458881378174, + "learning_rate": 2.9777272262242207e-05, + "loss": 0.0925, + "step": 85670 + }, + { + "epoch": 18.014038888587987, + "grad_norm": 0.09207334369421005, + "learning_rate": 2.977426324119711e-05, + "loss": 0.0023, + "step": 85680 + }, + { + "epoch": 18.0140930509668, + "grad_norm": 0.14609678089618683, + "learning_rate": 2.9771254220152016e-05, + "loss": 0.0834, + "step": 85690 + }, + { + "epoch": 18.01414721334561, + "grad_norm": 0.006684349849820137, + "learning_rate": 2.9768245199106926e-05, + "loss": 0.0031, + "step": 85700 + }, + { + "epoch": 18.014201375724422, + "grad_norm": 0.00769747793674469, + "learning_rate": 2.976523617806183e-05, + "loss": 0.013, + "step": 85710 + }, + { + "epoch": 18.014255538103235, + "grad_norm": 0.07141028344631195, + "learning_rate": 2.9762227157016735e-05, + "loss": 0.0848, + "step": 85720 + }, + { + "epoch": 18.014309700482045, + "grad_norm": 0.009110284969210625, + "learning_rate": 2.9759218135971645e-05, + "loss": 0.0531, + "step": 85730 + }, + { + "epoch": 18.014363862860858, + "grad_norm": 0.002824460156261921, + "learning_rate": 2.9756209114926555e-05, + "loss": 0.0938, + "step": 85740 + }, + { + "epoch": 18.014418025239667, + "grad_norm": 1.505582571029663, + "learning_rate": 2.9753200093881457e-05, + "loss": 0.0622, + "step": 85750 + }, + { + "epoch": 18.01447218761848, + "grad_norm": 0.19124896824359894, + "learning_rate": 2.9750191072836364e-05, + "loss": 0.0293, + "step": 85760 + }, + { + "epoch": 18.014526349997293, + "grad_norm": 0.07002335041761398, + "learning_rate": 2.9747182051791273e-05, + "loss": 0.0986, + "step": 85770 + }, + { + "epoch": 18.014580512376103, + "grad_norm": 0.0027643984649330378, + "learning_rate": 2.974417303074618e-05, + "loss": 0.0257, + "step": 85780 + }, + { + "epoch": 18.014634674754916, + "grad_norm": 0.0028041922487318516, + "learning_rate": 2.9741164009701083e-05, + "loss": 0.0582, + "step": 85790 + }, + { + "epoch": 18.014688837133725, + "grad_norm": 0.0027535257395356894, + "learning_rate": 2.9738154988655992e-05, + "loss": 0.1098, + "step": 85800 + }, + { + "epoch": 18.01474299951254, + "grad_norm": 1.058811902999878, + "learning_rate": 2.97351459676109e-05, + "loss": 0.1111, + "step": 85810 + }, + { + "epoch": 18.01479716189135, + "grad_norm": 0.002876501064747572, + "learning_rate": 2.9732136946565808e-05, + "loss": 0.0177, + "step": 85820 + }, + { + "epoch": 18.01485132427016, + "grad_norm": 4.899694442749023, + "learning_rate": 2.972912792552071e-05, + "loss": 0.0619, + "step": 85830 + }, + { + "epoch": 18.014905486648974, + "grad_norm": 0.3254466652870178, + "learning_rate": 2.972611890447562e-05, + "loss": 0.0693, + "step": 85840 + }, + { + "epoch": 18.014959649027784, + "grad_norm": 0.9936700463294983, + "learning_rate": 2.9723109883430527e-05, + "loss": 0.0836, + "step": 85850 + }, + { + "epoch": 18.015013811406597, + "grad_norm": 0.005114758852869272, + "learning_rate": 2.972010086238543e-05, + "loss": 0.0712, + "step": 85860 + }, + { + "epoch": 18.01506797378541, + "grad_norm": 25.39361572265625, + "learning_rate": 2.971709184134034e-05, + "loss": 0.1525, + "step": 85870 + }, + { + "epoch": 18.01512213616422, + "grad_norm": 2.735680103302002, + "learning_rate": 2.9714082820295246e-05, + "loss": 0.1125, + "step": 85880 + }, + { + "epoch": 18.015176298543032, + "grad_norm": 0.06717649102210999, + "learning_rate": 2.9711073799250155e-05, + "loss": 0.0153, + "step": 85890 + }, + { + "epoch": 18.015230460921845, + "grad_norm": 0.07634993642568588, + "learning_rate": 2.970806477820506e-05, + "loss": 0.0648, + "step": 85900 + }, + { + "epoch": 18.015284623300655, + "grad_norm": 0.14171019196510315, + "learning_rate": 2.9705055757159968e-05, + "loss": 0.0529, + "step": 85910 + }, + { + "epoch": 18.015338785679468, + "grad_norm": 0.4140682816505432, + "learning_rate": 2.9702046736114874e-05, + "loss": 0.0319, + "step": 85920 + }, + { + "epoch": 18.015392948058278, + "grad_norm": 0.9866162538528442, + "learning_rate": 2.9699037715069784e-05, + "loss": 0.0534, + "step": 85930 + }, + { + "epoch": 18.01544711043709, + "grad_norm": 0.22971200942993164, + "learning_rate": 2.9696028694024687e-05, + "loss": 0.1156, + "step": 85940 + }, + { + "epoch": 18.015501272815904, + "grad_norm": 3.3502161502838135, + "learning_rate": 2.9693019672979593e-05, + "loss": 0.0678, + "step": 85950 + }, + { + "epoch": 18.015555435194713, + "grad_norm": 0.9415259957313538, + "learning_rate": 2.9690010651934503e-05, + "loss": 0.1132, + "step": 85960 + }, + { + "epoch": 18.015609597573526, + "grad_norm": 0.013985752128064632, + "learning_rate": 2.968700163088941e-05, + "loss": 0.1894, + "step": 85970 + }, + { + "epoch": 18.015663759952336, + "grad_norm": 0.016884053125977516, + "learning_rate": 2.9683992609844312e-05, + "loss": 0.048, + "step": 85980 + }, + { + "epoch": 18.01571792233115, + "grad_norm": 1.5423552989959717, + "learning_rate": 2.968098358879922e-05, + "loss": 0.066, + "step": 85990 + }, + { + "epoch": 18.015772084709962, + "grad_norm": 0.003130836645141244, + "learning_rate": 2.967797456775413e-05, + "loss": 0.0262, + "step": 86000 + }, + { + "epoch": 18.01582624708877, + "grad_norm": 0.003064035205170512, + "learning_rate": 2.9674965546709034e-05, + "loss": 0.0248, + "step": 86010 + }, + { + "epoch": 18.015880409467584, + "grad_norm": 0.18007692694664001, + "learning_rate": 2.967195652566394e-05, + "loss": 0.0212, + "step": 86020 + }, + { + "epoch": 18.015934571846394, + "grad_norm": 0.06835996359586716, + "learning_rate": 2.966894750461885e-05, + "loss": 0.067, + "step": 86030 + }, + { + "epoch": 18.015988734225207, + "grad_norm": 0.052542831748723984, + "learning_rate": 2.9665938483573756e-05, + "loss": 0.024, + "step": 86040 + }, + { + "epoch": 18.01604289660402, + "grad_norm": 0.8959372639656067, + "learning_rate": 2.966292946252866e-05, + "loss": 0.1109, + "step": 86050 + }, + { + "epoch": 18.01609705898283, + "grad_norm": 0.018604086712002754, + "learning_rate": 2.965992044148357e-05, + "loss": 0.0306, + "step": 86060 + }, + { + "epoch": 18.016151221361643, + "grad_norm": 5.9212751388549805, + "learning_rate": 2.9656911420438475e-05, + "loss": 0.0783, + "step": 86070 + }, + { + "epoch": 18.016205383740452, + "grad_norm": 0.0031403121538460255, + "learning_rate": 2.9653902399393385e-05, + "loss": 0.01, + "step": 86080 + }, + { + "epoch": 18.016259546119265, + "grad_norm": 0.13097454607486725, + "learning_rate": 2.9650893378348288e-05, + "loss": 0.0056, + "step": 86090 + }, + { + "epoch": 18.01631370849808, + "grad_norm": 0.0039672404527664185, + "learning_rate": 2.9647884357303197e-05, + "loss": 0.1831, + "step": 86100 + }, + { + "epoch": 18.016367870876888, + "grad_norm": 0.004398446064442396, + "learning_rate": 2.9644875336258104e-05, + "loss": 0.094, + "step": 86110 + }, + { + "epoch": 18.0164220332557, + "grad_norm": 1.4576258659362793, + "learning_rate": 2.9641866315213013e-05, + "loss": 0.0838, + "step": 86120 + }, + { + "epoch": 18.016476195634514, + "grad_norm": 2.012178421020508, + "learning_rate": 2.9638857294167916e-05, + "loss": 0.0197, + "step": 86130 + }, + { + "epoch": 18.016530358013323, + "grad_norm": 0.1889590173959732, + "learning_rate": 2.9635848273122822e-05, + "loss": 0.0668, + "step": 86140 + }, + { + "epoch": 18.016584520392136, + "grad_norm": 0.4245285987854004, + "learning_rate": 2.9632839252077732e-05, + "loss": 0.0222, + "step": 86150 + }, + { + "epoch": 18.016638682770946, + "grad_norm": 0.06380559504032135, + "learning_rate": 2.9629830231032635e-05, + "loss": 0.0426, + "step": 86160 + }, + { + "epoch": 18.01669284514976, + "grad_norm": 0.28482168912887573, + "learning_rate": 2.9626821209987545e-05, + "loss": 0.0545, + "step": 86170 + }, + { + "epoch": 18.016747007528572, + "grad_norm": 0.05503354221582413, + "learning_rate": 2.962381218894245e-05, + "loss": 0.0484, + "step": 86180 + }, + { + "epoch": 18.01680116990738, + "grad_norm": 0.2721269130706787, + "learning_rate": 2.962080316789736e-05, + "loss": 0.0194, + "step": 86190 + }, + { + "epoch": 18.016855332286195, + "grad_norm": 0.00483819330111146, + "learning_rate": 2.9617794146852264e-05, + "loss": 0.0764, + "step": 86200 + }, + { + "epoch": 18.016909494665004, + "grad_norm": 0.08291971683502197, + "learning_rate": 2.961478512580717e-05, + "loss": 0.1173, + "step": 86210 + }, + { + "epoch": 18.016963657043817, + "grad_norm": 0.16859441995620728, + "learning_rate": 2.961177610476208e-05, + "loss": 0.0515, + "step": 86220 + }, + { + "epoch": 18.01701781942263, + "grad_norm": 3.3248698711395264, + "learning_rate": 2.9608767083716986e-05, + "loss": 0.1042, + "step": 86230 + }, + { + "epoch": 18.01707198180144, + "grad_norm": 0.39443376660346985, + "learning_rate": 2.960575806267189e-05, + "loss": 0.0651, + "step": 86240 + }, + { + "epoch": 18.017126144180253, + "grad_norm": 0.002417511772364378, + "learning_rate": 2.9602749041626798e-05, + "loss": 0.0084, + "step": 86250 + }, + { + "epoch": 18.017180306559062, + "grad_norm": 0.010298975743353367, + "learning_rate": 2.9599740020581708e-05, + "loss": 0.0494, + "step": 86260 + }, + { + "epoch": 18.017234468937875, + "grad_norm": 0.0025671781040728092, + "learning_rate": 2.9596730999536614e-05, + "loss": 0.068, + "step": 86270 + }, + { + "epoch": 18.01728863131669, + "grad_norm": 0.053131937980651855, + "learning_rate": 2.9593721978491517e-05, + "loss": 0.0416, + "step": 86280 + }, + { + "epoch": 18.017342793695498, + "grad_norm": 2.0311119556427, + "learning_rate": 2.9590712957446427e-05, + "loss": 0.0544, + "step": 86290 + }, + { + "epoch": 18.01739695607431, + "grad_norm": 0.11902770400047302, + "learning_rate": 2.9587703936401333e-05, + "loss": 0.0371, + "step": 86300 + }, + { + "epoch": 18.017451118453124, + "grad_norm": 0.14720386266708374, + "learning_rate": 2.9584694915356236e-05, + "loss": 0.0435, + "step": 86310 + }, + { + "epoch": 18.017505280831934, + "grad_norm": 3.0616679191589355, + "learning_rate": 2.9581685894311146e-05, + "loss": 0.0528, + "step": 86320 + }, + { + "epoch": 18.017559443210747, + "grad_norm": 0.19106434285640717, + "learning_rate": 2.9578676873266052e-05, + "loss": 0.0193, + "step": 86330 + }, + { + "epoch": 18.017613605589556, + "grad_norm": 0.0028486456722021103, + "learning_rate": 2.957566785222096e-05, + "loss": 0.0799, + "step": 86340 + }, + { + "epoch": 18.01766776796837, + "grad_norm": 0.14042653143405914, + "learning_rate": 2.9572658831175864e-05, + "loss": 0.0318, + "step": 86350 + }, + { + "epoch": 18.017721930347182, + "grad_norm": 0.002220758004114032, + "learning_rate": 2.9569649810130774e-05, + "loss": 0.0385, + "step": 86360 + }, + { + "epoch": 18.017776092725992, + "grad_norm": 0.34505221247673035, + "learning_rate": 2.956664078908568e-05, + "loss": 0.0438, + "step": 86370 + }, + { + "epoch": 18.017830255104805, + "grad_norm": 0.006100699305534363, + "learning_rate": 2.956363176804059e-05, + "loss": 0.0744, + "step": 86380 + }, + { + "epoch": 18.017884417483614, + "grad_norm": 0.002478728536516428, + "learning_rate": 2.9560622746995493e-05, + "loss": 0.0745, + "step": 86390 + }, + { + "epoch": 18.017938579862427, + "grad_norm": 0.08880461007356644, + "learning_rate": 2.95576137259504e-05, + "loss": 0.0039, + "step": 86400 + }, + { + "epoch": 18.01799274224124, + "grad_norm": 0.3308444023132324, + "learning_rate": 2.955460470490531e-05, + "loss": 0.1104, + "step": 86410 + }, + { + "epoch": 18.01804690462005, + "grad_norm": 0.22418014705181122, + "learning_rate": 2.955159568386022e-05, + "loss": 0.0103, + "step": 86420 + }, + { + "epoch": 18.018101066998863, + "grad_norm": 0.1239955946803093, + "learning_rate": 2.954858666281512e-05, + "loss": 0.15, + "step": 86430 + }, + { + "epoch": 18.018155229377673, + "grad_norm": 0.4092569351196289, + "learning_rate": 2.9545577641770028e-05, + "loss": 0.0048, + "step": 86440 + }, + { + "epoch": 18.018209391756486, + "grad_norm": 0.0020924857817590237, + "learning_rate": 2.9542568620724937e-05, + "loss": 0.0093, + "step": 86450 + }, + { + "epoch": 18.0182635541353, + "grad_norm": 0.02145824208855629, + "learning_rate": 2.953955959967984e-05, + "loss": 0.1012, + "step": 86460 + }, + { + "epoch": 18.018317716514108, + "grad_norm": 1.997469425201416, + "learning_rate": 2.9536550578634746e-05, + "loss": 0.0264, + "step": 86470 + }, + { + "epoch": 18.01837187889292, + "grad_norm": 2.723400592803955, + "learning_rate": 2.9533541557589656e-05, + "loss": 0.1137, + "step": 86480 + }, + { + "epoch": 18.018426041271734, + "grad_norm": 2.1285042762756348, + "learning_rate": 2.9530532536544562e-05, + "loss": 0.0271, + "step": 86490 + }, + { + "epoch": 18.018480203650544, + "grad_norm": 0.058699868619441986, + "learning_rate": 2.9527523515499465e-05, + "loss": 0.1143, + "step": 86500 + }, + { + "epoch": 18.018534366029357, + "grad_norm": 0.13454416394233704, + "learning_rate": 2.9524514494454375e-05, + "loss": 0.0945, + "step": 86510 + }, + { + "epoch": 18.018588528408166, + "grad_norm": 0.00583616690710187, + "learning_rate": 2.9521505473409285e-05, + "loss": 0.002, + "step": 86520 + }, + { + "epoch": 18.01864269078698, + "grad_norm": 0.33494460582733154, + "learning_rate": 2.951849645236419e-05, + "loss": 0.0187, + "step": 86530 + }, + { + "epoch": 18.018696853165793, + "grad_norm": 0.5169563889503479, + "learning_rate": 2.9515487431319094e-05, + "loss": 0.0317, + "step": 86540 + }, + { + "epoch": 18.018751015544602, + "grad_norm": 0.00863406341522932, + "learning_rate": 2.9512478410274003e-05, + "loss": 0.0199, + "step": 86550 + }, + { + "epoch": 18.018805177923415, + "grad_norm": 6.783618927001953, + "learning_rate": 2.950946938922891e-05, + "loss": 0.0481, + "step": 86560 + }, + { + "epoch": 18.018859340302225, + "grad_norm": 0.37707388401031494, + "learning_rate": 2.950646036818382e-05, + "loss": 0.0768, + "step": 86570 + }, + { + "epoch": 18.018913502681038, + "grad_norm": 0.6797799468040466, + "learning_rate": 2.9503451347138722e-05, + "loss": 0.0152, + "step": 86580 + }, + { + "epoch": 18.01896766505985, + "grad_norm": 0.0065122428350150585, + "learning_rate": 2.950044232609363e-05, + "loss": 0.0954, + "step": 86590 + }, + { + "epoch": 18.01902182743866, + "grad_norm": 0.18867479264736176, + "learning_rate": 2.9497433305048538e-05, + "loss": 0.0515, + "step": 86600 + }, + { + "epoch": 18.019075989817473, + "grad_norm": 0.006106742192059755, + "learning_rate": 2.949442428400344e-05, + "loss": 0.0068, + "step": 86610 + }, + { + "epoch": 18.019130152196283, + "grad_norm": 0.23644806444644928, + "learning_rate": 2.949141526295835e-05, + "loss": 0.0992, + "step": 86620 + }, + { + "epoch": 18.019184314575096, + "grad_norm": 0.4418918788433075, + "learning_rate": 2.9488406241913257e-05, + "loss": 0.0635, + "step": 86630 + }, + { + "epoch": 18.01923847695391, + "grad_norm": 0.004747314378619194, + "learning_rate": 2.9485397220868167e-05, + "loss": 0.017, + "step": 86640 + }, + { + "epoch": 18.01929263933272, + "grad_norm": 0.02597779408097267, + "learning_rate": 2.948238819982307e-05, + "loss": 0.1109, + "step": 86650 + }, + { + "epoch": 18.01934680171153, + "grad_norm": 1.4509104490280151, + "learning_rate": 2.9479379178777976e-05, + "loss": 0.0205, + "step": 86660 + }, + { + "epoch": 18.019400964090345, + "grad_norm": 0.0029954772908240557, + "learning_rate": 2.9476370157732886e-05, + "loss": 0.0722, + "step": 86670 + }, + { + "epoch": 18.019455126469154, + "grad_norm": 0.0624871551990509, + "learning_rate": 2.9473361136687795e-05, + "loss": 0.0032, + "step": 86680 + }, + { + "epoch": 18.019509288847967, + "grad_norm": 6.5551371574401855, + "learning_rate": 2.9470352115642698e-05, + "loss": 0.0914, + "step": 86690 + }, + { + "epoch": 18.019563451226777, + "grad_norm": 2.957751750946045, + "learning_rate": 2.9467343094597604e-05, + "loss": 0.045, + "step": 86700 + }, + { + "epoch": 18.01961761360559, + "grad_norm": 1.2063781023025513, + "learning_rate": 2.9464334073552514e-05, + "loss": 0.0132, + "step": 86710 + }, + { + "epoch": 18.019671775984403, + "grad_norm": 0.053215377032756805, + "learning_rate": 2.946132505250742e-05, + "loss": 0.0084, + "step": 86720 + }, + { + "epoch": 18.019725938363212, + "grad_norm": 0.34414178133010864, + "learning_rate": 2.9458316031462323e-05, + "loss": 0.048, + "step": 86730 + }, + { + "epoch": 18.019780100742025, + "grad_norm": 0.023510310798883438, + "learning_rate": 2.9455307010417233e-05, + "loss": 0.0246, + "step": 86740 + }, + { + "epoch": 18.019834263120835, + "grad_norm": 0.0035664939787238836, + "learning_rate": 2.945229798937214e-05, + "loss": 0.0069, + "step": 86750 + }, + { + "epoch": 18.019888425499648, + "grad_norm": 0.0020805636886507273, + "learning_rate": 2.9449288968327042e-05, + "loss": 0.036, + "step": 86760 + }, + { + "epoch": 18.01994258787846, + "grad_norm": 0.001933057326823473, + "learning_rate": 2.944627994728195e-05, + "loss": 0.1401, + "step": 86770 + }, + { + "epoch": 18.01999675025727, + "grad_norm": 0.0018346032593399286, + "learning_rate": 2.944327092623686e-05, + "loss": 0.0744, + "step": 86780 + }, + { + "epoch": 18.020050912636083, + "grad_norm": 0.26622092723846436, + "learning_rate": 2.9440261905191768e-05, + "loss": 0.089, + "step": 86790 + }, + { + "epoch": 18.020105075014893, + "grad_norm": 3.391558885574341, + "learning_rate": 2.943725288414667e-05, + "loss": 0.0861, + "step": 86800 + }, + { + "epoch": 18.020159237393706, + "grad_norm": 0.18119780719280243, + "learning_rate": 2.943424386310158e-05, + "loss": 0.0044, + "step": 86810 + }, + { + "epoch": 18.02021339977252, + "grad_norm": 0.0838126614689827, + "learning_rate": 2.9431234842056486e-05, + "loss": 0.0116, + "step": 86820 + }, + { + "epoch": 18.02026756215133, + "grad_norm": 0.006702871527522802, + "learning_rate": 2.9428225821011396e-05, + "loss": 0.0635, + "step": 86830 + }, + { + "epoch": 18.02032172453014, + "grad_norm": 3.7372934818267822, + "learning_rate": 2.94252167999663e-05, + "loss": 0.1248, + "step": 86840 + }, + { + "epoch": 18.020375886908955, + "grad_norm": 0.07860977202653885, + "learning_rate": 2.9422207778921205e-05, + "loss": 0.0462, + "step": 86850 + }, + { + "epoch": 18.020430049287764, + "grad_norm": 0.06619870662689209, + "learning_rate": 2.9419198757876115e-05, + "loss": 0.0051, + "step": 86860 + }, + { + "epoch": 18.020484211666577, + "grad_norm": 0.6461491584777832, + "learning_rate": 2.9416189736831025e-05, + "loss": 0.0961, + "step": 86870 + }, + { + "epoch": 18.020538374045387, + "grad_norm": 0.025083297863602638, + "learning_rate": 2.9413180715785927e-05, + "loss": 0.0029, + "step": 86880 + }, + { + "epoch": 18.0205925364242, + "grad_norm": 0.00670983549207449, + "learning_rate": 2.9410171694740834e-05, + "loss": 0.117, + "step": 86890 + }, + { + "epoch": 18.020646698803013, + "grad_norm": 0.004198089707642794, + "learning_rate": 2.9407162673695743e-05, + "loss": 0.1017, + "step": 86900 + }, + { + "epoch": 18.020700861181822, + "grad_norm": 0.16363215446472168, + "learning_rate": 2.9404153652650646e-05, + "loss": 0.0742, + "step": 86910 + }, + { + "epoch": 18.020755023560636, + "grad_norm": 0.09815327078104019, + "learning_rate": 2.9401144631605553e-05, + "loss": 0.0457, + "step": 86920 + }, + { + "epoch": 18.020809185939445, + "grad_norm": 0.16567695140838623, + "learning_rate": 2.9398135610560462e-05, + "loss": 0.0979, + "step": 86930 + }, + { + "epoch": 18.020863348318258, + "grad_norm": 0.01868806779384613, + "learning_rate": 2.9395126589515372e-05, + "loss": 0.1071, + "step": 86940 + }, + { + "epoch": 18.02091751069707, + "grad_norm": 0.03657110407948494, + "learning_rate": 2.9392117568470275e-05, + "loss": 0.0708, + "step": 86950 + }, + { + "epoch": 18.02097167307588, + "grad_norm": 0.007882621139287949, + "learning_rate": 2.938910854742518e-05, + "loss": 0.0347, + "step": 86960 + }, + { + "epoch": 18.021025835454694, + "grad_norm": 0.14331687986850739, + "learning_rate": 2.938609952638009e-05, + "loss": 0.0651, + "step": 86970 + }, + { + "epoch": 18.021079997833503, + "grad_norm": 0.6334721446037292, + "learning_rate": 2.9383090505334997e-05, + "loss": 0.0473, + "step": 86980 + }, + { + "epoch": 18.021134160212316, + "grad_norm": 0.1897280365228653, + "learning_rate": 2.93800814842899e-05, + "loss": 0.0045, + "step": 86990 + }, + { + "epoch": 18.02118832259113, + "grad_norm": 0.004155343398451805, + "learning_rate": 2.937707246324481e-05, + "loss": 0.0698, + "step": 87000 + }, + { + "epoch": 18.02124248496994, + "grad_norm": 0.004006965085864067, + "learning_rate": 2.9374063442199716e-05, + "loss": 0.0352, + "step": 87010 + }, + { + "epoch": 18.021296647348752, + "grad_norm": 0.3444781005382538, + "learning_rate": 2.9371054421154625e-05, + "loss": 0.0299, + "step": 87020 + }, + { + "epoch": 18.021350809727565, + "grad_norm": 0.0030681483913213015, + "learning_rate": 2.936804540010953e-05, + "loss": 0.0852, + "step": 87030 + }, + { + "epoch": 18.021404972106374, + "grad_norm": 2.5242269039154053, + "learning_rate": 2.9365036379064438e-05, + "loss": 0.0637, + "step": 87040 + }, + { + "epoch": 18.021459134485188, + "grad_norm": 0.02127186581492424, + "learning_rate": 2.9362027358019344e-05, + "loss": 0.0883, + "step": 87050 + }, + { + "epoch": 18.021513296863997, + "grad_norm": 0.11180540919303894, + "learning_rate": 2.9359018336974247e-05, + "loss": 0.1179, + "step": 87060 + }, + { + "epoch": 18.02156745924281, + "grad_norm": 0.3433002531528473, + "learning_rate": 2.9356009315929157e-05, + "loss": 0.0261, + "step": 87070 + }, + { + "epoch": 18.021621621621623, + "grad_norm": 0.003981668036431074, + "learning_rate": 2.9353000294884063e-05, + "loss": 0.0123, + "step": 87080 + }, + { + "epoch": 18.021675784000433, + "grad_norm": 0.03910257667303085, + "learning_rate": 2.9349991273838973e-05, + "loss": 0.0166, + "step": 87090 + }, + { + "epoch": 18.021729946379246, + "grad_norm": 0.003218522295355797, + "learning_rate": 2.9346982252793876e-05, + "loss": 0.0255, + "step": 87100 + }, + { + "epoch": 18.021784108758055, + "grad_norm": 0.005573266185820103, + "learning_rate": 2.9343973231748785e-05, + "loss": 0.0042, + "step": 87110 + }, + { + "epoch": 18.02183827113687, + "grad_norm": 0.0019760970026254654, + "learning_rate": 2.934096421070369e-05, + "loss": 0.0054, + "step": 87120 + }, + { + "epoch": 18.02189243351568, + "grad_norm": 0.0020616701804101467, + "learning_rate": 2.93379551896586e-05, + "loss": 0.0516, + "step": 87130 + }, + { + "epoch": 18.02194659589449, + "grad_norm": 3.5873608589172363, + "learning_rate": 2.9334946168613504e-05, + "loss": 0.0102, + "step": 87140 + }, + { + "epoch": 18.022000758273304, + "grad_norm": 3.5329413414001465, + "learning_rate": 2.933193714756841e-05, + "loss": 0.1617, + "step": 87150 + }, + { + "epoch": 18.022054920652113, + "grad_norm": 0.004613831639289856, + "learning_rate": 2.932892812652332e-05, + "loss": 0.0024, + "step": 87160 + }, + { + "epoch": 18.022109083030927, + "grad_norm": 0.005332572385668755, + "learning_rate": 2.9325919105478226e-05, + "loss": 0.0154, + "step": 87170 + }, + { + "epoch": 18.02216324540974, + "grad_norm": 0.04578351974487305, + "learning_rate": 2.932291008443313e-05, + "loss": 0.0061, + "step": 87180 + }, + { + "epoch": 18.02221740778855, + "grad_norm": 0.09131412953138351, + "learning_rate": 2.931990106338804e-05, + "loss": 0.0009, + "step": 87190 + }, + { + "epoch": 18.022271570167362, + "grad_norm": 0.08199245482683182, + "learning_rate": 2.931689204234295e-05, + "loss": 0.0135, + "step": 87200 + }, + { + "epoch": 18.02232573254617, + "grad_norm": 0.032237570732831955, + "learning_rate": 2.931388302129785e-05, + "loss": 0.0682, + "step": 87210 + }, + { + "epoch": 18.022379894924985, + "grad_norm": 0.02039683610200882, + "learning_rate": 2.9310874000252758e-05, + "loss": 0.0331, + "step": 87220 + }, + { + "epoch": 18.022434057303798, + "grad_norm": 0.0024438640102744102, + "learning_rate": 2.9307864979207667e-05, + "loss": 0.0525, + "step": 87230 + }, + { + "epoch": 18.022488219682607, + "grad_norm": 6.81507682800293, + "learning_rate": 2.9304855958162574e-05, + "loss": 0.1155, + "step": 87240 + }, + { + "epoch": 18.02254238206142, + "grad_norm": 5.798891067504883, + "learning_rate": 2.9301846937117477e-05, + "loss": 0.0873, + "step": 87250 + }, + { + "epoch": 18.022596544440233, + "grad_norm": 0.00312765222042799, + "learning_rate": 2.9298837916072386e-05, + "loss": 0.0033, + "step": 87260 + }, + { + "epoch": 18.022650706819043, + "grad_norm": 3.769282102584839, + "learning_rate": 2.9295828895027293e-05, + "loss": 0.049, + "step": 87270 + }, + { + "epoch": 18.022704869197856, + "grad_norm": 0.009279238060116768, + "learning_rate": 2.9292819873982202e-05, + "loss": 0.0387, + "step": 87280 + }, + { + "epoch": 18.022759031576665, + "grad_norm": 0.03148602321743965, + "learning_rate": 2.9289810852937105e-05, + "loss": 0.0049, + "step": 87290 + }, + { + "epoch": 18.02281319395548, + "grad_norm": 0.020282676443457603, + "learning_rate": 2.9286801831892015e-05, + "loss": 0.025, + "step": 87300 + }, + { + "epoch": 18.02286735633429, + "grad_norm": 2.7401657104492188, + "learning_rate": 2.928379281084692e-05, + "loss": 0.0802, + "step": 87310 + }, + { + "epoch": 18.0229215187131, + "grad_norm": 3.258364200592041, + "learning_rate": 2.928078378980183e-05, + "loss": 0.0388, + "step": 87320 + }, + { + "epoch": 18.022975681091914, + "grad_norm": 0.005077834706753492, + "learning_rate": 2.9277774768756734e-05, + "loss": 0.0012, + "step": 87330 + }, + { + "epoch": 18.023029843470724, + "grad_norm": 0.19989337027072906, + "learning_rate": 2.927476574771164e-05, + "loss": 0.0315, + "step": 87340 + }, + { + "epoch": 18.023084005849537, + "grad_norm": 0.20178286731243134, + "learning_rate": 2.927175672666655e-05, + "loss": 0.0365, + "step": 87350 + }, + { + "epoch": 18.02313816822835, + "grad_norm": 0.00312993535771966, + "learning_rate": 2.9268747705621452e-05, + "loss": 0.0153, + "step": 87360 + }, + { + "epoch": 18.02319233060716, + "grad_norm": 1.705958366394043, + "learning_rate": 2.9265738684576362e-05, + "loss": 0.059, + "step": 87370 + }, + { + "epoch": 18.023246492985972, + "grad_norm": 0.01946505531668663, + "learning_rate": 2.926272966353127e-05, + "loss": 0.0781, + "step": 87380 + }, + { + "epoch": 18.023300655364782, + "grad_norm": 0.2407418042421341, + "learning_rate": 2.9259720642486178e-05, + "loss": 0.0153, + "step": 87390 + }, + { + "epoch": 18.023354817743595, + "grad_norm": 1.7518657445907593, + "learning_rate": 2.925671162144108e-05, + "loss": 0.0691, + "step": 87400 + }, + { + "epoch": 18.023408980122408, + "grad_norm": 1.0420727729797363, + "learning_rate": 2.9253702600395987e-05, + "loss": 0.1258, + "step": 87410 + }, + { + "epoch": 18.023463142501218, + "grad_norm": 0.30973413586616516, + "learning_rate": 2.9250693579350897e-05, + "loss": 0.0212, + "step": 87420 + }, + { + "epoch": 18.02351730488003, + "grad_norm": 0.00299259927123785, + "learning_rate": 2.9247684558305803e-05, + "loss": 0.0413, + "step": 87430 + }, + { + "epoch": 18.023571467258844, + "grad_norm": 7.99776554107666, + "learning_rate": 2.9244675537260706e-05, + "loss": 0.1217, + "step": 87440 + }, + { + "epoch": 18.023625629637653, + "grad_norm": 0.06115112453699112, + "learning_rate": 2.9241666516215616e-05, + "loss": 0.0658, + "step": 87450 + }, + { + "epoch": 18.023679792016466, + "grad_norm": 0.06243852153420448, + "learning_rate": 2.9238657495170525e-05, + "loss": 0.0061, + "step": 87460 + }, + { + "epoch": 18.023733954395276, + "grad_norm": 2.17751145362854, + "learning_rate": 2.923564847412543e-05, + "loss": 0.1194, + "step": 87470 + }, + { + "epoch": 18.02378811677409, + "grad_norm": 0.024746907874941826, + "learning_rate": 2.9232639453080334e-05, + "loss": 0.0458, + "step": 87480 + }, + { + "epoch": 18.023842279152902, + "grad_norm": 2.3831703662872314, + "learning_rate": 2.9229630432035244e-05, + "loss": 0.0443, + "step": 87490 + }, + { + "epoch": 18.02389644153171, + "grad_norm": 0.0024179420433938503, + "learning_rate": 2.922662141099015e-05, + "loss": 0.0933, + "step": 87500 + }, + { + "epoch": 18.023950603910524, + "grad_norm": 0.33358389139175415, + "learning_rate": 2.9223612389945053e-05, + "loss": 0.0364, + "step": 87510 + }, + { + "epoch": 18.024004766289334, + "grad_norm": 0.0022506127133965492, + "learning_rate": 2.9220603368899963e-05, + "loss": 0.0042, + "step": 87520 + }, + { + "epoch": 18.024058928668147, + "grad_norm": 0.00687961932271719, + "learning_rate": 2.921759434785487e-05, + "loss": 0.0726, + "step": 87530 + }, + { + "epoch": 18.02411309104696, + "grad_norm": 0.021147815510630608, + "learning_rate": 2.921458532680978e-05, + "loss": 0.0082, + "step": 87540 + }, + { + "epoch": 18.02416725342577, + "grad_norm": 0.002296063117682934, + "learning_rate": 2.9211576305764682e-05, + "loss": 0.0677, + "step": 87550 + }, + { + "epoch": 18.024221415804583, + "grad_norm": 0.0024338173680007458, + "learning_rate": 2.920856728471959e-05, + "loss": 0.0841, + "step": 87560 + }, + { + "epoch": 18.024275578183392, + "grad_norm": 0.0027020627167075872, + "learning_rate": 2.9205558263674498e-05, + "loss": 0.0501, + "step": 87570 + }, + { + "epoch": 18.024329740562205, + "grad_norm": 0.8830876350402832, + "learning_rate": 2.9202549242629407e-05, + "loss": 0.0935, + "step": 87580 + }, + { + "epoch": 18.024383902941018, + "grad_norm": 0.006124474108219147, + "learning_rate": 2.919954022158431e-05, + "loss": 0.0164, + "step": 87590 + }, + { + "epoch": 18.024438065319828, + "grad_norm": 0.003790593473240733, + "learning_rate": 2.9196531200539217e-05, + "loss": 0.06, + "step": 87600 + }, + { + "epoch": 18.02449222769864, + "grad_norm": 2.8333892822265625, + "learning_rate": 2.9193522179494126e-05, + "loss": 0.1538, + "step": 87610 + }, + { + "epoch": 18.024546390077454, + "grad_norm": 0.25009867548942566, + "learning_rate": 2.9190513158449036e-05, + "loss": 0.0496, + "step": 87620 + }, + { + "epoch": 18.024600552456263, + "grad_norm": 0.015683475881814957, + "learning_rate": 2.918750413740394e-05, + "loss": 0.0427, + "step": 87630 + }, + { + "epoch": 18.024654714835076, + "grad_norm": 1.8206616640090942, + "learning_rate": 2.9184495116358845e-05, + "loss": 0.0522, + "step": 87640 + }, + { + "epoch": 18.024708877213886, + "grad_norm": 0.10339830070734024, + "learning_rate": 2.9181486095313755e-05, + "loss": 0.0486, + "step": 87650 + }, + { + "epoch": 18.0247630395927, + "grad_norm": 0.00333504774607718, + "learning_rate": 2.9178477074268658e-05, + "loss": 0.0465, + "step": 87660 + }, + { + "epoch": 18.024817201971512, + "grad_norm": 0.0044760811142623425, + "learning_rate": 2.9175468053223564e-05, + "loss": 0.0705, + "step": 87670 + }, + { + "epoch": 18.02487136435032, + "grad_norm": 4.499983310699463, + "learning_rate": 2.9172459032178473e-05, + "loss": 0.1521, + "step": 87680 + }, + { + "epoch": 18.024925526729135, + "grad_norm": 0.0041562290862202644, + "learning_rate": 2.916945001113338e-05, + "loss": 0.0426, + "step": 87690 + }, + { + "epoch": 18.024979689107944, + "grad_norm": 0.3828393220901489, + "learning_rate": 2.9166440990088283e-05, + "loss": 0.0902, + "step": 87700 + }, + { + "epoch": 18.02500135405947, + "eval_accuracy": 0.8288700195950359, + "eval_loss": 0.8596038222312927, + "eval_runtime": 117.5078, + "eval_samples_per_second": 26.058, + "eval_steps_per_second": 3.259, + "step": 87704 + }, + { + "epoch": 19.000032497427288, + "grad_norm": 0.1093871220946312, + "learning_rate": 2.9163431969043192e-05, + "loss": 0.0075, + "step": 87710 + }, + { + "epoch": 19.000086659806097, + "grad_norm": 0.0040130275301635265, + "learning_rate": 2.9160422947998102e-05, + "loss": 0.0459, + "step": 87720 + }, + { + "epoch": 19.00014082218491, + "grad_norm": 0.8983754515647888, + "learning_rate": 2.9157413926953008e-05, + "loss": 0.0178, + "step": 87730 + }, + { + "epoch": 19.000194984563723, + "grad_norm": 0.0036346695851534605, + "learning_rate": 2.915440490590791e-05, + "loss": 0.114, + "step": 87740 + }, + { + "epoch": 19.000249146942533, + "grad_norm": 0.0043432521633803844, + "learning_rate": 2.915139588486282e-05, + "loss": 0.03, + "step": 87750 + }, + { + "epoch": 19.000303309321346, + "grad_norm": 8.523094177246094, + "learning_rate": 2.9148386863817727e-05, + "loss": 0.0698, + "step": 87760 + }, + { + "epoch": 19.000357471700156, + "grad_norm": 0.103764109313488, + "learning_rate": 2.9145377842772637e-05, + "loss": 0.0459, + "step": 87770 + }, + { + "epoch": 19.00041163407897, + "grad_norm": 0.05036584660410881, + "learning_rate": 2.914236882172754e-05, + "loss": 0.1593, + "step": 87780 + }, + { + "epoch": 19.00046579645778, + "grad_norm": 0.006154886446893215, + "learning_rate": 2.9139359800682446e-05, + "loss": 0.0073, + "step": 87790 + }, + { + "epoch": 19.00051995883659, + "grad_norm": 0.0038058036006987095, + "learning_rate": 2.9136350779637356e-05, + "loss": 0.002, + "step": 87800 + }, + { + "epoch": 19.000574121215404, + "grad_norm": 0.004460057709366083, + "learning_rate": 2.913334175859226e-05, + "loss": 0.0432, + "step": 87810 + }, + { + "epoch": 19.000628283594214, + "grad_norm": 11.176277160644531, + "learning_rate": 2.9130332737547168e-05, + "loss": 0.3102, + "step": 87820 + }, + { + "epoch": 19.000682445973027, + "grad_norm": 0.026548314839601517, + "learning_rate": 2.9127323716502074e-05, + "loss": 0.2102, + "step": 87830 + }, + { + "epoch": 19.00073660835184, + "grad_norm": 0.026170050725340843, + "learning_rate": 2.9124314695456984e-05, + "loss": 0.1297, + "step": 87840 + }, + { + "epoch": 19.00079077073065, + "grad_norm": 0.0035375512670725584, + "learning_rate": 2.9121305674411887e-05, + "loss": 0.0032, + "step": 87850 + }, + { + "epoch": 19.000844933109462, + "grad_norm": 0.03256034478545189, + "learning_rate": 2.9118296653366793e-05, + "loss": 0.0518, + "step": 87860 + }, + { + "epoch": 19.000899095488275, + "grad_norm": 0.005484146066009998, + "learning_rate": 2.9115287632321703e-05, + "loss": 0.0291, + "step": 87870 + }, + { + "epoch": 19.000953257867085, + "grad_norm": 0.23271793127059937, + "learning_rate": 2.9112278611276613e-05, + "loss": 0.1032, + "step": 87880 + }, + { + "epoch": 19.001007420245898, + "grad_norm": 0.06344333291053772, + "learning_rate": 2.9109269590231515e-05, + "loss": 0.0248, + "step": 87890 + }, + { + "epoch": 19.001061582624708, + "grad_norm": 0.1406884640455246, + "learning_rate": 2.910626056918642e-05, + "loss": 0.0628, + "step": 87900 + }, + { + "epoch": 19.00111574500352, + "grad_norm": 0.0074730850756168365, + "learning_rate": 2.910325154814133e-05, + "loss": 0.0007, + "step": 87910 + }, + { + "epoch": 19.001169907382334, + "grad_norm": 0.022770173847675323, + "learning_rate": 2.9100242527096238e-05, + "loss": 0.0739, + "step": 87920 + }, + { + "epoch": 19.001224069761143, + "grad_norm": 0.003385682124644518, + "learning_rate": 2.909723350605114e-05, + "loss": 0.052, + "step": 87930 + }, + { + "epoch": 19.001278232139956, + "grad_norm": 0.003279475960880518, + "learning_rate": 2.909422448500605e-05, + "loss": 0.0136, + "step": 87940 + }, + { + "epoch": 19.001332394518766, + "grad_norm": 0.002816633088514209, + "learning_rate": 2.9091215463960956e-05, + "loss": 0.0282, + "step": 87950 + }, + { + "epoch": 19.00138655689758, + "grad_norm": 1.0302685499191284, + "learning_rate": 2.908820644291586e-05, + "loss": 0.015, + "step": 87960 + }, + { + "epoch": 19.001440719276392, + "grad_norm": 2.479722261428833, + "learning_rate": 2.908519742187077e-05, + "loss": 0.0472, + "step": 87970 + }, + { + "epoch": 19.0014948816552, + "grad_norm": 0.5361430048942566, + "learning_rate": 2.908218840082568e-05, + "loss": 0.0484, + "step": 87980 + }, + { + "epoch": 19.001549044034014, + "grad_norm": 0.0034951847046613693, + "learning_rate": 2.9079179379780585e-05, + "loss": 0.0394, + "step": 87990 + }, + { + "epoch": 19.001603206412824, + "grad_norm": 0.14930975437164307, + "learning_rate": 2.9076170358735488e-05, + "loss": 0.0626, + "step": 88000 + }, + { + "epoch": 19.001657368791637, + "grad_norm": 0.0077971783466637135, + "learning_rate": 2.9073161337690397e-05, + "loss": 0.0892, + "step": 88010 + }, + { + "epoch": 19.00171153117045, + "grad_norm": 0.3033340275287628, + "learning_rate": 2.9070152316645304e-05, + "loss": 0.1152, + "step": 88020 + }, + { + "epoch": 19.00176569354926, + "grad_norm": 0.16032597422599792, + "learning_rate": 2.9067143295600213e-05, + "loss": 0.0063, + "step": 88030 + }, + { + "epoch": 19.001819855928073, + "grad_norm": 0.0033163621556013823, + "learning_rate": 2.9064134274555116e-05, + "loss": 0.0773, + "step": 88040 + }, + { + "epoch": 19.001874018306886, + "grad_norm": 5.1819868087768555, + "learning_rate": 2.9061125253510023e-05, + "loss": 0.0724, + "step": 88050 + }, + { + "epoch": 19.001928180685695, + "grad_norm": 0.0030408466700464487, + "learning_rate": 2.9058116232464932e-05, + "loss": 0.0364, + "step": 88060 + }, + { + "epoch": 19.00198234306451, + "grad_norm": 0.08907619118690491, + "learning_rate": 2.9055107211419842e-05, + "loss": 0.0099, + "step": 88070 + }, + { + "epoch": 19.002036505443318, + "grad_norm": 0.007746007759124041, + "learning_rate": 2.9052098190374745e-05, + "loss": 0.0463, + "step": 88080 + }, + { + "epoch": 19.00209066782213, + "grad_norm": 0.009440966881811619, + "learning_rate": 2.904908916932965e-05, + "loss": 0.0401, + "step": 88090 + }, + { + "epoch": 19.002144830200944, + "grad_norm": 0.9649967551231384, + "learning_rate": 2.904608014828456e-05, + "loss": 0.0747, + "step": 88100 + }, + { + "epoch": 19.002198992579753, + "grad_norm": 1.582030177116394, + "learning_rate": 2.9043071127239464e-05, + "loss": 0.1073, + "step": 88110 + }, + { + "epoch": 19.002253154958566, + "grad_norm": 0.0031101300846785307, + "learning_rate": 2.904006210619437e-05, + "loss": 0.005, + "step": 88120 + }, + { + "epoch": 19.002307317337376, + "grad_norm": 0.005293321795761585, + "learning_rate": 2.903705308514928e-05, + "loss": 0.0129, + "step": 88130 + }, + { + "epoch": 19.00236147971619, + "grad_norm": 2.4556143283843994, + "learning_rate": 2.903404406410419e-05, + "loss": 0.1237, + "step": 88140 + }, + { + "epoch": 19.002415642095002, + "grad_norm": 0.002907360438257456, + "learning_rate": 2.9031035043059092e-05, + "loss": 0.0537, + "step": 88150 + }, + { + "epoch": 19.00246980447381, + "grad_norm": 17.653621673583984, + "learning_rate": 2.9028026022014e-05, + "loss": 0.0614, + "step": 88160 + }, + { + "epoch": 19.002523966852625, + "grad_norm": 0.7045848369598389, + "learning_rate": 2.9025017000968908e-05, + "loss": 0.0248, + "step": 88170 + }, + { + "epoch": 19.002578129231434, + "grad_norm": 19.537979125976562, + "learning_rate": 2.9022007979923814e-05, + "loss": 0.0715, + "step": 88180 + }, + { + "epoch": 19.002632291610247, + "grad_norm": 0.007100452668964863, + "learning_rate": 2.9018998958878717e-05, + "loss": 0.0147, + "step": 88190 + }, + { + "epoch": 19.00268645398906, + "grad_norm": 0.020506693050265312, + "learning_rate": 2.9015989937833627e-05, + "loss": 0.058, + "step": 88200 + }, + { + "epoch": 19.00274061636787, + "grad_norm": 0.0023181161377578974, + "learning_rate": 2.9012980916788533e-05, + "loss": 0.0516, + "step": 88210 + }, + { + "epoch": 19.002794778746683, + "grad_norm": 0.002496536122635007, + "learning_rate": 2.9009971895743443e-05, + "loss": 0.0549, + "step": 88220 + }, + { + "epoch": 19.002848941125496, + "grad_norm": 0.7877981066703796, + "learning_rate": 2.9006962874698346e-05, + "loss": 0.0693, + "step": 88230 + }, + { + "epoch": 19.002903103504305, + "grad_norm": 0.481158047914505, + "learning_rate": 2.9003953853653255e-05, + "loss": 0.0455, + "step": 88240 + }, + { + "epoch": 19.00295726588312, + "grad_norm": 0.0026120017282664776, + "learning_rate": 2.900094483260816e-05, + "loss": 0.0539, + "step": 88250 + }, + { + "epoch": 19.003011428261928, + "grad_norm": 0.0025183330290019512, + "learning_rate": 2.8997935811563065e-05, + "loss": 0.053, + "step": 88260 + }, + { + "epoch": 19.00306559064074, + "grad_norm": 0.0026878714561462402, + "learning_rate": 2.8994926790517974e-05, + "loss": 0.0031, + "step": 88270 + }, + { + "epoch": 19.003119753019554, + "grad_norm": 0.007150441873818636, + "learning_rate": 2.899191776947288e-05, + "loss": 0.0358, + "step": 88280 + }, + { + "epoch": 19.003173915398364, + "grad_norm": 0.014116455800831318, + "learning_rate": 2.898890874842779e-05, + "loss": 0.0348, + "step": 88290 + }, + { + "epoch": 19.003228077777177, + "grad_norm": 0.0037146913819015026, + "learning_rate": 2.8985899727382693e-05, + "loss": 0.0035, + "step": 88300 + }, + { + "epoch": 19.003282240155986, + "grad_norm": 18.06199836730957, + "learning_rate": 2.89828907063376e-05, + "loss": 0.0081, + "step": 88310 + }, + { + "epoch": 19.0033364025348, + "grad_norm": 0.01584681123495102, + "learning_rate": 2.897988168529251e-05, + "loss": 0.1382, + "step": 88320 + }, + { + "epoch": 19.003390564913612, + "grad_norm": 0.12190460413694382, + "learning_rate": 2.897687266424742e-05, + "loss": 0.0405, + "step": 88330 + }, + { + "epoch": 19.003444727292422, + "grad_norm": 0.14856167137622833, + "learning_rate": 2.897386364320232e-05, + "loss": 0.0486, + "step": 88340 + }, + { + "epoch": 19.003498889671235, + "grad_norm": 0.002756764879450202, + "learning_rate": 2.8970854622157228e-05, + "loss": 0.0183, + "step": 88350 + }, + { + "epoch": 19.003553052050044, + "grad_norm": 5.320080280303955, + "learning_rate": 2.8967845601112137e-05, + "loss": 0.0207, + "step": 88360 + }, + { + "epoch": 19.003607214428857, + "grad_norm": 6.4978861808776855, + "learning_rate": 2.8964836580067044e-05, + "loss": 0.1551, + "step": 88370 + }, + { + "epoch": 19.00366137680767, + "grad_norm": 0.012113097123801708, + "learning_rate": 2.8961827559021947e-05, + "loss": 0.0867, + "step": 88380 + }, + { + "epoch": 19.00371553918648, + "grad_norm": 0.003996224142611027, + "learning_rate": 2.8958818537976856e-05, + "loss": 0.0275, + "step": 88390 + }, + { + "epoch": 19.003769701565293, + "grad_norm": 1.835909366607666, + "learning_rate": 2.8955809516931766e-05, + "loss": 0.0208, + "step": 88400 + }, + { + "epoch": 19.003823863944106, + "grad_norm": 0.5727839469909668, + "learning_rate": 2.895280049588667e-05, + "loss": 0.1668, + "step": 88410 + }, + { + "epoch": 19.003878026322916, + "grad_norm": 0.2925862669944763, + "learning_rate": 2.8949791474841575e-05, + "loss": 0.0294, + "step": 88420 + }, + { + "epoch": 19.00393218870173, + "grad_norm": 0.031195899471640587, + "learning_rate": 2.8946782453796485e-05, + "loss": 0.0458, + "step": 88430 + }, + { + "epoch": 19.00398635108054, + "grad_norm": 0.33788612484931946, + "learning_rate": 2.894377343275139e-05, + "loss": 0.1537, + "step": 88440 + }, + { + "epoch": 19.00404051345935, + "grad_norm": 3.842193365097046, + "learning_rate": 2.8940764411706294e-05, + "loss": 0.1064, + "step": 88450 + }, + { + "epoch": 19.004094675838164, + "grad_norm": 1.7102422714233398, + "learning_rate": 2.8937755390661204e-05, + "loss": 0.2003, + "step": 88460 + }, + { + "epoch": 19.004148838216974, + "grad_norm": 0.005099811591207981, + "learning_rate": 2.893474636961611e-05, + "loss": 0.0137, + "step": 88470 + }, + { + "epoch": 19.004203000595787, + "grad_norm": 0.006738348864018917, + "learning_rate": 2.893173734857102e-05, + "loss": 0.1003, + "step": 88480 + }, + { + "epoch": 19.004257162974596, + "grad_norm": 3.8367919921875, + "learning_rate": 2.8928728327525922e-05, + "loss": 0.0644, + "step": 88490 + }, + { + "epoch": 19.00431132535341, + "grad_norm": 0.06763330847024918, + "learning_rate": 2.8925719306480832e-05, + "loss": 0.1002, + "step": 88500 + }, + { + "epoch": 19.004365487732223, + "grad_norm": 0.010570351034402847, + "learning_rate": 2.892271028543574e-05, + "loss": 0.023, + "step": 88510 + }, + { + "epoch": 19.004419650111032, + "grad_norm": 0.12728838622570038, + "learning_rate": 2.8919701264390648e-05, + "loss": 0.0372, + "step": 88520 + }, + { + "epoch": 19.004473812489845, + "grad_norm": 0.010492351837456226, + "learning_rate": 2.891669224334555e-05, + "loss": 0.0015, + "step": 88530 + }, + { + "epoch": 19.004527974868655, + "grad_norm": 0.0036690221168100834, + "learning_rate": 2.8913683222300457e-05, + "loss": 0.0814, + "step": 88540 + }, + { + "epoch": 19.004582137247468, + "grad_norm": 0.34478259086608887, + "learning_rate": 2.8910674201255367e-05, + "loss": 0.0566, + "step": 88550 + }, + { + "epoch": 19.00463629962628, + "grad_norm": 0.0031079610344022512, + "learning_rate": 2.890766518021027e-05, + "loss": 0.0133, + "step": 88560 + }, + { + "epoch": 19.00469046200509, + "grad_norm": 0.01052310224622488, + "learning_rate": 2.8904656159165176e-05, + "loss": 0.054, + "step": 88570 + }, + { + "epoch": 19.004744624383903, + "grad_norm": 0.002915700664743781, + "learning_rate": 2.8901647138120086e-05, + "loss": 0.0416, + "step": 88580 + }, + { + "epoch": 19.004798786762716, + "grad_norm": 0.90144282579422, + "learning_rate": 2.8898638117074995e-05, + "loss": 0.1043, + "step": 88590 + }, + { + "epoch": 19.004852949141526, + "grad_norm": 0.4430139660835266, + "learning_rate": 2.8895629096029898e-05, + "loss": 0.0849, + "step": 88600 + }, + { + "epoch": 19.00490711152034, + "grad_norm": 0.03895202651619911, + "learning_rate": 2.8892620074984804e-05, + "loss": 0.0727, + "step": 88610 + }, + { + "epoch": 19.00496127389915, + "grad_norm": 0.12488018721342087, + "learning_rate": 2.8889611053939714e-05, + "loss": 0.0467, + "step": 88620 + }, + { + "epoch": 19.00501543627796, + "grad_norm": 0.5889776349067688, + "learning_rate": 2.888660203289462e-05, + "loss": 0.046, + "step": 88630 + }, + { + "epoch": 19.005069598656775, + "grad_norm": 0.004482754971832037, + "learning_rate": 2.8883593011849523e-05, + "loss": 0.0663, + "step": 88640 + }, + { + "epoch": 19.005123761035584, + "grad_norm": 1.6325160264968872, + "learning_rate": 2.8880583990804433e-05, + "loss": 0.1232, + "step": 88650 + }, + { + "epoch": 19.005177923414397, + "grad_norm": 0.006025868933647871, + "learning_rate": 2.8877574969759343e-05, + "loss": 0.1218, + "step": 88660 + }, + { + "epoch": 19.005232085793207, + "grad_norm": 0.10723893344402313, + "learning_rate": 2.8874565948714245e-05, + "loss": 0.0081, + "step": 88670 + }, + { + "epoch": 19.00528624817202, + "grad_norm": 0.009307939559221268, + "learning_rate": 2.8871556927669152e-05, + "loss": 0.0425, + "step": 88680 + }, + { + "epoch": 19.005340410550833, + "grad_norm": 1.7577540874481201, + "learning_rate": 2.886854790662406e-05, + "loss": 0.0601, + "step": 88690 + }, + { + "epoch": 19.005394572929642, + "grad_norm": 0.07674834132194519, + "learning_rate": 2.8865538885578968e-05, + "loss": 0.0709, + "step": 88700 + }, + { + "epoch": 19.005448735308455, + "grad_norm": 0.05609602481126785, + "learning_rate": 2.886252986453387e-05, + "loss": 0.0306, + "step": 88710 + }, + { + "epoch": 19.005502897687265, + "grad_norm": 0.013587960042059422, + "learning_rate": 2.885952084348878e-05, + "loss": 0.0159, + "step": 88720 + }, + { + "epoch": 19.005557060066078, + "grad_norm": 0.014681215398013592, + "learning_rate": 2.8856511822443687e-05, + "loss": 0.0166, + "step": 88730 + }, + { + "epoch": 19.00561122244489, + "grad_norm": 4.736775875091553, + "learning_rate": 2.8853502801398596e-05, + "loss": 0.0886, + "step": 88740 + }, + { + "epoch": 19.0056653848237, + "grad_norm": 0.0033060878049582243, + "learning_rate": 2.88504937803535e-05, + "loss": 0.0198, + "step": 88750 + }, + { + "epoch": 19.005719547202514, + "grad_norm": 0.034790147095918655, + "learning_rate": 2.884748475930841e-05, + "loss": 0.1439, + "step": 88760 + }, + { + "epoch": 19.005773709581323, + "grad_norm": 0.008669151924550533, + "learning_rate": 2.8844475738263315e-05, + "loss": 0.0046, + "step": 88770 + }, + { + "epoch": 19.005827871960136, + "grad_norm": 0.0883779227733612, + "learning_rate": 2.8841466717218225e-05, + "loss": 0.0547, + "step": 88780 + }, + { + "epoch": 19.00588203433895, + "grad_norm": 0.006427382584661245, + "learning_rate": 2.8838457696173128e-05, + "loss": 0.0265, + "step": 88790 + }, + { + "epoch": 19.00593619671776, + "grad_norm": 0.00393795408308506, + "learning_rate": 2.8835448675128034e-05, + "loss": 0.0488, + "step": 88800 + }, + { + "epoch": 19.00599035909657, + "grad_norm": 0.18714463710784912, + "learning_rate": 2.8832439654082944e-05, + "loss": 0.106, + "step": 88810 + }, + { + "epoch": 19.006044521475385, + "grad_norm": 0.002441688207909465, + "learning_rate": 2.8829430633037846e-05, + "loss": 0.0257, + "step": 88820 + }, + { + "epoch": 19.006098683854194, + "grad_norm": 0.13608898222446442, + "learning_rate": 2.8826421611992756e-05, + "loss": 0.0629, + "step": 88830 + }, + { + "epoch": 19.006152846233007, + "grad_norm": 0.0598527193069458, + "learning_rate": 2.8823412590947662e-05, + "loss": 0.0616, + "step": 88840 + }, + { + "epoch": 19.006207008611817, + "grad_norm": 0.06171725317835808, + "learning_rate": 2.8820403569902572e-05, + "loss": 0.0351, + "step": 88850 + }, + { + "epoch": 19.00626117099063, + "grad_norm": 0.002526783151552081, + "learning_rate": 2.8817394548857475e-05, + "loss": 0.0343, + "step": 88860 + }, + { + "epoch": 19.006315333369443, + "grad_norm": 3.3975703716278076, + "learning_rate": 2.881438552781238e-05, + "loss": 0.0559, + "step": 88870 + }, + { + "epoch": 19.006369495748253, + "grad_norm": 0.09648863226175308, + "learning_rate": 2.881137650676729e-05, + "loss": 0.0692, + "step": 88880 + }, + { + "epoch": 19.006423658127066, + "grad_norm": 9.571417808532715, + "learning_rate": 2.8808367485722197e-05, + "loss": 0.1274, + "step": 88890 + }, + { + "epoch": 19.006477820505875, + "grad_norm": 0.44731196761131287, + "learning_rate": 2.88053584646771e-05, + "loss": 0.0545, + "step": 88900 + }, + { + "epoch": 19.006531982884688, + "grad_norm": 0.9643800854682922, + "learning_rate": 2.880234944363201e-05, + "loss": 0.0094, + "step": 88910 + }, + { + "epoch": 19.0065861452635, + "grad_norm": 0.0039345091208815575, + "learning_rate": 2.879934042258692e-05, + "loss": 0.0578, + "step": 88920 + }, + { + "epoch": 19.00664030764231, + "grad_norm": 0.002643777523189783, + "learning_rate": 2.8796331401541826e-05, + "loss": 0.0029, + "step": 88930 + }, + { + "epoch": 19.006694470021124, + "grad_norm": 3.74566912651062, + "learning_rate": 2.879332238049673e-05, + "loss": 0.2543, + "step": 88940 + }, + { + "epoch": 19.006748632399933, + "grad_norm": 1.3260761499404907, + "learning_rate": 2.8790313359451638e-05, + "loss": 0.0257, + "step": 88950 + }, + { + "epoch": 19.006802794778746, + "grad_norm": 2.213311195373535, + "learning_rate": 2.8787304338406544e-05, + "loss": 0.0088, + "step": 88960 + }, + { + "epoch": 19.00685695715756, + "grad_norm": 0.007540809456259012, + "learning_rate": 2.8784295317361447e-05, + "loss": 0.0411, + "step": 88970 + }, + { + "epoch": 19.00691111953637, + "grad_norm": 2.6436307430267334, + "learning_rate": 2.8781286296316357e-05, + "loss": 0.1042, + "step": 88980 + }, + { + "epoch": 19.006965281915182, + "grad_norm": 0.9928401112556458, + "learning_rate": 2.8778277275271263e-05, + "loss": 0.0557, + "step": 88990 + }, + { + "epoch": 19.007019444293995, + "grad_norm": 1.8672139644622803, + "learning_rate": 2.8775268254226173e-05, + "loss": 0.0688, + "step": 89000 + }, + { + "epoch": 19.007073606672805, + "grad_norm": 0.0035422525834292173, + "learning_rate": 2.8772259233181076e-05, + "loss": 0.0846, + "step": 89010 + }, + { + "epoch": 19.007127769051618, + "grad_norm": 0.6405699849128723, + "learning_rate": 2.8769250212135985e-05, + "loss": 0.0091, + "step": 89020 + }, + { + "epoch": 19.007181931430427, + "grad_norm": 0.0024361375253647566, + "learning_rate": 2.8766241191090892e-05, + "loss": 0.0213, + "step": 89030 + }, + { + "epoch": 19.00723609380924, + "grad_norm": 0.0039278543554246426, + "learning_rate": 2.87632321700458e-05, + "loss": 0.1383, + "step": 89040 + }, + { + "epoch": 19.007290256188053, + "grad_norm": 0.6144641041755676, + "learning_rate": 2.8760223149000704e-05, + "loss": 0.0286, + "step": 89050 + }, + { + "epoch": 19.007344418566863, + "grad_norm": 0.002848779782652855, + "learning_rate": 2.875721412795561e-05, + "loss": 0.0103, + "step": 89060 + }, + { + "epoch": 19.007398580945676, + "grad_norm": 3.4644412994384766, + "learning_rate": 2.875420510691052e-05, + "loss": 0.0676, + "step": 89070 + }, + { + "epoch": 19.007452743324485, + "grad_norm": 0.1878487765789032, + "learning_rate": 2.875119608586543e-05, + "loss": 0.0835, + "step": 89080 + }, + { + "epoch": 19.0075069057033, + "grad_norm": 0.05094999447464943, + "learning_rate": 2.8748187064820333e-05, + "loss": 0.1279, + "step": 89090 + }, + { + "epoch": 19.00756106808211, + "grad_norm": 1.3699132204055786, + "learning_rate": 2.874517804377524e-05, + "loss": 0.069, + "step": 89100 + }, + { + "epoch": 19.00761523046092, + "grad_norm": 0.07603837549686432, + "learning_rate": 2.874216902273015e-05, + "loss": 0.0202, + "step": 89110 + }, + { + "epoch": 19.007669392839734, + "grad_norm": 0.25643816590309143, + "learning_rate": 2.873916000168505e-05, + "loss": 0.0382, + "step": 89120 + }, + { + "epoch": 19.007723555218544, + "grad_norm": 0.09651478379964828, + "learning_rate": 2.8736150980639958e-05, + "loss": 0.009, + "step": 89130 + }, + { + "epoch": 19.007777717597357, + "grad_norm": 0.07127656787633896, + "learning_rate": 2.8733141959594868e-05, + "loss": 0.0676, + "step": 89140 + }, + { + "epoch": 19.00783187997617, + "grad_norm": 0.005863231141120195, + "learning_rate": 2.8730132938549774e-05, + "loss": 0.1104, + "step": 89150 + }, + { + "epoch": 19.00788604235498, + "grad_norm": 0.0025448380038142204, + "learning_rate": 2.8727123917504677e-05, + "loss": 0.0217, + "step": 89160 + }, + { + "epoch": 19.007940204733792, + "grad_norm": 0.036535799503326416, + "learning_rate": 2.8724114896459586e-05, + "loss": 0.055, + "step": 89170 + }, + { + "epoch": 19.007994367112605, + "grad_norm": 0.01136635523289442, + "learning_rate": 2.8721105875414496e-05, + "loss": 0.0536, + "step": 89180 + }, + { + "epoch": 19.008048529491415, + "grad_norm": 0.004708195570856333, + "learning_rate": 2.8718096854369402e-05, + "loss": 0.0433, + "step": 89190 + }, + { + "epoch": 19.008102691870228, + "grad_norm": 0.005660677794367075, + "learning_rate": 2.8715087833324305e-05, + "loss": 0.009, + "step": 89200 + }, + { + "epoch": 19.008156854249037, + "grad_norm": 0.005728621035814285, + "learning_rate": 2.8712078812279215e-05, + "loss": 0.0079, + "step": 89210 + }, + { + "epoch": 19.00821101662785, + "grad_norm": 0.004217789974063635, + "learning_rate": 2.870906979123412e-05, + "loss": 0.0299, + "step": 89220 + }, + { + "epoch": 19.008265179006663, + "grad_norm": 0.07609688490629196, + "learning_rate": 2.870606077018903e-05, + "loss": 0.1599, + "step": 89230 + }, + { + "epoch": 19.008319341385473, + "grad_norm": 0.7996937036514282, + "learning_rate": 2.8703051749143934e-05, + "loss": 0.0046, + "step": 89240 + }, + { + "epoch": 19.008373503764286, + "grad_norm": 0.07647044956684113, + "learning_rate": 2.870004272809884e-05, + "loss": 0.0252, + "step": 89250 + }, + { + "epoch": 19.008427666143096, + "grad_norm": 0.002725480357185006, + "learning_rate": 2.869703370705375e-05, + "loss": 0.0491, + "step": 89260 + }, + { + "epoch": 19.00848182852191, + "grad_norm": 1.2505360841751099, + "learning_rate": 2.8694024686008652e-05, + "loss": 0.1115, + "step": 89270 + }, + { + "epoch": 19.00853599090072, + "grad_norm": 0.88019198179245, + "learning_rate": 2.8691015664963562e-05, + "loss": 0.0269, + "step": 89280 + }, + { + "epoch": 19.00859015327953, + "grad_norm": 0.03252559155225754, + "learning_rate": 2.868800664391847e-05, + "loss": 0.0423, + "step": 89290 + }, + { + "epoch": 19.008644315658344, + "grad_norm": 0.0055719646625220776, + "learning_rate": 2.8684997622873378e-05, + "loss": 0.0655, + "step": 89300 + }, + { + "epoch": 19.008698478037154, + "grad_norm": 0.07902281731367111, + "learning_rate": 2.868198860182828e-05, + "loss": 0.1613, + "step": 89310 + }, + { + "epoch": 19.008752640415967, + "grad_norm": 0.0024833923671394587, + "learning_rate": 2.8678979580783187e-05, + "loss": 0.0011, + "step": 89320 + }, + { + "epoch": 19.00880680279478, + "grad_norm": 0.0037650475278496742, + "learning_rate": 2.8675970559738097e-05, + "loss": 0.0553, + "step": 89330 + }, + { + "epoch": 19.00886096517359, + "grad_norm": 0.027741998434066772, + "learning_rate": 2.8672961538693007e-05, + "loss": 0.0033, + "step": 89340 + }, + { + "epoch": 19.008915127552402, + "grad_norm": 0.757101833820343, + "learning_rate": 2.866995251764791e-05, + "loss": 0.0876, + "step": 89350 + }, + { + "epoch": 19.008969289931215, + "grad_norm": 0.8436241745948792, + "learning_rate": 2.8666943496602816e-05, + "loss": 0.0608, + "step": 89360 + }, + { + "epoch": 19.009023452310025, + "grad_norm": 0.6704066395759583, + "learning_rate": 2.8663934475557725e-05, + "loss": 0.0637, + "step": 89370 + }, + { + "epoch": 19.009077614688838, + "grad_norm": 0.002346466761082411, + "learning_rate": 2.866092545451263e-05, + "loss": 0.0069, + "step": 89380 + }, + { + "epoch": 19.009131777067648, + "grad_norm": 0.09418988972902298, + "learning_rate": 2.8657916433467535e-05, + "loss": 0.0832, + "step": 89390 + }, + { + "epoch": 19.00918593944646, + "grad_norm": 0.0023289374075829983, + "learning_rate": 2.8654907412422444e-05, + "loss": 0.0076, + "step": 89400 + }, + { + "epoch": 19.009240101825274, + "grad_norm": 0.010522122494876385, + "learning_rate": 2.865189839137735e-05, + "loss": 0.0638, + "step": 89410 + }, + { + "epoch": 19.009294264204083, + "grad_norm": 0.002211586106568575, + "learning_rate": 2.8648889370332253e-05, + "loss": 0.0162, + "step": 89420 + }, + { + "epoch": 19.009348426582896, + "grad_norm": 0.0021351398900151253, + "learning_rate": 2.8645880349287163e-05, + "loss": 0.0504, + "step": 89430 + }, + { + "epoch": 19.009402588961706, + "grad_norm": 3.3857262134552, + "learning_rate": 2.8642871328242073e-05, + "loss": 0.0554, + "step": 89440 + }, + { + "epoch": 19.00945675134052, + "grad_norm": 0.03635110706090927, + "learning_rate": 2.863986230719698e-05, + "loss": 0.0235, + "step": 89450 + }, + { + "epoch": 19.009510913719332, + "grad_norm": 0.0021344854030758142, + "learning_rate": 2.8636853286151882e-05, + "loss": 0.0138, + "step": 89460 + }, + { + "epoch": 19.00956507609814, + "grad_norm": 3.263249397277832, + "learning_rate": 2.863384426510679e-05, + "loss": 0.0243, + "step": 89470 + }, + { + "epoch": 19.009619238476954, + "grad_norm": 11.611098289489746, + "learning_rate": 2.8630835244061698e-05, + "loss": 0.1046, + "step": 89480 + }, + { + "epoch": 19.009673400855764, + "grad_norm": 1.647548794746399, + "learning_rate": 2.8627826223016607e-05, + "loss": 0.007, + "step": 89490 + }, + { + "epoch": 19.009727563234577, + "grad_norm": 1.3663337230682373, + "learning_rate": 2.862481720197151e-05, + "loss": 0.0216, + "step": 89500 + }, + { + "epoch": 19.00978172561339, + "grad_norm": 0.0019492004066705704, + "learning_rate": 2.8621808180926417e-05, + "loss": 0.0048, + "step": 89510 + }, + { + "epoch": 19.0098358879922, + "grad_norm": 0.046884387731552124, + "learning_rate": 2.8618799159881326e-05, + "loss": 0.0015, + "step": 89520 + }, + { + "epoch": 19.009890050371013, + "grad_norm": 0.002362202387303114, + "learning_rate": 2.8615790138836236e-05, + "loss": 0.0137, + "step": 89530 + }, + { + "epoch": 19.009944212749826, + "grad_norm": 0.001758052152581513, + "learning_rate": 2.861278111779114e-05, + "loss": 0.0305, + "step": 89540 + }, + { + "epoch": 19.009998375128635, + "grad_norm": 0.8875904679298401, + "learning_rate": 2.8609772096746045e-05, + "loss": 0.13, + "step": 89550 + }, + { + "epoch": 19.01005253750745, + "grad_norm": 0.36171796917915344, + "learning_rate": 2.8606763075700955e-05, + "loss": 0.0079, + "step": 89560 + }, + { + "epoch": 19.010106699886258, + "grad_norm": 0.02218085527420044, + "learning_rate": 2.8603754054655858e-05, + "loss": 0.004, + "step": 89570 + }, + { + "epoch": 19.01016086226507, + "grad_norm": 0.018342357128858566, + "learning_rate": 2.8600745033610764e-05, + "loss": 0.0325, + "step": 89580 + }, + { + "epoch": 19.010215024643884, + "grad_norm": 0.42554646730422974, + "learning_rate": 2.8597736012565674e-05, + "loss": 0.0316, + "step": 89590 + }, + { + "epoch": 19.010269187022693, + "grad_norm": 4.890683174133301, + "learning_rate": 2.8594726991520583e-05, + "loss": 0.0503, + "step": 89600 + }, + { + "epoch": 19.010323349401506, + "grad_norm": 0.04615088179707527, + "learning_rate": 2.8591717970475486e-05, + "loss": 0.0701, + "step": 89610 + }, + { + "epoch": 19.010377511780316, + "grad_norm": 0.004082201048731804, + "learning_rate": 2.8588708949430392e-05, + "loss": 0.0571, + "step": 89620 + }, + { + "epoch": 19.01043167415913, + "grad_norm": 0.003645175602287054, + "learning_rate": 2.8585699928385302e-05, + "loss": 0.0683, + "step": 89630 + }, + { + "epoch": 19.010485836537942, + "grad_norm": 0.016529066488146782, + "learning_rate": 2.858269090734021e-05, + "loss": 0.0062, + "step": 89640 + }, + { + "epoch": 19.01053999891675, + "grad_norm": 0.2836458086967468, + "learning_rate": 2.857968188629511e-05, + "loss": 0.0667, + "step": 89650 + }, + { + "epoch": 19.010594161295565, + "grad_norm": 0.00370216672308743, + "learning_rate": 2.857667286525002e-05, + "loss": 0.0033, + "step": 89660 + }, + { + "epoch": 19.010648323674374, + "grad_norm": 0.16570550203323364, + "learning_rate": 2.8573663844204927e-05, + "loss": 0.3227, + "step": 89670 + }, + { + "epoch": 19.010702486053187, + "grad_norm": 0.8348485827445984, + "learning_rate": 2.8570654823159837e-05, + "loss": 0.1166, + "step": 89680 + }, + { + "epoch": 19.010756648432, + "grad_norm": 0.026349348947405815, + "learning_rate": 2.856764580211474e-05, + "loss": 0.0034, + "step": 89690 + }, + { + "epoch": 19.01081081081081, + "grad_norm": 0.05952895060181618, + "learning_rate": 2.856463678106965e-05, + "loss": 0.0726, + "step": 89700 + }, + { + "epoch": 19.010864973189623, + "grad_norm": 0.013049258850514889, + "learning_rate": 2.8561627760024556e-05, + "loss": 0.0186, + "step": 89710 + }, + { + "epoch": 19.010919135568436, + "grad_norm": 0.004109680652618408, + "learning_rate": 2.855861873897946e-05, + "loss": 0.0647, + "step": 89720 + }, + { + "epoch": 19.010973297947245, + "grad_norm": 1.0402534008026123, + "learning_rate": 2.8555609717934368e-05, + "loss": 0.0727, + "step": 89730 + }, + { + "epoch": 19.01102746032606, + "grad_norm": 0.008073715493083, + "learning_rate": 2.8552600696889274e-05, + "loss": 0.0014, + "step": 89740 + }, + { + "epoch": 19.011081622704868, + "grad_norm": 0.6937558650970459, + "learning_rate": 2.8549591675844184e-05, + "loss": 0.104, + "step": 89750 + }, + { + "epoch": 19.01113578508368, + "grad_norm": 0.029639892280101776, + "learning_rate": 2.8546582654799087e-05, + "loss": 0.0116, + "step": 89760 + }, + { + "epoch": 19.011189947462494, + "grad_norm": 0.008320018649101257, + "learning_rate": 2.8543573633753993e-05, + "loss": 0.018, + "step": 89770 + }, + { + "epoch": 19.011244109841304, + "grad_norm": 0.015819422900676727, + "learning_rate": 2.8540564612708903e-05, + "loss": 0.1038, + "step": 89780 + }, + { + "epoch": 19.011298272220117, + "grad_norm": 0.034180544316768646, + "learning_rate": 2.8537555591663813e-05, + "loss": 0.023, + "step": 89790 + }, + { + "epoch": 19.011352434598926, + "grad_norm": 0.0615554079413414, + "learning_rate": 2.8534546570618716e-05, + "loss": 0.0412, + "step": 89800 + }, + { + "epoch": 19.01140659697774, + "grad_norm": 1.5429996252059937, + "learning_rate": 2.8531537549573622e-05, + "loss": 0.0408, + "step": 89810 + }, + { + "epoch": 19.011460759356552, + "grad_norm": 0.009603675454854965, + "learning_rate": 2.852852852852853e-05, + "loss": 0.0082, + "step": 89820 + }, + { + "epoch": 19.011514921735362, + "grad_norm": 0.0045327674597501755, + "learning_rate": 2.8525519507483438e-05, + "loss": 0.0144, + "step": 89830 + }, + { + "epoch": 19.011569084114175, + "grad_norm": 0.4926535189151764, + "learning_rate": 2.852251048643834e-05, + "loss": 0.0051, + "step": 89840 + }, + { + "epoch": 19.011623246492984, + "grad_norm": 0.030907128006219864, + "learning_rate": 2.851950146539325e-05, + "loss": 0.0021, + "step": 89850 + }, + { + "epoch": 19.011677408871797, + "grad_norm": 0.0028501469641923904, + "learning_rate": 2.851649244434816e-05, + "loss": 0.0824, + "step": 89860 + }, + { + "epoch": 19.01173157125061, + "grad_norm": 0.11628931760787964, + "learning_rate": 2.8513483423303063e-05, + "loss": 0.0356, + "step": 89870 + }, + { + "epoch": 19.01178573362942, + "grad_norm": 0.007104337215423584, + "learning_rate": 2.851047440225797e-05, + "loss": 0.0945, + "step": 89880 + }, + { + "epoch": 19.011839896008233, + "grad_norm": 0.003009240375831723, + "learning_rate": 2.850746538121288e-05, + "loss": 0.2289, + "step": 89890 + }, + { + "epoch": 19.011894058387043, + "grad_norm": 0.016926757991313934, + "learning_rate": 2.8504456360167785e-05, + "loss": 0.0901, + "step": 89900 + }, + { + "epoch": 19.011948220765856, + "grad_norm": 0.2533503472805023, + "learning_rate": 2.8501447339122688e-05, + "loss": 0.0031, + "step": 89910 + }, + { + "epoch": 19.01200238314467, + "grad_norm": 0.003185603301972151, + "learning_rate": 2.8498438318077598e-05, + "loss": 0.1243, + "step": 89920 + }, + { + "epoch": 19.01205654552348, + "grad_norm": 0.014094509184360504, + "learning_rate": 2.8495429297032504e-05, + "loss": 0.038, + "step": 89930 + }, + { + "epoch": 19.01211070790229, + "grad_norm": 0.0035059682559221983, + "learning_rate": 2.8492420275987414e-05, + "loss": 0.0393, + "step": 89940 + }, + { + "epoch": 19.012164870281104, + "grad_norm": 0.004007234238088131, + "learning_rate": 2.8489411254942316e-05, + "loss": 0.0071, + "step": 89950 + }, + { + "epoch": 19.012219032659914, + "grad_norm": 0.0038846649695187807, + "learning_rate": 2.8486402233897226e-05, + "loss": 0.0475, + "step": 89960 + }, + { + "epoch": 19.012273195038727, + "grad_norm": 0.01232766080647707, + "learning_rate": 2.8483393212852132e-05, + "loss": 0.1033, + "step": 89970 + }, + { + "epoch": 19.012327357417536, + "grad_norm": 1.2100735902786255, + "learning_rate": 2.8480384191807042e-05, + "loss": 0.0441, + "step": 89980 + }, + { + "epoch": 19.01238151979635, + "grad_norm": 0.12090238928794861, + "learning_rate": 2.8477375170761945e-05, + "loss": 0.1409, + "step": 89990 + }, + { + "epoch": 19.012435682175163, + "grad_norm": 0.02103259228169918, + "learning_rate": 2.847436614971685e-05, + "loss": 0.1246, + "step": 90000 + }, + { + "epoch": 19.012489844553972, + "grad_norm": 0.0036160671152174473, + "learning_rate": 2.847135712867176e-05, + "loss": 0.0088, + "step": 90010 + }, + { + "epoch": 19.012544006932785, + "grad_norm": 0.0058298916555941105, + "learning_rate": 2.8468348107626664e-05, + "loss": 0.1072, + "step": 90020 + }, + { + "epoch": 19.012598169311595, + "grad_norm": 3.654022216796875, + "learning_rate": 2.846533908658157e-05, + "loss": 0.0634, + "step": 90030 + }, + { + "epoch": 19.012652331690408, + "grad_norm": 0.003977067768573761, + "learning_rate": 2.846233006553648e-05, + "loss": 0.0651, + "step": 90040 + }, + { + "epoch": 19.01270649406922, + "grad_norm": 0.10766804963350296, + "learning_rate": 2.845932104449139e-05, + "loss": 0.0399, + "step": 90050 + }, + { + "epoch": 19.01276065644803, + "grad_norm": 0.037930309772491455, + "learning_rate": 2.8456312023446292e-05, + "loss": 0.0602, + "step": 90060 + }, + { + "epoch": 19.012814818826843, + "grad_norm": 0.005284378305077553, + "learning_rate": 2.84533030024012e-05, + "loss": 0.0769, + "step": 90070 + }, + { + "epoch": 19.012868981205653, + "grad_norm": 0.03961551561951637, + "learning_rate": 2.8450293981356108e-05, + "loss": 0.0744, + "step": 90080 + }, + { + "epoch": 19.012923143584466, + "grad_norm": 3.8926138877868652, + "learning_rate": 2.8447284960311014e-05, + "loss": 0.0761, + "step": 90090 + }, + { + "epoch": 19.01297730596328, + "grad_norm": 0.005378416273742914, + "learning_rate": 2.8444275939265917e-05, + "loss": 0.0442, + "step": 90100 + }, + { + "epoch": 19.01303146834209, + "grad_norm": 0.20737357437610626, + "learning_rate": 2.8441266918220827e-05, + "loss": 0.011, + "step": 90110 + }, + { + "epoch": 19.0130856307209, + "grad_norm": 0.34313055872917175, + "learning_rate": 2.8438257897175737e-05, + "loss": 0.028, + "step": 90120 + }, + { + "epoch": 19.013139793099715, + "grad_norm": 1.4560275077819824, + "learning_rate": 2.8435248876130643e-05, + "loss": 0.0464, + "step": 90130 + }, + { + "epoch": 19.013193955478524, + "grad_norm": 7.390440940856934, + "learning_rate": 2.8432239855085546e-05, + "loss": 0.1547, + "step": 90140 + }, + { + "epoch": 19.013248117857337, + "grad_norm": 0.1252073496580124, + "learning_rate": 2.8429230834040455e-05, + "loss": 0.0514, + "step": 90150 + }, + { + "epoch": 19.013302280236147, + "grad_norm": 0.003305608406662941, + "learning_rate": 2.8426221812995362e-05, + "loss": 0.0664, + "step": 90160 + }, + { + "epoch": 19.01335644261496, + "grad_norm": 0.003342210315167904, + "learning_rate": 2.8423212791950265e-05, + "loss": 0.0064, + "step": 90170 + }, + { + "epoch": 19.013410604993773, + "grad_norm": 0.04461684823036194, + "learning_rate": 2.8420203770905174e-05, + "loss": 0.0046, + "step": 90180 + }, + { + "epoch": 19.013464767372582, + "grad_norm": 0.0035326266661286354, + "learning_rate": 2.841719474986008e-05, + "loss": 0.0894, + "step": 90190 + }, + { + "epoch": 19.013518929751395, + "grad_norm": 0.6493744850158691, + "learning_rate": 2.841418572881499e-05, + "loss": 0.003, + "step": 90200 + }, + { + "epoch": 19.013573092130205, + "grad_norm": 0.48701468110084534, + "learning_rate": 2.8411176707769893e-05, + "loss": 0.0428, + "step": 90210 + }, + { + "epoch": 19.013627254509018, + "grad_norm": 0.044412340968847275, + "learning_rate": 2.8408167686724803e-05, + "loss": 0.0976, + "step": 90220 + }, + { + "epoch": 19.01368141688783, + "grad_norm": 0.032399311661720276, + "learning_rate": 2.840515866567971e-05, + "loss": 0.0539, + "step": 90230 + }, + { + "epoch": 19.01373557926664, + "grad_norm": 0.30458712577819824, + "learning_rate": 2.840214964463462e-05, + "loss": 0.0189, + "step": 90240 + }, + { + "epoch": 19.013789741645454, + "grad_norm": 0.15228575468063354, + "learning_rate": 2.839914062358952e-05, + "loss": 0.0152, + "step": 90250 + }, + { + "epoch": 19.013843904024263, + "grad_norm": 0.004150874447077513, + "learning_rate": 2.8396131602544428e-05, + "loss": 0.0577, + "step": 90260 + }, + { + "epoch": 19.013898066403076, + "grad_norm": 3.1626272201538086, + "learning_rate": 2.8393122581499338e-05, + "loss": 0.0434, + "step": 90270 + }, + { + "epoch": 19.01395222878189, + "grad_norm": 2.3472814559936523, + "learning_rate": 2.8390113560454247e-05, + "loss": 0.0383, + "step": 90280 + }, + { + "epoch": 19.0140063911607, + "grad_norm": 0.12881071865558624, + "learning_rate": 2.8387104539409147e-05, + "loss": 0.0377, + "step": 90290 + }, + { + "epoch": 19.01406055353951, + "grad_norm": 0.30170291662216187, + "learning_rate": 2.8384095518364056e-05, + "loss": 0.1506, + "step": 90300 + }, + { + "epoch": 19.014114715918325, + "grad_norm": 0.0028045775834470987, + "learning_rate": 2.8381086497318966e-05, + "loss": 0.1289, + "step": 90310 + }, + { + "epoch": 19.014168878297134, + "grad_norm": 7.863699436187744, + "learning_rate": 2.837807747627387e-05, + "loss": 0.1263, + "step": 90320 + }, + { + "epoch": 19.014223040675947, + "grad_norm": 1.5359517335891724, + "learning_rate": 2.8375068455228775e-05, + "loss": 0.0273, + "step": 90330 + }, + { + "epoch": 19.014277203054757, + "grad_norm": 0.002646627137437463, + "learning_rate": 2.8372059434183685e-05, + "loss": 0.0372, + "step": 90340 + }, + { + "epoch": 19.01433136543357, + "grad_norm": 0.4263642430305481, + "learning_rate": 2.836905041313859e-05, + "loss": 0.0368, + "step": 90350 + }, + { + "epoch": 19.014385527812383, + "grad_norm": 0.0038463131058961153, + "learning_rate": 2.8366041392093494e-05, + "loss": 0.0264, + "step": 90360 + }, + { + "epoch": 19.014439690191193, + "grad_norm": 2.6281087398529053, + "learning_rate": 2.8363032371048404e-05, + "loss": 0.0225, + "step": 90370 + }, + { + "epoch": 19.014493852570006, + "grad_norm": 0.004109704401344061, + "learning_rate": 2.8360023350003313e-05, + "loss": 0.0072, + "step": 90380 + }, + { + "epoch": 19.014548014948815, + "grad_norm": 0.011359667405486107, + "learning_rate": 2.835701432895822e-05, + "loss": 0.1232, + "step": 90390 + }, + { + "epoch": 19.014602177327628, + "grad_norm": 0.06473593413829803, + "learning_rate": 2.8354005307913122e-05, + "loss": 0.0213, + "step": 90400 + }, + { + "epoch": 19.01465633970644, + "grad_norm": 0.11664841324090958, + "learning_rate": 2.8350996286868032e-05, + "loss": 0.0256, + "step": 90410 + }, + { + "epoch": 19.01471050208525, + "grad_norm": 2.847205638885498, + "learning_rate": 2.834798726582294e-05, + "loss": 0.0565, + "step": 90420 + }, + { + "epoch": 19.014764664464064, + "grad_norm": 0.031611368060112, + "learning_rate": 2.8344978244777848e-05, + "loss": 0.062, + "step": 90430 + }, + { + "epoch": 19.014818826842873, + "grad_norm": 0.0028853637631982565, + "learning_rate": 2.834196922373275e-05, + "loss": 0.043, + "step": 90440 + }, + { + "epoch": 19.014872989221686, + "grad_norm": 0.04731445759534836, + "learning_rate": 2.8338960202687657e-05, + "loss": 0.0057, + "step": 90450 + }, + { + "epoch": 19.0149271516005, + "grad_norm": 2.305652379989624, + "learning_rate": 2.8335951181642567e-05, + "loss": 0.0688, + "step": 90460 + }, + { + "epoch": 19.01498131397931, + "grad_norm": 0.055396001785993576, + "learning_rate": 2.833294216059747e-05, + "loss": 0.1088, + "step": 90470 + }, + { + "epoch": 19.015035476358122, + "grad_norm": 2.370049476623535, + "learning_rate": 2.832993313955238e-05, + "loss": 0.1638, + "step": 90480 + }, + { + "epoch": 19.015089638736935, + "grad_norm": 0.032152850180864334, + "learning_rate": 2.8326924118507286e-05, + "loss": 0.1213, + "step": 90490 + }, + { + "epoch": 19.015143801115745, + "grad_norm": 0.20882762968540192, + "learning_rate": 2.8323915097462195e-05, + "loss": 0.1162, + "step": 90500 + }, + { + "epoch": 19.015197963494558, + "grad_norm": 0.033091988414525986, + "learning_rate": 2.8320906076417098e-05, + "loss": 0.0165, + "step": 90510 + }, + { + "epoch": 19.015252125873367, + "grad_norm": 0.03948738053441048, + "learning_rate": 2.8317897055372005e-05, + "loss": 0.0866, + "step": 90520 + }, + { + "epoch": 19.01530628825218, + "grad_norm": 0.0253798495978117, + "learning_rate": 2.8314888034326914e-05, + "loss": 0.0498, + "step": 90530 + }, + { + "epoch": 19.015360450630993, + "grad_norm": 0.033434540033340454, + "learning_rate": 2.8311879013281824e-05, + "loss": 0.0571, + "step": 90540 + }, + { + "epoch": 19.015414613009803, + "grad_norm": 0.0031627544667571783, + "learning_rate": 2.8308869992236727e-05, + "loss": 0.0309, + "step": 90550 + }, + { + "epoch": 19.015468775388616, + "grad_norm": 0.4012841582298279, + "learning_rate": 2.8305860971191633e-05, + "loss": 0.048, + "step": 90560 + }, + { + "epoch": 19.015522937767425, + "grad_norm": 2.893521785736084, + "learning_rate": 2.8302851950146543e-05, + "loss": 0.0384, + "step": 90570 + }, + { + "epoch": 19.01557710014624, + "grad_norm": 0.07723569869995117, + "learning_rate": 2.829984292910145e-05, + "loss": 0.0499, + "step": 90580 + }, + { + "epoch": 19.01563126252505, + "grad_norm": 0.025471068918704987, + "learning_rate": 2.8296833908056352e-05, + "loss": 0.0179, + "step": 90590 + }, + { + "epoch": 19.01568542490386, + "grad_norm": 0.10881607234477997, + "learning_rate": 2.829382488701126e-05, + "loss": 0.0109, + "step": 90600 + }, + { + "epoch": 19.015739587282674, + "grad_norm": 0.058695707470178604, + "learning_rate": 2.8290815865966168e-05, + "loss": 0.0011, + "step": 90610 + }, + { + "epoch": 19.015793749661484, + "grad_norm": 0.003971384838223457, + "learning_rate": 2.828780684492107e-05, + "loss": 0.0614, + "step": 90620 + }, + { + "epoch": 19.015847912040297, + "grad_norm": 1.8594446182250977, + "learning_rate": 2.828479782387598e-05, + "loss": 0.0319, + "step": 90630 + }, + { + "epoch": 19.01590207441911, + "grad_norm": 0.004293514881283045, + "learning_rate": 2.828178880283089e-05, + "loss": 0.0763, + "step": 90640 + }, + { + "epoch": 19.01595623679792, + "grad_norm": 5.519393444061279, + "learning_rate": 2.8278779781785796e-05, + "loss": 0.1468, + "step": 90650 + }, + { + "epoch": 19.016010399176732, + "grad_norm": 0.4885536730289459, + "learning_rate": 2.82757707607407e-05, + "loss": 0.0299, + "step": 90660 + }, + { + "epoch": 19.016064561555545, + "grad_norm": 0.5221139192581177, + "learning_rate": 2.827276173969561e-05, + "loss": 0.0156, + "step": 90670 + }, + { + "epoch": 19.016118723934355, + "grad_norm": 0.002880376297980547, + "learning_rate": 2.8269752718650515e-05, + "loss": 0.0369, + "step": 90680 + }, + { + "epoch": 19.016172886313168, + "grad_norm": 0.37421324849128723, + "learning_rate": 2.8266743697605425e-05, + "loss": 0.057, + "step": 90690 + }, + { + "epoch": 19.016227048691977, + "grad_norm": 4.627808570861816, + "learning_rate": 2.8263734676560328e-05, + "loss": 0.0552, + "step": 90700 + }, + { + "epoch": 19.01628121107079, + "grad_norm": 1.9426082372665405, + "learning_rate": 2.8260725655515234e-05, + "loss": 0.0595, + "step": 90710 + }, + { + "epoch": 19.016335373449603, + "grad_norm": 0.0029968232847750187, + "learning_rate": 2.8257716634470144e-05, + "loss": 0.0519, + "step": 90720 + }, + { + "epoch": 19.016389535828413, + "grad_norm": 2.9663519859313965, + "learning_rate": 2.8254707613425053e-05, + "loss": 0.1264, + "step": 90730 + }, + { + "epoch": 19.016443698207226, + "grad_norm": 0.13900525867938995, + "learning_rate": 2.8251698592379956e-05, + "loss": 0.0135, + "step": 90740 + }, + { + "epoch": 19.016497860586036, + "grad_norm": 1.3564074039459229, + "learning_rate": 2.8248689571334862e-05, + "loss": 0.1048, + "step": 90750 + }, + { + "epoch": 19.01655202296485, + "grad_norm": 0.011647377163171768, + "learning_rate": 2.8245680550289772e-05, + "loss": 0.0722, + "step": 90760 + }, + { + "epoch": 19.01660618534366, + "grad_norm": 0.0909603163599968, + "learning_rate": 2.8242671529244675e-05, + "loss": 0.0776, + "step": 90770 + }, + { + "epoch": 19.01666034772247, + "grad_norm": 3.508618116378784, + "learning_rate": 2.823966250819958e-05, + "loss": 0.0618, + "step": 90780 + }, + { + "epoch": 19.016714510101284, + "grad_norm": 0.1328079104423523, + "learning_rate": 2.823665348715449e-05, + "loss": 0.06, + "step": 90790 + }, + { + "epoch": 19.016768672480094, + "grad_norm": 0.0034572286531329155, + "learning_rate": 2.82336444661094e-05, + "loss": 0.0078, + "step": 90800 + }, + { + "epoch": 19.016822834858907, + "grad_norm": 6.197504997253418, + "learning_rate": 2.8230635445064303e-05, + "loss": 0.0242, + "step": 90810 + }, + { + "epoch": 19.01687699723772, + "grad_norm": 0.0036693913862109184, + "learning_rate": 2.822762642401921e-05, + "loss": 0.0075, + "step": 90820 + }, + { + "epoch": 19.01693115961653, + "grad_norm": 1.372385025024414, + "learning_rate": 2.822461740297412e-05, + "loss": 0.0201, + "step": 90830 + }, + { + "epoch": 19.016985321995342, + "grad_norm": 0.0024339575320482254, + "learning_rate": 2.8221608381929026e-05, + "loss": 0.0735, + "step": 90840 + }, + { + "epoch": 19.017039484374155, + "grad_norm": 0.00791244488209486, + "learning_rate": 2.821859936088393e-05, + "loss": 0.0131, + "step": 90850 + }, + { + "epoch": 19.017093646752965, + "grad_norm": 0.0396050326526165, + "learning_rate": 2.8215590339838838e-05, + "loss": 0.0397, + "step": 90860 + }, + { + "epoch": 19.017147809131778, + "grad_norm": 2.3745763301849365, + "learning_rate": 2.8212581318793744e-05, + "loss": 0.0763, + "step": 90870 + }, + { + "epoch": 19.017201971510588, + "grad_norm": 0.11599597334861755, + "learning_rate": 2.8209572297748654e-05, + "loss": 0.0357, + "step": 90880 + }, + { + "epoch": 19.0172561338894, + "grad_norm": 0.005531660281121731, + "learning_rate": 2.8206563276703557e-05, + "loss": 0.0241, + "step": 90890 + }, + { + "epoch": 19.017310296268214, + "grad_norm": 0.09128675609827042, + "learning_rate": 2.8203554255658467e-05, + "loss": 0.1946, + "step": 90900 + }, + { + "epoch": 19.017364458647023, + "grad_norm": 0.002958750817924738, + "learning_rate": 2.8200545234613373e-05, + "loss": 0.0151, + "step": 90910 + }, + { + "epoch": 19.017418621025836, + "grad_norm": 0.7880770564079285, + "learning_rate": 2.8197536213568276e-05, + "loss": 0.0651, + "step": 90920 + }, + { + "epoch": 19.017472783404646, + "grad_norm": 4.558811664581299, + "learning_rate": 2.8194527192523186e-05, + "loss": 0.059, + "step": 90930 + }, + { + "epoch": 19.01752694578346, + "grad_norm": 0.9456709027290344, + "learning_rate": 2.8191518171478092e-05, + "loss": 0.0699, + "step": 90940 + }, + { + "epoch": 19.017581108162272, + "grad_norm": 0.0049814824014902115, + "learning_rate": 2.8188509150433e-05, + "loss": 0.0186, + "step": 90950 + }, + { + "epoch": 19.01763527054108, + "grad_norm": 3.3740086555480957, + "learning_rate": 2.8185500129387904e-05, + "loss": 0.067, + "step": 90960 + }, + { + "epoch": 19.017689432919894, + "grad_norm": 0.007825317792594433, + "learning_rate": 2.818249110834281e-05, + "loss": 0.0494, + "step": 90970 + }, + { + "epoch": 19.017743595298704, + "grad_norm": 0.006575364153832197, + "learning_rate": 2.817948208729772e-05, + "loss": 0.0807, + "step": 90980 + }, + { + "epoch": 19.017797757677517, + "grad_norm": 0.03627527132630348, + "learning_rate": 2.817647306625263e-05, + "loss": 0.0191, + "step": 90990 + }, + { + "epoch": 19.01785192005633, + "grad_norm": 0.058938488364219666, + "learning_rate": 2.8173464045207533e-05, + "loss": 0.0565, + "step": 91000 + }, + { + "epoch": 19.01790608243514, + "grad_norm": 0.34616807103157043, + "learning_rate": 2.817045502416244e-05, + "loss": 0.059, + "step": 91010 + }, + { + "epoch": 19.017960244813953, + "grad_norm": 0.002755546011030674, + "learning_rate": 2.816744600311735e-05, + "loss": 0.0359, + "step": 91020 + }, + { + "epoch": 19.018014407192762, + "grad_norm": 10.107378959655762, + "learning_rate": 2.8164436982072255e-05, + "loss": 0.2723, + "step": 91030 + }, + { + "epoch": 19.018068569571575, + "grad_norm": 0.002876996761187911, + "learning_rate": 2.8161427961027158e-05, + "loss": 0.106, + "step": 91040 + }, + { + "epoch": 19.01812273195039, + "grad_norm": 0.09243125468492508, + "learning_rate": 2.8158418939982068e-05, + "loss": 0.0043, + "step": 91050 + }, + { + "epoch": 19.018176894329198, + "grad_norm": 0.09822073578834534, + "learning_rate": 2.8155409918936977e-05, + "loss": 0.0801, + "step": 91060 + }, + { + "epoch": 19.01823105670801, + "grad_norm": 0.0032983256969600916, + "learning_rate": 2.815240089789188e-05, + "loss": 0.0974, + "step": 91070 + }, + { + "epoch": 19.018285219086824, + "grad_norm": 1.7596814632415771, + "learning_rate": 2.8149391876846786e-05, + "loss": 0.0189, + "step": 91080 + }, + { + "epoch": 19.018339381465633, + "grad_norm": 0.024921655654907227, + "learning_rate": 2.8146382855801696e-05, + "loss": 0.0407, + "step": 91090 + }, + { + "epoch": 19.018393543844446, + "grad_norm": 0.003415880026295781, + "learning_rate": 2.8143373834756602e-05, + "loss": 0.077, + "step": 91100 + }, + { + "epoch": 19.018447706223256, + "grad_norm": 0.042320091277360916, + "learning_rate": 2.8140364813711505e-05, + "loss": 0.0127, + "step": 91110 + }, + { + "epoch": 19.01850186860207, + "grad_norm": 0.006342081353068352, + "learning_rate": 2.8137355792666415e-05, + "loss": 0.0333, + "step": 91120 + }, + { + "epoch": 19.018556030980882, + "grad_norm": 0.0023646452464163303, + "learning_rate": 2.813434677162132e-05, + "loss": 0.0011, + "step": 91130 + }, + { + "epoch": 19.01861019335969, + "grad_norm": 1.6895259618759155, + "learning_rate": 2.813133775057623e-05, + "loss": 0.0817, + "step": 91140 + }, + { + "epoch": 19.018664355738505, + "grad_norm": 0.03637620806694031, + "learning_rate": 2.8128328729531134e-05, + "loss": 0.0702, + "step": 91150 + }, + { + "epoch": 19.018718518117314, + "grad_norm": 0.0030694433953613043, + "learning_rate": 2.8125319708486043e-05, + "loss": 0.0783, + "step": 91160 + }, + { + "epoch": 19.018772680496127, + "grad_norm": 0.017964644357562065, + "learning_rate": 2.812231068744095e-05, + "loss": 0.0495, + "step": 91170 + }, + { + "epoch": 19.01882684287494, + "grad_norm": 1.236213207244873, + "learning_rate": 2.811930166639586e-05, + "loss": 0.0659, + "step": 91180 + }, + { + "epoch": 19.01888100525375, + "grad_norm": 0.0040317620150744915, + "learning_rate": 2.8116292645350762e-05, + "loss": 0.0869, + "step": 91190 + }, + { + "epoch": 19.018935167632563, + "grad_norm": 0.19218936562538147, + "learning_rate": 2.811328362430567e-05, + "loss": 0.0482, + "step": 91200 + }, + { + "epoch": 19.018989330011372, + "grad_norm": 0.7074936032295227, + "learning_rate": 2.8110274603260578e-05, + "loss": 0.0607, + "step": 91210 + }, + { + "epoch": 19.019043492390185, + "grad_norm": 0.0026942037511616945, + "learning_rate": 2.810726558221548e-05, + "loss": 0.0414, + "step": 91220 + }, + { + "epoch": 19.019097654769, + "grad_norm": 0.10483492910861969, + "learning_rate": 2.8104256561170387e-05, + "loss": 0.0512, + "step": 91230 + }, + { + "epoch": 19.019151817147808, + "grad_norm": 0.005706057883799076, + "learning_rate": 2.8101247540125297e-05, + "loss": 0.0914, + "step": 91240 + }, + { + "epoch": 19.01920597952662, + "grad_norm": 0.022633006796240807, + "learning_rate": 2.8098238519080207e-05, + "loss": 0.0067, + "step": 91250 + }, + { + "epoch": 19.019260141905434, + "grad_norm": 0.006576793733984232, + "learning_rate": 2.809522949803511e-05, + "loss": 0.0232, + "step": 91260 + }, + { + "epoch": 19.019314304284244, + "grad_norm": 0.003490651724860072, + "learning_rate": 2.8092220476990016e-05, + "loss": 0.0455, + "step": 91270 + }, + { + "epoch": 19.019368466663057, + "grad_norm": 0.002226861659437418, + "learning_rate": 2.8089211455944925e-05, + "loss": 0.0137, + "step": 91280 + }, + { + "epoch": 19.019422629041866, + "grad_norm": 0.011350653134286404, + "learning_rate": 2.8086202434899832e-05, + "loss": 0.0152, + "step": 91290 + }, + { + "epoch": 19.01947679142068, + "grad_norm": 0.0030969770159572363, + "learning_rate": 2.8083193413854735e-05, + "loss": 0.0542, + "step": 91300 + }, + { + "epoch": 19.019530953799492, + "grad_norm": 0.043711695820093155, + "learning_rate": 2.8080184392809644e-05, + "loss": 0.002, + "step": 91310 + }, + { + "epoch": 19.019585116178302, + "grad_norm": 0.00208380538970232, + "learning_rate": 2.8077175371764554e-05, + "loss": 0.0578, + "step": 91320 + }, + { + "epoch": 19.019639278557115, + "grad_norm": 0.7732433676719666, + "learning_rate": 2.807416635071946e-05, + "loss": 0.0685, + "step": 91330 + }, + { + "epoch": 19.019693440935924, + "grad_norm": 0.03250429779291153, + "learning_rate": 2.8071157329674363e-05, + "loss": 0.0355, + "step": 91340 + }, + { + "epoch": 19.019747603314737, + "grad_norm": 0.03286483883857727, + "learning_rate": 2.8068148308629273e-05, + "loss": 0.0416, + "step": 91350 + }, + { + "epoch": 19.01980176569355, + "grad_norm": 0.002831399207934737, + "learning_rate": 2.806513928758418e-05, + "loss": 0.1415, + "step": 91360 + }, + { + "epoch": 19.01985592807236, + "grad_norm": 0.051898930221796036, + "learning_rate": 2.8062130266539082e-05, + "loss": 0.0303, + "step": 91370 + }, + { + "epoch": 19.019910090451173, + "grad_norm": 0.0033119404688477516, + "learning_rate": 2.805912124549399e-05, + "loss": 0.0206, + "step": 91380 + }, + { + "epoch": 19.019964252829983, + "grad_norm": 3.286005973815918, + "learning_rate": 2.8056112224448898e-05, + "loss": 0.0348, + "step": 91390 + }, + { + "epoch": 19.020018415208796, + "grad_norm": 0.0465640090405941, + "learning_rate": 2.8053103203403808e-05, + "loss": 0.0292, + "step": 91400 + }, + { + "epoch": 19.02007257758761, + "grad_norm": 0.32042816281318665, + "learning_rate": 2.805009418235871e-05, + "loss": 0.0039, + "step": 91410 + }, + { + "epoch": 19.02012673996642, + "grad_norm": 0.053959719836711884, + "learning_rate": 2.804708516131362e-05, + "loss": 0.0125, + "step": 91420 + }, + { + "epoch": 19.02018090234523, + "grad_norm": 7.347249507904053, + "learning_rate": 2.8044076140268526e-05, + "loss": 0.03, + "step": 91430 + }, + { + "epoch": 19.020235064724044, + "grad_norm": 3.000455617904663, + "learning_rate": 2.8041067119223436e-05, + "loss": 0.0456, + "step": 91440 + }, + { + "epoch": 19.020289227102854, + "grad_norm": 0.06974869221448898, + "learning_rate": 2.803805809817834e-05, + "loss": 0.0603, + "step": 91450 + }, + { + "epoch": 19.020343389481667, + "grad_norm": 0.003936776891350746, + "learning_rate": 2.8035049077133245e-05, + "loss": 0.0171, + "step": 91460 + }, + { + "epoch": 19.020397551860476, + "grad_norm": 0.001831494621001184, + "learning_rate": 2.8032040056088155e-05, + "loss": 0.0024, + "step": 91470 + }, + { + "epoch": 19.02045171423929, + "grad_norm": 0.35799384117126465, + "learning_rate": 2.8029031035043065e-05, + "loss": 0.0231, + "step": 91480 + }, + { + "epoch": 19.020505876618103, + "grad_norm": 0.0018550141248852015, + "learning_rate": 2.8026022013997964e-05, + "loss": 0.0334, + "step": 91490 + }, + { + "epoch": 19.020560038996912, + "grad_norm": 0.004617667756974697, + "learning_rate": 2.8023012992952874e-05, + "loss": 0.0825, + "step": 91500 + }, + { + "epoch": 19.020614201375725, + "grad_norm": 0.06968899816274643, + "learning_rate": 2.8020003971907783e-05, + "loss": 0.0064, + "step": 91510 + }, + { + "epoch": 19.020668363754535, + "grad_norm": 0.0016845765057951212, + "learning_rate": 2.8016994950862686e-05, + "loss": 0.0455, + "step": 91520 + }, + { + "epoch": 19.020722526133348, + "grad_norm": 0.002668507630005479, + "learning_rate": 2.8013985929817593e-05, + "loss": 0.0126, + "step": 91530 + }, + { + "epoch": 19.02077668851216, + "grad_norm": 0.5776388049125671, + "learning_rate": 2.8010976908772502e-05, + "loss": 0.1775, + "step": 91540 + }, + { + "epoch": 19.02083085089097, + "grad_norm": 1.0255544185638428, + "learning_rate": 2.800796788772741e-05, + "loss": 0.1186, + "step": 91550 + }, + { + "epoch": 19.020885013269783, + "grad_norm": 0.07540726661682129, + "learning_rate": 2.800495886668231e-05, + "loss": 0.0048, + "step": 91560 + }, + { + "epoch": 19.020939175648593, + "grad_norm": 0.32760974764823914, + "learning_rate": 2.800194984563722e-05, + "loss": 0.1111, + "step": 91570 + }, + { + "epoch": 19.020993338027406, + "grad_norm": 1.7599823474884033, + "learning_rate": 2.799894082459213e-05, + "loss": 0.121, + "step": 91580 + }, + { + "epoch": 19.02104750040622, + "grad_norm": 0.006407394539564848, + "learning_rate": 2.7995931803547037e-05, + "loss": 0.081, + "step": 91590 + }, + { + "epoch": 19.02110166278503, + "grad_norm": 0.0029969452880322933, + "learning_rate": 2.799292278250194e-05, + "loss": 0.001, + "step": 91600 + }, + { + "epoch": 19.02115582516384, + "grad_norm": 0.3886762261390686, + "learning_rate": 2.798991376145685e-05, + "loss": 0.0563, + "step": 91610 + }, + { + "epoch": 19.021209987542655, + "grad_norm": 0.6612937450408936, + "learning_rate": 2.7986904740411756e-05, + "loss": 0.093, + "step": 91620 + }, + { + "epoch": 19.021264149921464, + "grad_norm": 0.03786264732480049, + "learning_rate": 2.7983895719366665e-05, + "loss": 0.0632, + "step": 91630 + }, + { + "epoch": 19.021318312300277, + "grad_norm": 0.301609069108963, + "learning_rate": 2.7980886698321568e-05, + "loss": 0.0389, + "step": 91640 + }, + { + "epoch": 19.021372474679087, + "grad_norm": 0.4612095355987549, + "learning_rate": 2.7977877677276475e-05, + "loss": 0.0464, + "step": 91650 + }, + { + "epoch": 19.0214266370579, + "grad_norm": 0.059691641479730606, + "learning_rate": 2.7974868656231384e-05, + "loss": 0.0127, + "step": 91660 + }, + { + "epoch": 19.021480799436713, + "grad_norm": 0.024047881364822388, + "learning_rate": 2.7971859635186287e-05, + "loss": 0.1181, + "step": 91670 + }, + { + "epoch": 19.021534961815522, + "grad_norm": 0.12297963351011276, + "learning_rate": 2.7968850614141197e-05, + "loss": 0.0385, + "step": 91680 + }, + { + "epoch": 19.021589124194335, + "grad_norm": 0.3638005256652832, + "learning_rate": 2.7965841593096103e-05, + "loss": 0.0858, + "step": 91690 + }, + { + "epoch": 19.021643286573145, + "grad_norm": 1.0085880756378174, + "learning_rate": 2.7962832572051013e-05, + "loss": 0.0339, + "step": 91700 + }, + { + "epoch": 19.021697448951958, + "grad_norm": 0.00401310995221138, + "learning_rate": 2.7959823551005916e-05, + "loss": 0.1611, + "step": 91710 + }, + { + "epoch": 19.02175161133077, + "grad_norm": 1.5987931489944458, + "learning_rate": 2.7956814529960822e-05, + "loss": 0.0153, + "step": 91720 + }, + { + "epoch": 19.02180577370958, + "grad_norm": 8.702102661132812, + "learning_rate": 2.795380550891573e-05, + "loss": 0.1392, + "step": 91730 + }, + { + "epoch": 19.021859936088394, + "grad_norm": 0.11103395372629166, + "learning_rate": 2.795079648787064e-05, + "loss": 0.0068, + "step": 91740 + }, + { + "epoch": 19.021914098467203, + "grad_norm": 0.008783380500972271, + "learning_rate": 2.794778746682554e-05, + "loss": 0.0263, + "step": 91750 + }, + { + "epoch": 19.021968260846016, + "grad_norm": 1.4134291410446167, + "learning_rate": 2.794477844578045e-05, + "loss": 0.1439, + "step": 91760 + }, + { + "epoch": 19.02202242322483, + "grad_norm": 5.429105281829834, + "learning_rate": 2.794176942473536e-05, + "loss": 0.1395, + "step": 91770 + }, + { + "epoch": 19.02207658560364, + "grad_norm": 0.07644331455230713, + "learning_rate": 2.7938760403690266e-05, + "loss": 0.0077, + "step": 91780 + }, + { + "epoch": 19.02213074798245, + "grad_norm": 8.067800521850586, + "learning_rate": 2.793575138264517e-05, + "loss": 0.0913, + "step": 91790 + }, + { + "epoch": 19.022184910361265, + "grad_norm": 2.1345417499542236, + "learning_rate": 2.793274236160008e-05, + "loss": 0.1281, + "step": 91800 + }, + { + "epoch": 19.022239072740074, + "grad_norm": 0.010762401856482029, + "learning_rate": 2.7929733340554985e-05, + "loss": 0.0739, + "step": 91810 + }, + { + "epoch": 19.022293235118887, + "grad_norm": 2.549609899520874, + "learning_rate": 2.7926724319509888e-05, + "loss": 0.0896, + "step": 91820 + }, + { + "epoch": 19.022347397497697, + "grad_norm": 0.6127600073814392, + "learning_rate": 2.7923715298464798e-05, + "loss": 0.0393, + "step": 91830 + }, + { + "epoch": 19.02240155987651, + "grad_norm": 0.09117835015058517, + "learning_rate": 2.7920706277419707e-05, + "loss": 0.0576, + "step": 91840 + }, + { + "epoch": 19.022455722255323, + "grad_norm": 3.0871615409851074, + "learning_rate": 2.7917697256374614e-05, + "loss": 0.0301, + "step": 91850 + }, + { + "epoch": 19.022509884634133, + "grad_norm": 0.04367199167609215, + "learning_rate": 2.7914688235329517e-05, + "loss": 0.0167, + "step": 91860 + }, + { + "epoch": 19.022564047012946, + "grad_norm": 0.0026883373502641916, + "learning_rate": 2.7911679214284426e-05, + "loss": 0.0917, + "step": 91870 + }, + { + "epoch": 19.022618209391755, + "grad_norm": 1.141575574874878, + "learning_rate": 2.7908670193239332e-05, + "loss": 0.0247, + "step": 91880 + }, + { + "epoch": 19.022672371770568, + "grad_norm": 0.0039621456526219845, + "learning_rate": 2.7905661172194242e-05, + "loss": 0.0705, + "step": 91890 + }, + { + "epoch": 19.02272653414938, + "grad_norm": 0.0023479643277823925, + "learning_rate": 2.7902652151149145e-05, + "loss": 0.0489, + "step": 91900 + }, + { + "epoch": 19.02278069652819, + "grad_norm": 0.17810675501823425, + "learning_rate": 2.789964313010405e-05, + "loss": 0.0074, + "step": 91910 + }, + { + "epoch": 19.022834858907004, + "grad_norm": 0.0026707854121923447, + "learning_rate": 2.789663410905896e-05, + "loss": 0.006, + "step": 91920 + }, + { + "epoch": 19.022889021285813, + "grad_norm": 0.0022376333363354206, + "learning_rate": 2.789362508801387e-05, + "loss": 0.095, + "step": 91930 + }, + { + "epoch": 19.022943183664626, + "grad_norm": 0.039162300527095795, + "learning_rate": 2.7890616066968773e-05, + "loss": 0.0901, + "step": 91940 + }, + { + "epoch": 19.02299734604344, + "grad_norm": 0.01530397031456232, + "learning_rate": 2.788760704592368e-05, + "loss": 0.0874, + "step": 91950 + }, + { + "epoch": 19.02305150842225, + "grad_norm": 0.005165558308362961, + "learning_rate": 2.788459802487859e-05, + "loss": 0.0489, + "step": 91960 + }, + { + "epoch": 19.023105670801062, + "grad_norm": 0.2001599669456482, + "learning_rate": 2.7881589003833492e-05, + "loss": 0.0313, + "step": 91970 + }, + { + "epoch": 19.023159833179875, + "grad_norm": 0.002549063181504607, + "learning_rate": 2.78785799827884e-05, + "loss": 0.031, + "step": 91980 + }, + { + "epoch": 19.023213995558685, + "grad_norm": 0.5150524377822876, + "learning_rate": 2.7875570961743308e-05, + "loss": 0.0082, + "step": 91990 + }, + { + "epoch": 19.023268157937498, + "grad_norm": 0.009727738797664642, + "learning_rate": 2.7872561940698218e-05, + "loss": 0.0641, + "step": 92000 + }, + { + "epoch": 19.023322320316307, + "grad_norm": 0.10660454630851746, + "learning_rate": 2.7869552919653117e-05, + "loss": 0.0589, + "step": 92010 + }, + { + "epoch": 19.02337648269512, + "grad_norm": 0.0029375413432717323, + "learning_rate": 2.7866543898608027e-05, + "loss": 0.0072, + "step": 92020 + }, + { + "epoch": 19.023430645073933, + "grad_norm": 0.3059910833835602, + "learning_rate": 2.7863534877562937e-05, + "loss": 0.0766, + "step": 92030 + }, + { + "epoch": 19.023484807452743, + "grad_norm": 2.9512791633605957, + "learning_rate": 2.7860525856517843e-05, + "loss": 0.0828, + "step": 92040 + }, + { + "epoch": 19.023538969831556, + "grad_norm": 0.19882754981517792, + "learning_rate": 2.7857516835472746e-05, + "loss": 0.006, + "step": 92050 + }, + { + "epoch": 19.023593132210365, + "grad_norm": 0.002480094088241458, + "learning_rate": 2.7854507814427656e-05, + "loss": 0.0299, + "step": 92060 + }, + { + "epoch": 19.02364729458918, + "grad_norm": 0.004224779084324837, + "learning_rate": 2.7851498793382562e-05, + "loss": 0.0271, + "step": 92070 + }, + { + "epoch": 19.02370145696799, + "grad_norm": 0.0021790452301502228, + "learning_rate": 2.784848977233747e-05, + "loss": 0.0146, + "step": 92080 + }, + { + "epoch": 19.0237556193468, + "grad_norm": 0.002116820542141795, + "learning_rate": 2.7845480751292374e-05, + "loss": 0.0425, + "step": 92090 + }, + { + "epoch": 19.023809781725614, + "grad_norm": 0.0032999610994011164, + "learning_rate": 2.7842471730247284e-05, + "loss": 0.0589, + "step": 92100 + }, + { + "epoch": 19.023863944104423, + "grad_norm": 0.0020583192817866802, + "learning_rate": 2.783946270920219e-05, + "loss": 0.0917, + "step": 92110 + }, + { + "epoch": 19.023918106483237, + "grad_norm": 6.337506294250488, + "learning_rate": 2.7836453688157093e-05, + "loss": 0.1416, + "step": 92120 + }, + { + "epoch": 19.02397226886205, + "grad_norm": 0.0023245876654982567, + "learning_rate": 2.7833444667112003e-05, + "loss": 0.0786, + "step": 92130 + }, + { + "epoch": 19.02402643124086, + "grad_norm": 0.036322176456451416, + "learning_rate": 2.783043564606691e-05, + "loss": 0.1781, + "step": 92140 + }, + { + "epoch": 19.024080593619672, + "grad_norm": 0.002921328879892826, + "learning_rate": 2.782742662502182e-05, + "loss": 0.0417, + "step": 92150 + }, + { + "epoch": 19.02413475599848, + "grad_norm": 0.0031165636610239744, + "learning_rate": 2.782441760397672e-05, + "loss": 0.0417, + "step": 92160 + }, + { + "epoch": 19.024188918377295, + "grad_norm": 0.006261691451072693, + "learning_rate": 2.7821408582931628e-05, + "loss": 0.0452, + "step": 92170 + }, + { + "epoch": 19.024243080756108, + "grad_norm": 0.003539418336004019, + "learning_rate": 2.7818399561886538e-05, + "loss": 0.0018, + "step": 92180 + }, + { + "epoch": 19.024297243134917, + "grad_norm": 0.0041514914482831955, + "learning_rate": 2.7815390540841447e-05, + "loss": 0.0271, + "step": 92190 + }, + { + "epoch": 19.02435140551373, + "grad_norm": 0.1774686723947525, + "learning_rate": 2.781238151979635e-05, + "loss": 0.064, + "step": 92200 + }, + { + "epoch": 19.024405567892543, + "grad_norm": 0.10281648486852646, + "learning_rate": 2.7809372498751256e-05, + "loss": 0.1055, + "step": 92210 + }, + { + "epoch": 19.024459730271353, + "grad_norm": 0.003853400470688939, + "learning_rate": 2.7806363477706166e-05, + "loss": 0.0708, + "step": 92220 + }, + { + "epoch": 19.024513892650166, + "grad_norm": 0.18637600541114807, + "learning_rate": 2.7803354456661072e-05, + "loss": 0.0038, + "step": 92230 + }, + { + "epoch": 19.024568055028976, + "grad_norm": 0.002431862521916628, + "learning_rate": 2.7800345435615975e-05, + "loss": 0.0305, + "step": 92240 + }, + { + "epoch": 19.02462221740779, + "grad_norm": 0.1318565458059311, + "learning_rate": 2.7797336414570885e-05, + "loss": 0.0475, + "step": 92250 + }, + { + "epoch": 19.0246763797866, + "grad_norm": 0.0024054001551121473, + "learning_rate": 2.7794327393525795e-05, + "loss": 0.0692, + "step": 92260 + }, + { + "epoch": 19.02473054216541, + "grad_norm": 0.43909749388694763, + "learning_rate": 2.7791318372480697e-05, + "loss": 0.0656, + "step": 92270 + }, + { + "epoch": 19.024784704544224, + "grad_norm": 0.024527497589588165, + "learning_rate": 2.7788309351435604e-05, + "loss": 0.0231, + "step": 92280 + }, + { + "epoch": 19.024838866923034, + "grad_norm": 1.8933910131454468, + "learning_rate": 2.7785300330390513e-05, + "loss": 0.0427, + "step": 92290 + }, + { + "epoch": 19.024893029301847, + "grad_norm": 0.003956683445721865, + "learning_rate": 2.778229130934542e-05, + "loss": 0.0023, + "step": 92300 + }, + { + "epoch": 19.02494719168066, + "grad_norm": 0.07071609050035477, + "learning_rate": 2.7779282288300323e-05, + "loss": 0.0756, + "step": 92310 + }, + { + "epoch": 19.02500135405947, + "grad_norm": 0.0038885497488081455, + "learning_rate": 2.7776273267255232e-05, + "loss": 0.0715, + "step": 92320 + }, + { + "epoch": 19.02500135405947, + "eval_accuracy": 0.8357282821685174, + "eval_loss": 0.785995364189148, + "eval_runtime": 118.4004, + "eval_samples_per_second": 25.861, + "eval_steps_per_second": 3.235, + "step": 92320 + }, + { + "epoch": 20.000054162378813, + "grad_norm": 0.1610623002052307, + "learning_rate": 2.777326424621014e-05, + "loss": 0.0077, + "step": 92330 + }, + { + "epoch": 20.000108324757623, + "grad_norm": 0.30737602710723877, + "learning_rate": 2.7770255225165048e-05, + "loss": 0.0311, + "step": 92340 + }, + { + "epoch": 20.000162487136436, + "grad_norm": 0.5943622589111328, + "learning_rate": 2.776724620411995e-05, + "loss": 0.0386, + "step": 92350 + }, + { + "epoch": 20.000216649515245, + "grad_norm": 0.036792732775211334, + "learning_rate": 2.776423718307486e-05, + "loss": 0.0221, + "step": 92360 + }, + { + "epoch": 20.000270811894058, + "grad_norm": 0.003087263787165284, + "learning_rate": 2.7761228162029767e-05, + "loss": 0.1284, + "step": 92370 + }, + { + "epoch": 20.00032497427287, + "grad_norm": 0.5419447422027588, + "learning_rate": 2.7758219140984677e-05, + "loss": 0.0164, + "step": 92380 + }, + { + "epoch": 20.00037913665168, + "grad_norm": 15.941893577575684, + "learning_rate": 2.775521011993958e-05, + "loss": 0.1874, + "step": 92390 + }, + { + "epoch": 20.000433299030494, + "grad_norm": 0.02789267711341381, + "learning_rate": 2.7752201098894486e-05, + "loss": 0.0203, + "step": 92400 + }, + { + "epoch": 20.000487461409303, + "grad_norm": 0.11710511147975922, + "learning_rate": 2.7749192077849395e-05, + "loss": 0.0148, + "step": 92410 + }, + { + "epoch": 20.000541623788116, + "grad_norm": 0.15932458639144897, + "learning_rate": 2.77461830568043e-05, + "loss": 0.0232, + "step": 92420 + }, + { + "epoch": 20.00059578616693, + "grad_norm": 0.0024751992896199226, + "learning_rate": 2.7743174035759205e-05, + "loss": 0.009, + "step": 92430 + }, + { + "epoch": 20.00064994854574, + "grad_norm": 0.7663392424583435, + "learning_rate": 2.7740165014714114e-05, + "loss": 0.0745, + "step": 92440 + }, + { + "epoch": 20.000704110924552, + "grad_norm": 0.11297160387039185, + "learning_rate": 2.7737155993669024e-05, + "loss": 0.0961, + "step": 92450 + }, + { + "epoch": 20.000758273303365, + "grad_norm": 0.1457342505455017, + "learning_rate": 2.7734146972623927e-05, + "loss": 0.003, + "step": 92460 + }, + { + "epoch": 20.000812435682175, + "grad_norm": 0.05883773788809776, + "learning_rate": 2.7731137951578833e-05, + "loss": 0.0208, + "step": 92470 + }, + { + "epoch": 20.000866598060988, + "grad_norm": 0.0026369623374193907, + "learning_rate": 2.7728128930533743e-05, + "loss": 0.012, + "step": 92480 + }, + { + "epoch": 20.000920760439797, + "grad_norm": 2.8421149253845215, + "learning_rate": 2.772511990948865e-05, + "loss": 0.0881, + "step": 92490 + }, + { + "epoch": 20.00097492281861, + "grad_norm": 0.003115796484053135, + "learning_rate": 2.7722110888443552e-05, + "loss": 0.0291, + "step": 92500 + }, + { + "epoch": 20.001029085197423, + "grad_norm": 0.0025851300451904535, + "learning_rate": 2.771910186739846e-05, + "loss": 0.032, + "step": 92510 + }, + { + "epoch": 20.001083247576233, + "grad_norm": 0.003643767209723592, + "learning_rate": 2.771609284635337e-05, + "loss": 0.0221, + "step": 92520 + }, + { + "epoch": 20.001137409955046, + "grad_norm": 0.001974629471078515, + "learning_rate": 2.7713083825308278e-05, + "loss": 0.0126, + "step": 92530 + }, + { + "epoch": 20.001191572333855, + "grad_norm": 0.002624703338369727, + "learning_rate": 2.771007480426318e-05, + "loss": 0.0665, + "step": 92540 + }, + { + "epoch": 20.00124573471267, + "grad_norm": 0.13029929995536804, + "learning_rate": 2.770706578321809e-05, + "loss": 0.0016, + "step": 92550 + }, + { + "epoch": 20.00129989709148, + "grad_norm": 0.0020432365126907825, + "learning_rate": 2.7704056762172996e-05, + "loss": 0.0436, + "step": 92560 + }, + { + "epoch": 20.00135405947029, + "grad_norm": 0.03367690369486809, + "learning_rate": 2.77010477411279e-05, + "loss": 0.0287, + "step": 92570 + }, + { + "epoch": 20.001408221849104, + "grad_norm": 0.03915070742368698, + "learning_rate": 2.769803872008281e-05, + "loss": 0.1087, + "step": 92580 + }, + { + "epoch": 20.001462384227914, + "grad_norm": 9.239109992980957, + "learning_rate": 2.7695029699037715e-05, + "loss": 0.0971, + "step": 92590 + }, + { + "epoch": 20.001516546606727, + "grad_norm": 0.007606461178511381, + "learning_rate": 2.7692020677992625e-05, + "loss": 0.0907, + "step": 92600 + }, + { + "epoch": 20.00157070898554, + "grad_norm": 0.8995658159255981, + "learning_rate": 2.7689011656947528e-05, + "loss": 0.0274, + "step": 92610 + }, + { + "epoch": 20.00162487136435, + "grad_norm": 0.4317634105682373, + "learning_rate": 2.7686002635902437e-05, + "loss": 0.1049, + "step": 92620 + }, + { + "epoch": 20.001679033743162, + "grad_norm": 0.1729550063610077, + "learning_rate": 2.7682993614857344e-05, + "loss": 0.0189, + "step": 92630 + }, + { + "epoch": 20.001733196121975, + "grad_norm": 0.029551323503255844, + "learning_rate": 2.7679984593812253e-05, + "loss": 0.036, + "step": 92640 + }, + { + "epoch": 20.001787358500785, + "grad_norm": 0.002210029400885105, + "learning_rate": 2.7676975572767156e-05, + "loss": 0.0006, + "step": 92650 + }, + { + "epoch": 20.001841520879598, + "grad_norm": 0.03083566203713417, + "learning_rate": 2.7673966551722063e-05, + "loss": 0.1352, + "step": 92660 + }, + { + "epoch": 20.001895683258407, + "grad_norm": 0.2967512607574463, + "learning_rate": 2.7670957530676972e-05, + "loss": 0.0305, + "step": 92670 + }, + { + "epoch": 20.00194984563722, + "grad_norm": 0.0025496038142591715, + "learning_rate": 2.7667948509631882e-05, + "loss": 0.1334, + "step": 92680 + }, + { + "epoch": 20.002004008016034, + "grad_norm": 6.816539764404297, + "learning_rate": 2.766493948858678e-05, + "loss": 0.1419, + "step": 92690 + }, + { + "epoch": 20.002058170394843, + "grad_norm": 0.3266603648662567, + "learning_rate": 2.766193046754169e-05, + "loss": 0.0587, + "step": 92700 + }, + { + "epoch": 20.002112332773656, + "grad_norm": 0.0029103539418429136, + "learning_rate": 2.76589214464966e-05, + "loss": 0.0751, + "step": 92710 + }, + { + "epoch": 20.002166495152466, + "grad_norm": 0.004517014138400555, + "learning_rate": 2.7655912425451504e-05, + "loss": 0.0436, + "step": 92720 + }, + { + "epoch": 20.00222065753128, + "grad_norm": 0.02316325157880783, + "learning_rate": 2.765290340440641e-05, + "loss": 0.0381, + "step": 92730 + }, + { + "epoch": 20.00227481991009, + "grad_norm": 2.5096631050109863, + "learning_rate": 2.764989438336132e-05, + "loss": 0.0633, + "step": 92740 + }, + { + "epoch": 20.0023289822889, + "grad_norm": 3.469195604324341, + "learning_rate": 2.7646885362316226e-05, + "loss": 0.0836, + "step": 92750 + }, + { + "epoch": 20.002383144667714, + "grad_norm": 3.330899238586426, + "learning_rate": 2.764387634127113e-05, + "loss": 0.0375, + "step": 92760 + }, + { + "epoch": 20.002437307046524, + "grad_norm": 0.3577406108379364, + "learning_rate": 2.764086732022604e-05, + "loss": 0.0297, + "step": 92770 + }, + { + "epoch": 20.002491469425337, + "grad_norm": 0.6503422260284424, + "learning_rate": 2.7637858299180948e-05, + "loss": 0.041, + "step": 92780 + }, + { + "epoch": 20.00254563180415, + "grad_norm": 0.30820968747138977, + "learning_rate": 2.7634849278135854e-05, + "loss": 0.012, + "step": 92790 + }, + { + "epoch": 20.00259979418296, + "grad_norm": 0.4806520640850067, + "learning_rate": 2.7631840257090757e-05, + "loss": 0.107, + "step": 92800 + }, + { + "epoch": 20.002653956561772, + "grad_norm": 10.599038124084473, + "learning_rate": 2.7628831236045667e-05, + "loss": 0.1068, + "step": 92810 + }, + { + "epoch": 20.002708118940586, + "grad_norm": 5.895978927612305, + "learning_rate": 2.7625822215000573e-05, + "loss": 0.0493, + "step": 92820 + }, + { + "epoch": 20.002762281319395, + "grad_norm": 0.19291898608207703, + "learning_rate": 2.7622813193955483e-05, + "loss": 0.0282, + "step": 92830 + }, + { + "epoch": 20.002816443698208, + "grad_norm": 0.03754507005214691, + "learning_rate": 2.7619804172910386e-05, + "loss": 0.0408, + "step": 92840 + }, + { + "epoch": 20.002870606077018, + "grad_norm": 1.79511296749115, + "learning_rate": 2.7616795151865292e-05, + "loss": 0.0954, + "step": 92850 + }, + { + "epoch": 20.00292476845583, + "grad_norm": 0.002252175472676754, + "learning_rate": 2.76137861308202e-05, + "loss": 0.0319, + "step": 92860 + }, + { + "epoch": 20.002978930834644, + "grad_norm": 0.6340799331665039, + "learning_rate": 2.7610777109775104e-05, + "loss": 0.0759, + "step": 92870 + }, + { + "epoch": 20.003033093213453, + "grad_norm": 0.0034477917943149805, + "learning_rate": 2.7607768088730014e-05, + "loss": 0.0143, + "step": 92880 + }, + { + "epoch": 20.003087255592266, + "grad_norm": 0.04738501086831093, + "learning_rate": 2.760475906768492e-05, + "loss": 0.0315, + "step": 92890 + }, + { + "epoch": 20.003141417971076, + "grad_norm": 0.026651879772543907, + "learning_rate": 2.760175004663983e-05, + "loss": 0.0302, + "step": 92900 + }, + { + "epoch": 20.00319558034989, + "grad_norm": 0.031210102140903473, + "learning_rate": 2.7598741025594733e-05, + "loss": 0.0045, + "step": 92910 + }, + { + "epoch": 20.003249742728702, + "grad_norm": 0.0017973771318793297, + "learning_rate": 2.759573200454964e-05, + "loss": 0.0237, + "step": 92920 + }, + { + "epoch": 20.00330390510751, + "grad_norm": 0.004127204418182373, + "learning_rate": 2.759272298350455e-05, + "loss": 0.1457, + "step": 92930 + }, + { + "epoch": 20.003358067486325, + "grad_norm": 0.0022354726679623127, + "learning_rate": 2.758971396245946e-05, + "loss": 0.0038, + "step": 92940 + }, + { + "epoch": 20.003412229865134, + "grad_norm": 0.001932488172315061, + "learning_rate": 2.7586704941414358e-05, + "loss": 0.0362, + "step": 92950 + }, + { + "epoch": 20.003466392243947, + "grad_norm": 8.198939323425293, + "learning_rate": 2.7583695920369268e-05, + "loss": 0.0762, + "step": 92960 + }, + { + "epoch": 20.00352055462276, + "grad_norm": 2.8033530712127686, + "learning_rate": 2.7580686899324177e-05, + "loss": 0.0544, + "step": 92970 + }, + { + "epoch": 20.00357471700157, + "grad_norm": 0.0017629576614126563, + "learning_rate": 2.7577677878279084e-05, + "loss": 0.0364, + "step": 92980 + }, + { + "epoch": 20.003628879380383, + "grad_norm": 0.0018352679908275604, + "learning_rate": 2.7574668857233987e-05, + "loss": 0.0736, + "step": 92990 + }, + { + "epoch": 20.003683041759196, + "grad_norm": 0.13454098999500275, + "learning_rate": 2.7571659836188896e-05, + "loss": 0.0288, + "step": 93000 + }, + { + "epoch": 20.003737204138005, + "grad_norm": 2.0882391929626465, + "learning_rate": 2.7568650815143802e-05, + "loss": 0.1329, + "step": 93010 + }, + { + "epoch": 20.00379136651682, + "grad_norm": 0.0018299393123015761, + "learning_rate": 2.7565641794098705e-05, + "loss": 0.0035, + "step": 93020 + }, + { + "epoch": 20.003845528895628, + "grad_norm": 0.001988290809094906, + "learning_rate": 2.7562632773053615e-05, + "loss": 0.0033, + "step": 93030 + }, + { + "epoch": 20.00389969127444, + "grad_norm": 0.12211811542510986, + "learning_rate": 2.7559623752008525e-05, + "loss": 0.0984, + "step": 93040 + }, + { + "epoch": 20.003953853653254, + "grad_norm": 4.071129322052002, + "learning_rate": 2.755661473096343e-05, + "loss": 0.0948, + "step": 93050 + }, + { + "epoch": 20.004008016032063, + "grad_norm": 0.001915379660204053, + "learning_rate": 2.7553605709918334e-05, + "loss": 0.0361, + "step": 93060 + }, + { + "epoch": 20.004062178410877, + "grad_norm": 0.002053502481430769, + "learning_rate": 2.7550596688873244e-05, + "loss": 0.0519, + "step": 93070 + }, + { + "epoch": 20.004116340789686, + "grad_norm": 0.0021844403818249702, + "learning_rate": 2.754758766782815e-05, + "loss": 0.0053, + "step": 93080 + }, + { + "epoch": 20.0041705031685, + "grad_norm": 2.445098400115967, + "learning_rate": 2.754457864678306e-05, + "loss": 0.0579, + "step": 93090 + }, + { + "epoch": 20.004224665547312, + "grad_norm": 0.35765132308006287, + "learning_rate": 2.7541569625737962e-05, + "loss": 0.0715, + "step": 93100 + }, + { + "epoch": 20.00427882792612, + "grad_norm": 0.1357228308916092, + "learning_rate": 2.753856060469287e-05, + "loss": 0.0359, + "step": 93110 + }, + { + "epoch": 20.004332990304935, + "grad_norm": 0.10104131698608398, + "learning_rate": 2.7535551583647778e-05, + "loss": 0.0114, + "step": 93120 + }, + { + "epoch": 20.004387152683744, + "grad_norm": 0.0017690121894702315, + "learning_rate": 2.7532542562602688e-05, + "loss": 0.0012, + "step": 93130 + }, + { + "epoch": 20.004441315062557, + "grad_norm": 0.14609526097774506, + "learning_rate": 2.752953354155759e-05, + "loss": 0.0011, + "step": 93140 + }, + { + "epoch": 20.00449547744137, + "grad_norm": 0.0016621008981019258, + "learning_rate": 2.7526524520512497e-05, + "loss": 0.0822, + "step": 93150 + }, + { + "epoch": 20.00454963982018, + "grad_norm": 0.002176041482016444, + "learning_rate": 2.7523515499467407e-05, + "loss": 0.0472, + "step": 93160 + }, + { + "epoch": 20.004603802198993, + "grad_norm": 0.17661158740520477, + "learning_rate": 2.752050647842231e-05, + "loss": 0.0753, + "step": 93170 + }, + { + "epoch": 20.004657964577806, + "grad_norm": 0.0018383085262030363, + "learning_rate": 2.7517497457377216e-05, + "loss": 0.0442, + "step": 93180 + }, + { + "epoch": 20.004712126956615, + "grad_norm": 2.397197961807251, + "learning_rate": 2.7514488436332126e-05, + "loss": 0.0526, + "step": 93190 + }, + { + "epoch": 20.00476628933543, + "grad_norm": 0.1695949286222458, + "learning_rate": 2.7511479415287035e-05, + "loss": 0.0141, + "step": 93200 + }, + { + "epoch": 20.004820451714238, + "grad_norm": 0.0017177645349875093, + "learning_rate": 2.7508470394241935e-05, + "loss": 0.0383, + "step": 93210 + }, + { + "epoch": 20.00487461409305, + "grad_norm": 3.4242103099823, + "learning_rate": 2.7505461373196844e-05, + "loss": 0.0425, + "step": 93220 + }, + { + "epoch": 20.004928776471864, + "grad_norm": 1.2689273357391357, + "learning_rate": 2.7502452352151754e-05, + "loss": 0.0124, + "step": 93230 + }, + { + "epoch": 20.004982938850674, + "grad_norm": 0.005736934021115303, + "learning_rate": 2.749944333110666e-05, + "loss": 0.0074, + "step": 93240 + }, + { + "epoch": 20.005037101229487, + "grad_norm": 0.00685268547385931, + "learning_rate": 2.7496434310061563e-05, + "loss": 0.0847, + "step": 93250 + }, + { + "epoch": 20.005091263608296, + "grad_norm": 0.43713676929473877, + "learning_rate": 2.7493425289016473e-05, + "loss": 0.0173, + "step": 93260 + }, + { + "epoch": 20.00514542598711, + "grad_norm": 0.001559908501803875, + "learning_rate": 2.749041626797138e-05, + "loss": 0.1267, + "step": 93270 + }, + { + "epoch": 20.005199588365922, + "grad_norm": 0.043636053800582886, + "learning_rate": 2.748740724692629e-05, + "loss": 0.0019, + "step": 93280 + }, + { + "epoch": 20.005253750744732, + "grad_norm": 0.08674392104148865, + "learning_rate": 2.7484398225881192e-05, + "loss": 0.0298, + "step": 93290 + }, + { + "epoch": 20.005307913123545, + "grad_norm": 0.0016253755893558264, + "learning_rate": 2.74813892048361e-05, + "loss": 0.0026, + "step": 93300 + }, + { + "epoch": 20.005362075502354, + "grad_norm": 0.00800950825214386, + "learning_rate": 2.7478380183791008e-05, + "loss": 0.0429, + "step": 93310 + }, + { + "epoch": 20.005416237881168, + "grad_norm": 5.963685035705566, + "learning_rate": 2.747537116274591e-05, + "loss": 0.1365, + "step": 93320 + }, + { + "epoch": 20.00547040025998, + "grad_norm": 0.053062569350004196, + "learning_rate": 2.747236214170082e-05, + "loss": 0.023, + "step": 93330 + }, + { + "epoch": 20.00552456263879, + "grad_norm": 0.0016039862530305982, + "learning_rate": 2.7469353120655726e-05, + "loss": 0.0343, + "step": 93340 + }, + { + "epoch": 20.005578725017603, + "grad_norm": 19.413490295410156, + "learning_rate": 2.7466344099610636e-05, + "loss": 0.0716, + "step": 93350 + }, + { + "epoch": 20.005632887396416, + "grad_norm": 0.0016633367631584406, + "learning_rate": 2.746333507856554e-05, + "loss": 0.0638, + "step": 93360 + }, + { + "epoch": 20.005687049775226, + "grad_norm": 0.1059771403670311, + "learning_rate": 2.7460326057520445e-05, + "loss": 0.0102, + "step": 93370 + }, + { + "epoch": 20.00574121215404, + "grad_norm": 0.04642447829246521, + "learning_rate": 2.7457317036475355e-05, + "loss": 0.1407, + "step": 93380 + }, + { + "epoch": 20.00579537453285, + "grad_norm": 0.0018182103522121906, + "learning_rate": 2.7454308015430265e-05, + "loss": 0.0243, + "step": 93390 + }, + { + "epoch": 20.00584953691166, + "grad_norm": 0.0017259921878576279, + "learning_rate": 2.7451298994385168e-05, + "loss": 0.0726, + "step": 93400 + }, + { + "epoch": 20.005903699290474, + "grad_norm": 0.40984219312667847, + "learning_rate": 2.7448289973340074e-05, + "loss": 0.0419, + "step": 93410 + }, + { + "epoch": 20.005957861669284, + "grad_norm": 0.38906237483024597, + "learning_rate": 2.7445280952294983e-05, + "loss": 0.0345, + "step": 93420 + }, + { + "epoch": 20.006012024048097, + "grad_norm": 0.0015645762905478477, + "learning_rate": 2.744227193124989e-05, + "loss": 0.1108, + "step": 93430 + }, + { + "epoch": 20.006066186426906, + "grad_norm": 2.085345983505249, + "learning_rate": 2.7439262910204793e-05, + "loss": 0.0573, + "step": 93440 + }, + { + "epoch": 20.00612034880572, + "grad_norm": 0.001728697563521564, + "learning_rate": 2.7436253889159702e-05, + "loss": 0.1048, + "step": 93450 + }, + { + "epoch": 20.006174511184533, + "grad_norm": 0.02620992250740528, + "learning_rate": 2.7433244868114612e-05, + "loss": 0.0719, + "step": 93460 + }, + { + "epoch": 20.006228673563342, + "grad_norm": 0.0026380731724202633, + "learning_rate": 2.743023584706951e-05, + "loss": 0.143, + "step": 93470 + }, + { + "epoch": 20.006282835942155, + "grad_norm": 0.5545454025268555, + "learning_rate": 2.742722682602442e-05, + "loss": 0.0329, + "step": 93480 + }, + { + "epoch": 20.006336998320965, + "grad_norm": 1.4724773168563843, + "learning_rate": 2.742421780497933e-05, + "loss": 0.0125, + "step": 93490 + }, + { + "epoch": 20.006391160699778, + "grad_norm": 15.55195426940918, + "learning_rate": 2.7421208783934237e-05, + "loss": 0.1037, + "step": 93500 + }, + { + "epoch": 20.00644532307859, + "grad_norm": 0.6978394389152527, + "learning_rate": 2.741819976288914e-05, + "loss": 0.1702, + "step": 93510 + }, + { + "epoch": 20.0064994854574, + "grad_norm": 0.0018948916113004088, + "learning_rate": 2.741519074184405e-05, + "loss": 0.0117, + "step": 93520 + }, + { + "epoch": 20.006553647836213, + "grad_norm": 0.026319505646824837, + "learning_rate": 2.7412181720798956e-05, + "loss": 0.0284, + "step": 93530 + }, + { + "epoch": 20.006607810215023, + "grad_norm": 0.03831809386610985, + "learning_rate": 2.7409172699753866e-05, + "loss": 0.0023, + "step": 93540 + }, + { + "epoch": 20.006661972593836, + "grad_norm": 2.7726287841796875, + "learning_rate": 2.740616367870877e-05, + "loss": 0.1033, + "step": 93550 + }, + { + "epoch": 20.00671613497265, + "grad_norm": 0.002636911580339074, + "learning_rate": 2.7403154657663678e-05, + "loss": 0.0772, + "step": 93560 + }, + { + "epoch": 20.00677029735146, + "grad_norm": 0.7842618227005005, + "learning_rate": 2.7400145636618584e-05, + "loss": 0.0601, + "step": 93570 + }, + { + "epoch": 20.00682445973027, + "grad_norm": 0.002189836697652936, + "learning_rate": 2.7397136615573494e-05, + "loss": 0.0523, + "step": 93580 + }, + { + "epoch": 20.006878622109085, + "grad_norm": 2.995713949203491, + "learning_rate": 2.7394127594528397e-05, + "loss": 0.0619, + "step": 93590 + }, + { + "epoch": 20.006932784487894, + "grad_norm": 0.0020622266456484795, + "learning_rate": 2.7391118573483303e-05, + "loss": 0.0202, + "step": 93600 + }, + { + "epoch": 20.006986946866707, + "grad_norm": 0.004175739828497171, + "learning_rate": 2.7388109552438213e-05, + "loss": 0.0544, + "step": 93610 + }, + { + "epoch": 20.007041109245517, + "grad_norm": 0.002791434293612838, + "learning_rate": 2.7385100531393116e-05, + "loss": 0.0482, + "step": 93620 + }, + { + "epoch": 20.00709527162433, + "grad_norm": 0.14531360566616058, + "learning_rate": 2.7382091510348022e-05, + "loss": 0.0903, + "step": 93630 + }, + { + "epoch": 20.007149434003143, + "grad_norm": 0.06046357750892639, + "learning_rate": 2.737908248930293e-05, + "loss": 0.0401, + "step": 93640 + }, + { + "epoch": 20.007203596381952, + "grad_norm": 0.3079821765422821, + "learning_rate": 2.737607346825784e-05, + "loss": 0.082, + "step": 93650 + }, + { + "epoch": 20.007257758760765, + "grad_norm": 0.43529221415519714, + "learning_rate": 2.7373064447212744e-05, + "loss": 0.0226, + "step": 93660 + }, + { + "epoch": 20.007311921139575, + "grad_norm": 0.08384162187576294, + "learning_rate": 2.737005542616765e-05, + "loss": 0.047, + "step": 93670 + }, + { + "epoch": 20.007366083518388, + "grad_norm": 10.696191787719727, + "learning_rate": 2.736704640512256e-05, + "loss": 0.0645, + "step": 93680 + }, + { + "epoch": 20.0074202458972, + "grad_norm": 0.2958439588546753, + "learning_rate": 2.7364037384077466e-05, + "loss": 0.0667, + "step": 93690 + }, + { + "epoch": 20.00747440827601, + "grad_norm": 0.08339865505695343, + "learning_rate": 2.736102836303237e-05, + "loss": 0.0523, + "step": 93700 + }, + { + "epoch": 20.007528570654824, + "grad_norm": 0.6901150345802307, + "learning_rate": 2.735801934198728e-05, + "loss": 0.0055, + "step": 93710 + }, + { + "epoch": 20.007582733033633, + "grad_norm": 1.323882818222046, + "learning_rate": 2.735501032094219e-05, + "loss": 0.0091, + "step": 93720 + }, + { + "epoch": 20.007636895412446, + "grad_norm": 0.5682179927825928, + "learning_rate": 2.7352001299897095e-05, + "loss": 0.0228, + "step": 93730 + }, + { + "epoch": 20.00769105779126, + "grad_norm": 0.020917925983667374, + "learning_rate": 2.7348992278851998e-05, + "loss": 0.0555, + "step": 93740 + }, + { + "epoch": 20.00774522017007, + "grad_norm": 0.11788579821586609, + "learning_rate": 2.7345983257806907e-05, + "loss": 0.0012, + "step": 93750 + }, + { + "epoch": 20.007799382548882, + "grad_norm": 0.005816661287099123, + "learning_rate": 2.7342974236761814e-05, + "loss": 0.0028, + "step": 93760 + }, + { + "epoch": 20.007853544927695, + "grad_norm": 0.015436283312737942, + "learning_rate": 2.7339965215716717e-05, + "loss": 0.0042, + "step": 93770 + }, + { + "epoch": 20.007907707306504, + "grad_norm": 0.007983904331922531, + "learning_rate": 2.7336956194671626e-05, + "loss": 0.0414, + "step": 93780 + }, + { + "epoch": 20.007961869685317, + "grad_norm": 0.1074032336473465, + "learning_rate": 2.7333947173626533e-05, + "loss": 0.1541, + "step": 93790 + }, + { + "epoch": 20.008016032064127, + "grad_norm": 0.0017884973203763366, + "learning_rate": 2.7330938152581442e-05, + "loss": 0.0923, + "step": 93800 + }, + { + "epoch": 20.00807019444294, + "grad_norm": 0.002214506734162569, + "learning_rate": 2.7327929131536345e-05, + "loss": 0.0856, + "step": 93810 + }, + { + "epoch": 20.008124356821753, + "grad_norm": 0.09336012601852417, + "learning_rate": 2.7324920110491255e-05, + "loss": 0.021, + "step": 93820 + }, + { + "epoch": 20.008178519200563, + "grad_norm": 0.21779289841651917, + "learning_rate": 2.732191108944616e-05, + "loss": 0.1986, + "step": 93830 + }, + { + "epoch": 20.008232681579376, + "grad_norm": 0.05891859158873558, + "learning_rate": 2.731890206840107e-05, + "loss": 0.0087, + "step": 93840 + }, + { + "epoch": 20.008286843958185, + "grad_norm": 0.028928538784384727, + "learning_rate": 2.7315893047355974e-05, + "loss": 0.1466, + "step": 93850 + }, + { + "epoch": 20.008341006336998, + "grad_norm": 0.1465793251991272, + "learning_rate": 2.731288402631088e-05, + "loss": 0.0038, + "step": 93860 + }, + { + "epoch": 20.00839516871581, + "grad_norm": 0.0024001391138881445, + "learning_rate": 2.730987500526579e-05, + "loss": 0.0608, + "step": 93870 + }, + { + "epoch": 20.00844933109462, + "grad_norm": 0.002874233992770314, + "learning_rate": 2.73068659842207e-05, + "loss": 0.1154, + "step": 93880 + }, + { + "epoch": 20.008503493473434, + "grad_norm": 0.002535998122766614, + "learning_rate": 2.73038569631756e-05, + "loss": 0.0081, + "step": 93890 + }, + { + "epoch": 20.008557655852243, + "grad_norm": 0.002383689396083355, + "learning_rate": 2.730084794213051e-05, + "loss": 0.0502, + "step": 93900 + }, + { + "epoch": 20.008611818231056, + "grad_norm": 12.150867462158203, + "learning_rate": 2.7297838921085418e-05, + "loss": 0.0539, + "step": 93910 + }, + { + "epoch": 20.00866598060987, + "grad_norm": 0.004310421645641327, + "learning_rate": 2.729482990004032e-05, + "loss": 0.0179, + "step": 93920 + }, + { + "epoch": 20.00872014298868, + "grad_norm": 0.007136146072298288, + "learning_rate": 2.7291820878995227e-05, + "loss": 0.1003, + "step": 93930 + }, + { + "epoch": 20.008774305367492, + "grad_norm": 0.003066245699301362, + "learning_rate": 2.7288811857950137e-05, + "loss": 0.0534, + "step": 93940 + }, + { + "epoch": 20.008828467746305, + "grad_norm": 0.0023149203043431044, + "learning_rate": 2.7285802836905043e-05, + "loss": 0.1199, + "step": 93950 + }, + { + "epoch": 20.008882630125115, + "grad_norm": 0.18191012740135193, + "learning_rate": 2.7282793815859946e-05, + "loss": 0.0536, + "step": 93960 + }, + { + "epoch": 20.008936792503928, + "grad_norm": 0.1883397251367569, + "learning_rate": 2.7279784794814856e-05, + "loss": 0.1199, + "step": 93970 + }, + { + "epoch": 20.008990954882737, + "grad_norm": 1.7628672122955322, + "learning_rate": 2.7276775773769765e-05, + "loss": 0.0417, + "step": 93980 + }, + { + "epoch": 20.00904511726155, + "grad_norm": 0.05492020770907402, + "learning_rate": 2.727376675272467e-05, + "loss": 0.0524, + "step": 93990 + }, + { + "epoch": 20.009099279640363, + "grad_norm": 1.1036357879638672, + "learning_rate": 2.7270757731679574e-05, + "loss": 0.0449, + "step": 94000 + }, + { + "epoch": 20.009153442019173, + "grad_norm": 0.3848893642425537, + "learning_rate": 2.7267748710634484e-05, + "loss": 0.0255, + "step": 94010 + }, + { + "epoch": 20.009207604397986, + "grad_norm": 0.460563987493515, + "learning_rate": 2.726473968958939e-05, + "loss": 0.0701, + "step": 94020 + }, + { + "epoch": 20.009261766776795, + "grad_norm": 0.0022593028843402863, + "learning_rate": 2.72617306685443e-05, + "loss": 0.0131, + "step": 94030 + }, + { + "epoch": 20.00931592915561, + "grad_norm": 2.4650115966796875, + "learning_rate": 2.7258721647499203e-05, + "loss": 0.0958, + "step": 94040 + }, + { + "epoch": 20.00937009153442, + "grad_norm": 2.7335259914398193, + "learning_rate": 2.725571262645411e-05, + "loss": 0.045, + "step": 94050 + }, + { + "epoch": 20.00942425391323, + "grad_norm": 0.013399980030953884, + "learning_rate": 2.725270360540902e-05, + "loss": 0.081, + "step": 94060 + }, + { + "epoch": 20.009478416292044, + "grad_norm": 0.09446106106042862, + "learning_rate": 2.7249694584363922e-05, + "loss": 0.0283, + "step": 94070 + }, + { + "epoch": 20.009532578670854, + "grad_norm": 0.0021128207445144653, + "learning_rate": 2.724668556331883e-05, + "loss": 0.005, + "step": 94080 + }, + { + "epoch": 20.009586741049667, + "grad_norm": 26.447452545166016, + "learning_rate": 2.7243676542273738e-05, + "loss": 0.2111, + "step": 94090 + }, + { + "epoch": 20.00964090342848, + "grad_norm": 0.002231459366157651, + "learning_rate": 2.7240667521228647e-05, + "loss": 0.0009, + "step": 94100 + }, + { + "epoch": 20.00969506580729, + "grad_norm": 0.502987802028656, + "learning_rate": 2.723765850018355e-05, + "loss": 0.0246, + "step": 94110 + }, + { + "epoch": 20.009749228186102, + "grad_norm": 0.9037148952484131, + "learning_rate": 2.7234649479138457e-05, + "loss": 0.0111, + "step": 94120 + }, + { + "epoch": 20.009803390564915, + "grad_norm": 4.445488452911377, + "learning_rate": 2.7231640458093366e-05, + "loss": 0.0943, + "step": 94130 + }, + { + "epoch": 20.009857552943725, + "grad_norm": 0.004323424771428108, + "learning_rate": 2.7228631437048276e-05, + "loss": 0.0493, + "step": 94140 + }, + { + "epoch": 20.009911715322538, + "grad_norm": 0.004194071516394615, + "learning_rate": 2.7225622416003175e-05, + "loss": 0.1247, + "step": 94150 + }, + { + "epoch": 20.009965877701347, + "grad_norm": 0.0077565754763782024, + "learning_rate": 2.7222613394958085e-05, + "loss": 0.059, + "step": 94160 + }, + { + "epoch": 20.01002004008016, + "grad_norm": 15.054800987243652, + "learning_rate": 2.7219604373912995e-05, + "loss": 0.1565, + "step": 94170 + }, + { + "epoch": 20.010074202458974, + "grad_norm": 8.280964851379395, + "learning_rate": 2.72165953528679e-05, + "loss": 0.2251, + "step": 94180 + }, + { + "epoch": 20.010128364837783, + "grad_norm": 0.006371773779392242, + "learning_rate": 2.7213586331822804e-05, + "loss": 0.0273, + "step": 94190 + }, + { + "epoch": 20.010182527216596, + "grad_norm": 0.06601741164922714, + "learning_rate": 2.7210577310777714e-05, + "loss": 0.0966, + "step": 94200 + }, + { + "epoch": 20.010236689595406, + "grad_norm": 3.599378824234009, + "learning_rate": 2.720756828973262e-05, + "loss": 0.1182, + "step": 94210 + }, + { + "epoch": 20.01029085197422, + "grad_norm": 4.377819538116455, + "learning_rate": 2.7204559268687523e-05, + "loss": 0.0749, + "step": 94220 + }, + { + "epoch": 20.01034501435303, + "grad_norm": 0.004091551527380943, + "learning_rate": 2.7201550247642432e-05, + "loss": 0.0511, + "step": 94230 + }, + { + "epoch": 20.01039917673184, + "grad_norm": 7.608785152435303, + "learning_rate": 2.7198541226597342e-05, + "loss": 0.0861, + "step": 94240 + }, + { + "epoch": 20.010453339110654, + "grad_norm": 0.24303507804870605, + "learning_rate": 2.7195532205552248e-05, + "loss": 0.1415, + "step": 94250 + }, + { + "epoch": 20.010507501489464, + "grad_norm": 0.004931447561830282, + "learning_rate": 2.719252318450715e-05, + "loss": 0.0205, + "step": 94260 + }, + { + "epoch": 20.010561663868277, + "grad_norm": 0.006939876824617386, + "learning_rate": 2.718951416346206e-05, + "loss": 0.1503, + "step": 94270 + }, + { + "epoch": 20.01061582624709, + "grad_norm": 0.06803911924362183, + "learning_rate": 2.7186505142416967e-05, + "loss": 0.0645, + "step": 94280 + }, + { + "epoch": 20.0106699886259, + "grad_norm": 0.1184546947479248, + "learning_rate": 2.7183496121371877e-05, + "loss": 0.036, + "step": 94290 + }, + { + "epoch": 20.010724151004712, + "grad_norm": 2.1144065856933594, + "learning_rate": 2.718048710032678e-05, + "loss": 0.0906, + "step": 94300 + }, + { + "epoch": 20.010778313383526, + "grad_norm": 0.3085380494594574, + "learning_rate": 2.7177478079281686e-05, + "loss": 0.0575, + "step": 94310 + }, + { + "epoch": 20.010832475762335, + "grad_norm": 0.051680371165275574, + "learning_rate": 2.7174469058236596e-05, + "loss": 0.0212, + "step": 94320 + }, + { + "epoch": 20.010886638141148, + "grad_norm": 0.00974586047232151, + "learning_rate": 2.7171460037191505e-05, + "loss": 0.0677, + "step": 94330 + }, + { + "epoch": 20.010940800519958, + "grad_norm": 0.0070203328505158424, + "learning_rate": 2.7168451016146408e-05, + "loss": 0.0318, + "step": 94340 + }, + { + "epoch": 20.01099496289877, + "grad_norm": 0.004766813479363918, + "learning_rate": 2.7165441995101314e-05, + "loss": 0.0142, + "step": 94350 + }, + { + "epoch": 20.011049125277584, + "grad_norm": 0.06842163950204849, + "learning_rate": 2.7162432974056224e-05, + "loss": 0.004, + "step": 94360 + }, + { + "epoch": 20.011103287656393, + "grad_norm": 0.004550015088170767, + "learning_rate": 2.7159423953011127e-05, + "loss": 0.0004, + "step": 94370 + }, + { + "epoch": 20.011157450035206, + "grad_norm": 0.9671139121055603, + "learning_rate": 2.7156414931966033e-05, + "loss": 0.0921, + "step": 94380 + }, + { + "epoch": 20.011211612414016, + "grad_norm": 0.5865395665168762, + "learning_rate": 2.7153405910920943e-05, + "loss": 0.0135, + "step": 94390 + }, + { + "epoch": 20.01126577479283, + "grad_norm": 0.0042292275466024876, + "learning_rate": 2.7150396889875853e-05, + "loss": 0.0749, + "step": 94400 + }, + { + "epoch": 20.011319937171642, + "grad_norm": 0.005681551992893219, + "learning_rate": 2.7147387868830752e-05, + "loss": 0.0519, + "step": 94410 + }, + { + "epoch": 20.01137409955045, + "grad_norm": 0.005482684355229139, + "learning_rate": 2.7144378847785662e-05, + "loss": 0.0843, + "step": 94420 + }, + { + "epoch": 20.011428261929264, + "grad_norm": 0.025651808828115463, + "learning_rate": 2.714136982674057e-05, + "loss": 0.0388, + "step": 94430 + }, + { + "epoch": 20.011482424308074, + "grad_norm": 0.07345505803823471, + "learning_rate": 2.7138360805695478e-05, + "loss": 0.0163, + "step": 94440 + }, + { + "epoch": 20.011536586686887, + "grad_norm": 0.0023786653764545918, + "learning_rate": 2.713535178465038e-05, + "loss": 0.0492, + "step": 94450 + }, + { + "epoch": 20.0115907490657, + "grad_norm": 0.0035417750477790833, + "learning_rate": 2.713234276360529e-05, + "loss": 0.0131, + "step": 94460 + }, + { + "epoch": 20.01164491144451, + "grad_norm": 1.4760743379592896, + "learning_rate": 2.7129333742560196e-05, + "loss": 0.0507, + "step": 94470 + }, + { + "epoch": 20.011699073823323, + "grad_norm": 0.0023567096795886755, + "learning_rate": 2.7126324721515106e-05, + "loss": 0.0678, + "step": 94480 + }, + { + "epoch": 20.011753236202136, + "grad_norm": 0.0024818931706249714, + "learning_rate": 2.712331570047001e-05, + "loss": 0.0676, + "step": 94490 + }, + { + "epoch": 20.011807398580945, + "grad_norm": 0.6408600807189941, + "learning_rate": 2.712030667942492e-05, + "loss": 0.02, + "step": 94500 + }, + { + "epoch": 20.01186156095976, + "grad_norm": 0.02781158685684204, + "learning_rate": 2.7117297658379825e-05, + "loss": 0.0988, + "step": 94510 + }, + { + "epoch": 20.011915723338568, + "grad_norm": 19.499855041503906, + "learning_rate": 2.7114288637334728e-05, + "loss": 0.0343, + "step": 94520 + }, + { + "epoch": 20.01196988571738, + "grad_norm": 0.002533120568841696, + "learning_rate": 2.7111279616289638e-05, + "loss": 0.1258, + "step": 94530 + }, + { + "epoch": 20.012024048096194, + "grad_norm": 0.0047176191583275795, + "learning_rate": 2.7108270595244544e-05, + "loss": 0.0044, + "step": 94540 + }, + { + "epoch": 20.012078210475003, + "grad_norm": 0.46018874645233154, + "learning_rate": 2.7105261574199453e-05, + "loss": 0.1091, + "step": 94550 + }, + { + "epoch": 20.012132372853817, + "grad_norm": 0.058956392109394073, + "learning_rate": 2.7102252553154356e-05, + "loss": 0.0213, + "step": 94560 + }, + { + "epoch": 20.012186535232626, + "grad_norm": 3.3860161304473877, + "learning_rate": 2.7099243532109263e-05, + "loss": 0.1209, + "step": 94570 + }, + { + "epoch": 20.01224069761144, + "grad_norm": 0.6034712195396423, + "learning_rate": 2.7096234511064172e-05, + "loss": 0.0081, + "step": 94580 + }, + { + "epoch": 20.012294859990252, + "grad_norm": 0.0024161881301552057, + "learning_rate": 2.7093225490019082e-05, + "loss": 0.0567, + "step": 94590 + }, + { + "epoch": 20.01234902236906, + "grad_norm": 2.8551478385925293, + "learning_rate": 2.7090216468973985e-05, + "loss": 0.0672, + "step": 94600 + }, + { + "epoch": 20.012403184747875, + "grad_norm": 2.368082046508789, + "learning_rate": 2.708720744792889e-05, + "loss": 0.2626, + "step": 94610 + }, + { + "epoch": 20.012457347126684, + "grad_norm": 0.011897725984454155, + "learning_rate": 2.70841984268838e-05, + "loss": 0.0396, + "step": 94620 + }, + { + "epoch": 20.012511509505497, + "grad_norm": 0.026628755033016205, + "learning_rate": 2.7081189405838707e-05, + "loss": 0.0094, + "step": 94630 + }, + { + "epoch": 20.01256567188431, + "grad_norm": 0.0027589723467826843, + "learning_rate": 2.707818038479361e-05, + "loss": 0.0193, + "step": 94640 + }, + { + "epoch": 20.01261983426312, + "grad_norm": 0.030702272430062294, + "learning_rate": 2.707517136374852e-05, + "loss": 0.0199, + "step": 94650 + }, + { + "epoch": 20.012673996641933, + "grad_norm": 2.479313850402832, + "learning_rate": 2.707216234270343e-05, + "loss": 0.0747, + "step": 94660 + }, + { + "epoch": 20.012728159020742, + "grad_norm": 0.060346782207489014, + "learning_rate": 2.706915332165833e-05, + "loss": 0.0232, + "step": 94670 + }, + { + "epoch": 20.012782321399555, + "grad_norm": 0.32797905802726746, + "learning_rate": 2.706614430061324e-05, + "loss": 0.0417, + "step": 94680 + }, + { + "epoch": 20.01283648377837, + "grad_norm": 0.004330570809543133, + "learning_rate": 2.7063135279568148e-05, + "loss": 0.0664, + "step": 94690 + }, + { + "epoch": 20.012890646157178, + "grad_norm": 0.3007182478904724, + "learning_rate": 2.7060126258523054e-05, + "loss": 0.0617, + "step": 94700 + }, + { + "epoch": 20.01294480853599, + "grad_norm": 0.004721327684819698, + "learning_rate": 2.7057117237477957e-05, + "loss": 0.112, + "step": 94710 + }, + { + "epoch": 20.012998970914804, + "grad_norm": 2.2088518142700195, + "learning_rate": 2.7054108216432867e-05, + "loss": 0.0507, + "step": 94720 + }, + { + "epoch": 20.013053133293614, + "grad_norm": 0.0025882974732667208, + "learning_rate": 2.7051099195387773e-05, + "loss": 0.1953, + "step": 94730 + }, + { + "epoch": 20.013107295672427, + "grad_norm": 0.4670311212539673, + "learning_rate": 2.7048090174342683e-05, + "loss": 0.0922, + "step": 94740 + }, + { + "epoch": 20.013161458051236, + "grad_norm": 0.005196999758481979, + "learning_rate": 2.7045081153297586e-05, + "loss": 0.0232, + "step": 94750 + }, + { + "epoch": 20.01321562043005, + "grad_norm": 0.27265286445617676, + "learning_rate": 2.7042072132252495e-05, + "loss": 0.0494, + "step": 94760 + }, + { + "epoch": 20.013269782808862, + "grad_norm": 0.014795482158660889, + "learning_rate": 2.70390631112074e-05, + "loss": 0.0386, + "step": 94770 + }, + { + "epoch": 20.013323945187672, + "grad_norm": 0.5649568438529968, + "learning_rate": 2.703605409016231e-05, + "loss": 0.0492, + "step": 94780 + }, + { + "epoch": 20.013378107566485, + "grad_norm": 0.3911621868610382, + "learning_rate": 2.7033045069117214e-05, + "loss": 0.0498, + "step": 94790 + }, + { + "epoch": 20.013432269945294, + "grad_norm": 0.10493011027574539, + "learning_rate": 2.703003604807212e-05, + "loss": 0.0031, + "step": 94800 + }, + { + "epoch": 20.013486432324108, + "grad_norm": 0.03078480437397957, + "learning_rate": 2.702702702702703e-05, + "loss": 0.017, + "step": 94810 + }, + { + "epoch": 20.01354059470292, + "grad_norm": 0.06432899087667465, + "learning_rate": 2.7024018005981933e-05, + "loss": 0.0408, + "step": 94820 + }, + { + "epoch": 20.01359475708173, + "grad_norm": 1.0799177885055542, + "learning_rate": 2.702100898493684e-05, + "loss": 0.014, + "step": 94830 + }, + { + "epoch": 20.013648919460543, + "grad_norm": 2.5497612953186035, + "learning_rate": 2.701799996389175e-05, + "loss": 0.0614, + "step": 94840 + }, + { + "epoch": 20.013703081839353, + "grad_norm": 3.0181803703308105, + "learning_rate": 2.701499094284666e-05, + "loss": 0.0719, + "step": 94850 + }, + { + "epoch": 20.013757244218166, + "grad_norm": 0.8188510537147522, + "learning_rate": 2.701198192180156e-05, + "loss": 0.1312, + "step": 94860 + }, + { + "epoch": 20.01381140659698, + "grad_norm": 0.818214476108551, + "learning_rate": 2.7008972900756468e-05, + "loss": 0.0226, + "step": 94870 + }, + { + "epoch": 20.01386556897579, + "grad_norm": 0.0024503704626113176, + "learning_rate": 2.7005963879711377e-05, + "loss": 0.0362, + "step": 94880 + }, + { + "epoch": 20.0139197313546, + "grad_norm": 2.257596254348755, + "learning_rate": 2.7002954858666284e-05, + "loss": 0.0728, + "step": 94890 + }, + { + "epoch": 20.013973893733414, + "grad_norm": 0.0025585442781448364, + "learning_rate": 2.6999945837621187e-05, + "loss": 0.0042, + "step": 94900 + }, + { + "epoch": 20.014028056112224, + "grad_norm": 1.3701629638671875, + "learning_rate": 2.6996936816576096e-05, + "loss": 0.0172, + "step": 94910 + }, + { + "epoch": 20.014082218491037, + "grad_norm": 0.761780858039856, + "learning_rate": 2.6993927795531006e-05, + "loss": 0.0106, + "step": 94920 + }, + { + "epoch": 20.014136380869846, + "grad_norm": 0.005454350262880325, + "learning_rate": 2.6990918774485912e-05, + "loss": 0.0315, + "step": 94930 + }, + { + "epoch": 20.01419054324866, + "grad_norm": 0.0025792401283979416, + "learning_rate": 2.6987909753440815e-05, + "loss": 0.0821, + "step": 94940 + }, + { + "epoch": 20.014244705627473, + "grad_norm": 2.3049027919769287, + "learning_rate": 2.6984900732395725e-05, + "loss": 0.0669, + "step": 94950 + }, + { + "epoch": 20.014298868006282, + "grad_norm": 0.0027335018385201693, + "learning_rate": 2.698189171135063e-05, + "loss": 0.0053, + "step": 94960 + }, + { + "epoch": 20.014353030385095, + "grad_norm": 0.5750669240951538, + "learning_rate": 2.6978882690305534e-05, + "loss": 0.0396, + "step": 94970 + }, + { + "epoch": 20.014407192763905, + "grad_norm": 0.0026030049193650484, + "learning_rate": 2.6975873669260444e-05, + "loss": 0.0252, + "step": 94980 + }, + { + "epoch": 20.014461355142718, + "grad_norm": 0.003752464661374688, + "learning_rate": 2.697286464821535e-05, + "loss": 0.0898, + "step": 94990 + }, + { + "epoch": 20.01451551752153, + "grad_norm": 0.0022702233400195837, + "learning_rate": 2.696985562717026e-05, + "loss": 0.0022, + "step": 95000 + }, + { + "epoch": 20.01456967990034, + "grad_norm": 0.08039423823356628, + "learning_rate": 2.6966846606125162e-05, + "loss": 0.0108, + "step": 95010 + }, + { + "epoch": 20.014623842279153, + "grad_norm": 0.003758417908102274, + "learning_rate": 2.6963837585080072e-05, + "loss": 0.1291, + "step": 95020 + }, + { + "epoch": 20.014678004657963, + "grad_norm": 0.004933053161948919, + "learning_rate": 2.696082856403498e-05, + "loss": 0.0099, + "step": 95030 + }, + { + "epoch": 20.014732167036776, + "grad_norm": 0.05063778534531593, + "learning_rate": 2.6957819542989888e-05, + "loss": 0.0234, + "step": 95040 + }, + { + "epoch": 20.01478632941559, + "grad_norm": 0.019318994134664536, + "learning_rate": 2.695481052194479e-05, + "loss": 0.0296, + "step": 95050 + }, + { + "epoch": 20.0148404917944, + "grad_norm": 0.001992806326597929, + "learning_rate": 2.6951801500899697e-05, + "loss": 0.0453, + "step": 95060 + }, + { + "epoch": 20.01489465417321, + "grad_norm": 0.2773083448410034, + "learning_rate": 2.6948792479854607e-05, + "loss": 0.1042, + "step": 95070 + }, + { + "epoch": 20.014948816552025, + "grad_norm": 0.40727680921554565, + "learning_rate": 2.6945783458809517e-05, + "loss": 0.0523, + "step": 95080 + }, + { + "epoch": 20.015002978930834, + "grad_norm": 0.047197140753269196, + "learning_rate": 2.6942774437764416e-05, + "loss": 0.1296, + "step": 95090 + }, + { + "epoch": 20.015057141309647, + "grad_norm": 5.664767742156982, + "learning_rate": 2.6939765416719326e-05, + "loss": 0.1317, + "step": 95100 + }, + { + "epoch": 20.015111303688457, + "grad_norm": 0.7541495561599731, + "learning_rate": 2.6936756395674235e-05, + "loss": 0.0421, + "step": 95110 + }, + { + "epoch": 20.01516546606727, + "grad_norm": 0.006737226154655218, + "learning_rate": 2.6933747374629138e-05, + "loss": 0.0148, + "step": 95120 + }, + { + "epoch": 20.015219628446083, + "grad_norm": 0.42523548007011414, + "learning_rate": 2.6930738353584044e-05, + "loss": 0.0214, + "step": 95130 + }, + { + "epoch": 20.015273790824892, + "grad_norm": 0.39704787731170654, + "learning_rate": 2.6927729332538954e-05, + "loss": 0.0125, + "step": 95140 + }, + { + "epoch": 20.015327953203705, + "grad_norm": 0.1329508274793625, + "learning_rate": 2.692472031149386e-05, + "loss": 0.0103, + "step": 95150 + }, + { + "epoch": 20.015382115582515, + "grad_norm": 0.6245369911193848, + "learning_rate": 2.6921711290448763e-05, + "loss": 0.0576, + "step": 95160 + }, + { + "epoch": 20.015436277961328, + "grad_norm": 0.05108930170536041, + "learning_rate": 2.6918702269403673e-05, + "loss": 0.0619, + "step": 95170 + }, + { + "epoch": 20.01549044034014, + "grad_norm": 0.1488204151391983, + "learning_rate": 2.6915693248358583e-05, + "loss": 0.0111, + "step": 95180 + }, + { + "epoch": 20.01554460271895, + "grad_norm": 0.04438699409365654, + "learning_rate": 2.691268422731349e-05, + "loss": 0.0348, + "step": 95190 + }, + { + "epoch": 20.015598765097764, + "grad_norm": 0.002049648202955723, + "learning_rate": 2.6909675206268392e-05, + "loss": 0.0493, + "step": 95200 + }, + { + "epoch": 20.015652927476573, + "grad_norm": 0.09715186059474945, + "learning_rate": 2.69066661852233e-05, + "loss": 0.0154, + "step": 95210 + }, + { + "epoch": 20.015707089855386, + "grad_norm": 0.011022977530956268, + "learning_rate": 2.6903657164178208e-05, + "loss": 0.0728, + "step": 95220 + }, + { + "epoch": 20.0157612522342, + "grad_norm": 3.486898183822632, + "learning_rate": 2.6900648143133117e-05, + "loss": 0.008, + "step": 95230 + }, + { + "epoch": 20.01581541461301, + "grad_norm": 0.1305031180381775, + "learning_rate": 2.689763912208802e-05, + "loss": 0.1484, + "step": 95240 + }, + { + "epoch": 20.015869576991822, + "grad_norm": 1.970547080039978, + "learning_rate": 2.6894630101042927e-05, + "loss": 0.1078, + "step": 95250 + }, + { + "epoch": 20.015923739370635, + "grad_norm": 0.04563349857926369, + "learning_rate": 2.6891621079997836e-05, + "loss": 0.0224, + "step": 95260 + }, + { + "epoch": 20.015977901749444, + "grad_norm": 0.002645814325660467, + "learning_rate": 2.688861205895274e-05, + "loss": 0.1207, + "step": 95270 + }, + { + "epoch": 20.016032064128257, + "grad_norm": 0.025361834093928337, + "learning_rate": 2.688560303790765e-05, + "loss": 0.0287, + "step": 95280 + }, + { + "epoch": 20.016086226507067, + "grad_norm": 0.003298556199297309, + "learning_rate": 2.6882594016862555e-05, + "loss": 0.095, + "step": 95290 + }, + { + "epoch": 20.01614038888588, + "grad_norm": 0.03256012499332428, + "learning_rate": 2.6879584995817465e-05, + "loss": 0.0634, + "step": 95300 + }, + { + "epoch": 20.016194551264693, + "grad_norm": 0.016092605888843536, + "learning_rate": 2.6876575974772368e-05, + "loss": 0.1073, + "step": 95310 + }, + { + "epoch": 20.016248713643503, + "grad_norm": 0.0026407656259834766, + "learning_rate": 2.6873566953727274e-05, + "loss": 0.0374, + "step": 95320 + }, + { + "epoch": 20.016302876022316, + "grad_norm": 0.10591991245746613, + "learning_rate": 2.6870557932682184e-05, + "loss": 0.1679, + "step": 95330 + }, + { + "epoch": 20.016357038401125, + "grad_norm": 0.0027914640959352255, + "learning_rate": 2.6867548911637093e-05, + "loss": 0.0177, + "step": 95340 + }, + { + "epoch": 20.016411200779938, + "grad_norm": 0.0846368744969368, + "learning_rate": 2.6864539890591993e-05, + "loss": 0.0184, + "step": 95350 + }, + { + "epoch": 20.01646536315875, + "grad_norm": 0.0025965722743421793, + "learning_rate": 2.6861530869546902e-05, + "loss": 0.0632, + "step": 95360 + }, + { + "epoch": 20.01651952553756, + "grad_norm": 0.1023019477725029, + "learning_rate": 2.6858521848501812e-05, + "loss": 0.0332, + "step": 95370 + }, + { + "epoch": 20.016573687916374, + "grad_norm": 0.003103623865172267, + "learning_rate": 2.6855512827456718e-05, + "loss": 0.0283, + "step": 95380 + }, + { + "epoch": 20.016627850295183, + "grad_norm": 0.0034937667660415173, + "learning_rate": 2.685250380641162e-05, + "loss": 0.0514, + "step": 95390 + }, + { + "epoch": 20.016682012673996, + "grad_norm": 0.08688759803771973, + "learning_rate": 2.684949478536653e-05, + "loss": 0.0398, + "step": 95400 + }, + { + "epoch": 20.01673617505281, + "grad_norm": 0.002192852320149541, + "learning_rate": 2.6846485764321437e-05, + "loss": 0.0472, + "step": 95410 + }, + { + "epoch": 20.01679033743162, + "grad_norm": 1.4980734586715698, + "learning_rate": 2.684347674327634e-05, + "loss": 0.0128, + "step": 95420 + }, + { + "epoch": 20.016844499810432, + "grad_norm": 0.0029683285392820835, + "learning_rate": 2.684046772223125e-05, + "loss": 0.0274, + "step": 95430 + }, + { + "epoch": 20.016898662189245, + "grad_norm": 0.033189479261636734, + "learning_rate": 2.683745870118616e-05, + "loss": 0.0307, + "step": 95440 + }, + { + "epoch": 20.016952824568055, + "grad_norm": 0.002572460565716028, + "learning_rate": 2.6834449680141066e-05, + "loss": 0.0072, + "step": 95450 + }, + { + "epoch": 20.017006986946868, + "grad_norm": 0.002403482561931014, + "learning_rate": 2.683144065909597e-05, + "loss": 0.1493, + "step": 95460 + }, + { + "epoch": 20.017061149325677, + "grad_norm": 4.174609661102295, + "learning_rate": 2.6828431638050878e-05, + "loss": 0.0586, + "step": 95470 + }, + { + "epoch": 20.01711531170449, + "grad_norm": 0.0019744199234992266, + "learning_rate": 2.6825422617005784e-05, + "loss": 0.046, + "step": 95480 + }, + { + "epoch": 20.017169474083303, + "grad_norm": 0.011137861758470535, + "learning_rate": 2.6822413595960694e-05, + "loss": 0.0191, + "step": 95490 + }, + { + "epoch": 20.017223636462113, + "grad_norm": 0.0020466847345232964, + "learning_rate": 2.6819404574915597e-05, + "loss": 0.0333, + "step": 95500 + }, + { + "epoch": 20.017277798840926, + "grad_norm": 0.0019232981139793992, + "learning_rate": 2.6816395553870503e-05, + "loss": 0.0026, + "step": 95510 + }, + { + "epoch": 20.017331961219735, + "grad_norm": 0.03678976371884346, + "learning_rate": 2.6813386532825413e-05, + "loss": 0.0591, + "step": 95520 + }, + { + "epoch": 20.01738612359855, + "grad_norm": 1.9483627080917358, + "learning_rate": 2.6810377511780323e-05, + "loss": 0.0522, + "step": 95530 + }, + { + "epoch": 20.01744028597736, + "grad_norm": 0.0027507548220455647, + "learning_rate": 2.6807368490735225e-05, + "loss": 0.0092, + "step": 95540 + }, + { + "epoch": 20.01749444835617, + "grad_norm": 0.0020787192042917013, + "learning_rate": 2.6804359469690132e-05, + "loss": 0.0094, + "step": 95550 + }, + { + "epoch": 20.017548610734984, + "grad_norm": 0.015342471189796925, + "learning_rate": 2.680135044864504e-05, + "loss": 0.1198, + "step": 95560 + }, + { + "epoch": 20.017602773113794, + "grad_norm": 0.0018022165168076754, + "learning_rate": 2.6798341427599944e-05, + "loss": 0.0055, + "step": 95570 + }, + { + "epoch": 20.017656935492607, + "grad_norm": 0.0016958977794274688, + "learning_rate": 2.679533240655485e-05, + "loss": 0.0436, + "step": 95580 + }, + { + "epoch": 20.01771109787142, + "grad_norm": 0.0025995734613388777, + "learning_rate": 2.679232338550976e-05, + "loss": 0.0484, + "step": 95590 + }, + { + "epoch": 20.01776526025023, + "grad_norm": 0.0017962814308702946, + "learning_rate": 2.678931436446467e-05, + "loss": 0.0251, + "step": 95600 + }, + { + "epoch": 20.017819422629042, + "grad_norm": 0.0369584895670414, + "learning_rate": 2.678630534341957e-05, + "loss": 0.0452, + "step": 95610 + }, + { + "epoch": 20.017873585007855, + "grad_norm": 4.4941816329956055, + "learning_rate": 2.678329632237448e-05, + "loss": 0.0488, + "step": 95620 + }, + { + "epoch": 20.017927747386665, + "grad_norm": 0.01745089888572693, + "learning_rate": 2.678028730132939e-05, + "loss": 0.0723, + "step": 95630 + }, + { + "epoch": 20.017981909765478, + "grad_norm": 0.013807397335767746, + "learning_rate": 2.6777278280284295e-05, + "loss": 0.0791, + "step": 95640 + }, + { + "epoch": 20.018036072144287, + "grad_norm": 0.6491586565971375, + "learning_rate": 2.6774269259239198e-05, + "loss": 0.0985, + "step": 95650 + }, + { + "epoch": 20.0180902345231, + "grad_norm": 0.010597633197903633, + "learning_rate": 2.6771260238194108e-05, + "loss": 0.085, + "step": 95660 + }, + { + "epoch": 20.018144396901913, + "grad_norm": 0.005208297166973352, + "learning_rate": 2.6768251217149014e-05, + "loss": 0.0371, + "step": 95670 + }, + { + "epoch": 20.018198559280723, + "grad_norm": 0.7513444423675537, + "learning_rate": 2.6765242196103923e-05, + "loss": 0.0387, + "step": 95680 + }, + { + "epoch": 20.018252721659536, + "grad_norm": 0.004323295783251524, + "learning_rate": 2.6762233175058826e-05, + "loss": 0.0498, + "step": 95690 + }, + { + "epoch": 20.018306884038346, + "grad_norm": 0.0035372443962842226, + "learning_rate": 2.6759224154013736e-05, + "loss": 0.0915, + "step": 95700 + }, + { + "epoch": 20.01836104641716, + "grad_norm": 0.003466893220320344, + "learning_rate": 2.6756215132968642e-05, + "loss": 0.0492, + "step": 95710 + }, + { + "epoch": 20.01841520879597, + "grad_norm": 0.04094162955880165, + "learning_rate": 2.6753206111923545e-05, + "loss": 0.0188, + "step": 95720 + }, + { + "epoch": 20.01846937117478, + "grad_norm": 0.061947040259838104, + "learning_rate": 2.6750197090878455e-05, + "loss": 0.0133, + "step": 95730 + }, + { + "epoch": 20.018523533553594, + "grad_norm": 0.04830978438258171, + "learning_rate": 2.674718806983336e-05, + "loss": 0.0192, + "step": 95740 + }, + { + "epoch": 20.018577695932404, + "grad_norm": 0.0021031666547060013, + "learning_rate": 2.674417904878827e-05, + "loss": 0.1271, + "step": 95750 + }, + { + "epoch": 20.018631858311217, + "grad_norm": 0.001944219577126205, + "learning_rate": 2.6741170027743174e-05, + "loss": 0.0046, + "step": 95760 + }, + { + "epoch": 20.01868602069003, + "grad_norm": 0.0019484171643853188, + "learning_rate": 2.673816100669808e-05, + "loss": 0.0594, + "step": 95770 + }, + { + "epoch": 20.01874018306884, + "grad_norm": 0.0018640849739313126, + "learning_rate": 2.673515198565299e-05, + "loss": 0.0792, + "step": 95780 + }, + { + "epoch": 20.018794345447652, + "grad_norm": 0.26490160822868347, + "learning_rate": 2.67321429646079e-05, + "loss": 0.051, + "step": 95790 + }, + { + "epoch": 20.018848507826462, + "grad_norm": 0.0021143334452062845, + "learning_rate": 2.6729133943562802e-05, + "loss": 0.0727, + "step": 95800 + }, + { + "epoch": 20.018902670205275, + "grad_norm": 0.03402922302484512, + "learning_rate": 2.672612492251771e-05, + "loss": 0.0572, + "step": 95810 + }, + { + "epoch": 20.018956832584088, + "grad_norm": 2.0350987911224365, + "learning_rate": 2.6723115901472618e-05, + "loss": 0.091, + "step": 95820 + }, + { + "epoch": 20.019010994962898, + "grad_norm": 1.3331769704818726, + "learning_rate": 2.6720106880427524e-05, + "loss": 0.0351, + "step": 95830 + }, + { + "epoch": 20.01906515734171, + "grad_norm": 0.5771913528442383, + "learning_rate": 2.6717097859382427e-05, + "loss": 0.0046, + "step": 95840 + }, + { + "epoch": 20.019119319720524, + "grad_norm": 0.01815073750913143, + "learning_rate": 2.6714088838337337e-05, + "loss": 0.0177, + "step": 95850 + }, + { + "epoch": 20.019173482099333, + "grad_norm": 3.7202792167663574, + "learning_rate": 2.6711079817292247e-05, + "loss": 0.0317, + "step": 95860 + }, + { + "epoch": 20.019227644478146, + "grad_norm": 0.001954390900209546, + "learning_rate": 2.6708070796247146e-05, + "loss": 0.0695, + "step": 95870 + }, + { + "epoch": 20.019281806856956, + "grad_norm": 0.04453924298286438, + "learning_rate": 2.6705061775202056e-05, + "loss": 0.0398, + "step": 95880 + }, + { + "epoch": 20.01933596923577, + "grad_norm": 4.860450267791748, + "learning_rate": 2.6702052754156965e-05, + "loss": 0.0727, + "step": 95890 + }, + { + "epoch": 20.019390131614582, + "grad_norm": 0.22001594305038452, + "learning_rate": 2.669904373311187e-05, + "loss": 0.0652, + "step": 95900 + }, + { + "epoch": 20.01944429399339, + "grad_norm": 0.015363773331046104, + "learning_rate": 2.6696034712066775e-05, + "loss": 0.0063, + "step": 95910 + }, + { + "epoch": 20.019498456372204, + "grad_norm": 0.0026056517381221056, + "learning_rate": 2.6693025691021684e-05, + "loss": 0.0172, + "step": 95920 + }, + { + "epoch": 20.019552618751014, + "grad_norm": 0.0065946681424975395, + "learning_rate": 2.669001666997659e-05, + "loss": 0.002, + "step": 95930 + }, + { + "epoch": 20.019606781129827, + "grad_norm": 0.002238259417936206, + "learning_rate": 2.66870076489315e-05, + "loss": 0.1536, + "step": 95940 + }, + { + "epoch": 20.01966094350864, + "grad_norm": 0.0034607108682394028, + "learning_rate": 2.6683998627886403e-05, + "loss": 0.0354, + "step": 95950 + }, + { + "epoch": 20.01971510588745, + "grad_norm": 0.002612688113003969, + "learning_rate": 2.6680989606841313e-05, + "loss": 0.0308, + "step": 95960 + }, + { + "epoch": 20.019769268266263, + "grad_norm": 12.176459312438965, + "learning_rate": 2.667798058579622e-05, + "loss": 0.1868, + "step": 95970 + }, + { + "epoch": 20.019823430645072, + "grad_norm": 0.003115291940048337, + "learning_rate": 2.667497156475113e-05, + "loss": 0.0209, + "step": 95980 + }, + { + "epoch": 20.019877593023885, + "grad_norm": 0.004529856611043215, + "learning_rate": 2.667196254370603e-05, + "loss": 0.0127, + "step": 95990 + }, + { + "epoch": 20.0199317554027, + "grad_norm": 0.003342653391882777, + "learning_rate": 2.6668953522660938e-05, + "loss": 0.0321, + "step": 96000 + }, + { + "epoch": 20.019985917781508, + "grad_norm": 0.002660399070009589, + "learning_rate": 2.6665944501615847e-05, + "loss": 0.0315, + "step": 96010 + }, + { + "epoch": 20.02004008016032, + "grad_norm": 0.19498389959335327, + "learning_rate": 2.666293548057075e-05, + "loss": 0.001, + "step": 96020 + }, + { + "epoch": 20.020094242539134, + "grad_norm": 0.0035495550837367773, + "learning_rate": 2.6659926459525657e-05, + "loss": 0.019, + "step": 96030 + }, + { + "epoch": 20.020148404917943, + "grad_norm": 0.003600415773689747, + "learning_rate": 2.6656917438480566e-05, + "loss": 0.0007, + "step": 96040 + }, + { + "epoch": 20.020202567296757, + "grad_norm": 0.08565489947795868, + "learning_rate": 2.6653908417435476e-05, + "loss": 0.0857, + "step": 96050 + }, + { + "epoch": 20.020256729675566, + "grad_norm": 0.0058264173567295074, + "learning_rate": 2.665089939639038e-05, + "loss": 0.0097, + "step": 96060 + }, + { + "epoch": 20.02031089205438, + "grad_norm": 0.0041724881157279015, + "learning_rate": 2.6647890375345285e-05, + "loss": 0.0296, + "step": 96070 + }, + { + "epoch": 20.020365054433192, + "grad_norm": 0.0028760135173797607, + "learning_rate": 2.6644881354300195e-05, + "loss": 0.0042, + "step": 96080 + }, + { + "epoch": 20.020419216812, + "grad_norm": 3.6380083560943604, + "learning_rate": 2.66418723332551e-05, + "loss": 0.033, + "step": 96090 + }, + { + "epoch": 20.020473379190815, + "grad_norm": 0.011511585675179958, + "learning_rate": 2.6638863312210004e-05, + "loss": 0.055, + "step": 96100 + }, + { + "epoch": 20.020527541569624, + "grad_norm": 0.2716904282569885, + "learning_rate": 2.6635854291164914e-05, + "loss": 0.0918, + "step": 96110 + }, + { + "epoch": 20.020581703948437, + "grad_norm": 2.4850587844848633, + "learning_rate": 2.6632845270119823e-05, + "loss": 0.0173, + "step": 96120 + }, + { + "epoch": 20.02063586632725, + "grad_norm": 0.7377625703811646, + "learning_rate": 2.662983624907473e-05, + "loss": 0.0441, + "step": 96130 + }, + { + "epoch": 20.02069002870606, + "grad_norm": 0.01645105890929699, + "learning_rate": 2.6626827228029632e-05, + "loss": 0.0278, + "step": 96140 + }, + { + "epoch": 20.020744191084873, + "grad_norm": 0.0027650187257677317, + "learning_rate": 2.6623818206984542e-05, + "loss": 0.0469, + "step": 96150 + }, + { + "epoch": 20.020798353463682, + "grad_norm": 0.05638166889548302, + "learning_rate": 2.662080918593945e-05, + "loss": 0.0429, + "step": 96160 + }, + { + "epoch": 20.020852515842495, + "grad_norm": 0.04158312454819679, + "learning_rate": 2.661780016489435e-05, + "loss": 0.0933, + "step": 96170 + }, + { + "epoch": 20.02090667822131, + "grad_norm": 2.227999687194824, + "learning_rate": 2.661479114384926e-05, + "loss": 0.0324, + "step": 96180 + }, + { + "epoch": 20.020960840600118, + "grad_norm": 0.048612792044878006, + "learning_rate": 2.6611782122804167e-05, + "loss": 0.0187, + "step": 96190 + }, + { + "epoch": 20.02101500297893, + "grad_norm": 2.4464683532714844, + "learning_rate": 2.6608773101759077e-05, + "loss": 0.0534, + "step": 96200 + }, + { + "epoch": 20.021069165357744, + "grad_norm": 0.0022743719164282084, + "learning_rate": 2.660576408071398e-05, + "loss": 0.0119, + "step": 96210 + }, + { + "epoch": 20.021123327736554, + "grad_norm": 0.002347994362935424, + "learning_rate": 2.660275505966889e-05, + "loss": 0.1009, + "step": 96220 + }, + { + "epoch": 20.021177490115367, + "grad_norm": 1.8660781383514404, + "learning_rate": 2.6599746038623796e-05, + "loss": 0.0374, + "step": 96230 + }, + { + "epoch": 20.021231652494176, + "grad_norm": 0.009823647327721119, + "learning_rate": 2.6596737017578705e-05, + "loss": 0.034, + "step": 96240 + }, + { + "epoch": 20.02128581487299, + "grad_norm": 0.002195831621065736, + "learning_rate": 2.6593727996533608e-05, + "loss": 0.0182, + "step": 96250 + }, + { + "epoch": 20.021339977251802, + "grad_norm": 0.012743648141622543, + "learning_rate": 2.6590718975488515e-05, + "loss": 0.0649, + "step": 96260 + }, + { + "epoch": 20.021394139630612, + "grad_norm": 1.9706391096115112, + "learning_rate": 2.6587709954443424e-05, + "loss": 0.0373, + "step": 96270 + }, + { + "epoch": 20.021448302009425, + "grad_norm": 0.022528178989887238, + "learning_rate": 2.658470093339833e-05, + "loss": 0.1377, + "step": 96280 + }, + { + "epoch": 20.021502464388234, + "grad_norm": 0.31889718770980835, + "learning_rate": 2.6581691912353233e-05, + "loss": 0.0295, + "step": 96290 + }, + { + "epoch": 20.021556626767048, + "grad_norm": 0.31014484167099, + "learning_rate": 2.6578682891308143e-05, + "loss": 0.1144, + "step": 96300 + }, + { + "epoch": 20.02161078914586, + "grad_norm": 0.0022790278308093548, + "learning_rate": 2.6575673870263053e-05, + "loss": 0.0644, + "step": 96310 + }, + { + "epoch": 20.02166495152467, + "grad_norm": 5.018631935119629, + "learning_rate": 2.6572664849217956e-05, + "loss": 0.0673, + "step": 96320 + }, + { + "epoch": 20.021719113903483, + "grad_norm": 0.003619932569563389, + "learning_rate": 2.6569655828172862e-05, + "loss": 0.0088, + "step": 96330 + }, + { + "epoch": 20.021773276282293, + "grad_norm": 0.0035784912761300802, + "learning_rate": 2.656664680712777e-05, + "loss": 0.0384, + "step": 96340 + }, + { + "epoch": 20.021827438661106, + "grad_norm": 0.011979945003986359, + "learning_rate": 2.6563637786082678e-05, + "loss": 0.1046, + "step": 96350 + }, + { + "epoch": 20.02188160103992, + "grad_norm": 0.16861477494239807, + "learning_rate": 2.656062876503758e-05, + "loss": 0.1261, + "step": 96360 + }, + { + "epoch": 20.02193576341873, + "grad_norm": 0.005337418057024479, + "learning_rate": 2.655761974399249e-05, + "loss": 0.0181, + "step": 96370 + }, + { + "epoch": 20.02198992579754, + "grad_norm": 0.00248343194834888, + "learning_rate": 2.65546107229474e-05, + "loss": 0.0279, + "step": 96380 + }, + { + "epoch": 20.022044088176354, + "grad_norm": 0.029389383271336555, + "learning_rate": 2.6551601701902306e-05, + "loss": 0.0293, + "step": 96390 + }, + { + "epoch": 20.022098250555164, + "grad_norm": 0.014989884570240974, + "learning_rate": 2.654859268085721e-05, + "loss": 0.0902, + "step": 96400 + }, + { + "epoch": 20.022152412933977, + "grad_norm": 0.802049994468689, + "learning_rate": 2.654558365981212e-05, + "loss": 0.1032, + "step": 96410 + }, + { + "epoch": 20.022206575312786, + "grad_norm": 2.518878698348999, + "learning_rate": 2.6542574638767025e-05, + "loss": 0.1413, + "step": 96420 + }, + { + "epoch": 20.0222607376916, + "grad_norm": 0.011869882233440876, + "learning_rate": 2.6539565617721935e-05, + "loss": 0.0448, + "step": 96430 + }, + { + "epoch": 20.022314900070413, + "grad_norm": 0.08624164015054703, + "learning_rate": 2.6536556596676838e-05, + "loss": 0.0726, + "step": 96440 + }, + { + "epoch": 20.022369062449222, + "grad_norm": 0.33002176880836487, + "learning_rate": 2.6533547575631744e-05, + "loss": 0.048, + "step": 96450 + }, + { + "epoch": 20.022423224828035, + "grad_norm": 0.004019600339233875, + "learning_rate": 2.6530538554586654e-05, + "loss": 0.087, + "step": 96460 + }, + { + "epoch": 20.022477387206845, + "grad_norm": 0.023656485602259636, + "learning_rate": 2.6527529533541556e-05, + "loss": 0.0054, + "step": 96470 + }, + { + "epoch": 20.022531549585658, + "grad_norm": 0.3643645644187927, + "learning_rate": 2.6524520512496466e-05, + "loss": 0.1012, + "step": 96480 + }, + { + "epoch": 20.02258571196447, + "grad_norm": 0.5808979868888855, + "learning_rate": 2.6521511491451372e-05, + "loss": 0.0161, + "step": 96490 + }, + { + "epoch": 20.02263987434328, + "grad_norm": 0.3421561121940613, + "learning_rate": 2.6518502470406282e-05, + "loss": 0.0914, + "step": 96500 + }, + { + "epoch": 20.022694036722093, + "grad_norm": 1.6618260145187378, + "learning_rate": 2.6515493449361185e-05, + "loss": 0.084, + "step": 96510 + }, + { + "epoch": 20.022748199100903, + "grad_norm": 0.6138107776641846, + "learning_rate": 2.651248442831609e-05, + "loss": 0.0336, + "step": 96520 + }, + { + "epoch": 20.022802361479716, + "grad_norm": 0.0024047954939305782, + "learning_rate": 2.6509475407271e-05, + "loss": 0.0888, + "step": 96530 + }, + { + "epoch": 20.02285652385853, + "grad_norm": 0.27512985467910767, + "learning_rate": 2.6506466386225907e-05, + "loss": 0.0484, + "step": 96540 + }, + { + "epoch": 20.02291068623734, + "grad_norm": 0.0022410349920392036, + "learning_rate": 2.650345736518081e-05, + "loss": 0.012, + "step": 96550 + }, + { + "epoch": 20.02296484861615, + "grad_norm": 0.0021957706194370985, + "learning_rate": 2.650044834413572e-05, + "loss": 0.0274, + "step": 96560 + }, + { + "epoch": 20.023019010994965, + "grad_norm": 11.903705596923828, + "learning_rate": 2.649743932309063e-05, + "loss": 0.0874, + "step": 96570 + }, + { + "epoch": 20.023073173373774, + "grad_norm": 0.02816258743405342, + "learning_rate": 2.6494430302045536e-05, + "loss": 0.1231, + "step": 96580 + }, + { + "epoch": 20.023127335752587, + "grad_norm": 0.010142980143427849, + "learning_rate": 2.649142128100044e-05, + "loss": 0.0051, + "step": 96590 + }, + { + "epoch": 20.023181498131397, + "grad_norm": 0.0045918351970613, + "learning_rate": 2.6488412259955348e-05, + "loss": 0.0349, + "step": 96600 + }, + { + "epoch": 20.02323566051021, + "grad_norm": 0.9670122265815735, + "learning_rate": 2.6485403238910254e-05, + "loss": 0.0449, + "step": 96610 + }, + { + "epoch": 20.023289822889023, + "grad_norm": 0.8263936638832092, + "learning_rate": 2.6482394217865157e-05, + "loss": 0.0556, + "step": 96620 + }, + { + "epoch": 20.023343985267832, + "grad_norm": 0.06090526282787323, + "learning_rate": 2.6479385196820067e-05, + "loss": 0.0698, + "step": 96630 + }, + { + "epoch": 20.023398147646645, + "grad_norm": 0.003991320263594389, + "learning_rate": 2.6476376175774977e-05, + "loss": 0.0246, + "step": 96640 + }, + { + "epoch": 20.023452310025455, + "grad_norm": 0.3504849076271057, + "learning_rate": 2.6473367154729883e-05, + "loss": 0.1359, + "step": 96650 + }, + { + "epoch": 20.023506472404268, + "grad_norm": 0.12708452343940735, + "learning_rate": 2.6470358133684786e-05, + "loss": 0.0102, + "step": 96660 + }, + { + "epoch": 20.02356063478308, + "grad_norm": 0.060891006141901016, + "learning_rate": 2.6467349112639695e-05, + "loss": 0.0195, + "step": 96670 + }, + { + "epoch": 20.02361479716189, + "grad_norm": 1.7886089086532593, + "learning_rate": 2.6464340091594602e-05, + "loss": 0.0501, + "step": 96680 + }, + { + "epoch": 20.023668959540704, + "grad_norm": 2.296623468399048, + "learning_rate": 2.646133107054951e-05, + "loss": 0.0607, + "step": 96690 + }, + { + "epoch": 20.023723121919513, + "grad_norm": 0.009368136525154114, + "learning_rate": 2.6458322049504414e-05, + "loss": 0.0117, + "step": 96700 + }, + { + "epoch": 20.023777284298326, + "grad_norm": 0.002741452306509018, + "learning_rate": 2.645531302845932e-05, + "loss": 0.0252, + "step": 96710 + }, + { + "epoch": 20.02383144667714, + "grad_norm": 0.7364794611930847, + "learning_rate": 2.645230400741423e-05, + "loss": 0.0223, + "step": 96720 + }, + { + "epoch": 20.02388560905595, + "grad_norm": 1.3524982929229736, + "learning_rate": 2.644929498636914e-05, + "loss": 0.0213, + "step": 96730 + }, + { + "epoch": 20.023939771434762, + "grad_norm": 0.13902278244495392, + "learning_rate": 2.6446285965324043e-05, + "loss": 0.0474, + "step": 96740 + }, + { + "epoch": 20.023993933813575, + "grad_norm": 0.001755065401084721, + "learning_rate": 2.644327694427895e-05, + "loss": 0.0732, + "step": 96750 + }, + { + "epoch": 20.024048096192384, + "grad_norm": 0.00249551166780293, + "learning_rate": 2.644026792323386e-05, + "loss": 0.0373, + "step": 96760 + }, + { + "epoch": 20.024102258571197, + "grad_norm": 0.045030031353235245, + "learning_rate": 2.643725890218876e-05, + "loss": 0.0791, + "step": 96770 + }, + { + "epoch": 20.024156420950007, + "grad_norm": 0.07057449966669083, + "learning_rate": 2.6434249881143668e-05, + "loss": 0.114, + "step": 96780 + }, + { + "epoch": 20.02421058332882, + "grad_norm": 0.0034901013132184744, + "learning_rate": 2.6431240860098578e-05, + "loss": 0.1233, + "step": 96790 + }, + { + "epoch": 20.024264745707633, + "grad_norm": 0.8684284090995789, + "learning_rate": 2.6428231839053487e-05, + "loss": 0.032, + "step": 96800 + }, + { + "epoch": 20.024318908086443, + "grad_norm": 0.00200284062884748, + "learning_rate": 2.6425222818008387e-05, + "loss": 0.0063, + "step": 96810 + }, + { + "epoch": 20.024373070465256, + "grad_norm": 0.0031470940448343754, + "learning_rate": 2.6422213796963296e-05, + "loss": 0.0486, + "step": 96820 + }, + { + "epoch": 20.024427232844065, + "grad_norm": 1.4770162105560303, + "learning_rate": 2.6419204775918206e-05, + "loss": 0.0503, + "step": 96830 + }, + { + "epoch": 20.024481395222878, + "grad_norm": 0.0019461706979200244, + "learning_rate": 2.6416195754873112e-05, + "loss": 0.0202, + "step": 96840 + }, + { + "epoch": 20.02453555760169, + "grad_norm": 0.14142398536205292, + "learning_rate": 2.6413186733828015e-05, + "loss": 0.0107, + "step": 96850 + }, + { + "epoch": 20.0245897199805, + "grad_norm": 3.9467384815216064, + "learning_rate": 2.6410177712782925e-05, + "loss": 0.0616, + "step": 96860 + }, + { + "epoch": 20.024643882359314, + "grad_norm": 0.002006291877478361, + "learning_rate": 2.640716869173783e-05, + "loss": 0.0011, + "step": 96870 + }, + { + "epoch": 20.024698044738123, + "grad_norm": 0.002447952749207616, + "learning_rate": 2.640415967069274e-05, + "loss": 0.0193, + "step": 96880 + }, + { + "epoch": 20.024752207116936, + "grad_norm": 0.6734604835510254, + "learning_rate": 2.6401150649647644e-05, + "loss": 0.0865, + "step": 96890 + }, + { + "epoch": 20.02480636949575, + "grad_norm": 0.32361307740211487, + "learning_rate": 2.6398141628602553e-05, + "loss": 0.0824, + "step": 96900 + }, + { + "epoch": 20.02486053187456, + "grad_norm": 3.9667317867279053, + "learning_rate": 2.639513260755746e-05, + "loss": 0.0954, + "step": 96910 + }, + { + "epoch": 20.024914694253372, + "grad_norm": 0.7700733542442322, + "learning_rate": 2.6392123586512363e-05, + "loss": 0.0409, + "step": 96920 + }, + { + "epoch": 20.02496885663218, + "grad_norm": 0.0028884869534522295, + "learning_rate": 2.6389114565467272e-05, + "loss": 0.1008, + "step": 96930 + }, + { + "epoch": 20.02500135405947, + "eval_accuracy": 0.8393207054212932, + "eval_loss": 0.7182582020759583, + "eval_runtime": 117.8694, + "eval_samples_per_second": 25.978, + "eval_steps_per_second": 3.249, + "step": 96936 + }, + { + "epoch": 21.000021664951525, + "grad_norm": 0.005256430711597204, + "learning_rate": 2.638610554442218e-05, + "loss": 0.0236, + "step": 96940 + }, + { + "epoch": 21.000075827330335, + "grad_norm": 0.28877297043800354, + "learning_rate": 2.6383096523377088e-05, + "loss": 0.0033, + "step": 96950 + }, + { + "epoch": 21.000129989709148, + "grad_norm": 0.0019391259411349893, + "learning_rate": 2.638008750233199e-05, + "loss": 0.013, + "step": 96960 + }, + { + "epoch": 21.00018415208796, + "grad_norm": 0.016895968466997147, + "learning_rate": 2.6377078481286897e-05, + "loss": 0.0534, + "step": 96970 + }, + { + "epoch": 21.00023831446677, + "grad_norm": 0.002261117100715637, + "learning_rate": 2.6374069460241807e-05, + "loss": 0.0825, + "step": 96980 + }, + { + "epoch": 21.000292476845583, + "grad_norm": 0.305070698261261, + "learning_rate": 2.6371060439196717e-05, + "loss": 0.0105, + "step": 96990 + }, + { + "epoch": 21.000346639224396, + "grad_norm": 0.0019494104199111462, + "learning_rate": 2.636805141815162e-05, + "loss": 0.0045, + "step": 97000 + }, + { + "epoch": 21.000400801603206, + "grad_norm": 0.01877741515636444, + "learning_rate": 2.6365042397106526e-05, + "loss": 0.0804, + "step": 97010 + }, + { + "epoch": 21.00045496398202, + "grad_norm": 0.002138343872502446, + "learning_rate": 2.6362033376061435e-05, + "loss": 0.003, + "step": 97020 + }, + { + "epoch": 21.00050912636083, + "grad_norm": 0.00270341569557786, + "learning_rate": 2.6359024355016342e-05, + "loss": 0.0024, + "step": 97030 + }, + { + "epoch": 21.00056328873964, + "grad_norm": 2.8805384635925293, + "learning_rate": 2.6356015333971245e-05, + "loss": 0.034, + "step": 97040 + }, + { + "epoch": 21.000617451118455, + "grad_norm": 0.3010755777359009, + "learning_rate": 2.6353006312926154e-05, + "loss": 0.0663, + "step": 97050 + }, + { + "epoch": 21.000671613497264, + "grad_norm": 0.009529931470751762, + "learning_rate": 2.6349997291881064e-05, + "loss": 0.0059, + "step": 97060 + }, + { + "epoch": 21.000725775876077, + "grad_norm": 0.19636982679367065, + "learning_rate": 2.6346988270835963e-05, + "loss": 0.0788, + "step": 97070 + }, + { + "epoch": 21.000779938254887, + "grad_norm": 1.4885095357894897, + "learning_rate": 2.6343979249790873e-05, + "loss": 0.007, + "step": 97080 + }, + { + "epoch": 21.0008341006337, + "grad_norm": 0.0018719966756179929, + "learning_rate": 2.6340970228745783e-05, + "loss": 0.1381, + "step": 97090 + }, + { + "epoch": 21.000888263012513, + "grad_norm": 0.30152809619903564, + "learning_rate": 2.633796120770069e-05, + "loss": 0.0028, + "step": 97100 + }, + { + "epoch": 21.000942425391322, + "grad_norm": 0.14925625920295715, + "learning_rate": 2.6334952186655592e-05, + "loss": 0.0202, + "step": 97110 + }, + { + "epoch": 21.000996587770135, + "grad_norm": 0.0017242045141756535, + "learning_rate": 2.63319431656105e-05, + "loss": 0.0227, + "step": 97120 + }, + { + "epoch": 21.001050750148945, + "grad_norm": 0.0016924890223890543, + "learning_rate": 2.6328934144565408e-05, + "loss": 0.0275, + "step": 97130 + }, + { + "epoch": 21.001104912527758, + "grad_norm": 0.025101004168391228, + "learning_rate": 2.6325925123520318e-05, + "loss": 0.0999, + "step": 97140 + }, + { + "epoch": 21.00115907490657, + "grad_norm": 0.00164126290474087, + "learning_rate": 2.632291610247522e-05, + "loss": 0.0866, + "step": 97150 + }, + { + "epoch": 21.00121323728538, + "grad_norm": 0.06972163915634155, + "learning_rate": 2.631990708143013e-05, + "loss": 0.0598, + "step": 97160 + }, + { + "epoch": 21.001267399664194, + "grad_norm": 0.03592763468623161, + "learning_rate": 2.6316898060385036e-05, + "loss": 0.0003, + "step": 97170 + }, + { + "epoch": 21.001321562043003, + "grad_norm": 1.9474605321884155, + "learning_rate": 2.6313889039339946e-05, + "loss": 0.1271, + "step": 97180 + }, + { + "epoch": 21.001375724421816, + "grad_norm": 0.001925988239236176, + "learning_rate": 2.631088001829485e-05, + "loss": 0.1478, + "step": 97190 + }, + { + "epoch": 21.00142988680063, + "grad_norm": 0.005174159072339535, + "learning_rate": 2.6307870997249755e-05, + "loss": 0.0021, + "step": 97200 + }, + { + "epoch": 21.00148404917944, + "grad_norm": 1.505248785018921, + "learning_rate": 2.6304861976204665e-05, + "loss": 0.0494, + "step": 97210 + }, + { + "epoch": 21.001538211558252, + "grad_norm": 0.07317541539669037, + "learning_rate": 2.6301852955159568e-05, + "loss": 0.0113, + "step": 97220 + }, + { + "epoch": 21.001592373937065, + "grad_norm": 0.0033918945118784904, + "learning_rate": 2.6298843934114474e-05, + "loss": 0.0444, + "step": 97230 + }, + { + "epoch": 21.001646536315874, + "grad_norm": 0.001965647330507636, + "learning_rate": 2.6295834913069384e-05, + "loss": 0.0778, + "step": 97240 + }, + { + "epoch": 21.001700698694687, + "grad_norm": 3.4816784858703613, + "learning_rate": 2.6292825892024293e-05, + "loss": 0.0969, + "step": 97250 + }, + { + "epoch": 21.001754861073497, + "grad_norm": 0.18013659119606018, + "learning_rate": 2.6289816870979196e-05, + "loss": 0.0605, + "step": 97260 + }, + { + "epoch": 21.00180902345231, + "grad_norm": 1.0483345985412598, + "learning_rate": 2.6286807849934102e-05, + "loss": 0.0624, + "step": 97270 + }, + { + "epoch": 21.001863185831123, + "grad_norm": 0.029979707673192024, + "learning_rate": 2.6283798828889012e-05, + "loss": 0.004, + "step": 97280 + }, + { + "epoch": 21.001917348209933, + "grad_norm": 0.0018820228287950158, + "learning_rate": 2.628078980784392e-05, + "loss": 0.0453, + "step": 97290 + }, + { + "epoch": 21.001971510588746, + "grad_norm": 0.054014384746551514, + "learning_rate": 2.627778078679882e-05, + "loss": 0.0598, + "step": 97300 + }, + { + "epoch": 21.002025672967555, + "grad_norm": 2.3739333152770996, + "learning_rate": 2.627477176575373e-05, + "loss": 0.0214, + "step": 97310 + }, + { + "epoch": 21.00207983534637, + "grad_norm": 0.001855912385508418, + "learning_rate": 2.627176274470864e-05, + "loss": 0.0017, + "step": 97320 + }, + { + "epoch": 21.00213399772518, + "grad_norm": 0.06445277482271194, + "learning_rate": 2.6268753723663547e-05, + "loss": 0.0474, + "step": 97330 + }, + { + "epoch": 21.00218816010399, + "grad_norm": 0.0018765190616250038, + "learning_rate": 2.626574470261845e-05, + "loss": 0.0653, + "step": 97340 + }, + { + "epoch": 21.002242322482804, + "grad_norm": 1.5517650842666626, + "learning_rate": 2.626273568157336e-05, + "loss": 0.0723, + "step": 97350 + }, + { + "epoch": 21.002296484861613, + "grad_norm": 0.0019433615962043405, + "learning_rate": 2.6259726660528266e-05, + "loss": 0.093, + "step": 97360 + }, + { + "epoch": 21.002350647240426, + "grad_norm": 0.0031474425923079252, + "learning_rate": 2.625671763948317e-05, + "loss": 0.0342, + "step": 97370 + }, + { + "epoch": 21.00240480961924, + "grad_norm": 9.67113208770752, + "learning_rate": 2.6253708618438078e-05, + "loss": 0.103, + "step": 97380 + }, + { + "epoch": 21.00245897199805, + "grad_norm": 0.0019725074525922537, + "learning_rate": 2.6250699597392985e-05, + "loss": 0.0395, + "step": 97390 + }, + { + "epoch": 21.002513134376862, + "grad_norm": 0.16251085698604584, + "learning_rate": 2.6247690576347894e-05, + "loss": 0.004, + "step": 97400 + }, + { + "epoch": 21.002567296755675, + "grad_norm": 0.8009486198425293, + "learning_rate": 2.6244681555302797e-05, + "loss": 0.0392, + "step": 97410 + }, + { + "epoch": 21.002621459134485, + "grad_norm": 0.0020827469415962696, + "learning_rate": 2.6241672534257707e-05, + "loss": 0.0746, + "step": 97420 + }, + { + "epoch": 21.002675621513298, + "grad_norm": 0.00191681704018265, + "learning_rate": 2.6238663513212613e-05, + "loss": 0.0493, + "step": 97430 + }, + { + "epoch": 21.002729783892107, + "grad_norm": 1.1241304874420166, + "learning_rate": 2.6235654492167523e-05, + "loss": 0.0191, + "step": 97440 + }, + { + "epoch": 21.00278394627092, + "grad_norm": 1.0705739259719849, + "learning_rate": 2.6232645471122426e-05, + "loss": 0.0655, + "step": 97450 + }, + { + "epoch": 21.002838108649733, + "grad_norm": 0.02639171853661537, + "learning_rate": 2.6229636450077332e-05, + "loss": 0.0644, + "step": 97460 + }, + { + "epoch": 21.002892271028543, + "grad_norm": 0.21300628781318665, + "learning_rate": 2.622662742903224e-05, + "loss": 0.0693, + "step": 97470 + }, + { + "epoch": 21.002946433407356, + "grad_norm": 0.00237604440189898, + "learning_rate": 2.6223618407987148e-05, + "loss": 0.0041, + "step": 97480 + }, + { + "epoch": 21.003000595786165, + "grad_norm": 0.0021626874804496765, + "learning_rate": 2.622060938694205e-05, + "loss": 0.0422, + "step": 97490 + }, + { + "epoch": 21.00305475816498, + "grad_norm": 0.002958860481157899, + "learning_rate": 2.621760036589696e-05, + "loss": 0.0821, + "step": 97500 + }, + { + "epoch": 21.00310892054379, + "grad_norm": 0.023852013051509857, + "learning_rate": 2.621459134485187e-05, + "loss": 0.0172, + "step": 97510 + }, + { + "epoch": 21.0031630829226, + "grad_norm": 0.0020415412727743387, + "learning_rate": 2.6211582323806773e-05, + "loss": 0.0805, + "step": 97520 + }, + { + "epoch": 21.003217245301414, + "grad_norm": 0.0024309465661644936, + "learning_rate": 2.620857330276168e-05, + "loss": 0.056, + "step": 97530 + }, + { + "epoch": 21.003271407680224, + "grad_norm": 10.572484016418457, + "learning_rate": 2.620556428171659e-05, + "loss": 0.1742, + "step": 97540 + }, + { + "epoch": 21.003325570059037, + "grad_norm": 0.7713026404380798, + "learning_rate": 2.6202555260671495e-05, + "loss": 0.0579, + "step": 97550 + }, + { + "epoch": 21.00337973243785, + "grad_norm": 0.0019965562969446182, + "learning_rate": 2.6199546239626398e-05, + "loss": 0.0299, + "step": 97560 + }, + { + "epoch": 21.00343389481666, + "grad_norm": 0.0031781531870365143, + "learning_rate": 2.6196537218581308e-05, + "loss": 0.0283, + "step": 97570 + }, + { + "epoch": 21.003488057195472, + "grad_norm": 0.05634618178009987, + "learning_rate": 2.6193528197536217e-05, + "loss": 0.0192, + "step": 97580 + }, + { + "epoch": 21.003542219574285, + "grad_norm": 0.0032531649339944124, + "learning_rate": 2.6190519176491124e-05, + "loss": 0.0775, + "step": 97590 + }, + { + "epoch": 21.003596381953095, + "grad_norm": 0.001952708000317216, + "learning_rate": 2.6187510155446026e-05, + "loss": 0.0663, + "step": 97600 + }, + { + "epoch": 21.003650544331908, + "grad_norm": 0.5336933135986328, + "learning_rate": 2.6184501134400936e-05, + "loss": 0.2317, + "step": 97610 + }, + { + "epoch": 21.003704706710717, + "grad_norm": 0.002182829426601529, + "learning_rate": 2.6181492113355842e-05, + "loss": 0.0222, + "step": 97620 + }, + { + "epoch": 21.00375886908953, + "grad_norm": 0.4130668044090271, + "learning_rate": 2.6178483092310752e-05, + "loss": 0.0367, + "step": 97630 + }, + { + "epoch": 21.003813031468344, + "grad_norm": 1.0761494636535645, + "learning_rate": 2.6175474071265655e-05, + "loss": 0.0257, + "step": 97640 + }, + { + "epoch": 21.003867193847153, + "grad_norm": 0.1021333783864975, + "learning_rate": 2.617246505022056e-05, + "loss": 0.0196, + "step": 97650 + }, + { + "epoch": 21.003921356225966, + "grad_norm": 0.001992648234590888, + "learning_rate": 2.616945602917547e-05, + "loss": 0.0146, + "step": 97660 + }, + { + "epoch": 21.003975518604776, + "grad_norm": 0.19405673444271088, + "learning_rate": 2.6166447008130374e-05, + "loss": 0.1399, + "step": 97670 + }, + { + "epoch": 21.00402968098359, + "grad_norm": 0.04988935589790344, + "learning_rate": 2.6163437987085283e-05, + "loss": 0.0446, + "step": 97680 + }, + { + "epoch": 21.0040838433624, + "grad_norm": 0.07018885016441345, + "learning_rate": 2.616042896604019e-05, + "loss": 0.0504, + "step": 97690 + }, + { + "epoch": 21.00413800574121, + "grad_norm": 0.004257873632013798, + "learning_rate": 2.61574199449951e-05, + "loss": 0.0766, + "step": 97700 + }, + { + "epoch": 21.004192168120024, + "grad_norm": 0.05968055501580238, + "learning_rate": 2.6154410923950002e-05, + "loss": 0.0124, + "step": 97710 + }, + { + "epoch": 21.004246330498834, + "grad_norm": 0.0029054274782538414, + "learning_rate": 2.615140190290491e-05, + "loss": 0.1014, + "step": 97720 + }, + { + "epoch": 21.004300492877647, + "grad_norm": 0.05353769659996033, + "learning_rate": 2.6148392881859818e-05, + "loss": 0.0765, + "step": 97730 + }, + { + "epoch": 21.00435465525646, + "grad_norm": 3.969069719314575, + "learning_rate": 2.6145383860814724e-05, + "loss": 0.0594, + "step": 97740 + }, + { + "epoch": 21.00440881763527, + "grad_norm": 0.05975166708230972, + "learning_rate": 2.6142374839769627e-05, + "loss": 0.0151, + "step": 97750 + }, + { + "epoch": 21.004462980014083, + "grad_norm": 2.8060076236724854, + "learning_rate": 2.6139365818724537e-05, + "loss": 0.0516, + "step": 97760 + }, + { + "epoch": 21.004517142392896, + "grad_norm": 0.010894769802689552, + "learning_rate": 2.6136356797679447e-05, + "loss": 0.0251, + "step": 97770 + }, + { + "epoch": 21.004571304771705, + "grad_norm": 0.47182267904281616, + "learning_rate": 2.6133347776634353e-05, + "loss": 0.0608, + "step": 97780 + }, + { + "epoch": 21.004625467150518, + "grad_norm": 2.27372407913208, + "learning_rate": 2.6130338755589256e-05, + "loss": 0.1725, + "step": 97790 + }, + { + "epoch": 21.004679629529328, + "grad_norm": 0.01636236160993576, + "learning_rate": 2.6127329734544166e-05, + "loss": 0.0445, + "step": 97800 + }, + { + "epoch": 21.00473379190814, + "grad_norm": 1.5963480472564697, + "learning_rate": 2.6124320713499072e-05, + "loss": 0.0249, + "step": 97810 + }, + { + "epoch": 21.004787954286954, + "grad_norm": 0.7261400818824768, + "learning_rate": 2.6121311692453975e-05, + "loss": 0.0989, + "step": 97820 + }, + { + "epoch": 21.004842116665763, + "grad_norm": 0.0024093242827802896, + "learning_rate": 2.6118302671408884e-05, + "loss": 0.0306, + "step": 97830 + }, + { + "epoch": 21.004896279044576, + "grad_norm": 0.010622064583003521, + "learning_rate": 2.6115293650363794e-05, + "loss": 0.0149, + "step": 97840 + }, + { + "epoch": 21.004950441423386, + "grad_norm": 3.586791515350342, + "learning_rate": 2.61122846293187e-05, + "loss": 0.1143, + "step": 97850 + }, + { + "epoch": 21.0050046038022, + "grad_norm": 0.0022460634354501963, + "learning_rate": 2.6109275608273603e-05, + "loss": 0.0289, + "step": 97860 + }, + { + "epoch": 21.005058766181012, + "grad_norm": 0.0019798199646174908, + "learning_rate": 2.6106266587228513e-05, + "loss": 0.0044, + "step": 97870 + }, + { + "epoch": 21.00511292855982, + "grad_norm": 0.319415807723999, + "learning_rate": 2.610325756618342e-05, + "loss": 0.0849, + "step": 97880 + }, + { + "epoch": 21.005167090938635, + "grad_norm": 0.35820338129997253, + "learning_rate": 2.610024854513833e-05, + "loss": 0.026, + "step": 97890 + }, + { + "epoch": 21.005221253317444, + "grad_norm": 0.0019043234642595053, + "learning_rate": 2.609723952409323e-05, + "loss": 0.0207, + "step": 97900 + }, + { + "epoch": 21.005275415696257, + "grad_norm": 0.0024290597066283226, + "learning_rate": 2.6094230503048138e-05, + "loss": 0.084, + "step": 97910 + }, + { + "epoch": 21.00532957807507, + "grad_norm": 1.4574390649795532, + "learning_rate": 2.6091221482003048e-05, + "loss": 0.023, + "step": 97920 + }, + { + "epoch": 21.00538374045388, + "grad_norm": 0.6285720467567444, + "learning_rate": 2.6088212460957957e-05, + "loss": 0.0437, + "step": 97930 + }, + { + "epoch": 21.005437902832693, + "grad_norm": 0.0018049426143988967, + "learning_rate": 2.608520343991286e-05, + "loss": 0.0507, + "step": 97940 + }, + { + "epoch": 21.005492065211506, + "grad_norm": 0.2127385139465332, + "learning_rate": 2.6082194418867766e-05, + "loss": 0.0175, + "step": 97950 + }, + { + "epoch": 21.005546227590315, + "grad_norm": 0.35145339369773865, + "learning_rate": 2.6079185397822676e-05, + "loss": 0.1728, + "step": 97960 + }, + { + "epoch": 21.00560038996913, + "grad_norm": 0.0022757514379918575, + "learning_rate": 2.607617637677758e-05, + "loss": 0.0256, + "step": 97970 + }, + { + "epoch": 21.005654552347938, + "grad_norm": 0.004268258344382048, + "learning_rate": 2.6073167355732485e-05, + "loss": 0.0669, + "step": 97980 + }, + { + "epoch": 21.00570871472675, + "grad_norm": 0.7567241787910461, + "learning_rate": 2.6070158334687395e-05, + "loss": 0.0427, + "step": 97990 + }, + { + "epoch": 21.005762877105564, + "grad_norm": 0.13003231585025787, + "learning_rate": 2.60671493136423e-05, + "loss": 0.0206, + "step": 98000 + }, + { + "epoch": 21.005817039484374, + "grad_norm": 0.011407604441046715, + "learning_rate": 2.6064140292597204e-05, + "loss": 0.0462, + "step": 98010 + }, + { + "epoch": 21.005871201863187, + "grad_norm": 0.0024322245735675097, + "learning_rate": 2.6061131271552114e-05, + "loss": 0.0034, + "step": 98020 + }, + { + "epoch": 21.005925364241996, + "grad_norm": 1.940609335899353, + "learning_rate": 2.6058122250507023e-05, + "loss": 0.0486, + "step": 98030 + }, + { + "epoch": 21.00597952662081, + "grad_norm": 0.0020640408620238304, + "learning_rate": 2.605511322946193e-05, + "loss": 0.0041, + "step": 98040 + }, + { + "epoch": 21.006033688999622, + "grad_norm": 0.002006355905905366, + "learning_rate": 2.6052104208416833e-05, + "loss": 0.0014, + "step": 98050 + }, + { + "epoch": 21.00608785137843, + "grad_norm": 0.00203239475376904, + "learning_rate": 2.6049095187371742e-05, + "loss": 0.2193, + "step": 98060 + }, + { + "epoch": 21.006142013757245, + "grad_norm": 0.3297191560268402, + "learning_rate": 2.604608616632665e-05, + "loss": 0.1406, + "step": 98070 + }, + { + "epoch": 21.006196176136054, + "grad_norm": 0.9842365980148315, + "learning_rate": 2.6043077145281558e-05, + "loss": 0.0118, + "step": 98080 + }, + { + "epoch": 21.006250338514867, + "grad_norm": 0.0025180119555443525, + "learning_rate": 2.604006812423646e-05, + "loss": 0.0992, + "step": 98090 + }, + { + "epoch": 21.00630450089368, + "grad_norm": 0.02455550618469715, + "learning_rate": 2.603705910319137e-05, + "loss": 0.0733, + "step": 98100 + }, + { + "epoch": 21.00635866327249, + "grad_norm": 0.11519995331764221, + "learning_rate": 2.6034050082146277e-05, + "loss": 0.0073, + "step": 98110 + }, + { + "epoch": 21.006412825651303, + "grad_norm": 0.0035064872354269028, + "learning_rate": 2.603104106110118e-05, + "loss": 0.0428, + "step": 98120 + }, + { + "epoch": 21.006466988030116, + "grad_norm": 0.9895692467689514, + "learning_rate": 2.602803204005609e-05, + "loss": 0.1567, + "step": 98130 + }, + { + "epoch": 21.006521150408926, + "grad_norm": 0.15430708229541779, + "learning_rate": 2.6025023019010996e-05, + "loss": 0.0808, + "step": 98140 + }, + { + "epoch": 21.00657531278774, + "grad_norm": 0.07383282482624054, + "learning_rate": 2.6022013997965905e-05, + "loss": 0.0404, + "step": 98150 + }, + { + "epoch": 21.006629475166548, + "grad_norm": 0.5461089015007019, + "learning_rate": 2.601900497692081e-05, + "loss": 0.064, + "step": 98160 + }, + { + "epoch": 21.00668363754536, + "grad_norm": 0.03282592073082924, + "learning_rate": 2.6015995955875715e-05, + "loss": 0.0639, + "step": 98170 + }, + { + "epoch": 21.006737799924174, + "grad_norm": 0.022575240582227707, + "learning_rate": 2.6012986934830624e-05, + "loss": 0.0073, + "step": 98180 + }, + { + "epoch": 21.006791962302984, + "grad_norm": 0.006096336990594864, + "learning_rate": 2.6009977913785534e-05, + "loss": 0.1429, + "step": 98190 + }, + { + "epoch": 21.006846124681797, + "grad_norm": 0.15704891085624695, + "learning_rate": 2.6006968892740437e-05, + "loss": 0.1051, + "step": 98200 + }, + { + "epoch": 21.006900287060606, + "grad_norm": 0.05349527299404144, + "learning_rate": 2.6003959871695343e-05, + "loss": 0.1014, + "step": 98210 + }, + { + "epoch": 21.00695444943942, + "grad_norm": 0.13550551235675812, + "learning_rate": 2.6000950850650253e-05, + "loss": 0.043, + "step": 98220 + }, + { + "epoch": 21.007008611818232, + "grad_norm": 0.3956553637981415, + "learning_rate": 2.599794182960516e-05, + "loss": 0.1648, + "step": 98230 + }, + { + "epoch": 21.007062774197042, + "grad_norm": 0.0038412127178162336, + "learning_rate": 2.5994932808560062e-05, + "loss": 0.0008, + "step": 98240 + }, + { + "epoch": 21.007116936575855, + "grad_norm": 0.030327364802360535, + "learning_rate": 2.599192378751497e-05, + "loss": 0.0297, + "step": 98250 + }, + { + "epoch": 21.007171098954665, + "grad_norm": 0.06246018037199974, + "learning_rate": 2.5988914766469878e-05, + "loss": 0.0303, + "step": 98260 + }, + { + "epoch": 21.007225261333478, + "grad_norm": 0.0033455851953476667, + "learning_rate": 2.598590574542478e-05, + "loss": 0.0095, + "step": 98270 + }, + { + "epoch": 21.00727942371229, + "grad_norm": 0.9269759654998779, + "learning_rate": 2.598289672437969e-05, + "loss": 0.014, + "step": 98280 + }, + { + "epoch": 21.0073335860911, + "grad_norm": 4.130246639251709, + "learning_rate": 2.59798877033346e-05, + "loss": 0.1217, + "step": 98290 + }, + { + "epoch": 21.007387748469913, + "grad_norm": 0.7852204442024231, + "learning_rate": 2.5976878682289506e-05, + "loss": 0.0178, + "step": 98300 + }, + { + "epoch": 21.007441910848726, + "grad_norm": 1.2060966491699219, + "learning_rate": 2.597386966124441e-05, + "loss": 0.0505, + "step": 98310 + }, + { + "epoch": 21.007496073227536, + "grad_norm": 0.8086978197097778, + "learning_rate": 2.597086064019932e-05, + "loss": 0.076, + "step": 98320 + }, + { + "epoch": 21.00755023560635, + "grad_norm": 2.208083391189575, + "learning_rate": 2.5967851619154225e-05, + "loss": 0.0817, + "step": 98330 + }, + { + "epoch": 21.00760439798516, + "grad_norm": 0.12476234883069992, + "learning_rate": 2.5964842598109135e-05, + "loss": 0.1197, + "step": 98340 + }, + { + "epoch": 21.00765856036397, + "grad_norm": 0.008586226962506771, + "learning_rate": 2.5961833577064038e-05, + "loss": 0.0875, + "step": 98350 + }, + { + "epoch": 21.007712722742784, + "grad_norm": 0.0027545636985450983, + "learning_rate": 2.5958824556018947e-05, + "loss": 0.0315, + "step": 98360 + }, + { + "epoch": 21.007766885121594, + "grad_norm": 0.15213090181350708, + "learning_rate": 2.5955815534973854e-05, + "loss": 0.03, + "step": 98370 + }, + { + "epoch": 21.007821047500407, + "grad_norm": 3.241919755935669, + "learning_rate": 2.5952806513928763e-05, + "loss": 0.0377, + "step": 98380 + }, + { + "epoch": 21.007875209879217, + "grad_norm": 0.11649468541145325, + "learning_rate": 2.5949797492883666e-05, + "loss": 0.0767, + "step": 98390 + }, + { + "epoch": 21.00792937225803, + "grad_norm": 1.389238715171814, + "learning_rate": 2.5946788471838572e-05, + "loss": 0.0717, + "step": 98400 + }, + { + "epoch": 21.007983534636843, + "grad_norm": 0.43055954575538635, + "learning_rate": 2.5943779450793482e-05, + "loss": 0.0212, + "step": 98410 + }, + { + "epoch": 21.008037697015652, + "grad_norm": 0.0023620619904249907, + "learning_rate": 2.5940770429748385e-05, + "loss": 0.0103, + "step": 98420 + }, + { + "epoch": 21.008091859394465, + "grad_norm": 0.06963256746530533, + "learning_rate": 2.593776140870329e-05, + "loss": 0.0039, + "step": 98430 + }, + { + "epoch": 21.008146021773275, + "grad_norm": 0.04657465219497681, + "learning_rate": 2.59347523876582e-05, + "loss": 0.0579, + "step": 98440 + }, + { + "epoch": 21.008200184152088, + "grad_norm": 0.0022962663788348436, + "learning_rate": 2.593174336661311e-05, + "loss": 0.064, + "step": 98450 + }, + { + "epoch": 21.0082543465309, + "grad_norm": 0.9327406883239746, + "learning_rate": 2.5928734345568014e-05, + "loss": 0.0606, + "step": 98460 + }, + { + "epoch": 21.00830850890971, + "grad_norm": 1.7022364139556885, + "learning_rate": 2.592572532452292e-05, + "loss": 0.0384, + "step": 98470 + }, + { + "epoch": 21.008362671288523, + "grad_norm": 2.8788375854492188, + "learning_rate": 2.592271630347783e-05, + "loss": 0.0848, + "step": 98480 + }, + { + "epoch": 21.008416833667333, + "grad_norm": 1.9177842140197754, + "learning_rate": 2.5919707282432736e-05, + "loss": 0.064, + "step": 98490 + }, + { + "epoch": 21.008470996046146, + "grad_norm": 0.00512679060921073, + "learning_rate": 2.591669826138764e-05, + "loss": 0.0961, + "step": 98500 + }, + { + "epoch": 21.00852515842496, + "grad_norm": 0.002605549991130829, + "learning_rate": 2.5913689240342548e-05, + "loss": 0.0351, + "step": 98510 + }, + { + "epoch": 21.00857932080377, + "grad_norm": 0.052205126732587814, + "learning_rate": 2.5910680219297458e-05, + "loss": 0.107, + "step": 98520 + }, + { + "epoch": 21.00863348318258, + "grad_norm": 0.002272265497595072, + "learning_rate": 2.5907671198252364e-05, + "loss": 0.0033, + "step": 98530 + }, + { + "epoch": 21.008687645561395, + "grad_norm": 2.5939269065856934, + "learning_rate": 2.5904662177207267e-05, + "loss": 0.1396, + "step": 98540 + }, + { + "epoch": 21.008741807940204, + "grad_norm": 0.059891022741794586, + "learning_rate": 2.5901653156162177e-05, + "loss": 0.0939, + "step": 98550 + }, + { + "epoch": 21.008795970319017, + "grad_norm": 0.0033932882361114025, + "learning_rate": 2.5898644135117083e-05, + "loss": 0.0672, + "step": 98560 + }, + { + "epoch": 21.008850132697827, + "grad_norm": 0.1603430062532425, + "learning_rate": 2.5895635114071986e-05, + "loss": 0.0587, + "step": 98570 + }, + { + "epoch": 21.00890429507664, + "grad_norm": 0.20548245310783386, + "learning_rate": 2.5892626093026896e-05, + "loss": 0.0436, + "step": 98580 + }, + { + "epoch": 21.008958457455453, + "grad_norm": 0.14678455889225006, + "learning_rate": 2.5889617071981802e-05, + "loss": 0.0359, + "step": 98590 + }, + { + "epoch": 21.009012619834262, + "grad_norm": 1.619713544845581, + "learning_rate": 2.588660805093671e-05, + "loss": 0.1139, + "step": 98600 + }, + { + "epoch": 21.009066782213075, + "grad_norm": 0.6171128153800964, + "learning_rate": 2.5883599029891614e-05, + "loss": 0.1086, + "step": 98610 + }, + { + "epoch": 21.009120944591885, + "grad_norm": 0.037877507507801056, + "learning_rate": 2.5880590008846524e-05, + "loss": 0.1418, + "step": 98620 + }, + { + "epoch": 21.009175106970698, + "grad_norm": 0.3497214615345001, + "learning_rate": 2.587758098780143e-05, + "loss": 0.0489, + "step": 98630 + }, + { + "epoch": 21.00922926934951, + "grad_norm": 0.22318235039710999, + "learning_rate": 2.587457196675634e-05, + "loss": 0.0136, + "step": 98640 + }, + { + "epoch": 21.00928343172832, + "grad_norm": 0.6283579468727112, + "learning_rate": 2.5871562945711243e-05, + "loss": 0.0605, + "step": 98650 + }, + { + "epoch": 21.009337594107134, + "grad_norm": 0.004928564187139273, + "learning_rate": 2.586855392466615e-05, + "loss": 0.0239, + "step": 98660 + }, + { + "epoch": 21.009391756485943, + "grad_norm": 8.36874771118164, + "learning_rate": 2.586554490362106e-05, + "loss": 0.0829, + "step": 98670 + }, + { + "epoch": 21.009445918864756, + "grad_norm": 0.004727130755782127, + "learning_rate": 2.5862535882575965e-05, + "loss": 0.0016, + "step": 98680 + }, + { + "epoch": 21.00950008124357, + "grad_norm": 0.003969965968281031, + "learning_rate": 2.5859526861530868e-05, + "loss": 0.0393, + "step": 98690 + }, + { + "epoch": 21.00955424362238, + "grad_norm": 0.06736376136541367, + "learning_rate": 2.5856517840485778e-05, + "loss": 0.0643, + "step": 98700 + }, + { + "epoch": 21.009608406001192, + "grad_norm": 0.006445087026804686, + "learning_rate": 2.5853508819440687e-05, + "loss": 0.0189, + "step": 98710 + }, + { + "epoch": 21.009662568380005, + "grad_norm": 0.0056525482796132565, + "learning_rate": 2.585049979839559e-05, + "loss": 0.0701, + "step": 98720 + }, + { + "epoch": 21.009716730758814, + "grad_norm": 0.3849724233150482, + "learning_rate": 2.5847490777350496e-05, + "loss": 0.0025, + "step": 98730 + }, + { + "epoch": 21.009770893137627, + "grad_norm": 0.0020280585158616304, + "learning_rate": 2.5844481756305406e-05, + "loss": 0.03, + "step": 98740 + }, + { + "epoch": 21.009825055516437, + "grad_norm": 0.0021052504889667034, + "learning_rate": 2.5841472735260312e-05, + "loss": 0.0087, + "step": 98750 + }, + { + "epoch": 21.00987921789525, + "grad_norm": 0.27675819396972656, + "learning_rate": 2.5838463714215215e-05, + "loss": 0.0582, + "step": 98760 + }, + { + "epoch": 21.009933380274063, + "grad_norm": 0.1743270605802536, + "learning_rate": 2.5835454693170125e-05, + "loss": 0.0353, + "step": 98770 + }, + { + "epoch": 21.009987542652873, + "grad_norm": 4.160787582397461, + "learning_rate": 2.5832445672125035e-05, + "loss": 0.0606, + "step": 98780 + }, + { + "epoch": 21.010041705031686, + "grad_norm": 0.5753592252731323, + "learning_rate": 2.582943665107994e-05, + "loss": 0.0012, + "step": 98790 + }, + { + "epoch": 21.010095867410495, + "grad_norm": 1.5988081693649292, + "learning_rate": 2.5826427630034844e-05, + "loss": 0.0529, + "step": 98800 + }, + { + "epoch": 21.01015002978931, + "grad_norm": 0.015602223575115204, + "learning_rate": 2.5823418608989753e-05, + "loss": 0.0735, + "step": 98810 + }, + { + "epoch": 21.01020419216812, + "grad_norm": 0.038965191692113876, + "learning_rate": 2.582040958794466e-05, + "loss": 0.0888, + "step": 98820 + }, + { + "epoch": 21.01025835454693, + "grad_norm": 0.007964706048369408, + "learning_rate": 2.581740056689957e-05, + "loss": 0.0279, + "step": 98830 + }, + { + "epoch": 21.010312516925744, + "grad_norm": 0.6690701842308044, + "learning_rate": 2.5814391545854472e-05, + "loss": 0.0386, + "step": 98840 + }, + { + "epoch": 21.010366679304553, + "grad_norm": 0.06738877296447754, + "learning_rate": 2.581138252480938e-05, + "loss": 0.0105, + "step": 98850 + }, + { + "epoch": 21.010420841683366, + "grad_norm": 0.014012674801051617, + "learning_rate": 2.5808373503764288e-05, + "loss": 0.0421, + "step": 98860 + }, + { + "epoch": 21.01047500406218, + "grad_norm": 0.08192218840122223, + "learning_rate": 2.580536448271919e-05, + "loss": 0.0287, + "step": 98870 + }, + { + "epoch": 21.01052916644099, + "grad_norm": 0.00638024415820837, + "learning_rate": 2.58023554616741e-05, + "loss": 0.0063, + "step": 98880 + }, + { + "epoch": 21.010583328819802, + "grad_norm": 0.002082273829728365, + "learning_rate": 2.5799346440629007e-05, + "loss": 0.0048, + "step": 98890 + }, + { + "epoch": 21.010637491198615, + "grad_norm": 0.23803092539310455, + "learning_rate": 2.5796337419583917e-05, + "loss": 0.0325, + "step": 98900 + }, + { + "epoch": 21.010691653577425, + "grad_norm": 0.10448542982339859, + "learning_rate": 2.579332839853882e-05, + "loss": 0.0376, + "step": 98910 + }, + { + "epoch": 21.010745815956238, + "grad_norm": 0.01012720912694931, + "learning_rate": 2.5790319377493726e-05, + "loss": 0.0401, + "step": 98920 + }, + { + "epoch": 21.010799978335047, + "grad_norm": 0.003615354187786579, + "learning_rate": 2.5787310356448636e-05, + "loss": 0.0095, + "step": 98930 + }, + { + "epoch": 21.01085414071386, + "grad_norm": 0.0021807083394378424, + "learning_rate": 2.5784301335403542e-05, + "loss": 0.1494, + "step": 98940 + }, + { + "epoch": 21.010908303092673, + "grad_norm": 0.0022137269843369722, + "learning_rate": 2.5781292314358445e-05, + "loss": 0.034, + "step": 98950 + }, + { + "epoch": 21.010962465471483, + "grad_norm": 0.09951620548963547, + "learning_rate": 2.5778283293313354e-05, + "loss": 0.0612, + "step": 98960 + }, + { + "epoch": 21.011016627850296, + "grad_norm": 0.2409048229455948, + "learning_rate": 2.5775274272268264e-05, + "loss": 0.0454, + "step": 98970 + }, + { + "epoch": 21.011070790229105, + "grad_norm": 0.02159353345632553, + "learning_rate": 2.577226525122317e-05, + "loss": 0.1133, + "step": 98980 + }, + { + "epoch": 21.01112495260792, + "grad_norm": 0.3253210484981537, + "learning_rate": 2.5769256230178073e-05, + "loss": 0.0422, + "step": 98990 + }, + { + "epoch": 21.01117911498673, + "grad_norm": 1.5191028118133545, + "learning_rate": 2.5766247209132983e-05, + "loss": 0.0168, + "step": 99000 + }, + { + "epoch": 21.01123327736554, + "grad_norm": 0.02819526195526123, + "learning_rate": 2.576323818808789e-05, + "loss": 0.0076, + "step": 99010 + }, + { + "epoch": 21.011287439744354, + "grad_norm": 0.0017772208666428924, + "learning_rate": 2.5760229167042792e-05, + "loss": 0.0012, + "step": 99020 + }, + { + "epoch": 21.011341602123164, + "grad_norm": 0.07698265463113785, + "learning_rate": 2.57572201459977e-05, + "loss": 0.0899, + "step": 99030 + }, + { + "epoch": 21.011395764501977, + "grad_norm": 0.2864196300506592, + "learning_rate": 2.575421112495261e-05, + "loss": 0.0913, + "step": 99040 + }, + { + "epoch": 21.01144992688079, + "grad_norm": 0.0017490703612565994, + "learning_rate": 2.5751202103907518e-05, + "loss": 0.03, + "step": 99050 + }, + { + "epoch": 21.0115040892596, + "grad_norm": 0.14595554769039154, + "learning_rate": 2.574819308286242e-05, + "loss": 0.0101, + "step": 99060 + }, + { + "epoch": 21.011558251638412, + "grad_norm": 2.525158643722534, + "learning_rate": 2.574518406181733e-05, + "loss": 0.025, + "step": 99070 + }, + { + "epoch": 21.011612414017225, + "grad_norm": 0.001630927436053753, + "learning_rate": 2.5742175040772236e-05, + "loss": 0.0633, + "step": 99080 + }, + { + "epoch": 21.011666576396035, + "grad_norm": 0.001677106018178165, + "learning_rate": 2.5739166019727146e-05, + "loss": 0.0355, + "step": 99090 + }, + { + "epoch": 21.011720738774848, + "grad_norm": 0.0016231030458584428, + "learning_rate": 2.573615699868205e-05, + "loss": 0.0333, + "step": 99100 + }, + { + "epoch": 21.011774901153657, + "grad_norm": 0.0026709914673119783, + "learning_rate": 2.5733147977636955e-05, + "loss": 0.0481, + "step": 99110 + }, + { + "epoch": 21.01182906353247, + "grad_norm": 0.030059849843382835, + "learning_rate": 2.5730138956591865e-05, + "loss": 0.059, + "step": 99120 + }, + { + "epoch": 21.011883225911284, + "grad_norm": 0.5928305983543396, + "learning_rate": 2.5727129935546775e-05, + "loss": 0.0648, + "step": 99130 + }, + { + "epoch": 21.011937388290093, + "grad_norm": 0.24618637561798096, + "learning_rate": 2.5724120914501677e-05, + "loss": 0.1398, + "step": 99140 + }, + { + "epoch": 21.011991550668906, + "grad_norm": 3.58597993850708, + "learning_rate": 2.5721111893456584e-05, + "loss": 0.0243, + "step": 99150 + }, + { + "epoch": 21.012045713047716, + "grad_norm": 4.158473968505859, + "learning_rate": 2.5718102872411493e-05, + "loss": 0.0778, + "step": 99160 + }, + { + "epoch": 21.01209987542653, + "grad_norm": 0.0020461673848330975, + "learning_rate": 2.5715093851366396e-05, + "loss": 0.1603, + "step": 99170 + }, + { + "epoch": 21.01215403780534, + "grad_norm": 0.0357590951025486, + "learning_rate": 2.5712084830321303e-05, + "loss": 0.0026, + "step": 99180 + }, + { + "epoch": 21.01220820018415, + "grad_norm": 0.0025135590694844723, + "learning_rate": 2.5709075809276212e-05, + "loss": 0.0132, + "step": 99190 + }, + { + "epoch": 21.012262362562964, + "grad_norm": 0.02231612242758274, + "learning_rate": 2.570606678823112e-05, + "loss": 0.009, + "step": 99200 + }, + { + "epoch": 21.012316524941774, + "grad_norm": 0.0637345090508461, + "learning_rate": 2.570305776718602e-05, + "loss": 0.0706, + "step": 99210 + }, + { + "epoch": 21.012370687320587, + "grad_norm": 0.023610197007656097, + "learning_rate": 2.570004874614093e-05, + "loss": 0.1975, + "step": 99220 + }, + { + "epoch": 21.0124248496994, + "grad_norm": 0.16773080825805664, + "learning_rate": 2.569703972509584e-05, + "loss": 0.0278, + "step": 99230 + }, + { + "epoch": 21.01247901207821, + "grad_norm": 0.5404071807861328, + "learning_rate": 2.5694030704050747e-05, + "loss": 0.0103, + "step": 99240 + }, + { + "epoch": 21.012533174457023, + "grad_norm": 0.11702386289834976, + "learning_rate": 2.569102168300565e-05, + "loss": 0.0891, + "step": 99250 + }, + { + "epoch": 21.012587336835836, + "grad_norm": 1.11049485206604, + "learning_rate": 2.568801266196056e-05, + "loss": 0.0766, + "step": 99260 + }, + { + "epoch": 21.012641499214645, + "grad_norm": 0.07669618725776672, + "learning_rate": 2.5685003640915466e-05, + "loss": 0.0195, + "step": 99270 + }, + { + "epoch": 21.012695661593458, + "grad_norm": 0.5769846439361572, + "learning_rate": 2.5681994619870375e-05, + "loss": 0.0066, + "step": 99280 + }, + { + "epoch": 21.012749823972268, + "grad_norm": 1.0491893291473389, + "learning_rate": 2.567898559882528e-05, + "loss": 0.0698, + "step": 99290 + }, + { + "epoch": 21.01280398635108, + "grad_norm": 2.577437400817871, + "learning_rate": 2.5675976577780188e-05, + "loss": 0.037, + "step": 99300 + }, + { + "epoch": 21.012858148729894, + "grad_norm": 0.0025105420500040054, + "learning_rate": 2.5672967556735094e-05, + "loss": 0.0163, + "step": 99310 + }, + { + "epoch": 21.012912311108703, + "grad_norm": 0.0019028750248253345, + "learning_rate": 2.5669958535689997e-05, + "loss": 0.0118, + "step": 99320 + }, + { + "epoch": 21.012966473487516, + "grad_norm": 0.14816084504127502, + "learning_rate": 2.5666949514644907e-05, + "loss": 0.0156, + "step": 99330 + }, + { + "epoch": 21.013020635866326, + "grad_norm": 0.001711381133645773, + "learning_rate": 2.5663940493599813e-05, + "loss": 0.0119, + "step": 99340 + }, + { + "epoch": 21.01307479824514, + "grad_norm": 0.0016435772413387895, + "learning_rate": 2.5660931472554723e-05, + "loss": 0.0144, + "step": 99350 + }, + { + "epoch": 21.013128960623952, + "grad_norm": 2.0387814044952393, + "learning_rate": 2.5657922451509626e-05, + "loss": 0.0703, + "step": 99360 + }, + { + "epoch": 21.01318312300276, + "grad_norm": 0.007997160777449608, + "learning_rate": 2.5654913430464532e-05, + "loss": 0.0141, + "step": 99370 + }, + { + "epoch": 21.013237285381575, + "grad_norm": 0.21867403388023376, + "learning_rate": 2.565190440941944e-05, + "loss": 0.1397, + "step": 99380 + }, + { + "epoch": 21.013291447760384, + "grad_norm": 0.002195917069911957, + "learning_rate": 2.564889538837435e-05, + "loss": 0.0151, + "step": 99390 + }, + { + "epoch": 21.013345610139197, + "grad_norm": 0.19134385883808136, + "learning_rate": 2.5645886367329254e-05, + "loss": 0.0556, + "step": 99400 + }, + { + "epoch": 21.01339977251801, + "grad_norm": 0.005207630805671215, + "learning_rate": 2.564287734628416e-05, + "loss": 0.0934, + "step": 99410 + }, + { + "epoch": 21.01345393489682, + "grad_norm": 0.13410499691963196, + "learning_rate": 2.563986832523907e-05, + "loss": 0.0532, + "step": 99420 + }, + { + "epoch": 21.013508097275633, + "grad_norm": 0.06004227325320244, + "learning_rate": 2.5636859304193976e-05, + "loss": 0.0324, + "step": 99430 + }, + { + "epoch": 21.013562259654446, + "grad_norm": 3.54917573928833, + "learning_rate": 2.563385028314888e-05, + "loss": 0.0335, + "step": 99440 + }, + { + "epoch": 21.013616422033255, + "grad_norm": 0.28921595215797424, + "learning_rate": 2.563084126210379e-05, + "loss": 0.0635, + "step": 99450 + }, + { + "epoch": 21.01367058441207, + "grad_norm": 0.0018961818423122168, + "learning_rate": 2.5627832241058695e-05, + "loss": 0.0152, + "step": 99460 + }, + { + "epoch": 21.013724746790878, + "grad_norm": 1.5746079683303833, + "learning_rate": 2.5624823220013598e-05, + "loss": 0.0963, + "step": 99470 + }, + { + "epoch": 21.01377890916969, + "grad_norm": 0.5131257176399231, + "learning_rate": 2.5621814198968508e-05, + "loss": 0.0086, + "step": 99480 + }, + { + "epoch": 21.013833071548504, + "grad_norm": 0.10804323107004166, + "learning_rate": 2.5618805177923417e-05, + "loss": 0.126, + "step": 99490 + }, + { + "epoch": 21.013887233927314, + "grad_norm": 0.4099733531475067, + "learning_rate": 2.5615796156878324e-05, + "loss": 0.0054, + "step": 99500 + }, + { + "epoch": 21.013941396306127, + "grad_norm": 0.004259428009390831, + "learning_rate": 2.5612787135833227e-05, + "loss": 0.0639, + "step": 99510 + }, + { + "epoch": 21.013995558684936, + "grad_norm": 0.01408042386174202, + "learning_rate": 2.5609778114788136e-05, + "loss": 0.0859, + "step": 99520 + }, + { + "epoch": 21.01404972106375, + "grad_norm": 0.0677160769701004, + "learning_rate": 2.5606769093743042e-05, + "loss": 0.0748, + "step": 99530 + }, + { + "epoch": 21.014103883442562, + "grad_norm": 0.005303757265210152, + "learning_rate": 2.5603760072697952e-05, + "loss": 0.0051, + "step": 99540 + }, + { + "epoch": 21.01415804582137, + "grad_norm": 0.003626642283052206, + "learning_rate": 2.5600751051652855e-05, + "loss": 0.0062, + "step": 99550 + }, + { + "epoch": 21.014212208200185, + "grad_norm": 0.0020252091344445944, + "learning_rate": 2.5597742030607765e-05, + "loss": 0.0789, + "step": 99560 + }, + { + "epoch": 21.014266370578994, + "grad_norm": 3.349801540374756, + "learning_rate": 2.559473300956267e-05, + "loss": 0.0751, + "step": 99570 + }, + { + "epoch": 21.014320532957807, + "grad_norm": 0.07173556089401245, + "learning_rate": 2.559172398851758e-05, + "loss": 0.0745, + "step": 99580 + }, + { + "epoch": 21.01437469533662, + "grad_norm": 0.11854970455169678, + "learning_rate": 2.5588714967472484e-05, + "loss": 0.1769, + "step": 99590 + }, + { + "epoch": 21.01442885771543, + "grad_norm": 0.018280936405062675, + "learning_rate": 2.558570594642739e-05, + "loss": 0.0389, + "step": 99600 + }, + { + "epoch": 21.014483020094243, + "grad_norm": 0.042683571577072144, + "learning_rate": 2.55826969253823e-05, + "loss": 0.0706, + "step": 99610 + }, + { + "epoch": 21.014537182473052, + "grad_norm": 0.10199908912181854, + "learning_rate": 2.5579687904337202e-05, + "loss": 0.1183, + "step": 99620 + }, + { + "epoch": 21.014591344851866, + "grad_norm": 0.37970271706581116, + "learning_rate": 2.557667888329211e-05, + "loss": 0.0227, + "step": 99630 + }, + { + "epoch": 21.01464550723068, + "grad_norm": 0.019207604229450226, + "learning_rate": 2.5573669862247018e-05, + "loss": 0.0031, + "step": 99640 + }, + { + "epoch": 21.014699669609488, + "grad_norm": 0.030582129955291748, + "learning_rate": 2.5570660841201928e-05, + "loss": 0.0338, + "step": 99650 + }, + { + "epoch": 21.0147538319883, + "grad_norm": 0.0230389516800642, + "learning_rate": 2.556765182015683e-05, + "loss": 0.0698, + "step": 99660 + }, + { + "epoch": 21.014807994367114, + "grad_norm": 0.0808221772313118, + "learning_rate": 2.5564642799111737e-05, + "loss": 0.0438, + "step": 99670 + }, + { + "epoch": 21.014862156745924, + "grad_norm": 0.23860861361026764, + "learning_rate": 2.5561633778066647e-05, + "loss": 0.0046, + "step": 99680 + }, + { + "epoch": 21.014916319124737, + "grad_norm": 0.005894686095416546, + "learning_rate": 2.5558624757021553e-05, + "loss": 0.0097, + "step": 99690 + }, + { + "epoch": 21.014970481503546, + "grad_norm": 0.03567054122686386, + "learning_rate": 2.5555615735976456e-05, + "loss": 0.0018, + "step": 99700 + }, + { + "epoch": 21.01502464388236, + "grad_norm": 0.20138372480869293, + "learning_rate": 2.5552606714931366e-05, + "loss": 0.0396, + "step": 99710 + }, + { + "epoch": 21.015078806261172, + "grad_norm": 0.005183570086956024, + "learning_rate": 2.5549597693886272e-05, + "loss": 0.0172, + "step": 99720 + }, + { + "epoch": 21.015132968639982, + "grad_norm": 0.001685679191723466, + "learning_rate": 2.554658867284118e-05, + "loss": 0.062, + "step": 99730 + }, + { + "epoch": 21.015187131018795, + "grad_norm": 0.02917654998600483, + "learning_rate": 2.5543579651796084e-05, + "loss": 0.1135, + "step": 99740 + }, + { + "epoch": 21.015241293397604, + "grad_norm": 0.0028428465593606234, + "learning_rate": 2.5540570630750994e-05, + "loss": 0.2167, + "step": 99750 + }, + { + "epoch": 21.015295455776418, + "grad_norm": 0.05646153911948204, + "learning_rate": 2.55375616097059e-05, + "loss": 0.0479, + "step": 99760 + }, + { + "epoch": 21.01534961815523, + "grad_norm": 0.18013666570186615, + "learning_rate": 2.5534552588660803e-05, + "loss": 0.0636, + "step": 99770 + }, + { + "epoch": 21.01540378053404, + "grad_norm": 0.0020300440955907106, + "learning_rate": 2.5531543567615713e-05, + "loss": 0.0353, + "step": 99780 + }, + { + "epoch": 21.015457942912853, + "grad_norm": 0.7692350745201111, + "learning_rate": 2.552853454657062e-05, + "loss": 0.0434, + "step": 99790 + }, + { + "epoch": 21.015512105291663, + "grad_norm": 0.8559945225715637, + "learning_rate": 2.552552552552553e-05, + "loss": 0.1046, + "step": 99800 + }, + { + "epoch": 21.015566267670476, + "grad_norm": 0.31042754650115967, + "learning_rate": 2.5522516504480432e-05, + "loss": 0.0524, + "step": 99810 + }, + { + "epoch": 21.01562043004929, + "grad_norm": 0.08661746233701706, + "learning_rate": 2.551950748343534e-05, + "loss": 0.0651, + "step": 99820 + }, + { + "epoch": 21.0156745924281, + "grad_norm": 10.737921714782715, + "learning_rate": 2.5516498462390248e-05, + "loss": 0.0411, + "step": 99830 + }, + { + "epoch": 21.01572875480691, + "grad_norm": 0.03627188876271248, + "learning_rate": 2.5513489441345157e-05, + "loss": 0.07, + "step": 99840 + }, + { + "epoch": 21.015782917185724, + "grad_norm": 0.26481908559799194, + "learning_rate": 2.551048042030006e-05, + "loss": 0.0719, + "step": 99850 + }, + { + "epoch": 21.015837079564534, + "grad_norm": 0.019587349146604538, + "learning_rate": 2.5507471399254967e-05, + "loss": 0.0041, + "step": 99860 + }, + { + "epoch": 21.015891241943347, + "grad_norm": 0.439730703830719, + "learning_rate": 2.5504462378209876e-05, + "loss": 0.0045, + "step": 99870 + }, + { + "epoch": 21.015945404322157, + "grad_norm": 0.004488307051360607, + "learning_rate": 2.550145335716478e-05, + "loss": 0.0469, + "step": 99880 + }, + { + "epoch": 21.01599956670097, + "grad_norm": 0.8496699333190918, + "learning_rate": 2.5498444336119685e-05, + "loss": 0.0452, + "step": 99890 + }, + { + "epoch": 21.016053729079783, + "grad_norm": 0.002811343874782324, + "learning_rate": 2.5495435315074595e-05, + "loss": 0.0251, + "step": 99900 + }, + { + "epoch": 21.016107891458592, + "grad_norm": 0.003428869880735874, + "learning_rate": 2.5492426294029505e-05, + "loss": 0.0185, + "step": 99910 + }, + { + "epoch": 21.016162053837405, + "grad_norm": 0.004410326015204191, + "learning_rate": 2.5489417272984408e-05, + "loss": 0.0229, + "step": 99920 + }, + { + "epoch": 21.016216216216215, + "grad_norm": 0.005799031350761652, + "learning_rate": 2.5486408251939314e-05, + "loss": 0.0587, + "step": 99930 + }, + { + "epoch": 21.016270378595028, + "grad_norm": 0.029069390147924423, + "learning_rate": 2.5483399230894223e-05, + "loss": 0.0403, + "step": 99940 + }, + { + "epoch": 21.01632454097384, + "grad_norm": 0.1875336468219757, + "learning_rate": 2.548039020984913e-05, + "loss": 0.0242, + "step": 99950 + }, + { + "epoch": 21.01637870335265, + "grad_norm": 0.002408982953056693, + "learning_rate": 2.5477381188804033e-05, + "loss": 0.0288, + "step": 99960 + }, + { + "epoch": 21.016432865731463, + "grad_norm": 0.013812161050736904, + "learning_rate": 2.5474372167758942e-05, + "loss": 0.03, + "step": 99970 + }, + { + "epoch": 21.016487028110273, + "grad_norm": 0.041899532079696655, + "learning_rate": 2.547136314671385e-05, + "loss": 0.0769, + "step": 99980 + }, + { + "epoch": 21.016541190489086, + "grad_norm": 0.0018523848848417401, + "learning_rate": 2.5468354125668758e-05, + "loss": 0.0303, + "step": 99990 + }, + { + "epoch": 21.0165953528679, + "grad_norm": 0.01001475378870964, + "learning_rate": 2.546534510462366e-05, + "loss": 0.0133, + "step": 100000 + }, + { + "epoch": 21.01664951524671, + "grad_norm": 0.5216675996780396, + "learning_rate": 2.546233608357857e-05, + "loss": 0.0522, + "step": 100010 + }, + { + "epoch": 21.01670367762552, + "grad_norm": 0.041607800871133804, + "learning_rate": 2.5459327062533477e-05, + "loss": 0.0252, + "step": 100020 + }, + { + "epoch": 21.016757840004335, + "grad_norm": 2.359884262084961, + "learning_rate": 2.545631804148838e-05, + "loss": 0.0298, + "step": 100030 + }, + { + "epoch": 21.016812002383144, + "grad_norm": 2.3004841804504395, + "learning_rate": 2.545330902044329e-05, + "loss": 0.0305, + "step": 100040 + }, + { + "epoch": 21.016866164761957, + "grad_norm": 2.345144510269165, + "learning_rate": 2.5450299999398196e-05, + "loss": 0.0397, + "step": 100050 + }, + { + "epoch": 21.016920327140767, + "grad_norm": 0.001794771640561521, + "learning_rate": 2.5447290978353106e-05, + "loss": 0.0768, + "step": 100060 + }, + { + "epoch": 21.01697448951958, + "grad_norm": 0.144999697804451, + "learning_rate": 2.544428195730801e-05, + "loss": 0.0698, + "step": 100070 + }, + { + "epoch": 21.017028651898393, + "grad_norm": 2.6672167778015137, + "learning_rate": 2.5441272936262918e-05, + "loss": 0.0616, + "step": 100080 + }, + { + "epoch": 21.017082814277202, + "grad_norm": 0.0018152233678847551, + "learning_rate": 2.5438263915217824e-05, + "loss": 0.0412, + "step": 100090 + }, + { + "epoch": 21.017136976656015, + "grad_norm": 0.020682498812675476, + "learning_rate": 2.5435254894172734e-05, + "loss": 0.0883, + "step": 100100 + }, + { + "epoch": 21.017191139034825, + "grad_norm": 0.7990724444389343, + "learning_rate": 2.5432245873127637e-05, + "loss": 0.0443, + "step": 100110 + }, + { + "epoch": 21.017245301413638, + "grad_norm": 0.012245926074683666, + "learning_rate": 2.5429236852082543e-05, + "loss": 0.0248, + "step": 100120 + }, + { + "epoch": 21.01729946379245, + "grad_norm": 1.5867624282836914, + "learning_rate": 2.5426227831037453e-05, + "loss": 0.0966, + "step": 100130 + }, + { + "epoch": 21.01735362617126, + "grad_norm": 0.002010100521147251, + "learning_rate": 2.542321880999236e-05, + "loss": 0.0803, + "step": 100140 + }, + { + "epoch": 21.017407788550074, + "grad_norm": 0.12018482387065887, + "learning_rate": 2.5420209788947262e-05, + "loss": 0.0073, + "step": 100150 + }, + { + "epoch": 21.017461950928883, + "grad_norm": 0.08006370067596436, + "learning_rate": 2.541720076790217e-05, + "loss": 0.0185, + "step": 100160 + }, + { + "epoch": 21.017516113307696, + "grad_norm": 0.0019313261145725846, + "learning_rate": 2.541419174685708e-05, + "loss": 0.0081, + "step": 100170 + }, + { + "epoch": 21.01757027568651, + "grad_norm": 0.0032233777455985546, + "learning_rate": 2.5411182725811984e-05, + "loss": 0.0042, + "step": 100180 + }, + { + "epoch": 21.01762443806532, + "grad_norm": 0.0018616458401083946, + "learning_rate": 2.540817370476689e-05, + "loss": 0.0082, + "step": 100190 + }, + { + "epoch": 21.017678600444132, + "grad_norm": 2.355017900466919, + "learning_rate": 2.54051646837218e-05, + "loss": 0.0565, + "step": 100200 + }, + { + "epoch": 21.017732762822945, + "grad_norm": 0.004422638099640608, + "learning_rate": 2.5402155662676706e-05, + "loss": 0.0767, + "step": 100210 + }, + { + "epoch": 21.017786925201754, + "grad_norm": 0.0018901731818914413, + "learning_rate": 2.539914664163161e-05, + "loss": 0.0273, + "step": 100220 + }, + { + "epoch": 21.017841087580567, + "grad_norm": 0.0017854681937023997, + "learning_rate": 2.539613762058652e-05, + "loss": 0.0515, + "step": 100230 + }, + { + "epoch": 21.017895249959377, + "grad_norm": 0.003985359333455563, + "learning_rate": 2.539312859954143e-05, + "loss": 0.0449, + "step": 100240 + }, + { + "epoch": 21.01794941233819, + "grad_norm": 0.003127308562397957, + "learning_rate": 2.5390119578496335e-05, + "loss": 0.0096, + "step": 100250 + }, + { + "epoch": 21.018003574717003, + "grad_norm": 0.4269263446331024, + "learning_rate": 2.5387110557451238e-05, + "loss": 0.0125, + "step": 100260 + }, + { + "epoch": 21.018057737095813, + "grad_norm": 0.0016356052365154028, + "learning_rate": 2.5384101536406147e-05, + "loss": 0.0132, + "step": 100270 + }, + { + "epoch": 21.018111899474626, + "grad_norm": 0.003874956863000989, + "learning_rate": 2.5381092515361054e-05, + "loss": 0.041, + "step": 100280 + }, + { + "epoch": 21.018166061853435, + "grad_norm": 1.9892487525939941, + "learning_rate": 2.5378083494315963e-05, + "loss": 0.0561, + "step": 100290 + }, + { + "epoch": 21.01822022423225, + "grad_norm": 0.0020785301458090544, + "learning_rate": 2.5375074473270866e-05, + "loss": 0.0421, + "step": 100300 + }, + { + "epoch": 21.01827438661106, + "grad_norm": 0.08422765880823135, + "learning_rate": 2.5372065452225773e-05, + "loss": 0.0534, + "step": 100310 + }, + { + "epoch": 21.01832854898987, + "grad_norm": 0.11135346442461014, + "learning_rate": 2.5369056431180682e-05, + "loss": 0.0937, + "step": 100320 + }, + { + "epoch": 21.018382711368684, + "grad_norm": 0.0016339889261871576, + "learning_rate": 2.5366047410135585e-05, + "loss": 0.0172, + "step": 100330 + }, + { + "epoch": 21.018436873747493, + "grad_norm": 0.001739337109029293, + "learning_rate": 2.5363038389090495e-05, + "loss": 0.0553, + "step": 100340 + }, + { + "epoch": 21.018491036126306, + "grad_norm": 0.003057498252019286, + "learning_rate": 2.53600293680454e-05, + "loss": 0.0269, + "step": 100350 + }, + { + "epoch": 21.01854519850512, + "grad_norm": 4.43930196762085, + "learning_rate": 2.535702034700031e-05, + "loss": 0.016, + "step": 100360 + }, + { + "epoch": 21.01859936088393, + "grad_norm": 0.00161645351909101, + "learning_rate": 2.5354011325955214e-05, + "loss": 0.0224, + "step": 100370 + }, + { + "epoch": 21.018653523262742, + "grad_norm": 0.6750757694244385, + "learning_rate": 2.535100230491012e-05, + "loss": 0.0343, + "step": 100380 + }, + { + "epoch": 21.018707685641555, + "grad_norm": 0.0026305033825337887, + "learning_rate": 2.534799328386503e-05, + "loss": 0.0218, + "step": 100390 + }, + { + "epoch": 21.018761848020365, + "grad_norm": 0.0016482025384902954, + "learning_rate": 2.5344984262819936e-05, + "loss": 0.0207, + "step": 100400 + }, + { + "epoch": 21.018816010399178, + "grad_norm": 0.34343433380126953, + "learning_rate": 2.534197524177484e-05, + "loss": 0.0059, + "step": 100410 + }, + { + "epoch": 21.018870172777987, + "grad_norm": 3.5246903896331787, + "learning_rate": 2.533896622072975e-05, + "loss": 0.0382, + "step": 100420 + }, + { + "epoch": 21.0189243351568, + "grad_norm": 4.757564544677734, + "learning_rate": 2.5335957199684658e-05, + "loss": 0.09, + "step": 100430 + }, + { + "epoch": 21.018978497535613, + "grad_norm": 1.8353739976882935, + "learning_rate": 2.5332948178639564e-05, + "loss": 0.1202, + "step": 100440 + }, + { + "epoch": 21.019032659914423, + "grad_norm": 0.11559576541185379, + "learning_rate": 2.5329939157594467e-05, + "loss": 0.1511, + "step": 100450 + }, + { + "epoch": 21.019086822293236, + "grad_norm": 0.2140749990940094, + "learning_rate": 2.5326930136549377e-05, + "loss": 0.0632, + "step": 100460 + }, + { + "epoch": 21.019140984672045, + "grad_norm": 0.0014482177793979645, + "learning_rate": 2.5323921115504283e-05, + "loss": 0.0037, + "step": 100470 + }, + { + "epoch": 21.01919514705086, + "grad_norm": 0.29297900199890137, + "learning_rate": 2.5320912094459186e-05, + "loss": 0.0126, + "step": 100480 + }, + { + "epoch": 21.01924930942967, + "grad_norm": 0.0015811558114364743, + "learning_rate": 2.5317903073414096e-05, + "loss": 0.1947, + "step": 100490 + }, + { + "epoch": 21.01930347180848, + "grad_norm": 0.07310039550065994, + "learning_rate": 2.5314894052369005e-05, + "loss": 0.1467, + "step": 100500 + }, + { + "epoch": 21.019357634187294, + "grad_norm": 0.006989673245698214, + "learning_rate": 2.531188503132391e-05, + "loss": 0.1161, + "step": 100510 + }, + { + "epoch": 21.019411796566104, + "grad_norm": 1.3437166213989258, + "learning_rate": 2.5308876010278815e-05, + "loss": 0.0396, + "step": 100520 + }, + { + "epoch": 21.019465958944917, + "grad_norm": 0.002288941526785493, + "learning_rate": 2.5305866989233724e-05, + "loss": 0.02, + "step": 100530 + }, + { + "epoch": 21.01952012132373, + "grad_norm": 0.0023644224274903536, + "learning_rate": 2.530285796818863e-05, + "loss": 0.0089, + "step": 100540 + }, + { + "epoch": 21.01957428370254, + "grad_norm": 0.0022124173119664192, + "learning_rate": 2.529984894714354e-05, + "loss": 0.0087, + "step": 100550 + }, + { + "epoch": 21.019628446081352, + "grad_norm": 0.007931110449135303, + "learning_rate": 2.5296839926098443e-05, + "loss": 0.0555, + "step": 100560 + }, + { + "epoch": 21.019682608460165, + "grad_norm": 0.0020068828016519547, + "learning_rate": 2.529383090505335e-05, + "loss": 0.0558, + "step": 100570 + }, + { + "epoch": 21.019736770838975, + "grad_norm": 0.002048543654382229, + "learning_rate": 2.529082188400826e-05, + "loss": 0.031, + "step": 100580 + }, + { + "epoch": 21.019790933217788, + "grad_norm": 0.08889581263065338, + "learning_rate": 2.528781286296317e-05, + "loss": 0.0752, + "step": 100590 + }, + { + "epoch": 21.019845095596597, + "grad_norm": 0.004627130925655365, + "learning_rate": 2.528480384191807e-05, + "loss": 0.0019, + "step": 100600 + }, + { + "epoch": 21.01989925797541, + "grad_norm": 0.1516059786081314, + "learning_rate": 2.5281794820872978e-05, + "loss": 0.1103, + "step": 100610 + }, + { + "epoch": 21.019953420354224, + "grad_norm": 0.5268750190734863, + "learning_rate": 2.5278785799827887e-05, + "loss": 0.0031, + "step": 100620 + }, + { + "epoch": 21.020007582733033, + "grad_norm": 0.0909598246216774, + "learning_rate": 2.527577677878279e-05, + "loss": 0.0709, + "step": 100630 + }, + { + "epoch": 21.020061745111846, + "grad_norm": 0.17460647225379944, + "learning_rate": 2.5272767757737697e-05, + "loss": 0.0399, + "step": 100640 + }, + { + "epoch": 21.020115907490656, + "grad_norm": 0.0024471376091241837, + "learning_rate": 2.5269758736692606e-05, + "loss": 0.0331, + "step": 100650 + }, + { + "epoch": 21.02017006986947, + "grad_norm": 1.9739000797271729, + "learning_rate": 2.5266749715647513e-05, + "loss": 0.091, + "step": 100660 + }, + { + "epoch": 21.02022423224828, + "grad_norm": 0.017127113416790962, + "learning_rate": 2.5263740694602415e-05, + "loss": 0.0525, + "step": 100670 + }, + { + "epoch": 21.02027839462709, + "grad_norm": 0.0038104867562651634, + "learning_rate": 2.5260731673557325e-05, + "loss": 0.0307, + "step": 100680 + }, + { + "epoch": 21.020332557005904, + "grad_norm": 3.6029505729675293, + "learning_rate": 2.5257722652512235e-05, + "loss": 0.0486, + "step": 100690 + }, + { + "epoch": 21.020386719384714, + "grad_norm": 0.002107449574396014, + "learning_rate": 2.525471363146714e-05, + "loss": 0.0097, + "step": 100700 + }, + { + "epoch": 21.020440881763527, + "grad_norm": 0.0031398457940667868, + "learning_rate": 2.5251704610422044e-05, + "loss": 0.0761, + "step": 100710 + }, + { + "epoch": 21.02049504414234, + "grad_norm": 0.6183173060417175, + "learning_rate": 2.5248695589376954e-05, + "loss": 0.042, + "step": 100720 + }, + { + "epoch": 21.02054920652115, + "grad_norm": 0.001996099017560482, + "learning_rate": 2.524568656833186e-05, + "loss": 0.0283, + "step": 100730 + }, + { + "epoch": 21.020603368899963, + "grad_norm": 0.08937390148639679, + "learning_rate": 2.524267754728677e-05, + "loss": 0.0648, + "step": 100740 + }, + { + "epoch": 21.020657531278772, + "grad_norm": 0.002191385021433234, + "learning_rate": 2.5239668526241672e-05, + "loss": 0.0016, + "step": 100750 + }, + { + "epoch": 21.020711693657585, + "grad_norm": 1.7768454551696777, + "learning_rate": 2.5236659505196582e-05, + "loss": 0.1132, + "step": 100760 + }, + { + "epoch": 21.020765856036398, + "grad_norm": 0.27491456270217896, + "learning_rate": 2.523365048415149e-05, + "loss": 0.0662, + "step": 100770 + }, + { + "epoch": 21.020820018415208, + "grad_norm": 0.0026387330144643784, + "learning_rate": 2.523064146310639e-05, + "loss": 0.0025, + "step": 100780 + }, + { + "epoch": 21.02087418079402, + "grad_norm": 1.0489697456359863, + "learning_rate": 2.52276324420613e-05, + "loss": 0.0081, + "step": 100790 + }, + { + "epoch": 21.020928343172834, + "grad_norm": 0.0018992229597643018, + "learning_rate": 2.5224623421016207e-05, + "loss": 0.018, + "step": 100800 + }, + { + "epoch": 21.020982505551643, + "grad_norm": 0.22158817946910858, + "learning_rate": 2.5221614399971117e-05, + "loss": 0.0802, + "step": 100810 + }, + { + "epoch": 21.021036667930456, + "grad_norm": 0.0855960100889206, + "learning_rate": 2.521860537892602e-05, + "loss": 0.1365, + "step": 100820 + }, + { + "epoch": 21.021090830309266, + "grad_norm": 0.003369309473782778, + "learning_rate": 2.5215596357880926e-05, + "loss": 0.0638, + "step": 100830 + }, + { + "epoch": 21.02114499268808, + "grad_norm": 1.2302263975143433, + "learning_rate": 2.5212587336835836e-05, + "loss": 0.0553, + "step": 100840 + }, + { + "epoch": 21.021199155066892, + "grad_norm": 0.30933132767677307, + "learning_rate": 2.5209578315790745e-05, + "loss": 0.0717, + "step": 100850 + }, + { + "epoch": 21.0212533174457, + "grad_norm": 0.0034986843820661306, + "learning_rate": 2.5206569294745648e-05, + "loss": 0.0729, + "step": 100860 + }, + { + "epoch": 21.021307479824515, + "grad_norm": 0.0020316792652010918, + "learning_rate": 2.5203560273700554e-05, + "loss": 0.0378, + "step": 100870 + }, + { + "epoch": 21.021361642203324, + "grad_norm": 0.002355351345613599, + "learning_rate": 2.5200551252655464e-05, + "loss": 0.0333, + "step": 100880 + }, + { + "epoch": 21.021415804582137, + "grad_norm": 0.519023060798645, + "learning_rate": 2.519754223161037e-05, + "loss": 0.1386, + "step": 100890 + }, + { + "epoch": 21.02146996696095, + "grad_norm": 0.11393988877534866, + "learning_rate": 2.5194533210565273e-05, + "loss": 0.0061, + "step": 100900 + }, + { + "epoch": 21.02152412933976, + "grad_norm": 0.0024936178233474493, + "learning_rate": 2.5191524189520183e-05, + "loss": 0.0168, + "step": 100910 + }, + { + "epoch": 21.021578291718573, + "grad_norm": 0.539995014667511, + "learning_rate": 2.518851516847509e-05, + "loss": 0.0045, + "step": 100920 + }, + { + "epoch": 21.021632454097382, + "grad_norm": 0.0019844393245875835, + "learning_rate": 2.5185506147429992e-05, + "loss": 0.0022, + "step": 100930 + }, + { + "epoch": 21.021686616476195, + "grad_norm": 0.10200336575508118, + "learning_rate": 2.5182497126384902e-05, + "loss": 0.0513, + "step": 100940 + }, + { + "epoch": 21.02174077885501, + "grad_norm": 0.012235449627041817, + "learning_rate": 2.517948810533981e-05, + "loss": 0.0172, + "step": 100950 + }, + { + "epoch": 21.021794941233818, + "grad_norm": 0.5915061831474304, + "learning_rate": 2.5176479084294718e-05, + "loss": 0.0536, + "step": 100960 + }, + { + "epoch": 21.02184910361263, + "grad_norm": 0.001914295251481235, + "learning_rate": 2.517347006324962e-05, + "loss": 0.0082, + "step": 100970 + }, + { + "epoch": 21.021903265991444, + "grad_norm": 0.0017410508589819074, + "learning_rate": 2.517046104220453e-05, + "loss": 0.0009, + "step": 100980 + }, + { + "epoch": 21.021957428370253, + "grad_norm": 0.03900052607059479, + "learning_rate": 2.5167452021159437e-05, + "loss": 0.0422, + "step": 100990 + }, + { + "epoch": 21.022011590749067, + "grad_norm": 0.009709260426461697, + "learning_rate": 2.5164443000114346e-05, + "loss": 0.0542, + "step": 101000 + }, + { + "epoch": 21.022065753127876, + "grad_norm": 0.0016957944026216865, + "learning_rate": 2.516143397906925e-05, + "loss": 0.0044, + "step": 101010 + }, + { + "epoch": 21.02211991550669, + "grad_norm": 0.003303790930658579, + "learning_rate": 2.515842495802416e-05, + "loss": 0.0043, + "step": 101020 + }, + { + "epoch": 21.022174077885502, + "grad_norm": 0.18731220066547394, + "learning_rate": 2.5155415936979065e-05, + "loss": 0.0052, + "step": 101030 + }, + { + "epoch": 21.02222824026431, + "grad_norm": 0.0016660008113831282, + "learning_rate": 2.5152406915933975e-05, + "loss": 0.0623, + "step": 101040 + }, + { + "epoch": 21.022282402643125, + "grad_norm": 0.002143308985978365, + "learning_rate": 2.5149397894888878e-05, + "loss": 0.0794, + "step": 101050 + }, + { + "epoch": 21.022336565021934, + "grad_norm": 0.03017214499413967, + "learning_rate": 2.5146388873843784e-05, + "loss": 0.0624, + "step": 101060 + }, + { + "epoch": 21.022390727400747, + "grad_norm": 0.0016661044210195541, + "learning_rate": 2.5143379852798693e-05, + "loss": 0.0959, + "step": 101070 + }, + { + "epoch": 21.02244488977956, + "grad_norm": 0.017056798562407494, + "learning_rate": 2.5140370831753596e-05, + "loss": 0.1046, + "step": 101080 + }, + { + "epoch": 21.02249905215837, + "grad_norm": 0.02190474048256874, + "learning_rate": 2.5137361810708503e-05, + "loss": 0.1287, + "step": 101090 + }, + { + "epoch": 21.022553214537183, + "grad_norm": 0.25610530376434326, + "learning_rate": 2.5134352789663412e-05, + "loss": 0.171, + "step": 101100 + }, + { + "epoch": 21.022607376915992, + "grad_norm": 0.3766373097896576, + "learning_rate": 2.5131343768618322e-05, + "loss": 0.1394, + "step": 101110 + }, + { + "epoch": 21.022661539294806, + "grad_norm": 0.5646913051605225, + "learning_rate": 2.5128334747573225e-05, + "loss": 0.0976, + "step": 101120 + }, + { + "epoch": 21.02271570167362, + "grad_norm": 0.013991497457027435, + "learning_rate": 2.512532572652813e-05, + "loss": 0.0181, + "step": 101130 + }, + { + "epoch": 21.022769864052428, + "grad_norm": 0.0021750254090875387, + "learning_rate": 2.512231670548304e-05, + "loss": 0.0623, + "step": 101140 + }, + { + "epoch": 21.02282402643124, + "grad_norm": 0.002111440757289529, + "learning_rate": 2.5119307684437947e-05, + "loss": 0.0127, + "step": 101150 + }, + { + "epoch": 21.022878188810054, + "grad_norm": 0.002052828436717391, + "learning_rate": 2.511629866339285e-05, + "loss": 0.008, + "step": 101160 + }, + { + "epoch": 21.022932351188864, + "grad_norm": 0.6928426623344421, + "learning_rate": 2.511328964234776e-05, + "loss": 0.0511, + "step": 101170 + }, + { + "epoch": 21.022986513567677, + "grad_norm": 0.0017544825095683336, + "learning_rate": 2.5110280621302666e-05, + "loss": 0.0734, + "step": 101180 + }, + { + "epoch": 21.023040675946486, + "grad_norm": 0.0018120839959010482, + "learning_rate": 2.5107271600257576e-05, + "loss": 0.0132, + "step": 101190 + }, + { + "epoch": 21.0230948383253, + "grad_norm": 0.38225120306015015, + "learning_rate": 2.510426257921248e-05, + "loss": 0.0489, + "step": 101200 + }, + { + "epoch": 21.023149000704112, + "grad_norm": 0.44370028376579285, + "learning_rate": 2.5101253558167388e-05, + "loss": 0.0944, + "step": 101210 + }, + { + "epoch": 21.023203163082922, + "grad_norm": 10.013155937194824, + "learning_rate": 2.5098244537122294e-05, + "loss": 0.0801, + "step": 101220 + }, + { + "epoch": 21.023257325461735, + "grad_norm": 0.3207588195800781, + "learning_rate": 2.5095235516077197e-05, + "loss": 0.0057, + "step": 101230 + }, + { + "epoch": 21.023311487840544, + "grad_norm": 0.0017901425017043948, + "learning_rate": 2.5092226495032107e-05, + "loss": 0.0386, + "step": 101240 + }, + { + "epoch": 21.023365650219358, + "grad_norm": 0.03196423500776291, + "learning_rate": 2.5089217473987013e-05, + "loss": 0.0292, + "step": 101250 + }, + { + "epoch": 21.02341981259817, + "grad_norm": 0.0019604081753641367, + "learning_rate": 2.5086208452941923e-05, + "loss": 0.0525, + "step": 101260 + }, + { + "epoch": 21.02347397497698, + "grad_norm": 0.9122229814529419, + "learning_rate": 2.5083199431896826e-05, + "loss": 0.0066, + "step": 101270 + }, + { + "epoch": 21.023528137355793, + "grad_norm": 0.7736020684242249, + "learning_rate": 2.5080190410851735e-05, + "loss": 0.0233, + "step": 101280 + }, + { + "epoch": 21.023582299734603, + "grad_norm": 0.09387915581464767, + "learning_rate": 2.5077181389806642e-05, + "loss": 0.0266, + "step": 101290 + }, + { + "epoch": 21.023636462113416, + "grad_norm": 0.02102135866880417, + "learning_rate": 2.507417236876155e-05, + "loss": 0.028, + "step": 101300 + }, + { + "epoch": 21.02369062449223, + "grad_norm": 0.3137955367565155, + "learning_rate": 2.5071163347716454e-05, + "loss": 0.0686, + "step": 101310 + }, + { + "epoch": 21.02374478687104, + "grad_norm": 0.03135678172111511, + "learning_rate": 2.506815432667136e-05, + "loss": 0.069, + "step": 101320 + }, + { + "epoch": 21.02379894924985, + "grad_norm": 0.009134954772889614, + "learning_rate": 2.506514530562627e-05, + "loss": 0.017, + "step": 101330 + }, + { + "epoch": 21.023853111628664, + "grad_norm": 0.002557945204898715, + "learning_rate": 2.5062136284581176e-05, + "loss": 0.0144, + "step": 101340 + }, + { + "epoch": 21.023907274007474, + "grad_norm": 0.0016823344631120563, + "learning_rate": 2.505912726353608e-05, + "loss": 0.1017, + "step": 101350 + }, + { + "epoch": 21.023961436386287, + "grad_norm": 0.5909912586212158, + "learning_rate": 2.505611824249099e-05, + "loss": 0.1119, + "step": 101360 + }, + { + "epoch": 21.024015598765097, + "grad_norm": 0.0018050232902169228, + "learning_rate": 2.50531092214459e-05, + "loss": 0.0218, + "step": 101370 + }, + { + "epoch": 21.02406976114391, + "grad_norm": 0.0020033842884004116, + "learning_rate": 2.50501002004008e-05, + "loss": 0.0168, + "step": 101380 + }, + { + "epoch": 21.024123923522723, + "grad_norm": 0.0025254010688513517, + "learning_rate": 2.5047091179355708e-05, + "loss": 0.0023, + "step": 101390 + }, + { + "epoch": 21.024178085901532, + "grad_norm": 0.0015486277407035232, + "learning_rate": 2.5044082158310617e-05, + "loss": 0.0183, + "step": 101400 + }, + { + "epoch": 21.024232248280345, + "grad_norm": 0.0017179279820993543, + "learning_rate": 2.5041073137265524e-05, + "loss": 0.0418, + "step": 101410 + }, + { + "epoch": 21.024286410659155, + "grad_norm": 0.22190938889980316, + "learning_rate": 2.5038064116220427e-05, + "loss": 0.0644, + "step": 101420 + }, + { + "epoch": 21.024340573037968, + "grad_norm": 2.548443555831909, + "learning_rate": 2.5035055095175336e-05, + "loss": 0.052, + "step": 101430 + }, + { + "epoch": 21.02439473541678, + "grad_norm": 0.06520179659128189, + "learning_rate": 2.5032046074130243e-05, + "loss": 0.0692, + "step": 101440 + }, + { + "epoch": 21.02444889779559, + "grad_norm": 3.370251417160034, + "learning_rate": 2.5029037053085152e-05, + "loss": 0.1711, + "step": 101450 + }, + { + "epoch": 21.024503060174403, + "grad_norm": 0.09391475468873978, + "learning_rate": 2.5026028032040055e-05, + "loss": 0.0314, + "step": 101460 + }, + { + "epoch": 21.024557222553213, + "grad_norm": 0.10852747410535812, + "learning_rate": 2.5023019010994965e-05, + "loss": 0.0096, + "step": 101470 + }, + { + "epoch": 21.024611384932026, + "grad_norm": 1.2675586938858032, + "learning_rate": 2.502000998994987e-05, + "loss": 0.0934, + "step": 101480 + }, + { + "epoch": 21.02466554731084, + "grad_norm": 0.12705956399440765, + "learning_rate": 2.501700096890478e-05, + "loss": 0.0905, + "step": 101490 + }, + { + "epoch": 21.02471970968965, + "grad_norm": 0.06015264242887497, + "learning_rate": 2.5013991947859684e-05, + "loss": 0.0176, + "step": 101500 + }, + { + "epoch": 21.02477387206846, + "grad_norm": 0.021787920966744423, + "learning_rate": 2.501098292681459e-05, + "loss": 0.0614, + "step": 101510 + }, + { + "epoch": 21.024828034447275, + "grad_norm": 0.05554509907960892, + "learning_rate": 2.50079739057695e-05, + "loss": 0.0092, + "step": 101520 + }, + { + "epoch": 21.024882196826084, + "grad_norm": 3.0047454833984375, + "learning_rate": 2.5004964884724402e-05, + "loss": 0.1439, + "step": 101530 + }, + { + "epoch": 21.024936359204897, + "grad_norm": 0.06555503606796265, + "learning_rate": 2.5001955863679312e-05, + "loss": 0.0088, + "step": 101540 + }, + { + "epoch": 21.024990521583707, + "grad_norm": 0.003095227526500821, + "learning_rate": 2.499894684263422e-05, + "loss": 0.0808, + "step": 101550 + }, + { + "epoch": 21.02500135405947, + "eval_accuracy": 0.8301763553233181, + "eval_loss": 0.8277378082275391, + "eval_runtime": 114.9906, + "eval_samples_per_second": 26.628, + "eval_steps_per_second": 3.331, + "step": 101552 + }, + { + "epoch": 22.00004332990305, + "grad_norm": 0.022143878042697906, + "learning_rate": 2.4995937821589125e-05, + "loss": 0.0443, + "step": 101560 + }, + { + "epoch": 22.00009749228186, + "grad_norm": 0.0019415731076151133, + "learning_rate": 2.499292880054403e-05, + "loss": 0.0935, + "step": 101570 + }, + { + "epoch": 22.000151654660673, + "grad_norm": 0.0019301131833344698, + "learning_rate": 2.4989919779498937e-05, + "loss": 0.0671, + "step": 101580 + }, + { + "epoch": 22.000205817039486, + "grad_norm": 0.01620473340153694, + "learning_rate": 2.4986910758453847e-05, + "loss": 0.0955, + "step": 101590 + }, + { + "epoch": 22.000259979418296, + "grad_norm": 0.4359208941459656, + "learning_rate": 2.4983901737408753e-05, + "loss": 0.0082, + "step": 101600 + }, + { + "epoch": 22.00031414179711, + "grad_norm": 0.14419138431549072, + "learning_rate": 2.498089271636366e-05, + "loss": 0.0143, + "step": 101610 + }, + { + "epoch": 22.000368304175918, + "grad_norm": 0.0166554544121027, + "learning_rate": 2.4977883695318566e-05, + "loss": 0.0096, + "step": 101620 + }, + { + "epoch": 22.00042246655473, + "grad_norm": 1.8155901432037354, + "learning_rate": 2.4974874674273475e-05, + "loss": 0.0525, + "step": 101630 + }, + { + "epoch": 22.000476628933544, + "grad_norm": 6.479307651519775, + "learning_rate": 2.4971865653228378e-05, + "loss": 0.0642, + "step": 101640 + }, + { + "epoch": 22.000530791312354, + "grad_norm": 0.026782456785440445, + "learning_rate": 2.4968856632183288e-05, + "loss": 0.0076, + "step": 101650 + }, + { + "epoch": 22.000584953691167, + "grad_norm": 0.22880838811397552, + "learning_rate": 2.4965847611138194e-05, + "loss": 0.0122, + "step": 101660 + }, + { + "epoch": 22.000639116069976, + "grad_norm": 0.15146945416927338, + "learning_rate": 2.49628385900931e-05, + "loss": 0.0645, + "step": 101670 + }, + { + "epoch": 22.00069327844879, + "grad_norm": 0.20000465214252472, + "learning_rate": 2.4959829569048007e-05, + "loss": 0.0587, + "step": 101680 + }, + { + "epoch": 22.000747440827602, + "grad_norm": 0.8270396590232849, + "learning_rate": 2.4956820548002913e-05, + "loss": 0.0187, + "step": 101690 + }, + { + "epoch": 22.000801603206412, + "grad_norm": 0.0726112574338913, + "learning_rate": 2.4953811526957823e-05, + "loss": 0.1053, + "step": 101700 + }, + { + "epoch": 22.000855765585225, + "grad_norm": 0.05257405713200569, + "learning_rate": 2.4950802505912726e-05, + "loss": 0.005, + "step": 101710 + }, + { + "epoch": 22.000909927964035, + "grad_norm": 0.002062662271782756, + "learning_rate": 2.4947793484867635e-05, + "loss": 0.0225, + "step": 101720 + }, + { + "epoch": 22.000964090342848, + "grad_norm": 0.025556569918990135, + "learning_rate": 2.494478446382254e-05, + "loss": 0.0352, + "step": 101730 + }, + { + "epoch": 22.00101825272166, + "grad_norm": 0.00888539757579565, + "learning_rate": 2.4941775442777448e-05, + "loss": 0.0207, + "step": 101740 + }, + { + "epoch": 22.00107241510047, + "grad_norm": 0.0024196321610361338, + "learning_rate": 2.4938766421732354e-05, + "loss": 0.0018, + "step": 101750 + }, + { + "epoch": 22.001126577479283, + "grad_norm": 0.005594124551862478, + "learning_rate": 2.4935757400687264e-05, + "loss": 0.0816, + "step": 101760 + }, + { + "epoch": 22.001180739858096, + "grad_norm": 0.08106105774641037, + "learning_rate": 2.4932748379642167e-05, + "loss": 0.0063, + "step": 101770 + }, + { + "epoch": 22.001234902236906, + "grad_norm": 1.23317551612854, + "learning_rate": 2.4929739358597076e-05, + "loss": 0.0829, + "step": 101780 + }, + { + "epoch": 22.00128906461572, + "grad_norm": 0.370013952255249, + "learning_rate": 2.4926730337551983e-05, + "loss": 0.0718, + "step": 101790 + }, + { + "epoch": 22.00134322699453, + "grad_norm": 0.651416540145874, + "learning_rate": 2.492372131650689e-05, + "loss": 0.1139, + "step": 101800 + }, + { + "epoch": 22.00139738937334, + "grad_norm": 3.179840087890625, + "learning_rate": 2.4920712295461795e-05, + "loss": 0.0362, + "step": 101810 + }, + { + "epoch": 22.001451551752155, + "grad_norm": 0.0016941084759309888, + "learning_rate": 2.4917703274416705e-05, + "loss": 0.011, + "step": 101820 + }, + { + "epoch": 22.001505714130964, + "grad_norm": 0.0018102091271430254, + "learning_rate": 2.491469425337161e-05, + "loss": 0.0038, + "step": 101830 + }, + { + "epoch": 22.001559876509777, + "grad_norm": 0.004396073054522276, + "learning_rate": 2.4911685232326514e-05, + "loss": 0.0282, + "step": 101840 + }, + { + "epoch": 22.001614038888587, + "grad_norm": 1.2555434703826904, + "learning_rate": 2.4908676211281424e-05, + "loss": 0.0107, + "step": 101850 + }, + { + "epoch": 22.0016682012674, + "grad_norm": 0.00955589022487402, + "learning_rate": 2.490566719023633e-05, + "loss": 0.0533, + "step": 101860 + }, + { + "epoch": 22.001722363646213, + "grad_norm": 0.3820533752441406, + "learning_rate": 2.4902658169191236e-05, + "loss": 0.0305, + "step": 101870 + }, + { + "epoch": 22.001776526025022, + "grad_norm": 0.10427551716566086, + "learning_rate": 2.4899649148146142e-05, + "loss": 0.0371, + "step": 101880 + }, + { + "epoch": 22.001830688403835, + "grad_norm": 0.05803282931447029, + "learning_rate": 2.4896640127101052e-05, + "loss": 0.0028, + "step": 101890 + }, + { + "epoch": 22.001884850782645, + "grad_norm": 0.0015010455390438437, + "learning_rate": 2.4893631106055955e-05, + "loss": 0.1163, + "step": 101900 + }, + { + "epoch": 22.001939013161458, + "grad_norm": 0.15705110132694244, + "learning_rate": 2.4890622085010865e-05, + "loss": 0.0486, + "step": 101910 + }, + { + "epoch": 22.00199317554027, + "grad_norm": 0.0016201717080548406, + "learning_rate": 2.488761306396577e-05, + "loss": 0.0429, + "step": 101920 + }, + { + "epoch": 22.00204733791908, + "grad_norm": 2.8670713901519775, + "learning_rate": 2.4884604042920677e-05, + "loss": 0.0354, + "step": 101930 + }, + { + "epoch": 22.002101500297893, + "grad_norm": 0.001578882453031838, + "learning_rate": 2.4881595021875583e-05, + "loss": 0.0686, + "step": 101940 + }, + { + "epoch": 22.002155662676707, + "grad_norm": 0.2207588404417038, + "learning_rate": 2.4878586000830493e-05, + "loss": 0.0341, + "step": 101950 + }, + { + "epoch": 22.002209825055516, + "grad_norm": 0.02905878610908985, + "learning_rate": 2.48755769797854e-05, + "loss": 0.0239, + "step": 101960 + }, + { + "epoch": 22.00226398743433, + "grad_norm": 0.0019506568787619472, + "learning_rate": 2.4872567958740306e-05, + "loss": 0.0329, + "step": 101970 + }, + { + "epoch": 22.00231814981314, + "grad_norm": 0.14751403033733368, + "learning_rate": 2.4869558937695212e-05, + "loss": 0.0761, + "step": 101980 + }, + { + "epoch": 22.00237231219195, + "grad_norm": 0.002017493825405836, + "learning_rate": 2.4866549916650118e-05, + "loss": 0.0661, + "step": 101990 + }, + { + "epoch": 22.002426474570765, + "grad_norm": 0.19739115238189697, + "learning_rate": 2.4863540895605024e-05, + "loss": 0.0057, + "step": 102000 + }, + { + "epoch": 22.002480636949574, + "grad_norm": 2.3238182067871094, + "learning_rate": 2.486053187455993e-05, + "loss": 0.0401, + "step": 102010 + }, + { + "epoch": 22.002534799328387, + "grad_norm": 0.002103889361023903, + "learning_rate": 2.485752285351484e-05, + "loss": 0.02, + "step": 102020 + }, + { + "epoch": 22.002588961707197, + "grad_norm": 0.001977722393348813, + "learning_rate": 2.4854513832469743e-05, + "loss": 0.0096, + "step": 102030 + }, + { + "epoch": 22.00264312408601, + "grad_norm": 0.0047136470675468445, + "learning_rate": 2.4851504811424653e-05, + "loss": 0.0237, + "step": 102040 + }, + { + "epoch": 22.002697286464823, + "grad_norm": 0.0030474208761006594, + "learning_rate": 2.484849579037956e-05, + "loss": 0.0496, + "step": 102050 + }, + { + "epoch": 22.002751448843632, + "grad_norm": 0.0015322957187891006, + "learning_rate": 2.4845486769334466e-05, + "loss": 0.0373, + "step": 102060 + }, + { + "epoch": 22.002805611222445, + "grad_norm": 0.035518575459718704, + "learning_rate": 2.4842477748289372e-05, + "loss": 0.0063, + "step": 102070 + }, + { + "epoch": 22.002859773601255, + "grad_norm": 0.001849277294240892, + "learning_rate": 2.483946872724428e-05, + "loss": 0.0234, + "step": 102080 + }, + { + "epoch": 22.002913935980068, + "grad_norm": 0.06103372573852539, + "learning_rate": 2.4836459706199188e-05, + "loss": 0.005, + "step": 102090 + }, + { + "epoch": 22.00296809835888, + "grad_norm": 0.0022745735477656126, + "learning_rate": 2.4833450685154094e-05, + "loss": 0.0805, + "step": 102100 + }, + { + "epoch": 22.00302226073769, + "grad_norm": 0.012859470210969448, + "learning_rate": 2.4830441664109e-05, + "loss": 0.0794, + "step": 102110 + }, + { + "epoch": 22.003076423116504, + "grad_norm": 0.002457009395584464, + "learning_rate": 2.4827432643063907e-05, + "loss": 0.0112, + "step": 102120 + }, + { + "epoch": 22.003130585495313, + "grad_norm": 0.0015070172958076, + "learning_rate": 2.4824423622018813e-05, + "loss": 0.0196, + "step": 102130 + }, + { + "epoch": 22.003184747874126, + "grad_norm": 0.056855153292417526, + "learning_rate": 2.482141460097372e-05, + "loss": 0.0042, + "step": 102140 + }, + { + "epoch": 22.00323891025294, + "grad_norm": 0.266215980052948, + "learning_rate": 2.481840557992863e-05, + "loss": 0.0793, + "step": 102150 + }, + { + "epoch": 22.00329307263175, + "grad_norm": 0.0014834745088592172, + "learning_rate": 2.481539655888353e-05, + "loss": 0.1061, + "step": 102160 + }, + { + "epoch": 22.003347235010562, + "grad_norm": 0.011989876627922058, + "learning_rate": 2.481238753783844e-05, + "loss": 0.0033, + "step": 102170 + }, + { + "epoch": 22.003401397389375, + "grad_norm": 0.23901985585689545, + "learning_rate": 2.4809378516793348e-05, + "loss": 0.0837, + "step": 102180 + }, + { + "epoch": 22.003455559768184, + "grad_norm": 0.019825858995318413, + "learning_rate": 2.4806369495748254e-05, + "loss": 0.0794, + "step": 102190 + }, + { + "epoch": 22.003509722146998, + "grad_norm": 0.19895774126052856, + "learning_rate": 2.480336047470316e-05, + "loss": 0.1378, + "step": 102200 + }, + { + "epoch": 22.003563884525807, + "grad_norm": 0.0015060979640111327, + "learning_rate": 2.480035145365807e-05, + "loss": 0.0263, + "step": 102210 + }, + { + "epoch": 22.00361804690462, + "grad_norm": 0.16123715043067932, + "learning_rate": 2.4797342432612976e-05, + "loss": 0.0751, + "step": 102220 + }, + { + "epoch": 22.003672209283433, + "grad_norm": 0.06172638759016991, + "learning_rate": 2.4794333411567882e-05, + "loss": 0.0919, + "step": 102230 + }, + { + "epoch": 22.003726371662243, + "grad_norm": 0.4246143698692322, + "learning_rate": 2.479132439052279e-05, + "loss": 0.0103, + "step": 102240 + }, + { + "epoch": 22.003780534041056, + "grad_norm": 0.0030197477899491787, + "learning_rate": 2.4788315369477695e-05, + "loss": 0.0016, + "step": 102250 + }, + { + "epoch": 22.003834696419865, + "grad_norm": 0.002887916285544634, + "learning_rate": 2.47853063484326e-05, + "loss": 0.116, + "step": 102260 + }, + { + "epoch": 22.00388885879868, + "grad_norm": 0.025418441742658615, + "learning_rate": 2.478229732738751e-05, + "loss": 0.0097, + "step": 102270 + }, + { + "epoch": 22.00394302117749, + "grad_norm": 0.0044163987040519714, + "learning_rate": 2.4779288306342417e-05, + "loss": 0.0488, + "step": 102280 + }, + { + "epoch": 22.0039971835563, + "grad_norm": 1.4237191677093506, + "learning_rate": 2.477627928529732e-05, + "loss": 0.0459, + "step": 102290 + }, + { + "epoch": 22.004051345935114, + "grad_norm": 0.974800169467926, + "learning_rate": 2.477327026425223e-05, + "loss": 0.0482, + "step": 102300 + }, + { + "epoch": 22.004105508313923, + "grad_norm": 0.859653115272522, + "learning_rate": 2.4770261243207136e-05, + "loss": 0.0375, + "step": 102310 + }, + { + "epoch": 22.004159670692736, + "grad_norm": 0.005854060873389244, + "learning_rate": 2.4767252222162042e-05, + "loss": 0.0063, + "step": 102320 + }, + { + "epoch": 22.00421383307155, + "grad_norm": 0.0023894747719168663, + "learning_rate": 2.476424320111695e-05, + "loss": 0.0248, + "step": 102330 + }, + { + "epoch": 22.00426799545036, + "grad_norm": 0.0017120731063187122, + "learning_rate": 2.4761234180071858e-05, + "loss": 0.0604, + "step": 102340 + }, + { + "epoch": 22.004322157829172, + "grad_norm": 0.0018855404341593385, + "learning_rate": 2.4758225159026764e-05, + "loss": 0.0498, + "step": 102350 + }, + { + "epoch": 22.004376320207985, + "grad_norm": 0.001765925670042634, + "learning_rate": 2.475521613798167e-05, + "loss": 0.0965, + "step": 102360 + }, + { + "epoch": 22.004430482586795, + "grad_norm": 0.05708399415016174, + "learning_rate": 2.4752207116936577e-05, + "loss": 0.0851, + "step": 102370 + }, + { + "epoch": 22.004484644965608, + "grad_norm": 0.0018671610159799457, + "learning_rate": 2.4749198095891483e-05, + "loss": 0.0364, + "step": 102380 + }, + { + "epoch": 22.004538807344417, + "grad_norm": 0.5427826046943665, + "learning_rate": 2.474618907484639e-05, + "loss": 0.0111, + "step": 102390 + }, + { + "epoch": 22.00459296972323, + "grad_norm": 0.00196211040019989, + "learning_rate": 2.47431800538013e-05, + "loss": 0.0127, + "step": 102400 + }, + { + "epoch": 22.004647132102043, + "grad_norm": 0.001826060120947659, + "learning_rate": 2.4740171032756205e-05, + "loss": 0.0891, + "step": 102410 + }, + { + "epoch": 22.004701294480853, + "grad_norm": 0.7538915276527405, + "learning_rate": 2.4737162011711112e-05, + "loss": 0.049, + "step": 102420 + }, + { + "epoch": 22.004755456859666, + "grad_norm": 0.0025386984925717115, + "learning_rate": 2.4734152990666018e-05, + "loss": 0.0443, + "step": 102430 + }, + { + "epoch": 22.004809619238475, + "grad_norm": 0.6765029430389404, + "learning_rate": 2.4731143969620924e-05, + "loss": 0.0476, + "step": 102440 + }, + { + "epoch": 22.00486378161729, + "grad_norm": 0.0020560191478580236, + "learning_rate": 2.472813494857583e-05, + "loss": 0.0902, + "step": 102450 + }, + { + "epoch": 22.0049179439961, + "grad_norm": 0.005404469091445208, + "learning_rate": 2.4725125927530737e-05, + "loss": 0.0031, + "step": 102460 + }, + { + "epoch": 22.00497210637491, + "grad_norm": 0.05399349331855774, + "learning_rate": 2.4722116906485646e-05, + "loss": 0.0055, + "step": 102470 + }, + { + "epoch": 22.005026268753724, + "grad_norm": 0.06590558588504791, + "learning_rate": 2.4719107885440553e-05, + "loss": 0.0133, + "step": 102480 + }, + { + "epoch": 22.005080431132534, + "grad_norm": 0.002157864859327674, + "learning_rate": 2.471609886439546e-05, + "loss": 0.0024, + "step": 102490 + }, + { + "epoch": 22.005134593511347, + "grad_norm": 1.7073664665222168, + "learning_rate": 2.4713089843350365e-05, + "loss": 0.0191, + "step": 102500 + }, + { + "epoch": 22.00518875589016, + "grad_norm": 0.23138390481472015, + "learning_rate": 2.471008082230527e-05, + "loss": 0.0167, + "step": 102510 + }, + { + "epoch": 22.00524291826897, + "grad_norm": 1.9340509176254272, + "learning_rate": 2.4707071801260178e-05, + "loss": 0.0225, + "step": 102520 + }, + { + "epoch": 22.005297080647782, + "grad_norm": 1.4151124954223633, + "learning_rate": 2.4704062780215088e-05, + "loss": 0.0766, + "step": 102530 + }, + { + "epoch": 22.005351243026595, + "grad_norm": 0.0018489513313397765, + "learning_rate": 2.4701053759169994e-05, + "loss": 0.0271, + "step": 102540 + }, + { + "epoch": 22.005405405405405, + "grad_norm": 5.809009552001953, + "learning_rate": 2.46980447381249e-05, + "loss": 0.0544, + "step": 102550 + }, + { + "epoch": 22.005459567784218, + "grad_norm": 0.006225453224033117, + "learning_rate": 2.4695035717079806e-05, + "loss": 0.0255, + "step": 102560 + }, + { + "epoch": 22.005513730163027, + "grad_norm": 1.8439345359802246, + "learning_rate": 2.4692026696034716e-05, + "loss": 0.0646, + "step": 102570 + }, + { + "epoch": 22.00556789254184, + "grad_norm": 0.7390115857124329, + "learning_rate": 2.468901767498962e-05, + "loss": 0.0454, + "step": 102580 + }, + { + "epoch": 22.005622054920654, + "grad_norm": 0.07241781800985336, + "learning_rate": 2.4686008653944525e-05, + "loss": 0.1167, + "step": 102590 + }, + { + "epoch": 22.005676217299463, + "grad_norm": 0.0018484359607100487, + "learning_rate": 2.4682999632899435e-05, + "loss": 0.083, + "step": 102600 + }, + { + "epoch": 22.005730379678276, + "grad_norm": 1.5559401512145996, + "learning_rate": 2.467999061185434e-05, + "loss": 0.0074, + "step": 102610 + }, + { + "epoch": 22.005784542057086, + "grad_norm": 0.002455232199281454, + "learning_rate": 2.4676981590809247e-05, + "loss": 0.084, + "step": 102620 + }, + { + "epoch": 22.0058387044359, + "grad_norm": 0.0031145205721259117, + "learning_rate": 2.4673972569764154e-05, + "loss": 0.0135, + "step": 102630 + }, + { + "epoch": 22.005892866814712, + "grad_norm": 0.03376670181751251, + "learning_rate": 2.467096354871906e-05, + "loss": 0.0174, + "step": 102640 + }, + { + "epoch": 22.00594702919352, + "grad_norm": 0.10549793392419815, + "learning_rate": 2.4667954527673966e-05, + "loss": 0.0561, + "step": 102650 + }, + { + "epoch": 22.006001191572334, + "grad_norm": 0.002371403155848384, + "learning_rate": 2.4664945506628876e-05, + "loss": 0.0757, + "step": 102660 + }, + { + "epoch": 22.006055353951144, + "grad_norm": 0.5767815113067627, + "learning_rate": 2.4661936485583782e-05, + "loss": 0.02, + "step": 102670 + }, + { + "epoch": 22.006109516329957, + "grad_norm": 0.0025937682949006557, + "learning_rate": 2.465892746453869e-05, + "loss": 0.0118, + "step": 102680 + }, + { + "epoch": 22.00616367870877, + "grad_norm": 0.002030187053605914, + "learning_rate": 2.4655918443493595e-05, + "loss": 0.047, + "step": 102690 + }, + { + "epoch": 22.00621784108758, + "grad_norm": 0.057176072150468826, + "learning_rate": 2.4652909422448504e-05, + "loss": 0.0702, + "step": 102700 + }, + { + "epoch": 22.006272003466393, + "grad_norm": 0.7450338006019592, + "learning_rate": 2.4649900401403407e-05, + "loss": 0.015, + "step": 102710 + }, + { + "epoch": 22.006326165845206, + "grad_norm": 0.0022719628177583218, + "learning_rate": 2.4646891380358314e-05, + "loss": 0.0569, + "step": 102720 + }, + { + "epoch": 22.006380328224015, + "grad_norm": 0.0019745994359254837, + "learning_rate": 2.4643882359313223e-05, + "loss": 0.0147, + "step": 102730 + }, + { + "epoch": 22.006434490602828, + "grad_norm": 0.0021387008018791676, + "learning_rate": 2.464087333826813e-05, + "loss": 0.0592, + "step": 102740 + }, + { + "epoch": 22.006488652981638, + "grad_norm": 0.9179924726486206, + "learning_rate": 2.4637864317223036e-05, + "loss": 0.0061, + "step": 102750 + }, + { + "epoch": 22.00654281536045, + "grad_norm": 0.24836280941963196, + "learning_rate": 2.4634855296177942e-05, + "loss": 0.081, + "step": 102760 + }, + { + "epoch": 22.006596977739264, + "grad_norm": 0.764030396938324, + "learning_rate": 2.4631846275132848e-05, + "loss": 0.0433, + "step": 102770 + }, + { + "epoch": 22.006651140118073, + "grad_norm": 0.46483802795410156, + "learning_rate": 2.4628837254087755e-05, + "loss": 0.0092, + "step": 102780 + }, + { + "epoch": 22.006705302496886, + "grad_norm": 0.3199487328529358, + "learning_rate": 2.4625828233042664e-05, + "loss": 0.0712, + "step": 102790 + }, + { + "epoch": 22.006759464875696, + "grad_norm": 0.005070808343589306, + "learning_rate": 2.462281921199757e-05, + "loss": 0.0424, + "step": 102800 + }, + { + "epoch": 22.00681362725451, + "grad_norm": 0.003192715346813202, + "learning_rate": 2.4619810190952477e-05, + "loss": 0.0267, + "step": 102810 + }, + { + "epoch": 22.006867789633322, + "grad_norm": 0.08264616876840591, + "learning_rate": 2.4616801169907383e-05, + "loss": 0.0434, + "step": 102820 + }, + { + "epoch": 22.00692195201213, + "grad_norm": 0.8161893486976624, + "learning_rate": 2.4613792148862293e-05, + "loss": 0.046, + "step": 102830 + }, + { + "epoch": 22.006976114390945, + "grad_norm": 0.0017573353834450245, + "learning_rate": 2.4610783127817196e-05, + "loss": 0.0033, + "step": 102840 + }, + { + "epoch": 22.007030276769754, + "grad_norm": 0.0018053943058475852, + "learning_rate": 2.4607774106772105e-05, + "loss": 0.0362, + "step": 102850 + }, + { + "epoch": 22.007084439148567, + "grad_norm": 0.6559213995933533, + "learning_rate": 2.460476508572701e-05, + "loss": 0.0175, + "step": 102860 + }, + { + "epoch": 22.00713860152738, + "grad_norm": 0.0026246500201523304, + "learning_rate": 2.4601756064681918e-05, + "loss": 0.1346, + "step": 102870 + }, + { + "epoch": 22.00719276390619, + "grad_norm": 0.05893804877996445, + "learning_rate": 2.4598747043636824e-05, + "loss": 0.0156, + "step": 102880 + }, + { + "epoch": 22.007246926285003, + "grad_norm": 0.003224913962185383, + "learning_rate": 2.459573802259173e-05, + "loss": 0.0512, + "step": 102890 + }, + { + "epoch": 22.007301088663816, + "grad_norm": 1.2122726440429688, + "learning_rate": 2.4592729001546637e-05, + "loss": 0.0038, + "step": 102900 + }, + { + "epoch": 22.007355251042625, + "grad_norm": 0.14179307222366333, + "learning_rate": 2.4589719980501543e-05, + "loss": 0.0622, + "step": 102910 + }, + { + "epoch": 22.00740941342144, + "grad_norm": 0.001904285978525877, + "learning_rate": 2.4586710959456453e-05, + "loss": 0.0285, + "step": 102920 + }, + { + "epoch": 22.007463575800248, + "grad_norm": 0.4446732699871063, + "learning_rate": 2.458370193841136e-05, + "loss": 0.0333, + "step": 102930 + }, + { + "epoch": 22.00751773817906, + "grad_norm": 0.011886199936270714, + "learning_rate": 2.4580692917366265e-05, + "loss": 0.0148, + "step": 102940 + }, + { + "epoch": 22.007571900557874, + "grad_norm": 0.00870982464402914, + "learning_rate": 2.457768389632117e-05, + "loss": 0.0995, + "step": 102950 + }, + { + "epoch": 22.007626062936684, + "grad_norm": 0.03796743229031563, + "learning_rate": 2.457467487527608e-05, + "loss": 0.003, + "step": 102960 + }, + { + "epoch": 22.007680225315497, + "grad_norm": 0.00175680301617831, + "learning_rate": 2.4571665854230984e-05, + "loss": 0.0334, + "step": 102970 + }, + { + "epoch": 22.007734387694306, + "grad_norm": 0.005076531320810318, + "learning_rate": 2.4568656833185894e-05, + "loss": 0.0252, + "step": 102980 + }, + { + "epoch": 22.00778855007312, + "grad_norm": 0.001699388143606484, + "learning_rate": 2.45656478121408e-05, + "loss": 0.0516, + "step": 102990 + }, + { + "epoch": 22.007842712451932, + "grad_norm": 4.83137321472168, + "learning_rate": 2.4562638791095706e-05, + "loss": 0.1068, + "step": 103000 + }, + { + "epoch": 22.00789687483074, + "grad_norm": 7.721273422241211, + "learning_rate": 2.4559629770050612e-05, + "loss": 0.0386, + "step": 103010 + }, + { + "epoch": 22.007951037209555, + "grad_norm": 0.5584099888801575, + "learning_rate": 2.455662074900552e-05, + "loss": 0.0505, + "step": 103020 + }, + { + "epoch": 22.008005199588364, + "grad_norm": 0.0017264606431126595, + "learning_rate": 2.4553611727960425e-05, + "loss": 0.048, + "step": 103030 + }, + { + "epoch": 22.008059361967177, + "grad_norm": 0.005552404560148716, + "learning_rate": 2.455060270691533e-05, + "loss": 0.0008, + "step": 103040 + }, + { + "epoch": 22.00811352434599, + "grad_norm": 0.0016813125694170594, + "learning_rate": 2.454759368587024e-05, + "loss": 0.0017, + "step": 103050 + }, + { + "epoch": 22.0081676867248, + "grad_norm": 0.8707745671272278, + "learning_rate": 2.4544584664825147e-05, + "loss": 0.0675, + "step": 103060 + }, + { + "epoch": 22.008221849103613, + "grad_norm": 0.0032533020712435246, + "learning_rate": 2.4541575643780053e-05, + "loss": 0.0206, + "step": 103070 + }, + { + "epoch": 22.008276011482426, + "grad_norm": 0.6916283369064331, + "learning_rate": 2.453856662273496e-05, + "loss": 0.0606, + "step": 103080 + }, + { + "epoch": 22.008330173861236, + "grad_norm": 2.458286762237549, + "learning_rate": 2.453555760168987e-05, + "loss": 0.0252, + "step": 103090 + }, + { + "epoch": 22.00838433624005, + "grad_norm": 0.001778973382897675, + "learning_rate": 2.4532548580644772e-05, + "loss": 0.0841, + "step": 103100 + }, + { + "epoch": 22.008438498618858, + "grad_norm": 0.04073784872889519, + "learning_rate": 2.4529539559599682e-05, + "loss": 0.1668, + "step": 103110 + }, + { + "epoch": 22.00849266099767, + "grad_norm": 0.0017155333189293742, + "learning_rate": 2.4526530538554588e-05, + "loss": 0.0022, + "step": 103120 + }, + { + "epoch": 22.008546823376484, + "grad_norm": 0.001680096611380577, + "learning_rate": 2.4523521517509494e-05, + "loss": 0.0177, + "step": 103130 + }, + { + "epoch": 22.008600985755294, + "grad_norm": 0.024898642674088478, + "learning_rate": 2.45205124964644e-05, + "loss": 0.1125, + "step": 103140 + }, + { + "epoch": 22.008655148134107, + "grad_norm": 0.0017414261819794774, + "learning_rate": 2.451750347541931e-05, + "loss": 0.1231, + "step": 103150 + }, + { + "epoch": 22.008709310512916, + "grad_norm": 0.01212683692574501, + "learning_rate": 2.4514494454374213e-05, + "loss": 0.0479, + "step": 103160 + }, + { + "epoch": 22.00876347289173, + "grad_norm": 0.0019228830933570862, + "learning_rate": 2.451148543332912e-05, + "loss": 0.0008, + "step": 103170 + }, + { + "epoch": 22.008817635270542, + "grad_norm": 9.055424690246582, + "learning_rate": 2.450847641228403e-05, + "loss": 0.1076, + "step": 103180 + }, + { + "epoch": 22.008871797649352, + "grad_norm": 0.0022772338707000017, + "learning_rate": 2.4505467391238936e-05, + "loss": 0.0576, + "step": 103190 + }, + { + "epoch": 22.008925960028165, + "grad_norm": 0.0018140305764973164, + "learning_rate": 2.4502458370193842e-05, + "loss": 0.0253, + "step": 103200 + }, + { + "epoch": 22.008980122406975, + "grad_norm": 0.018661795184016228, + "learning_rate": 2.4499449349148748e-05, + "loss": 0.0175, + "step": 103210 + }, + { + "epoch": 22.009034284785788, + "grad_norm": 0.0018887703772634268, + "learning_rate": 2.4496440328103658e-05, + "loss": 0.0378, + "step": 103220 + }, + { + "epoch": 22.0090884471646, + "grad_norm": 0.003262239508330822, + "learning_rate": 2.449343130705856e-05, + "loss": 0.0508, + "step": 103230 + }, + { + "epoch": 22.00914260954341, + "grad_norm": 0.6387301087379456, + "learning_rate": 2.449042228601347e-05, + "loss": 0.1465, + "step": 103240 + }, + { + "epoch": 22.009196771922223, + "grad_norm": 0.19220282137393951, + "learning_rate": 2.4487413264968377e-05, + "loss": 0.0231, + "step": 103250 + }, + { + "epoch": 22.009250934301033, + "grad_norm": 0.15543295443058014, + "learning_rate": 2.4484404243923283e-05, + "loss": 0.063, + "step": 103260 + }, + { + "epoch": 22.009305096679846, + "grad_norm": 0.0019883590284734964, + "learning_rate": 2.448139522287819e-05, + "loss": 0.051, + "step": 103270 + }, + { + "epoch": 22.00935925905866, + "grad_norm": 0.0020601535215973854, + "learning_rate": 2.44783862018331e-05, + "loss": 0.0268, + "step": 103280 + }, + { + "epoch": 22.00941342143747, + "grad_norm": 0.04511368274688721, + "learning_rate": 2.4475377180788e-05, + "loss": 0.0501, + "step": 103290 + }, + { + "epoch": 22.00946758381628, + "grad_norm": 0.005884250160306692, + "learning_rate": 2.447236815974291e-05, + "loss": 0.0158, + "step": 103300 + }, + { + "epoch": 22.009521746195094, + "grad_norm": 0.030254632234573364, + "learning_rate": 2.4469359138697818e-05, + "loss": 0.0042, + "step": 103310 + }, + { + "epoch": 22.009575908573904, + "grad_norm": 3.57952618598938, + "learning_rate": 2.4466350117652724e-05, + "loss": 0.0375, + "step": 103320 + }, + { + "epoch": 22.009630070952717, + "grad_norm": 4.664054870605469, + "learning_rate": 2.446334109660763e-05, + "loss": 0.1186, + "step": 103330 + }, + { + "epoch": 22.009684233331527, + "grad_norm": 0.001961097354069352, + "learning_rate": 2.4460332075562536e-05, + "loss": 0.0048, + "step": 103340 + }, + { + "epoch": 22.00973839571034, + "grad_norm": 0.004267385695129633, + "learning_rate": 2.4457323054517446e-05, + "loss": 0.0849, + "step": 103350 + }, + { + "epoch": 22.009792558089153, + "grad_norm": 0.5582884550094604, + "learning_rate": 2.445431403347235e-05, + "loss": 0.074, + "step": 103360 + }, + { + "epoch": 22.009846720467962, + "grad_norm": 0.024852510541677475, + "learning_rate": 2.445130501242726e-05, + "loss": 0.0819, + "step": 103370 + }, + { + "epoch": 22.009900882846775, + "grad_norm": 7.938240051269531, + "learning_rate": 2.4448295991382165e-05, + "loss": 0.1112, + "step": 103380 + }, + { + "epoch": 22.009955045225585, + "grad_norm": 0.12886139750480652, + "learning_rate": 2.444528697033707e-05, + "loss": 0.1082, + "step": 103390 + }, + { + "epoch": 22.010009207604398, + "grad_norm": 0.002829421078786254, + "learning_rate": 2.4442277949291977e-05, + "loss": 0.1268, + "step": 103400 + }, + { + "epoch": 22.01006336998321, + "grad_norm": 0.039311185479164124, + "learning_rate": 2.4439268928246887e-05, + "loss": 0.0312, + "step": 103410 + }, + { + "epoch": 22.01011753236202, + "grad_norm": 0.02082144096493721, + "learning_rate": 2.4436259907201793e-05, + "loss": 0.0515, + "step": 103420 + }, + { + "epoch": 22.010171694740833, + "grad_norm": 0.0027940503787249327, + "learning_rate": 2.44332508861567e-05, + "loss": 0.0741, + "step": 103430 + }, + { + "epoch": 22.010225857119643, + "grad_norm": 0.004319139290601015, + "learning_rate": 2.4430241865111606e-05, + "loss": 0.0327, + "step": 103440 + }, + { + "epoch": 22.010280019498456, + "grad_norm": 0.7083796262741089, + "learning_rate": 2.4427232844066512e-05, + "loss": 0.0667, + "step": 103450 + }, + { + "epoch": 22.01033418187727, + "grad_norm": 0.020240409299731255, + "learning_rate": 2.442422382302142e-05, + "loss": 0.0184, + "step": 103460 + }, + { + "epoch": 22.01038834425608, + "grad_norm": 0.006769239436835051, + "learning_rate": 2.4421214801976325e-05, + "loss": 0.0582, + "step": 103470 + }, + { + "epoch": 22.01044250663489, + "grad_norm": 1.4065043926239014, + "learning_rate": 2.4418205780931234e-05, + "loss": 0.0162, + "step": 103480 + }, + { + "epoch": 22.010496669013705, + "grad_norm": 0.0037532190326601267, + "learning_rate": 2.4415196759886137e-05, + "loss": 0.0064, + "step": 103490 + }, + { + "epoch": 22.010550831392514, + "grad_norm": 0.004484208300709724, + "learning_rate": 2.4412187738841047e-05, + "loss": 0.0711, + "step": 103500 + }, + { + "epoch": 22.010604993771327, + "grad_norm": 0.052573807537555695, + "learning_rate": 2.4409178717795953e-05, + "loss": 0.0293, + "step": 103510 + }, + { + "epoch": 22.010659156150137, + "grad_norm": 4.090267658233643, + "learning_rate": 2.440616969675086e-05, + "loss": 0.0636, + "step": 103520 + }, + { + "epoch": 22.01071331852895, + "grad_norm": 0.0020907612051814795, + "learning_rate": 2.4403160675705766e-05, + "loss": 0.0232, + "step": 103530 + }, + { + "epoch": 22.010767480907763, + "grad_norm": 0.0020860808435827494, + "learning_rate": 2.4400151654660675e-05, + "loss": 0.0023, + "step": 103540 + }, + { + "epoch": 22.010821643286572, + "grad_norm": 0.09741821140050888, + "learning_rate": 2.4397142633615582e-05, + "loss": 0.0403, + "step": 103550 + }, + { + "epoch": 22.010875805665385, + "grad_norm": 2.124116897583008, + "learning_rate": 2.4394133612570488e-05, + "loss": 0.0457, + "step": 103560 + }, + { + "epoch": 22.010929968044195, + "grad_norm": 0.7423585057258606, + "learning_rate": 2.4391124591525394e-05, + "loss": 0.0142, + "step": 103570 + }, + { + "epoch": 22.010984130423008, + "grad_norm": 0.0025667985901236534, + "learning_rate": 2.43881155704803e-05, + "loss": 0.0005, + "step": 103580 + }, + { + "epoch": 22.01103829280182, + "grad_norm": 0.0061273169703781605, + "learning_rate": 2.4385106549435207e-05, + "loss": 0.0601, + "step": 103590 + }, + { + "epoch": 22.01109245518063, + "grad_norm": 0.001966688083484769, + "learning_rate": 2.4382097528390117e-05, + "loss": 0.0638, + "step": 103600 + }, + { + "epoch": 22.011146617559444, + "grad_norm": 1.017674446105957, + "learning_rate": 2.4379088507345023e-05, + "loss": 0.0307, + "step": 103610 + }, + { + "epoch": 22.011200779938253, + "grad_norm": 0.44925740361213684, + "learning_rate": 2.4376079486299926e-05, + "loss": 0.0256, + "step": 103620 + }, + { + "epoch": 22.011254942317066, + "grad_norm": 0.002290178555995226, + "learning_rate": 2.4373070465254835e-05, + "loss": 0.0212, + "step": 103630 + }, + { + "epoch": 22.01130910469588, + "grad_norm": 0.002386608626693487, + "learning_rate": 2.437006144420974e-05, + "loss": 0.0161, + "step": 103640 + }, + { + "epoch": 22.01136326707469, + "grad_norm": 0.006941372994333506, + "learning_rate": 2.4367052423164648e-05, + "loss": 0.011, + "step": 103650 + }, + { + "epoch": 22.011417429453502, + "grad_norm": 0.006101915612816811, + "learning_rate": 2.4364043402119554e-05, + "loss": 0.0224, + "step": 103660 + }, + { + "epoch": 22.011471591832315, + "grad_norm": 0.0023641716688871384, + "learning_rate": 2.4361034381074464e-05, + "loss": 0.0617, + "step": 103670 + }, + { + "epoch": 22.011525754211124, + "grad_norm": 0.0022457411978393793, + "learning_rate": 2.435802536002937e-05, + "loss": 0.0126, + "step": 103680 + }, + { + "epoch": 22.011579916589938, + "grad_norm": 0.013175413943827152, + "learning_rate": 2.4355016338984276e-05, + "loss": 0.089, + "step": 103690 + }, + { + "epoch": 22.011634078968747, + "grad_norm": 0.001876925234682858, + "learning_rate": 2.4352007317939183e-05, + "loss": 0.0196, + "step": 103700 + }, + { + "epoch": 22.01168824134756, + "grad_norm": 0.02490648254752159, + "learning_rate": 2.434899829689409e-05, + "loss": 0.0171, + "step": 103710 + }, + { + "epoch": 22.011742403726373, + "grad_norm": 0.00315101002342999, + "learning_rate": 2.4345989275848995e-05, + "loss": 0.1018, + "step": 103720 + }, + { + "epoch": 22.011796566105183, + "grad_norm": 0.0017694798298180103, + "learning_rate": 2.4342980254803905e-05, + "loss": 0.0352, + "step": 103730 + }, + { + "epoch": 22.011850728483996, + "grad_norm": 0.0026283508632332087, + "learning_rate": 2.433997123375881e-05, + "loss": 0.0005, + "step": 103740 + }, + { + "epoch": 22.011904890862805, + "grad_norm": 0.0015556096332147717, + "learning_rate": 2.4336962212713717e-05, + "loss": 0.028, + "step": 103750 + }, + { + "epoch": 22.01195905324162, + "grad_norm": 0.04265059903264046, + "learning_rate": 2.4333953191668624e-05, + "loss": 0.0319, + "step": 103760 + }, + { + "epoch": 22.01201321562043, + "grad_norm": 0.445473849773407, + "learning_rate": 2.433094417062353e-05, + "loss": 0.0057, + "step": 103770 + }, + { + "epoch": 22.01206737799924, + "grad_norm": 2.2672977447509766, + "learning_rate": 2.4327935149578436e-05, + "loss": 0.0339, + "step": 103780 + }, + { + "epoch": 22.012121540378054, + "grad_norm": 0.0707869902253151, + "learning_rate": 2.4324926128533342e-05, + "loss": 0.0277, + "step": 103790 + }, + { + "epoch": 22.012175702756863, + "grad_norm": 0.006573288701474667, + "learning_rate": 2.4321917107488252e-05, + "loss": 0.042, + "step": 103800 + }, + { + "epoch": 22.012229865135676, + "grad_norm": 0.282339483499527, + "learning_rate": 2.431890808644316e-05, + "loss": 0.0161, + "step": 103810 + }, + { + "epoch": 22.01228402751449, + "grad_norm": 0.002086301799863577, + "learning_rate": 2.4315899065398065e-05, + "loss": 0.0438, + "step": 103820 + }, + { + "epoch": 22.0123381898933, + "grad_norm": 0.9569835662841797, + "learning_rate": 2.431289004435297e-05, + "loss": 0.1863, + "step": 103830 + }, + { + "epoch": 22.012392352272112, + "grad_norm": 0.011897038668394089, + "learning_rate": 2.4309881023307877e-05, + "loss": 0.0056, + "step": 103840 + }, + { + "epoch": 22.012446514650925, + "grad_norm": 6.8858819007873535, + "learning_rate": 2.4306872002262784e-05, + "loss": 0.0514, + "step": 103850 + }, + { + "epoch": 22.012500677029735, + "grad_norm": 0.014576892368495464, + "learning_rate": 2.4303862981217693e-05, + "loss": 0.0054, + "step": 103860 + }, + { + "epoch": 22.012554839408548, + "grad_norm": 1.5203489065170288, + "learning_rate": 2.43008539601726e-05, + "loss": 0.0318, + "step": 103870 + }, + { + "epoch": 22.012609001787357, + "grad_norm": 0.006208596285432577, + "learning_rate": 2.4297844939127506e-05, + "loss": 0.2114, + "step": 103880 + }, + { + "epoch": 22.01266316416617, + "grad_norm": 9.447468757629395, + "learning_rate": 2.4294835918082412e-05, + "loss": 0.1592, + "step": 103890 + }, + { + "epoch": 22.012717326544983, + "grad_norm": 0.328490287065506, + "learning_rate": 2.429182689703732e-05, + "loss": 0.0221, + "step": 103900 + }, + { + "epoch": 22.012771488923793, + "grad_norm": 0.0639338567852974, + "learning_rate": 2.4288817875992225e-05, + "loss": 0.0714, + "step": 103910 + }, + { + "epoch": 22.012825651302606, + "grad_norm": 3.8726065158843994, + "learning_rate": 2.428580885494713e-05, + "loss": 0.0686, + "step": 103920 + }, + { + "epoch": 22.012879813681415, + "grad_norm": 0.23454521596431732, + "learning_rate": 2.428279983390204e-05, + "loss": 0.0603, + "step": 103930 + }, + { + "epoch": 22.01293397606023, + "grad_norm": 0.686766505241394, + "learning_rate": 2.4279790812856947e-05, + "loss": 0.0942, + "step": 103940 + }, + { + "epoch": 22.01298813843904, + "grad_norm": 0.15309910476207733, + "learning_rate": 2.4276781791811853e-05, + "loss": 0.0041, + "step": 103950 + }, + { + "epoch": 22.01304230081785, + "grad_norm": 0.2667436897754669, + "learning_rate": 2.427377277076676e-05, + "loss": 0.0302, + "step": 103960 + }, + { + "epoch": 22.013096463196664, + "grad_norm": 0.30543360114097595, + "learning_rate": 2.4270763749721666e-05, + "loss": 0.0587, + "step": 103970 + }, + { + "epoch": 22.013150625575474, + "grad_norm": 0.0033649615943431854, + "learning_rate": 2.4267754728676572e-05, + "loss": 0.1276, + "step": 103980 + }, + { + "epoch": 22.013204787954287, + "grad_norm": 0.1568613350391388, + "learning_rate": 2.426474570763148e-05, + "loss": 0.0119, + "step": 103990 + }, + { + "epoch": 22.0132589503331, + "grad_norm": 0.25008487701416016, + "learning_rate": 2.4261736686586388e-05, + "loss": 0.0179, + "step": 104000 + }, + { + "epoch": 22.01331311271191, + "grad_norm": 0.42648962140083313, + "learning_rate": 2.4258727665541294e-05, + "loss": 0.0439, + "step": 104010 + }, + { + "epoch": 22.013367275090722, + "grad_norm": 0.3322126567363739, + "learning_rate": 2.42557186444962e-05, + "loss": 0.0846, + "step": 104020 + }, + { + "epoch": 22.013421437469535, + "grad_norm": 2.8112893104553223, + "learning_rate": 2.425270962345111e-05, + "loss": 0.0411, + "step": 104030 + }, + { + "epoch": 22.013475599848345, + "grad_norm": 0.4682772755622864, + "learning_rate": 2.4249700602406013e-05, + "loss": 0.1157, + "step": 104040 + }, + { + "epoch": 22.013529762227158, + "grad_norm": 0.051073428243398666, + "learning_rate": 2.4246691581360923e-05, + "loss": 0.0469, + "step": 104050 + }, + { + "epoch": 22.013583924605967, + "grad_norm": 0.004179670009762049, + "learning_rate": 2.424368256031583e-05, + "loss": 0.0244, + "step": 104060 + }, + { + "epoch": 22.01363808698478, + "grad_norm": 0.0625658854842186, + "learning_rate": 2.4240673539270735e-05, + "loss": 0.0227, + "step": 104070 + }, + { + "epoch": 22.013692249363594, + "grad_norm": 0.32986655831336975, + "learning_rate": 2.423766451822564e-05, + "loss": 0.1245, + "step": 104080 + }, + { + "epoch": 22.013746411742403, + "grad_norm": 0.003457451006397605, + "learning_rate": 2.4234655497180548e-05, + "loss": 0.1123, + "step": 104090 + }, + { + "epoch": 22.013800574121216, + "grad_norm": 2.668128490447998, + "learning_rate": 2.4231646476135454e-05, + "loss": 0.0458, + "step": 104100 + }, + { + "epoch": 22.013854736500026, + "grad_norm": 0.06576848775148392, + "learning_rate": 2.422863745509036e-05, + "loss": 0.113, + "step": 104110 + }, + { + "epoch": 22.01390889887884, + "grad_norm": 0.08368135988712311, + "learning_rate": 2.422562843404527e-05, + "loss": 0.0383, + "step": 104120 + }, + { + "epoch": 22.013963061257652, + "grad_norm": 0.06467413902282715, + "learning_rate": 2.4222619413000176e-05, + "loss": 0.1285, + "step": 104130 + }, + { + "epoch": 22.01401722363646, + "grad_norm": 0.03716299310326576, + "learning_rate": 2.4219610391955082e-05, + "loss": 0.0277, + "step": 104140 + }, + { + "epoch": 22.014071386015274, + "grad_norm": 0.01060076430439949, + "learning_rate": 2.421660137090999e-05, + "loss": 0.035, + "step": 104150 + }, + { + "epoch": 22.014125548394084, + "grad_norm": 0.08368218690156937, + "learning_rate": 2.42135923498649e-05, + "loss": 0.0283, + "step": 104160 + }, + { + "epoch": 22.014179710772897, + "grad_norm": 3.138340473175049, + "learning_rate": 2.42105833288198e-05, + "loss": 0.0438, + "step": 104170 + }, + { + "epoch": 22.01423387315171, + "grad_norm": 0.010427797213196754, + "learning_rate": 2.420757430777471e-05, + "loss": 0.0265, + "step": 104180 + }, + { + "epoch": 22.01428803553052, + "grad_norm": 1.1929187774658203, + "learning_rate": 2.4204565286729617e-05, + "loss": 0.0414, + "step": 104190 + }, + { + "epoch": 22.014342197909333, + "grad_norm": 0.003937163855880499, + "learning_rate": 2.4201556265684523e-05, + "loss": 0.0691, + "step": 104200 + }, + { + "epoch": 22.014396360288146, + "grad_norm": 0.007829326204955578, + "learning_rate": 2.419854724463943e-05, + "loss": 0.0394, + "step": 104210 + }, + { + "epoch": 22.014450522666955, + "grad_norm": 0.0036137450952082872, + "learning_rate": 2.4195538223594336e-05, + "loss": 0.0684, + "step": 104220 + }, + { + "epoch": 22.014504685045768, + "grad_norm": 0.04667451232671738, + "learning_rate": 2.4192529202549242e-05, + "loss": 0.0485, + "step": 104230 + }, + { + "epoch": 22.014558847424578, + "grad_norm": 5.041412353515625, + "learning_rate": 2.418952018150415e-05, + "loss": 0.0869, + "step": 104240 + }, + { + "epoch": 22.01461300980339, + "grad_norm": 0.03070872649550438, + "learning_rate": 2.4186511160459058e-05, + "loss": 0.1411, + "step": 104250 + }, + { + "epoch": 22.014667172182204, + "grad_norm": 0.7106316685676575, + "learning_rate": 2.4183502139413965e-05, + "loss": 0.0664, + "step": 104260 + }, + { + "epoch": 22.014721334561013, + "grad_norm": 0.05118921771645546, + "learning_rate": 2.418049311836887e-05, + "loss": 0.0045, + "step": 104270 + }, + { + "epoch": 22.014775496939826, + "grad_norm": 0.14347697794437408, + "learning_rate": 2.4177484097323777e-05, + "loss": 0.0186, + "step": 104280 + }, + { + "epoch": 22.014829659318636, + "grad_norm": 0.002487270627170801, + "learning_rate": 2.4174475076278687e-05, + "loss": 0.0824, + "step": 104290 + }, + { + "epoch": 22.01488382169745, + "grad_norm": 0.0033725996036082506, + "learning_rate": 2.417146605523359e-05, + "loss": 0.0279, + "step": 104300 + }, + { + "epoch": 22.014937984076262, + "grad_norm": 0.004682490602135658, + "learning_rate": 2.41684570341885e-05, + "loss": 0.0394, + "step": 104310 + }, + { + "epoch": 22.01499214645507, + "grad_norm": 0.031485725194215775, + "learning_rate": 2.4165448013143406e-05, + "loss": 0.0242, + "step": 104320 + }, + { + "epoch": 22.015046308833885, + "grad_norm": 0.0027875779196619987, + "learning_rate": 2.4162438992098312e-05, + "loss": 0.0061, + "step": 104330 + }, + { + "epoch": 22.015100471212694, + "grad_norm": 0.021730629727244377, + "learning_rate": 2.4159429971053218e-05, + "loss": 0.0047, + "step": 104340 + }, + { + "epoch": 22.015154633591507, + "grad_norm": 0.4467077851295471, + "learning_rate": 2.4156420950008128e-05, + "loss": 0.0317, + "step": 104350 + }, + { + "epoch": 22.01520879597032, + "grad_norm": 0.016218047589063644, + "learning_rate": 2.415341192896303e-05, + "loss": 0.0433, + "step": 104360 + }, + { + "epoch": 22.01526295834913, + "grad_norm": 0.12951792776584625, + "learning_rate": 2.4150402907917937e-05, + "loss": 0.019, + "step": 104370 + }, + { + "epoch": 22.015317120727943, + "grad_norm": 1.726689100265503, + "learning_rate": 2.4147393886872847e-05, + "loss": 0.057, + "step": 104380 + }, + { + "epoch": 22.015371283106752, + "grad_norm": 0.028342995792627335, + "learning_rate": 2.4144384865827753e-05, + "loss": 0.0384, + "step": 104390 + }, + { + "epoch": 22.015425445485565, + "grad_norm": 0.0019446220248937607, + "learning_rate": 2.414137584478266e-05, + "loss": 0.0029, + "step": 104400 + }, + { + "epoch": 22.01547960786438, + "grad_norm": 0.02780722826719284, + "learning_rate": 2.4138366823737565e-05, + "loss": 0.0064, + "step": 104410 + }, + { + "epoch": 22.015533770243188, + "grad_norm": 0.00222004484385252, + "learning_rate": 2.4135357802692475e-05, + "loss": 0.1633, + "step": 104420 + }, + { + "epoch": 22.015587932622, + "grad_norm": 0.03326239436864853, + "learning_rate": 2.4132348781647378e-05, + "loss": 0.0668, + "step": 104430 + }, + { + "epoch": 22.015642095000814, + "grad_norm": 0.003723681438714266, + "learning_rate": 2.4129339760602288e-05, + "loss": 0.036, + "step": 104440 + }, + { + "epoch": 22.015696257379624, + "grad_norm": 1.2665046453475952, + "learning_rate": 2.4126330739557194e-05, + "loss": 0.0404, + "step": 104450 + }, + { + "epoch": 22.015750419758437, + "grad_norm": 0.0020134022925049067, + "learning_rate": 2.41233217185121e-05, + "loss": 0.0342, + "step": 104460 + }, + { + "epoch": 22.015804582137246, + "grad_norm": 0.329886794090271, + "learning_rate": 2.4120312697467006e-05, + "loss": 0.0373, + "step": 104470 + }, + { + "epoch": 22.01585874451606, + "grad_norm": 0.0025065161753445864, + "learning_rate": 2.4117303676421916e-05, + "loss": 0.0374, + "step": 104480 + }, + { + "epoch": 22.015912906894872, + "grad_norm": 0.0021926749031990767, + "learning_rate": 2.411429465537682e-05, + "loss": 0.051, + "step": 104490 + }, + { + "epoch": 22.01596706927368, + "grad_norm": 0.08059217780828476, + "learning_rate": 2.411128563433173e-05, + "loss": 0.0587, + "step": 104500 + }, + { + "epoch": 22.016021231652495, + "grad_norm": 0.10569336265325546, + "learning_rate": 2.4108276613286635e-05, + "loss": 0.1247, + "step": 104510 + }, + { + "epoch": 22.016075394031304, + "grad_norm": 0.02476891689002514, + "learning_rate": 2.410526759224154e-05, + "loss": 0.0936, + "step": 104520 + }, + { + "epoch": 22.016129556410117, + "grad_norm": 0.0032029934227466583, + "learning_rate": 2.4102258571196447e-05, + "loss": 0.0408, + "step": 104530 + }, + { + "epoch": 22.01618371878893, + "grad_norm": 0.06977403163909912, + "learning_rate": 2.4099249550151354e-05, + "loss": 0.0541, + "step": 104540 + }, + { + "epoch": 22.01623788116774, + "grad_norm": 0.16437867283821106, + "learning_rate": 2.4096240529106263e-05, + "loss": 0.0399, + "step": 104550 + }, + { + "epoch": 22.016292043546553, + "grad_norm": 1.3035717010498047, + "learning_rate": 2.4093231508061166e-05, + "loss": 0.0587, + "step": 104560 + }, + { + "epoch": 22.016346205925363, + "grad_norm": 0.3558902442455292, + "learning_rate": 2.4090222487016076e-05, + "loss": 0.0467, + "step": 104570 + }, + { + "epoch": 22.016400368304176, + "grad_norm": 0.09231079369783401, + "learning_rate": 2.4087213465970982e-05, + "loss": 0.0595, + "step": 104580 + }, + { + "epoch": 22.01645453068299, + "grad_norm": 0.0021262732334434986, + "learning_rate": 2.408420444492589e-05, + "loss": 0.075, + "step": 104590 + }, + { + "epoch": 22.016508693061798, + "grad_norm": 0.04212004318833351, + "learning_rate": 2.4081195423880795e-05, + "loss": 0.0539, + "step": 104600 + }, + { + "epoch": 22.01656285544061, + "grad_norm": 1.3350950479507446, + "learning_rate": 2.4078186402835704e-05, + "loss": 0.0362, + "step": 104610 + }, + { + "epoch": 22.016617017819424, + "grad_norm": 0.0019258444663137197, + "learning_rate": 2.4075177381790607e-05, + "loss": 0.0962, + "step": 104620 + }, + { + "epoch": 22.016671180198234, + "grad_norm": 0.5932328104972839, + "learning_rate": 2.4072168360745517e-05, + "loss": 0.1292, + "step": 104630 + }, + { + "epoch": 22.016725342577047, + "grad_norm": 0.0026855480391532183, + "learning_rate": 2.4069159339700423e-05, + "loss": 0.0069, + "step": 104640 + }, + { + "epoch": 22.016779504955856, + "grad_norm": 0.9578983783721924, + "learning_rate": 2.406615031865533e-05, + "loss": 0.0154, + "step": 104650 + }, + { + "epoch": 22.01683366733467, + "grad_norm": 1.0543417930603027, + "learning_rate": 2.4063141297610236e-05, + "loss": 0.0865, + "step": 104660 + }, + { + "epoch": 22.016887829713482, + "grad_norm": 2.0832102298736572, + "learning_rate": 2.4060132276565142e-05, + "loss": 0.0721, + "step": 104670 + }, + { + "epoch": 22.016941992092292, + "grad_norm": 0.13716214895248413, + "learning_rate": 2.4057123255520052e-05, + "loss": 0.1117, + "step": 104680 + }, + { + "epoch": 22.016996154471105, + "grad_norm": 0.06740757077932358, + "learning_rate": 2.4054114234474955e-05, + "loss": 0.0359, + "step": 104690 + }, + { + "epoch": 22.017050316849915, + "grad_norm": 0.0018787737935781479, + "learning_rate": 2.4051105213429864e-05, + "loss": 0.0949, + "step": 104700 + }, + { + "epoch": 22.017104479228728, + "grad_norm": 0.0222768634557724, + "learning_rate": 2.404809619238477e-05, + "loss": 0.0419, + "step": 104710 + }, + { + "epoch": 22.01715864160754, + "grad_norm": 0.3569668233394623, + "learning_rate": 2.4045087171339677e-05, + "loss": 0.0235, + "step": 104720 + }, + { + "epoch": 22.01721280398635, + "grad_norm": 0.001793992705643177, + "learning_rate": 2.4042078150294583e-05, + "loss": 0.0098, + "step": 104730 + }, + { + "epoch": 22.017266966365163, + "grad_norm": 0.023645061999559402, + "learning_rate": 2.4039069129249493e-05, + "loss": 0.0032, + "step": 104740 + }, + { + "epoch": 22.017321128743973, + "grad_norm": 0.11094474792480469, + "learning_rate": 2.4036060108204396e-05, + "loss": 0.0169, + "step": 104750 + }, + { + "epoch": 22.017375291122786, + "grad_norm": 0.17437277734279633, + "learning_rate": 2.4033051087159305e-05, + "loss": 0.0076, + "step": 104760 + }, + { + "epoch": 22.0174294535016, + "grad_norm": 0.1219262182712555, + "learning_rate": 2.403004206611421e-05, + "loss": 0.1379, + "step": 104770 + }, + { + "epoch": 22.01748361588041, + "grad_norm": 0.037468891590833664, + "learning_rate": 2.4027033045069118e-05, + "loss": 0.0984, + "step": 104780 + }, + { + "epoch": 22.01753777825922, + "grad_norm": 0.0021105085033923388, + "learning_rate": 2.4024024024024024e-05, + "loss": 0.0249, + "step": 104790 + }, + { + "epoch": 22.017591940638034, + "grad_norm": 0.06091690808534622, + "learning_rate": 2.4021015002978934e-05, + "loss": 0.0579, + "step": 104800 + }, + { + "epoch": 22.017646103016844, + "grad_norm": 1.9444787502288818, + "learning_rate": 2.401800598193384e-05, + "loss": 0.0552, + "step": 104810 + }, + { + "epoch": 22.017700265395657, + "grad_norm": 0.25110241770744324, + "learning_rate": 2.4014996960888743e-05, + "loss": 0.0048, + "step": 104820 + }, + { + "epoch": 22.017754427774467, + "grad_norm": 0.005609241779893637, + "learning_rate": 2.4011987939843653e-05, + "loss": 0.1068, + "step": 104830 + }, + { + "epoch": 22.01780859015328, + "grad_norm": 0.960174560546875, + "learning_rate": 2.400897891879856e-05, + "loss": 0.0112, + "step": 104840 + }, + { + "epoch": 22.017862752532093, + "grad_norm": 0.00401797192171216, + "learning_rate": 2.4005969897753465e-05, + "loss": 0.0045, + "step": 104850 + }, + { + "epoch": 22.017916914910902, + "grad_norm": 0.355872243642807, + "learning_rate": 2.400296087670837e-05, + "loss": 0.0391, + "step": 104860 + }, + { + "epoch": 22.017971077289715, + "grad_norm": 0.00797437783330679, + "learning_rate": 2.399995185566328e-05, + "loss": 0.1047, + "step": 104870 + }, + { + "epoch": 22.018025239668525, + "grad_norm": 0.12233935296535492, + "learning_rate": 2.3996942834618184e-05, + "loss": 0.0514, + "step": 104880 + }, + { + "epoch": 22.018079402047338, + "grad_norm": 0.0028916148003190756, + "learning_rate": 2.3993933813573094e-05, + "loss": 0.0601, + "step": 104890 + }, + { + "epoch": 22.01813356442615, + "grad_norm": 0.09516549110412598, + "learning_rate": 2.3990924792528e-05, + "loss": 0.0213, + "step": 104900 + }, + { + "epoch": 22.01818772680496, + "grad_norm": 0.0027583467308431864, + "learning_rate": 2.3987915771482906e-05, + "loss": 0.0244, + "step": 104910 + }, + { + "epoch": 22.018241889183773, + "grad_norm": 0.003148033283650875, + "learning_rate": 2.3984906750437813e-05, + "loss": 0.0432, + "step": 104920 + }, + { + "epoch": 22.018296051562583, + "grad_norm": 0.050825051963329315, + "learning_rate": 2.3981897729392722e-05, + "loss": 0.0196, + "step": 104930 + }, + { + "epoch": 22.018350213941396, + "grad_norm": 0.0023244197946041822, + "learning_rate": 2.397888870834763e-05, + "loss": 0.0568, + "step": 104940 + }, + { + "epoch": 22.01840437632021, + "grad_norm": 0.002278275554999709, + "learning_rate": 2.3975879687302535e-05, + "loss": 0.0384, + "step": 104950 + }, + { + "epoch": 22.01845853869902, + "grad_norm": 0.01249349769204855, + "learning_rate": 2.397287066625744e-05, + "loss": 0.0663, + "step": 104960 + }, + { + "epoch": 22.01851270107783, + "grad_norm": 0.002379494719207287, + "learning_rate": 2.3969861645212347e-05, + "loss": 0.014, + "step": 104970 + }, + { + "epoch": 22.018566863456645, + "grad_norm": 0.0019438134040683508, + "learning_rate": 2.3966852624167254e-05, + "loss": 0.085, + "step": 104980 + }, + { + "epoch": 22.018621025835454, + "grad_norm": 0.059263940900564194, + "learning_rate": 2.396384360312216e-05, + "loss": 0.0348, + "step": 104990 + }, + { + "epoch": 22.018675188214267, + "grad_norm": 1.0313035249710083, + "learning_rate": 2.396083458207707e-05, + "loss": 0.0946, + "step": 105000 + }, + { + "epoch": 22.018729350593077, + "grad_norm": 0.002938975812867284, + "learning_rate": 2.3957825561031972e-05, + "loss": 0.039, + "step": 105010 + }, + { + "epoch": 22.01878351297189, + "grad_norm": 1.827419400215149, + "learning_rate": 2.3954816539986882e-05, + "loss": 0.0468, + "step": 105020 + }, + { + "epoch": 22.018837675350703, + "grad_norm": 1.0652698278427124, + "learning_rate": 2.395180751894179e-05, + "loss": 0.0151, + "step": 105030 + }, + { + "epoch": 22.018891837729512, + "grad_norm": 0.2902905344963074, + "learning_rate": 2.3948798497896695e-05, + "loss": 0.0104, + "step": 105040 + }, + { + "epoch": 22.018946000108325, + "grad_norm": 0.0022225026041269302, + "learning_rate": 2.39457894768516e-05, + "loss": 0.0271, + "step": 105050 + }, + { + "epoch": 22.019000162487135, + "grad_norm": 0.006619415245950222, + "learning_rate": 2.394278045580651e-05, + "loss": 0.0283, + "step": 105060 + }, + { + "epoch": 22.019054324865948, + "grad_norm": 3.2694015502929688, + "learning_rate": 2.3939771434761417e-05, + "loss": 0.0508, + "step": 105070 + }, + { + "epoch": 22.01910848724476, + "grad_norm": 0.00393706327304244, + "learning_rate": 2.3936762413716323e-05, + "loss": 0.0711, + "step": 105080 + }, + { + "epoch": 22.01916264962357, + "grad_norm": 0.0018715993501245975, + "learning_rate": 2.393375339267123e-05, + "loss": 0.0744, + "step": 105090 + }, + { + "epoch": 22.019216812002384, + "grad_norm": 0.004842312540858984, + "learning_rate": 2.393074437162614e-05, + "loss": 0.0395, + "step": 105100 + }, + { + "epoch": 22.019270974381193, + "grad_norm": 0.002102744532749057, + "learning_rate": 2.3927735350581042e-05, + "loss": 0.0494, + "step": 105110 + }, + { + "epoch": 22.019325136760006, + "grad_norm": 0.01149003952741623, + "learning_rate": 2.3924726329535948e-05, + "loss": 0.0337, + "step": 105120 + }, + { + "epoch": 22.01937929913882, + "grad_norm": 0.0018989309901371598, + "learning_rate": 2.3921717308490858e-05, + "loss": 0.002, + "step": 105130 + }, + { + "epoch": 22.01943346151763, + "grad_norm": 0.14141644537448883, + "learning_rate": 2.3918708287445764e-05, + "loss": 0.0487, + "step": 105140 + }, + { + "epoch": 22.019487623896442, + "grad_norm": 0.02399980090558529, + "learning_rate": 2.391569926640067e-05, + "loss": 0.1395, + "step": 105150 + }, + { + "epoch": 22.019541786275255, + "grad_norm": 0.880787193775177, + "learning_rate": 2.3912690245355577e-05, + "loss": 0.0837, + "step": 105160 + }, + { + "epoch": 22.019595948654064, + "grad_norm": 0.543516218662262, + "learning_rate": 2.3909681224310483e-05, + "loss": 0.0548, + "step": 105170 + }, + { + "epoch": 22.019650111032878, + "grad_norm": 1.4213261604309082, + "learning_rate": 2.390667220326539e-05, + "loss": 0.0473, + "step": 105180 + }, + { + "epoch": 22.019704273411687, + "grad_norm": 0.058446306735277176, + "learning_rate": 2.39036631822203e-05, + "loss": 0.0092, + "step": 105190 + }, + { + "epoch": 22.0197584357905, + "grad_norm": 0.0017413184978067875, + "learning_rate": 2.3900654161175205e-05, + "loss": 0.0084, + "step": 105200 + }, + { + "epoch": 22.019812598169313, + "grad_norm": 0.18107980489730835, + "learning_rate": 2.389764514013011e-05, + "loss": 0.021, + "step": 105210 + }, + { + "epoch": 22.019866760548123, + "grad_norm": 0.0034601634833961725, + "learning_rate": 2.3894636119085018e-05, + "loss": 0.0127, + "step": 105220 + }, + { + "epoch": 22.019920922926936, + "grad_norm": 0.6095376014709473, + "learning_rate": 2.3891627098039927e-05, + "loss": 0.0098, + "step": 105230 + }, + { + "epoch": 22.019975085305745, + "grad_norm": 0.0016025612130761147, + "learning_rate": 2.388861807699483e-05, + "loss": 0.1422, + "step": 105240 + }, + { + "epoch": 22.02002924768456, + "grad_norm": 0.2670147716999054, + "learning_rate": 2.388560905594974e-05, + "loss": 0.0467, + "step": 105250 + }, + { + "epoch": 22.02008341006337, + "grad_norm": 0.0039268783293664455, + "learning_rate": 2.3882600034904646e-05, + "loss": 0.0782, + "step": 105260 + }, + { + "epoch": 22.02013757244218, + "grad_norm": 0.034687723964452744, + "learning_rate": 2.3879591013859552e-05, + "loss": 0.0265, + "step": 105270 + }, + { + "epoch": 22.020191734820994, + "grad_norm": 0.004119109828025103, + "learning_rate": 2.387658199281446e-05, + "loss": 0.0287, + "step": 105280 + }, + { + "epoch": 22.020245897199803, + "grad_norm": 0.013723297044634819, + "learning_rate": 2.3873572971769365e-05, + "loss": 0.0035, + "step": 105290 + }, + { + "epoch": 22.020300059578616, + "grad_norm": 1.854074478149414, + "learning_rate": 2.387056395072427e-05, + "loss": 0.2137, + "step": 105300 + }, + { + "epoch": 22.02035422195743, + "grad_norm": 33.66188049316406, + "learning_rate": 2.3867554929679178e-05, + "loss": 0.061, + "step": 105310 + }, + { + "epoch": 22.02040838433624, + "grad_norm": 0.026657912880182266, + "learning_rate": 2.3864545908634087e-05, + "loss": 0.0025, + "step": 105320 + }, + { + "epoch": 22.020462546715052, + "grad_norm": 0.004028350580483675, + "learning_rate": 2.3861536887588993e-05, + "loss": 0.08, + "step": 105330 + }, + { + "epoch": 22.020516709093865, + "grad_norm": 0.0034987828694283962, + "learning_rate": 2.38585278665439e-05, + "loss": 0.0331, + "step": 105340 + }, + { + "epoch": 22.020570871472675, + "grad_norm": 0.05955209210515022, + "learning_rate": 2.3855518845498806e-05, + "loss": 0.07, + "step": 105350 + }, + { + "epoch": 22.020625033851488, + "grad_norm": 0.1970023512840271, + "learning_rate": 2.3852509824453716e-05, + "loss": 0.0042, + "step": 105360 + }, + { + "epoch": 22.020679196230297, + "grad_norm": 7.775887489318848, + "learning_rate": 2.384950080340862e-05, + "loss": 0.0309, + "step": 105370 + }, + { + "epoch": 22.02073335860911, + "grad_norm": 0.3122694492340088, + "learning_rate": 2.3846491782363528e-05, + "loss": 0.0399, + "step": 105380 + }, + { + "epoch": 22.020787520987923, + "grad_norm": 0.0020819976925849915, + "learning_rate": 2.3843482761318435e-05, + "loss": 0.0124, + "step": 105390 + }, + { + "epoch": 22.020841683366733, + "grad_norm": 0.0018421558197587729, + "learning_rate": 2.384047374027334e-05, + "loss": 0.0604, + "step": 105400 + }, + { + "epoch": 22.020895845745546, + "grad_norm": 0.036234259605407715, + "learning_rate": 2.3837464719228247e-05, + "loss": 0.0106, + "step": 105410 + }, + { + "epoch": 22.020950008124355, + "grad_norm": 0.001993720419704914, + "learning_rate": 2.3834455698183153e-05, + "loss": 0.1717, + "step": 105420 + }, + { + "epoch": 22.02100417050317, + "grad_norm": 0.002966871252283454, + "learning_rate": 2.383144667713806e-05, + "loss": 0.1243, + "step": 105430 + }, + { + "epoch": 22.02105833288198, + "grad_norm": 0.0017515082145109773, + "learning_rate": 2.3828437656092966e-05, + "loss": 0.049, + "step": 105440 + }, + { + "epoch": 22.02111249526079, + "grad_norm": 0.0015766045544296503, + "learning_rate": 2.3825428635047876e-05, + "loss": 0.0093, + "step": 105450 + }, + { + "epoch": 22.021166657639604, + "grad_norm": 4.3913655281066895, + "learning_rate": 2.3822419614002782e-05, + "loss": 0.0387, + "step": 105460 + }, + { + "epoch": 22.021220820018414, + "grad_norm": 0.005907343700528145, + "learning_rate": 2.3819410592957688e-05, + "loss": 0.0223, + "step": 105470 + }, + { + "epoch": 22.021274982397227, + "grad_norm": 0.9506145715713501, + "learning_rate": 2.3816401571912594e-05, + "loss": 0.0439, + "step": 105480 + }, + { + "epoch": 22.02132914477604, + "grad_norm": 0.12697525322437286, + "learning_rate": 2.3813392550867504e-05, + "loss": 0.0635, + "step": 105490 + }, + { + "epoch": 22.02138330715485, + "grad_norm": 0.028409559279680252, + "learning_rate": 2.3810383529822407e-05, + "loss": 0.0051, + "step": 105500 + }, + { + "epoch": 22.021437469533662, + "grad_norm": 0.001506446162238717, + "learning_rate": 2.3807374508777317e-05, + "loss": 0.0395, + "step": 105510 + }, + { + "epoch": 22.021491631912472, + "grad_norm": 0.012034744955599308, + "learning_rate": 2.3804365487732223e-05, + "loss": 0.0288, + "step": 105520 + }, + { + "epoch": 22.021545794291285, + "grad_norm": 2.674027442932129, + "learning_rate": 2.380135646668713e-05, + "loss": 0.0414, + "step": 105530 + }, + { + "epoch": 22.021599956670098, + "grad_norm": 4.623109340667725, + "learning_rate": 2.3798347445642035e-05, + "loss": 0.1884, + "step": 105540 + }, + { + "epoch": 22.021654119048907, + "grad_norm": 0.543425440788269, + "learning_rate": 2.3795338424596945e-05, + "loss": 0.0136, + "step": 105550 + }, + { + "epoch": 22.02170828142772, + "grad_norm": 3.6912992000579834, + "learning_rate": 2.3792329403551848e-05, + "loss": 0.0399, + "step": 105560 + }, + { + "epoch": 22.021762443806534, + "grad_norm": 1.8356165885925293, + "learning_rate": 2.3789320382506754e-05, + "loss": 0.1055, + "step": 105570 + }, + { + "epoch": 22.021816606185343, + "grad_norm": 1.1328600645065308, + "learning_rate": 2.3786311361461664e-05, + "loss": 0.016, + "step": 105580 + }, + { + "epoch": 22.021870768564156, + "grad_norm": 0.0018258083146065474, + "learning_rate": 2.378330234041657e-05, + "loss": 0.0667, + "step": 105590 + }, + { + "epoch": 22.021924930942966, + "grad_norm": 0.5426905155181885, + "learning_rate": 2.3780293319371476e-05, + "loss": 0.1919, + "step": 105600 + }, + { + "epoch": 22.02197909332178, + "grad_norm": 0.49489277601242065, + "learning_rate": 2.3777284298326383e-05, + "loss": 0.0178, + "step": 105610 + }, + { + "epoch": 22.022033255700592, + "grad_norm": 0.001804159488528967, + "learning_rate": 2.3774275277281292e-05, + "loss": 0.0222, + "step": 105620 + }, + { + "epoch": 22.0220874180794, + "grad_norm": 0.005756039172410965, + "learning_rate": 2.3771266256236195e-05, + "loss": 0.0191, + "step": 105630 + }, + { + "epoch": 22.022141580458214, + "grad_norm": 0.0015565030043944716, + "learning_rate": 2.3768257235191105e-05, + "loss": 0.023, + "step": 105640 + }, + { + "epoch": 22.022195742837024, + "grad_norm": 4.120847702026367, + "learning_rate": 2.376524821414601e-05, + "loss": 0.0555, + "step": 105650 + }, + { + "epoch": 22.022249905215837, + "grad_norm": 0.22194147109985352, + "learning_rate": 2.3762239193100917e-05, + "loss": 0.0821, + "step": 105660 + }, + { + "epoch": 22.02230406759465, + "grad_norm": 0.0032924539409577847, + "learning_rate": 2.3759230172055824e-05, + "loss": 0.0458, + "step": 105670 + }, + { + "epoch": 22.02235822997346, + "grad_norm": 0.05382060632109642, + "learning_rate": 2.3756221151010733e-05, + "loss": 0.01, + "step": 105680 + }, + { + "epoch": 22.022412392352273, + "grad_norm": 4.650063514709473, + "learning_rate": 2.3753212129965636e-05, + "loss": 0.0719, + "step": 105690 + }, + { + "epoch": 22.022466554731082, + "grad_norm": 0.0015189478872343898, + "learning_rate": 2.3750203108920546e-05, + "loss": 0.0007, + "step": 105700 + }, + { + "epoch": 22.022520717109895, + "grad_norm": 0.0016504732193425298, + "learning_rate": 2.3747194087875452e-05, + "loss": 0.0813, + "step": 105710 + }, + { + "epoch": 22.022574879488708, + "grad_norm": 0.01218233909457922, + "learning_rate": 2.374418506683036e-05, + "loss": 0.1055, + "step": 105720 + }, + { + "epoch": 22.022629041867518, + "grad_norm": 0.002429451560601592, + "learning_rate": 2.3741176045785265e-05, + "loss": 0.0345, + "step": 105730 + }, + { + "epoch": 22.02268320424633, + "grad_norm": 13.755796432495117, + "learning_rate": 2.373816702474017e-05, + "loss": 0.1593, + "step": 105740 + }, + { + "epoch": 22.022737366625144, + "grad_norm": 0.002289822557941079, + "learning_rate": 2.373515800369508e-05, + "loss": 0.1217, + "step": 105750 + }, + { + "epoch": 22.022791529003953, + "grad_norm": 0.04963906481862068, + "learning_rate": 2.3732148982649984e-05, + "loss": 0.0699, + "step": 105760 + }, + { + "epoch": 22.022845691382766, + "grad_norm": 0.029923392459750175, + "learning_rate": 2.3729139961604893e-05, + "loss": 0.0613, + "step": 105770 + }, + { + "epoch": 22.022899853761576, + "grad_norm": 1.0549136400222778, + "learning_rate": 2.37261309405598e-05, + "loss": 0.0548, + "step": 105780 + }, + { + "epoch": 22.02295401614039, + "grad_norm": 0.49438032507896423, + "learning_rate": 2.3723121919514706e-05, + "loss": 0.0331, + "step": 105790 + }, + { + "epoch": 22.023008178519202, + "grad_norm": 0.5741178393363953, + "learning_rate": 2.3720112898469612e-05, + "loss": 0.0894, + "step": 105800 + }, + { + "epoch": 22.02306234089801, + "grad_norm": 0.06487226486206055, + "learning_rate": 2.3717103877424522e-05, + "loss": 0.0293, + "step": 105810 + }, + { + "epoch": 22.023116503276825, + "grad_norm": 1.0851534605026245, + "learning_rate": 2.3714094856379425e-05, + "loss": 0.0372, + "step": 105820 + }, + { + "epoch": 22.023170665655634, + "grad_norm": 0.030629126355051994, + "learning_rate": 2.3711085835334334e-05, + "loss": 0.0074, + "step": 105830 + }, + { + "epoch": 22.023224828034447, + "grad_norm": 0.0016963761299848557, + "learning_rate": 2.370807681428924e-05, + "loss": 0.042, + "step": 105840 + }, + { + "epoch": 22.02327899041326, + "grad_norm": 0.022454023361206055, + "learning_rate": 2.3705067793244147e-05, + "loss": 0.0103, + "step": 105850 + }, + { + "epoch": 22.02333315279207, + "grad_norm": 0.0015967541839927435, + "learning_rate": 2.3702058772199053e-05, + "loss": 0.0101, + "step": 105860 + }, + { + "epoch": 22.023387315170883, + "grad_norm": 2.0791687965393066, + "learning_rate": 2.369904975115396e-05, + "loss": 0.0184, + "step": 105870 + }, + { + "epoch": 22.023441477549692, + "grad_norm": 0.0018801623955368996, + "learning_rate": 2.369604073010887e-05, + "loss": 0.1207, + "step": 105880 + }, + { + "epoch": 22.023495639928505, + "grad_norm": 0.015141607262194157, + "learning_rate": 2.3693031709063772e-05, + "loss": 0.0028, + "step": 105890 + }, + { + "epoch": 22.02354980230732, + "grad_norm": 0.001861299155279994, + "learning_rate": 2.369002268801868e-05, + "loss": 0.0032, + "step": 105900 + }, + { + "epoch": 22.023603964686128, + "grad_norm": 0.0017889493610709906, + "learning_rate": 2.3687013666973588e-05, + "loss": 0.0168, + "step": 105910 + }, + { + "epoch": 22.02365812706494, + "grad_norm": 0.002523469738662243, + "learning_rate": 2.3684004645928494e-05, + "loss": 0.029, + "step": 105920 + }, + { + "epoch": 22.023712289443754, + "grad_norm": 0.15991637110710144, + "learning_rate": 2.36809956248834e-05, + "loss": 0.0059, + "step": 105930 + }, + { + "epoch": 22.023766451822564, + "grad_norm": 0.0023300889879465103, + "learning_rate": 2.367798660383831e-05, + "loss": 0.0042, + "step": 105940 + }, + { + "epoch": 22.023820614201377, + "grad_norm": 0.14624778926372528, + "learning_rate": 2.3674977582793213e-05, + "loss": 0.0089, + "step": 105950 + }, + { + "epoch": 22.023874776580186, + "grad_norm": 0.003844026243314147, + "learning_rate": 2.3671968561748123e-05, + "loss": 0.0499, + "step": 105960 + }, + { + "epoch": 22.023928938959, + "grad_norm": 0.15634331107139587, + "learning_rate": 2.366895954070303e-05, + "loss": 0.0653, + "step": 105970 + }, + { + "epoch": 22.023983101337812, + "grad_norm": 0.43384015560150146, + "learning_rate": 2.3665950519657935e-05, + "loss": 0.0058, + "step": 105980 + }, + { + "epoch": 22.02403726371662, + "grad_norm": 0.002920906525105238, + "learning_rate": 2.366294149861284e-05, + "loss": 0.0059, + "step": 105990 + }, + { + "epoch": 22.024091426095435, + "grad_norm": 0.008241130039095879, + "learning_rate": 2.365993247756775e-05, + "loss": 0.1182, + "step": 106000 + }, + { + "epoch": 22.024145588474244, + "grad_norm": 3.173616409301758, + "learning_rate": 2.3656923456522657e-05, + "loss": 0.0322, + "step": 106010 + }, + { + "epoch": 22.024199750853057, + "grad_norm": 2.9878602027893066, + "learning_rate": 2.365391443547756e-05, + "loss": 0.0457, + "step": 106020 + }, + { + "epoch": 22.02425391323187, + "grad_norm": 0.00147437141276896, + "learning_rate": 2.365090541443247e-05, + "loss": 0.0557, + "step": 106030 + }, + { + "epoch": 22.02430807561068, + "grad_norm": 0.0016306116012856364, + "learning_rate": 2.3647896393387376e-05, + "loss": 0.0723, + "step": 106040 + }, + { + "epoch": 22.024362237989493, + "grad_norm": 0.029392488300800323, + "learning_rate": 2.3644887372342283e-05, + "loss": 0.0318, + "step": 106050 + }, + { + "epoch": 22.024416400368303, + "grad_norm": 3.3468692302703857, + "learning_rate": 2.364187835129719e-05, + "loss": 0.0612, + "step": 106060 + }, + { + "epoch": 22.024470562747116, + "grad_norm": 0.03435279056429863, + "learning_rate": 2.36388693302521e-05, + "loss": 0.1573, + "step": 106070 + }, + { + "epoch": 22.02452472512593, + "grad_norm": 0.0015246360562741756, + "learning_rate": 2.3635860309207e-05, + "loss": 0.1333, + "step": 106080 + }, + { + "epoch": 22.024578887504738, + "grad_norm": 0.15828432142734528, + "learning_rate": 2.363285128816191e-05, + "loss": 0.0156, + "step": 106090 + }, + { + "epoch": 22.02463304988355, + "grad_norm": 0.9370366334915161, + "learning_rate": 2.3629842267116817e-05, + "loss": 0.0428, + "step": 106100 + }, + { + "epoch": 22.024687212262364, + "grad_norm": 0.18918028473854065, + "learning_rate": 2.3626833246071724e-05, + "loss": 0.0558, + "step": 106110 + }, + { + "epoch": 22.024741374641174, + "grad_norm": 1.5960065126419067, + "learning_rate": 2.362382422502663e-05, + "loss": 0.0875, + "step": 106120 + }, + { + "epoch": 22.024795537019987, + "grad_norm": 5.633467197418213, + "learning_rate": 2.362081520398154e-05, + "loss": 0.0707, + "step": 106130 + }, + { + "epoch": 22.024849699398796, + "grad_norm": 0.38734501600265503, + "learning_rate": 2.3617806182936446e-05, + "loss": 0.0472, + "step": 106140 + }, + { + "epoch": 22.02490386177761, + "grad_norm": 2.723412036895752, + "learning_rate": 2.3614797161891352e-05, + "loss": 0.0531, + "step": 106150 + }, + { + "epoch": 22.024958024156422, + "grad_norm": 0.0015723000979050994, + "learning_rate": 2.361178814084626e-05, + "loss": 0.0324, + "step": 106160 + }, + { + "epoch": 22.02500135405947, + "eval_accuracy": 0.8484650555192684, + "eval_loss": 0.7807546854019165, + "eval_runtime": 117.3266, + "eval_samples_per_second": 26.098, + "eval_steps_per_second": 3.264, + "step": 106168 + }, + { + "epoch": 23.000010832475763, + "grad_norm": 0.0014313565334305167, + "learning_rate": 2.3608779119801165e-05, + "loss": 0.0034, + "step": 106170 + }, + { + "epoch": 23.000064994854576, + "grad_norm": 0.6104956269264221, + "learning_rate": 2.360577009875607e-05, + "loss": 0.0731, + "step": 106180 + }, + { + "epoch": 23.000119157233385, + "grad_norm": 0.0015504815382882953, + "learning_rate": 2.3602761077710977e-05, + "loss": 0.0039, + "step": 106190 + }, + { + "epoch": 23.0001733196122, + "grad_norm": 0.0013942731311544776, + "learning_rate": 2.3599752056665887e-05, + "loss": 0.0182, + "step": 106200 + }, + { + "epoch": 23.000227481991008, + "grad_norm": 0.9473475217819214, + "learning_rate": 2.359674303562079e-05, + "loss": 0.0457, + "step": 106210 + }, + { + "epoch": 23.00028164436982, + "grad_norm": 1.8474962711334229, + "learning_rate": 2.35937340145757e-05, + "loss": 0.0938, + "step": 106220 + }, + { + "epoch": 23.000335806748634, + "grad_norm": 1.040013074874878, + "learning_rate": 2.3590724993530606e-05, + "loss": 0.0337, + "step": 106230 + }, + { + "epoch": 23.000389969127443, + "grad_norm": 0.16905531287193298, + "learning_rate": 2.3587715972485512e-05, + "loss": 0.0452, + "step": 106240 + }, + { + "epoch": 23.000444131506256, + "grad_norm": 1.1326135396957397, + "learning_rate": 2.3584706951440418e-05, + "loss": 0.037, + "step": 106250 + }, + { + "epoch": 23.000498293885066, + "grad_norm": 0.0013948165578767657, + "learning_rate": 2.3581697930395328e-05, + "loss": 0.004, + "step": 106260 + }, + { + "epoch": 23.00055245626388, + "grad_norm": 0.1389145851135254, + "learning_rate": 2.3578688909350234e-05, + "loss": 0.0313, + "step": 106270 + }, + { + "epoch": 23.000606618642692, + "grad_norm": 0.0013722266303375363, + "learning_rate": 2.357567988830514e-05, + "loss": 0.0005, + "step": 106280 + }, + { + "epoch": 23.0006607810215, + "grad_norm": 0.006914348807185888, + "learning_rate": 2.3572670867260047e-05, + "loss": 0.0003, + "step": 106290 + }, + { + "epoch": 23.000714943400315, + "grad_norm": 0.4345429241657257, + "learning_rate": 2.3569661846214956e-05, + "loss": 0.0475, + "step": 106300 + }, + { + "epoch": 23.000769105779124, + "grad_norm": 0.01547437347471714, + "learning_rate": 2.356665282516986e-05, + "loss": 0.159, + "step": 106310 + }, + { + "epoch": 23.000823268157937, + "grad_norm": 1.0513253211975098, + "learning_rate": 2.3563643804124766e-05, + "loss": 0.045, + "step": 106320 + }, + { + "epoch": 23.00087743053675, + "grad_norm": 0.0017610556678846478, + "learning_rate": 2.3560634783079675e-05, + "loss": 0.0004, + "step": 106330 + }, + { + "epoch": 23.00093159291556, + "grad_norm": 0.03435102850198746, + "learning_rate": 2.3557625762034578e-05, + "loss": 0.0401, + "step": 106340 + }, + { + "epoch": 23.000985755294373, + "grad_norm": 1.3211530447006226, + "learning_rate": 2.3554616740989488e-05, + "loss": 0.0482, + "step": 106350 + }, + { + "epoch": 23.001039917673186, + "grad_norm": 0.0017357270698994398, + "learning_rate": 2.3551607719944394e-05, + "loss": 0.0191, + "step": 106360 + }, + { + "epoch": 23.001094080051995, + "grad_norm": 6.01875114440918, + "learning_rate": 2.35485986988993e-05, + "loss": 0.0315, + "step": 106370 + }, + { + "epoch": 23.00114824243081, + "grad_norm": 0.0761415958404541, + "learning_rate": 2.3545589677854207e-05, + "loss": 0.0375, + "step": 106380 + }, + { + "epoch": 23.001202404809618, + "grad_norm": 0.001385535579174757, + "learning_rate": 2.3542580656809116e-05, + "loss": 0.0003, + "step": 106390 + }, + { + "epoch": 23.00125656718843, + "grad_norm": 2.744001626968384, + "learning_rate": 2.3539571635764022e-05, + "loss": 0.1257, + "step": 106400 + }, + { + "epoch": 23.001310729567244, + "grad_norm": 2.261899709701538, + "learning_rate": 2.353656261471893e-05, + "loss": 0.0126, + "step": 106410 + }, + { + "epoch": 23.001364891946054, + "grad_norm": 9.365140914916992, + "learning_rate": 2.3533553593673835e-05, + "loss": 0.1326, + "step": 106420 + }, + { + "epoch": 23.001419054324867, + "grad_norm": 10.044968605041504, + "learning_rate": 2.3530544572628745e-05, + "loss": 0.1705, + "step": 106430 + }, + { + "epoch": 23.001473216703676, + "grad_norm": 0.0013647829182446003, + "learning_rate": 2.3527535551583648e-05, + "loss": 0.1026, + "step": 106440 + }, + { + "epoch": 23.00152737908249, + "grad_norm": 0.006722067482769489, + "learning_rate": 2.3524526530538557e-05, + "loss": 0.0012, + "step": 106450 + }, + { + "epoch": 23.001581541461302, + "grad_norm": 0.0017306440277025104, + "learning_rate": 2.3521517509493464e-05, + "loss": 0.0069, + "step": 106460 + }, + { + "epoch": 23.001635703840112, + "grad_norm": 1.5961713790893555, + "learning_rate": 2.3518508488448366e-05, + "loss": 0.0658, + "step": 106470 + }, + { + "epoch": 23.001689866218925, + "grad_norm": 2.2805020809173584, + "learning_rate": 2.3515499467403276e-05, + "loss": 0.0705, + "step": 106480 + }, + { + "epoch": 23.001744028597734, + "grad_norm": 0.48338937759399414, + "learning_rate": 2.3512490446358182e-05, + "loss": 0.0418, + "step": 106490 + }, + { + "epoch": 23.001798190976547, + "grad_norm": 0.04607585445046425, + "learning_rate": 2.350948142531309e-05, + "loss": 0.042, + "step": 106500 + }, + { + "epoch": 23.00185235335536, + "grad_norm": 7.336219310760498, + "learning_rate": 2.3506472404267995e-05, + "loss": 0.0395, + "step": 106510 + }, + { + "epoch": 23.00190651573417, + "grad_norm": 0.03255310654640198, + "learning_rate": 2.3503463383222905e-05, + "loss": 0.1645, + "step": 106520 + }, + { + "epoch": 23.001960678112983, + "grad_norm": 0.002266752067953348, + "learning_rate": 2.350045436217781e-05, + "loss": 0.0502, + "step": 106530 + }, + { + "epoch": 23.002014840491796, + "grad_norm": 0.002157798269763589, + "learning_rate": 2.3497445341132717e-05, + "loss": 0.0164, + "step": 106540 + }, + { + "epoch": 23.002069002870606, + "grad_norm": 0.08941686153411865, + "learning_rate": 2.3494436320087623e-05, + "loss": 0.0449, + "step": 106550 + }, + { + "epoch": 23.00212316524942, + "grad_norm": 0.0018702230881899595, + "learning_rate": 2.3491427299042533e-05, + "loss": 0.0021, + "step": 106560 + }, + { + "epoch": 23.002177327628228, + "grad_norm": 0.0053636860102415085, + "learning_rate": 2.3488418277997436e-05, + "loss": 0.036, + "step": 106570 + }, + { + "epoch": 23.00223149000704, + "grad_norm": 0.002583312103524804, + "learning_rate": 2.3485409256952346e-05, + "loss": 0.0999, + "step": 106580 + }, + { + "epoch": 23.002285652385854, + "grad_norm": 0.01862751506268978, + "learning_rate": 2.3482400235907252e-05, + "loss": 0.0199, + "step": 106590 + }, + { + "epoch": 23.002339814764664, + "grad_norm": 0.6736955642700195, + "learning_rate": 2.3479391214862158e-05, + "loss": 0.0211, + "step": 106600 + }, + { + "epoch": 23.002393977143477, + "grad_norm": 1.0588405132293701, + "learning_rate": 2.3476382193817064e-05, + "loss": 0.0153, + "step": 106610 + }, + { + "epoch": 23.002448139522286, + "grad_norm": 1.713447093963623, + "learning_rate": 2.347337317277197e-05, + "loss": 0.037, + "step": 106620 + }, + { + "epoch": 23.0025023019011, + "grad_norm": 4.636550426483154, + "learning_rate": 2.3470364151726877e-05, + "loss": 0.1637, + "step": 106630 + }, + { + "epoch": 23.002556464279913, + "grad_norm": 0.0017158051487058401, + "learning_rate": 2.3467355130681783e-05, + "loss": 0.0246, + "step": 106640 + }, + { + "epoch": 23.002610626658722, + "grad_norm": 1.4512343406677246, + "learning_rate": 2.3464346109636693e-05, + "loss": 0.1125, + "step": 106650 + }, + { + "epoch": 23.002664789037535, + "grad_norm": 0.4704143702983856, + "learning_rate": 2.34613370885916e-05, + "loss": 0.0296, + "step": 106660 + }, + { + "epoch": 23.002718951416345, + "grad_norm": 0.02483762428164482, + "learning_rate": 2.3458328067546505e-05, + "loss": 0.029, + "step": 106670 + }, + { + "epoch": 23.002773113795158, + "grad_norm": 0.0022490990813821554, + "learning_rate": 2.3455319046501412e-05, + "loss": 0.0669, + "step": 106680 + }, + { + "epoch": 23.00282727617397, + "grad_norm": 0.024291399866342545, + "learning_rate": 2.345231002545632e-05, + "loss": 0.0009, + "step": 106690 + }, + { + "epoch": 23.00288143855278, + "grad_norm": 0.006522486452013254, + "learning_rate": 2.3449301004411224e-05, + "loss": 0.0784, + "step": 106700 + }, + { + "epoch": 23.002935600931593, + "grad_norm": 0.0015565728535875678, + "learning_rate": 2.3446291983366134e-05, + "loss": 0.022, + "step": 106710 + }, + { + "epoch": 23.002989763310406, + "grad_norm": 0.4464746415615082, + "learning_rate": 2.344328296232104e-05, + "loss": 0.0498, + "step": 106720 + }, + { + "epoch": 23.003043925689216, + "grad_norm": 0.04566536843776703, + "learning_rate": 2.3440273941275946e-05, + "loss": 0.0254, + "step": 106730 + }, + { + "epoch": 23.00309808806803, + "grad_norm": 0.10411117970943451, + "learning_rate": 2.3437264920230853e-05, + "loss": 0.0476, + "step": 106740 + }, + { + "epoch": 23.00315225044684, + "grad_norm": 0.0032453006133437157, + "learning_rate": 2.3434255899185762e-05, + "loss": 0.0772, + "step": 106750 + }, + { + "epoch": 23.00320641282565, + "grad_norm": 0.3948831856250763, + "learning_rate": 2.3431246878140665e-05, + "loss": 0.0602, + "step": 106760 + }, + { + "epoch": 23.003260575204465, + "grad_norm": 0.08055372536182404, + "learning_rate": 2.342823785709557e-05, + "loss": 0.0471, + "step": 106770 + }, + { + "epoch": 23.003314737583274, + "grad_norm": 0.04405328258872032, + "learning_rate": 2.342522883605048e-05, + "loss": 0.0344, + "step": 106780 + }, + { + "epoch": 23.003368899962087, + "grad_norm": 0.006329078692942858, + "learning_rate": 2.3422219815005388e-05, + "loss": 0.0244, + "step": 106790 + }, + { + "epoch": 23.003423062340897, + "grad_norm": 0.0021907046902924776, + "learning_rate": 2.3419210793960294e-05, + "loss": 0.0594, + "step": 106800 + }, + { + "epoch": 23.00347722471971, + "grad_norm": 0.3285902440547943, + "learning_rate": 2.34162017729152e-05, + "loss": 0.0293, + "step": 106810 + }, + { + "epoch": 23.003531387098523, + "grad_norm": 0.03547278419137001, + "learning_rate": 2.341319275187011e-05, + "loss": 0.0018, + "step": 106820 + }, + { + "epoch": 23.003585549477332, + "grad_norm": 0.12692324817180634, + "learning_rate": 2.3410183730825013e-05, + "loss": 0.0082, + "step": 106830 + }, + { + "epoch": 23.003639711856145, + "grad_norm": 0.0013976155314594507, + "learning_rate": 2.3407174709779922e-05, + "loss": 0.0468, + "step": 106840 + }, + { + "epoch": 23.003693874234955, + "grad_norm": 0.0021948525682091713, + "learning_rate": 2.340416568873483e-05, + "loss": 0.0547, + "step": 106850 + }, + { + "epoch": 23.003748036613768, + "grad_norm": 0.46322697401046753, + "learning_rate": 2.3401156667689735e-05, + "loss": 0.0503, + "step": 106860 + }, + { + "epoch": 23.00380219899258, + "grad_norm": 0.26525333523750305, + "learning_rate": 2.339814764664464e-05, + "loss": 0.1336, + "step": 106870 + }, + { + "epoch": 23.00385636137139, + "grad_norm": 0.7324000000953674, + "learning_rate": 2.339513862559955e-05, + "loss": 0.0098, + "step": 106880 + }, + { + "epoch": 23.003910523750204, + "grad_norm": 0.627208948135376, + "learning_rate": 2.3392129604554454e-05, + "loss": 0.0878, + "step": 106890 + }, + { + "epoch": 23.003964686129013, + "grad_norm": 0.0020517020020633936, + "learning_rate": 2.3389120583509363e-05, + "loss": 0.0083, + "step": 106900 + }, + { + "epoch": 23.004018848507826, + "grad_norm": 16.04061508178711, + "learning_rate": 2.338611156246427e-05, + "loss": 0.0818, + "step": 106910 + }, + { + "epoch": 23.00407301088664, + "grad_norm": 0.0036011310294270515, + "learning_rate": 2.3383102541419176e-05, + "loss": 0.0697, + "step": 106920 + }, + { + "epoch": 23.00412717326545, + "grad_norm": 0.004517156630754471, + "learning_rate": 2.3380093520374082e-05, + "loss": 0.0458, + "step": 106930 + }, + { + "epoch": 23.00418133564426, + "grad_norm": 0.013297967612743378, + "learning_rate": 2.337708449932899e-05, + "loss": 0.0581, + "step": 106940 + }, + { + "epoch": 23.004235498023075, + "grad_norm": 0.0021922586020082235, + "learning_rate": 2.3374075478283898e-05, + "loss": 0.0248, + "step": 106950 + }, + { + "epoch": 23.004289660401884, + "grad_norm": 0.002187256468459964, + "learning_rate": 2.33710664572388e-05, + "loss": 0.0285, + "step": 106960 + }, + { + "epoch": 23.004343822780697, + "grad_norm": 0.0014165631728246808, + "learning_rate": 2.336805743619371e-05, + "loss": 0.0241, + "step": 106970 + }, + { + "epoch": 23.004397985159507, + "grad_norm": 0.10014567524194717, + "learning_rate": 2.3365048415148617e-05, + "loss": 0.0318, + "step": 106980 + }, + { + "epoch": 23.00445214753832, + "grad_norm": 0.05098872259259224, + "learning_rate": 2.3362039394103523e-05, + "loss": 0.0388, + "step": 106990 + }, + { + "epoch": 23.004506309917133, + "grad_norm": 0.07601124048233032, + "learning_rate": 2.335903037305843e-05, + "loss": 0.1667, + "step": 107000 + }, + { + "epoch": 23.004560472295942, + "grad_norm": 0.05255204811692238, + "learning_rate": 2.335602135201334e-05, + "loss": 0.0141, + "step": 107010 + }, + { + "epoch": 23.004614634674756, + "grad_norm": 0.4095138907432556, + "learning_rate": 2.3353012330968242e-05, + "loss": 0.0337, + "step": 107020 + }, + { + "epoch": 23.004668797053565, + "grad_norm": 0.2338481843471527, + "learning_rate": 2.335000330992315e-05, + "loss": 0.0519, + "step": 107030 + }, + { + "epoch": 23.004722959432378, + "grad_norm": 0.44344910979270935, + "learning_rate": 2.3346994288878058e-05, + "loss": 0.0708, + "step": 107040 + }, + { + "epoch": 23.00477712181119, + "grad_norm": 3.6312835216522217, + "learning_rate": 2.3343985267832964e-05, + "loss": 0.0644, + "step": 107050 + }, + { + "epoch": 23.00483128419, + "grad_norm": 0.01768883503973484, + "learning_rate": 2.334097624678787e-05, + "loss": 0.0372, + "step": 107060 + }, + { + "epoch": 23.004885446568814, + "grad_norm": 0.4933013617992401, + "learning_rate": 2.3337967225742777e-05, + "loss": 0.0979, + "step": 107070 + }, + { + "epoch": 23.004939608947623, + "grad_norm": 0.1839456856250763, + "learning_rate": 2.3334958204697686e-05, + "loss": 0.052, + "step": 107080 + }, + { + "epoch": 23.004993771326436, + "grad_norm": 1.4203850030899048, + "learning_rate": 2.333194918365259e-05, + "loss": 0.013, + "step": 107090 + }, + { + "epoch": 23.00504793370525, + "grad_norm": 0.3390403687953949, + "learning_rate": 2.33289401626075e-05, + "loss": 0.0513, + "step": 107100 + }, + { + "epoch": 23.00510209608406, + "grad_norm": 1.2012385129928589, + "learning_rate": 2.3325931141562405e-05, + "loss": 0.0211, + "step": 107110 + }, + { + "epoch": 23.005156258462872, + "grad_norm": 0.030578020960092545, + "learning_rate": 2.332292212051731e-05, + "loss": 0.0037, + "step": 107120 + }, + { + "epoch": 23.005210420841685, + "grad_norm": 2.5271754264831543, + "learning_rate": 2.3319913099472218e-05, + "loss": 0.0263, + "step": 107130 + }, + { + "epoch": 23.005264583220495, + "grad_norm": 0.043785613030195236, + "learning_rate": 2.3316904078427127e-05, + "loss": 0.0314, + "step": 107140 + }, + { + "epoch": 23.005318745599308, + "grad_norm": 2.036740303039551, + "learning_rate": 2.331389505738203e-05, + "loss": 0.1248, + "step": 107150 + }, + { + "epoch": 23.005372907978117, + "grad_norm": 0.12820766866207123, + "learning_rate": 2.331088603633694e-05, + "loss": 0.0108, + "step": 107160 + }, + { + "epoch": 23.00542707035693, + "grad_norm": 0.07483287900686264, + "learning_rate": 2.3307877015291846e-05, + "loss": 0.0051, + "step": 107170 + }, + { + "epoch": 23.005481232735743, + "grad_norm": 0.0013112592278048396, + "learning_rate": 2.3304867994246753e-05, + "loss": 0.0512, + "step": 107180 + }, + { + "epoch": 23.005535395114553, + "grad_norm": 0.04921048507094383, + "learning_rate": 2.330185897320166e-05, + "loss": 0.0124, + "step": 107190 + }, + { + "epoch": 23.005589557493366, + "grad_norm": 0.0021154924761503935, + "learning_rate": 2.329884995215657e-05, + "loss": 0.0533, + "step": 107200 + }, + { + "epoch": 23.005643719872175, + "grad_norm": 0.4829463064670563, + "learning_rate": 2.3295840931111475e-05, + "loss": 0.0351, + "step": 107210 + }, + { + "epoch": 23.00569788225099, + "grad_norm": 0.06339601427316666, + "learning_rate": 2.3292831910066378e-05, + "loss": 0.0351, + "step": 107220 + }, + { + "epoch": 23.0057520446298, + "grad_norm": 0.002167812082916498, + "learning_rate": 2.3289822889021287e-05, + "loss": 0.0374, + "step": 107230 + }, + { + "epoch": 23.00580620700861, + "grad_norm": 0.06862837076187134, + "learning_rate": 2.3286813867976194e-05, + "loss": 0.0569, + "step": 107240 + }, + { + "epoch": 23.005860369387424, + "grad_norm": 0.0037541110068559647, + "learning_rate": 2.32838048469311e-05, + "loss": 0.0854, + "step": 107250 + }, + { + "epoch": 23.005914531766233, + "grad_norm": 0.02524840459227562, + "learning_rate": 2.3280795825886006e-05, + "loss": 0.0133, + "step": 107260 + }, + { + "epoch": 23.005968694145047, + "grad_norm": 0.0014225090853869915, + "learning_rate": 2.3277786804840916e-05, + "loss": 0.0076, + "step": 107270 + }, + { + "epoch": 23.00602285652386, + "grad_norm": 1.1466931104660034, + "learning_rate": 2.327477778379582e-05, + "loss": 0.0392, + "step": 107280 + }, + { + "epoch": 23.00607701890267, + "grad_norm": 0.059432923793792725, + "learning_rate": 2.327176876275073e-05, + "loss": 0.042, + "step": 107290 + }, + { + "epoch": 23.006131181281482, + "grad_norm": 4.678475379943848, + "learning_rate": 2.3268759741705635e-05, + "loss": 0.0292, + "step": 107300 + }, + { + "epoch": 23.006185343660295, + "grad_norm": 0.8122572898864746, + "learning_rate": 2.326575072066054e-05, + "loss": 0.0058, + "step": 107310 + }, + { + "epoch": 23.006239506039105, + "grad_norm": 0.001193423056975007, + "learning_rate": 2.3262741699615447e-05, + "loss": 0.0353, + "step": 107320 + }, + { + "epoch": 23.006293668417918, + "grad_norm": 0.12384961545467377, + "learning_rate": 2.3259732678570357e-05, + "loss": 0.0367, + "step": 107330 + }, + { + "epoch": 23.006347830796727, + "grad_norm": 0.0011959044495597482, + "learning_rate": 2.3256723657525263e-05, + "loss": 0.0334, + "step": 107340 + }, + { + "epoch": 23.00640199317554, + "grad_norm": 0.0042528859339654446, + "learning_rate": 2.325371463648017e-05, + "loss": 0.132, + "step": 107350 + }, + { + "epoch": 23.006456155554353, + "grad_norm": 0.002638715086504817, + "learning_rate": 2.3250705615435076e-05, + "loss": 0.0421, + "step": 107360 + }, + { + "epoch": 23.006510317933163, + "grad_norm": 0.0012270637089386582, + "learning_rate": 2.3247696594389982e-05, + "loss": 0.075, + "step": 107370 + }, + { + "epoch": 23.006564480311976, + "grad_norm": 0.0012313760817050934, + "learning_rate": 2.3244687573344888e-05, + "loss": 0.073, + "step": 107380 + }, + { + "epoch": 23.006618642690785, + "grad_norm": 0.0012427284382283688, + "learning_rate": 2.3241678552299794e-05, + "loss": 0.0127, + "step": 107390 + }, + { + "epoch": 23.0066728050696, + "grad_norm": 0.02089519239962101, + "learning_rate": 2.3238669531254704e-05, + "loss": 0.0483, + "step": 107400 + }, + { + "epoch": 23.00672696744841, + "grad_norm": 0.009073445573449135, + "learning_rate": 2.3235660510209607e-05, + "loss": 0.028, + "step": 107410 + }, + { + "epoch": 23.00678112982722, + "grad_norm": 0.0012410705676302314, + "learning_rate": 2.3232651489164517e-05, + "loss": 0.0194, + "step": 107420 + }, + { + "epoch": 23.006835292206034, + "grad_norm": 3.412658452987671, + "learning_rate": 2.3229642468119423e-05, + "loss": 0.06, + "step": 107430 + }, + { + "epoch": 23.006889454584844, + "grad_norm": 0.47317996621131897, + "learning_rate": 2.322663344707433e-05, + "loss": 0.0275, + "step": 107440 + }, + { + "epoch": 23.006943616963657, + "grad_norm": 0.028473176062107086, + "learning_rate": 2.3223624426029236e-05, + "loss": 0.0709, + "step": 107450 + }, + { + "epoch": 23.00699777934247, + "grad_norm": 0.0022597601637244225, + "learning_rate": 2.3220615404984145e-05, + "loss": 0.0229, + "step": 107460 + }, + { + "epoch": 23.00705194172128, + "grad_norm": 0.050597697496414185, + "learning_rate": 2.321760638393905e-05, + "loss": 0.0717, + "step": 107470 + }, + { + "epoch": 23.007106104100092, + "grad_norm": 0.001330336439423263, + "learning_rate": 2.3214597362893958e-05, + "loss": 0.0383, + "step": 107480 + }, + { + "epoch": 23.007160266478905, + "grad_norm": 0.2119753211736679, + "learning_rate": 2.3211588341848864e-05, + "loss": 0.0422, + "step": 107490 + }, + { + "epoch": 23.007214428857715, + "grad_norm": 0.2455664426088333, + "learning_rate": 2.3208579320803774e-05, + "loss": 0.0059, + "step": 107500 + }, + { + "epoch": 23.007268591236528, + "grad_norm": 0.514948844909668, + "learning_rate": 2.3205570299758677e-05, + "loss": 0.1281, + "step": 107510 + }, + { + "epoch": 23.007322753615338, + "grad_norm": 2.742877244949341, + "learning_rate": 2.3202561278713583e-05, + "loss": 0.0553, + "step": 107520 + }, + { + "epoch": 23.00737691599415, + "grad_norm": 0.6283391714096069, + "learning_rate": 2.3199552257668492e-05, + "loss": 0.0316, + "step": 107530 + }, + { + "epoch": 23.007431078372964, + "grad_norm": 0.701327383518219, + "learning_rate": 2.3196543236623395e-05, + "loss": 0.0266, + "step": 107540 + }, + { + "epoch": 23.007485240751773, + "grad_norm": 3.9219930171966553, + "learning_rate": 2.3193534215578305e-05, + "loss": 0.0453, + "step": 107550 + }, + { + "epoch": 23.007539403130586, + "grad_norm": 1.6686527729034424, + "learning_rate": 2.319052519453321e-05, + "loss": 0.0169, + "step": 107560 + }, + { + "epoch": 23.007593565509396, + "grad_norm": 0.5552170276641846, + "learning_rate": 2.3187516173488118e-05, + "loss": 0.118, + "step": 107570 + }, + { + "epoch": 23.00764772788821, + "grad_norm": 0.0015036090044304729, + "learning_rate": 2.3184507152443024e-05, + "loss": 0.0192, + "step": 107580 + }, + { + "epoch": 23.007701890267022, + "grad_norm": 0.0038282712921500206, + "learning_rate": 2.3181498131397934e-05, + "loss": 0.006, + "step": 107590 + }, + { + "epoch": 23.00775605264583, + "grad_norm": 0.005604168865829706, + "learning_rate": 2.317848911035284e-05, + "loss": 0.0066, + "step": 107600 + }, + { + "epoch": 23.007810215024644, + "grad_norm": 1.2839058637619019, + "learning_rate": 2.3175480089307746e-05, + "loss": 0.1174, + "step": 107610 + }, + { + "epoch": 23.007864377403454, + "grad_norm": 0.2945883870124817, + "learning_rate": 2.3172471068262652e-05, + "loss": 0.0252, + "step": 107620 + }, + { + "epoch": 23.007918539782267, + "grad_norm": 0.0014038387453183532, + "learning_rate": 2.3169462047217562e-05, + "loss": 0.0282, + "step": 107630 + }, + { + "epoch": 23.00797270216108, + "grad_norm": 0.4435470402240753, + "learning_rate": 2.3166453026172465e-05, + "loss": 0.0448, + "step": 107640 + }, + { + "epoch": 23.00802686453989, + "grad_norm": 0.001913105952553451, + "learning_rate": 2.3163444005127375e-05, + "loss": 0.0312, + "step": 107650 + }, + { + "epoch": 23.008081026918703, + "grad_norm": 0.13524246215820312, + "learning_rate": 2.316043498408228e-05, + "loss": 0.0014, + "step": 107660 + }, + { + "epoch": 23.008135189297516, + "grad_norm": 0.0012869001366198063, + "learning_rate": 2.3157425963037184e-05, + "loss": 0.0383, + "step": 107670 + }, + { + "epoch": 23.008189351676325, + "grad_norm": 0.6893039345741272, + "learning_rate": 2.3154416941992093e-05, + "loss": 0.0344, + "step": 107680 + }, + { + "epoch": 23.00824351405514, + "grad_norm": 0.9926559925079346, + "learning_rate": 2.3151407920947e-05, + "loss": 0.0292, + "step": 107690 + }, + { + "epoch": 23.008297676433948, + "grad_norm": 0.0014494735514745116, + "learning_rate": 2.3148398899901906e-05, + "loss": 0.1711, + "step": 107700 + }, + { + "epoch": 23.00835183881276, + "grad_norm": 1.2372105121612549, + "learning_rate": 2.3145389878856812e-05, + "loss": 0.0724, + "step": 107710 + }, + { + "epoch": 23.008406001191574, + "grad_norm": 0.9532870650291443, + "learning_rate": 2.3142380857811722e-05, + "loss": 0.0374, + "step": 107720 + }, + { + "epoch": 23.008460163570383, + "grad_norm": 0.011664208956062794, + "learning_rate": 2.3139371836766628e-05, + "loss": 0.0112, + "step": 107730 + }, + { + "epoch": 23.008514325949196, + "grad_norm": 0.0016174778575077653, + "learning_rate": 2.3136362815721534e-05, + "loss": 0.0181, + "step": 107740 + }, + { + "epoch": 23.008568488328006, + "grad_norm": 0.003228050423786044, + "learning_rate": 2.313335379467644e-05, + "loss": 0.0584, + "step": 107750 + }, + { + "epoch": 23.00862265070682, + "grad_norm": 0.025496983900666237, + "learning_rate": 2.313034477363135e-05, + "loss": 0.0231, + "step": 107760 + }, + { + "epoch": 23.008676813085632, + "grad_norm": 0.019956840202212334, + "learning_rate": 2.3127335752586253e-05, + "loss": 0.06, + "step": 107770 + }, + { + "epoch": 23.00873097546444, + "grad_norm": 0.003491549286991358, + "learning_rate": 2.3124326731541163e-05, + "loss": 0.0305, + "step": 107780 + }, + { + "epoch": 23.008785137843255, + "grad_norm": 0.022620810195803642, + "learning_rate": 2.312131771049607e-05, + "loss": 0.091, + "step": 107790 + }, + { + "epoch": 23.008839300222064, + "grad_norm": 0.8273584842681885, + "learning_rate": 2.3118308689450975e-05, + "loss": 0.1215, + "step": 107800 + }, + { + "epoch": 23.008893462600877, + "grad_norm": 1.9393776655197144, + "learning_rate": 2.3115299668405882e-05, + "loss": 0.1211, + "step": 107810 + }, + { + "epoch": 23.00894762497969, + "grad_norm": 1.1001588106155396, + "learning_rate": 2.3112290647360788e-05, + "loss": 0.0367, + "step": 107820 + }, + { + "epoch": 23.0090017873585, + "grad_norm": 0.3025175631046295, + "learning_rate": 2.3109281626315694e-05, + "loss": 0.0643, + "step": 107830 + }, + { + "epoch": 23.009055949737313, + "grad_norm": 0.023638617247343063, + "learning_rate": 2.31062726052706e-05, + "loss": 0.0894, + "step": 107840 + }, + { + "epoch": 23.009110112116126, + "grad_norm": 0.00265858368948102, + "learning_rate": 2.310326358422551e-05, + "loss": 0.0018, + "step": 107850 + }, + { + "epoch": 23.009164274494935, + "grad_norm": 5.509610176086426, + "learning_rate": 2.3100254563180416e-05, + "loss": 0.035, + "step": 107860 + }, + { + "epoch": 23.00921843687375, + "grad_norm": 9.659786224365234, + "learning_rate": 2.3097245542135323e-05, + "loss": 0.0973, + "step": 107870 + }, + { + "epoch": 23.009272599252558, + "grad_norm": 0.9078571200370789, + "learning_rate": 2.309423652109023e-05, + "loss": 0.1694, + "step": 107880 + }, + { + "epoch": 23.00932676163137, + "grad_norm": 0.2075580209493637, + "learning_rate": 2.309122750004514e-05, + "loss": 0.0268, + "step": 107890 + }, + { + "epoch": 23.009380924010184, + "grad_norm": 0.0019722902216017246, + "learning_rate": 2.308821847900004e-05, + "loss": 0.0645, + "step": 107900 + }, + { + "epoch": 23.009435086388994, + "grad_norm": 0.10566259920597076, + "learning_rate": 2.308520945795495e-05, + "loss": 0.0462, + "step": 107910 + }, + { + "epoch": 23.009489248767807, + "grad_norm": 0.01974065601825714, + "learning_rate": 2.3082200436909858e-05, + "loss": 0.0349, + "step": 107920 + }, + { + "epoch": 23.009543411146616, + "grad_norm": 0.0020848456770181656, + "learning_rate": 2.3079191415864764e-05, + "loss": 0.0587, + "step": 107930 + }, + { + "epoch": 23.00959757352543, + "grad_norm": 0.014966660179197788, + "learning_rate": 2.307618239481967e-05, + "loss": 0.0171, + "step": 107940 + }, + { + "epoch": 23.009651735904242, + "grad_norm": 0.7523822784423828, + "learning_rate": 2.307317337377458e-05, + "loss": 0.1003, + "step": 107950 + }, + { + "epoch": 23.009705898283052, + "grad_norm": 0.03198444843292236, + "learning_rate": 2.3070164352729483e-05, + "loss": 0.013, + "step": 107960 + }, + { + "epoch": 23.009760060661865, + "grad_norm": 0.002931856317445636, + "learning_rate": 2.306715533168439e-05, + "loss": 0.1349, + "step": 107970 + }, + { + "epoch": 23.009814223040674, + "grad_norm": 6.088014125823975, + "learning_rate": 2.30641463106393e-05, + "loss": 0.0914, + "step": 107980 + }, + { + "epoch": 23.009868385419487, + "grad_norm": 6.482632637023926, + "learning_rate": 2.3061137289594205e-05, + "loss": 0.0789, + "step": 107990 + }, + { + "epoch": 23.0099225477983, + "grad_norm": 0.0034518109168857336, + "learning_rate": 2.305812826854911e-05, + "loss": 0.0466, + "step": 108000 + }, + { + "epoch": 23.00997671017711, + "grad_norm": 1.360408902168274, + "learning_rate": 2.3055119247504017e-05, + "loss": 0.0821, + "step": 108010 + }, + { + "epoch": 23.010030872555923, + "grad_norm": 0.2619033753871918, + "learning_rate": 2.3052110226458927e-05, + "loss": 0.0177, + "step": 108020 + }, + { + "epoch": 23.010085034934733, + "grad_norm": 0.34039631485939026, + "learning_rate": 2.304910120541383e-05, + "loss": 0.0758, + "step": 108030 + }, + { + "epoch": 23.010139197313546, + "grad_norm": 0.003615249413996935, + "learning_rate": 2.304609218436874e-05, + "loss": 0.0183, + "step": 108040 + }, + { + "epoch": 23.01019335969236, + "grad_norm": 1.4407404661178589, + "learning_rate": 2.3043083163323646e-05, + "loss": 0.0243, + "step": 108050 + }, + { + "epoch": 23.010247522071168, + "grad_norm": 0.5162412524223328, + "learning_rate": 2.3040074142278552e-05, + "loss": 0.0559, + "step": 108060 + }, + { + "epoch": 23.01030168444998, + "grad_norm": 0.06741423159837723, + "learning_rate": 2.303706512123346e-05, + "loss": 0.0035, + "step": 108070 + }, + { + "epoch": 23.010355846828794, + "grad_norm": 4.411358833312988, + "learning_rate": 2.3034056100188368e-05, + "loss": 0.096, + "step": 108080 + }, + { + "epoch": 23.010410009207604, + "grad_norm": 0.018513265997171402, + "learning_rate": 2.303104707914327e-05, + "loss": 0.0525, + "step": 108090 + }, + { + "epoch": 23.010464171586417, + "grad_norm": 0.002237963955849409, + "learning_rate": 2.302803805809818e-05, + "loss": 0.0316, + "step": 108100 + }, + { + "epoch": 23.010518333965226, + "grad_norm": 0.8834067583084106, + "learning_rate": 2.3025029037053087e-05, + "loss": 0.0174, + "step": 108110 + }, + { + "epoch": 23.01057249634404, + "grad_norm": 0.014752371236681938, + "learning_rate": 2.3022020016007993e-05, + "loss": 0.0759, + "step": 108120 + }, + { + "epoch": 23.010626658722853, + "grad_norm": 0.0025859582237899303, + "learning_rate": 2.30190109949629e-05, + "loss": 0.0526, + "step": 108130 + }, + { + "epoch": 23.010680821101662, + "grad_norm": 0.052897777408361435, + "learning_rate": 2.3016001973917806e-05, + "loss": 0.0587, + "step": 108140 + }, + { + "epoch": 23.010734983480475, + "grad_norm": 1.206848382949829, + "learning_rate": 2.3012992952872715e-05, + "loss": 0.0219, + "step": 108150 + }, + { + "epoch": 23.010789145859285, + "grad_norm": 0.09674908965826035, + "learning_rate": 2.3009983931827618e-05, + "loss": 0.047, + "step": 108160 + }, + { + "epoch": 23.010843308238098, + "grad_norm": 0.0031556342728435993, + "learning_rate": 2.3006974910782528e-05, + "loss": 0.0474, + "step": 108170 + }, + { + "epoch": 23.01089747061691, + "grad_norm": 0.0025248234160244465, + "learning_rate": 2.3003965889737434e-05, + "loss": 0.0265, + "step": 108180 + }, + { + "epoch": 23.01095163299572, + "grad_norm": 0.020822111517190933, + "learning_rate": 2.300095686869234e-05, + "loss": 0.0033, + "step": 108190 + }, + { + "epoch": 23.011005795374533, + "grad_norm": 0.012370612472295761, + "learning_rate": 2.2997947847647247e-05, + "loss": 0.02, + "step": 108200 + }, + { + "epoch": 23.011059957753343, + "grad_norm": 0.26396915316581726, + "learning_rate": 2.2994938826602156e-05, + "loss": 0.0153, + "step": 108210 + }, + { + "epoch": 23.011114120132156, + "grad_norm": 0.0020566366147249937, + "learning_rate": 2.299192980555706e-05, + "loss": 0.0073, + "step": 108220 + }, + { + "epoch": 23.01116828251097, + "grad_norm": 1.937742829322815, + "learning_rate": 2.298892078451197e-05, + "loss": 0.0392, + "step": 108230 + }, + { + "epoch": 23.01122244488978, + "grad_norm": 0.9859521389007568, + "learning_rate": 2.2985911763466875e-05, + "loss": 0.0303, + "step": 108240 + }, + { + "epoch": 23.01127660726859, + "grad_norm": 0.5567569136619568, + "learning_rate": 2.298290274242178e-05, + "loss": 0.1671, + "step": 108250 + }, + { + "epoch": 23.011330769647405, + "grad_norm": 0.026919974014163017, + "learning_rate": 2.2979893721376688e-05, + "loss": 0.0785, + "step": 108260 + }, + { + "epoch": 23.011384932026214, + "grad_norm": 0.0027508896309882402, + "learning_rate": 2.2976884700331594e-05, + "loss": 0.0239, + "step": 108270 + }, + { + "epoch": 23.011439094405027, + "grad_norm": 0.013742206618189812, + "learning_rate": 2.2973875679286504e-05, + "loss": 0.037, + "step": 108280 + }, + { + "epoch": 23.011493256783837, + "grad_norm": 0.006370102055370808, + "learning_rate": 2.2970866658241407e-05, + "loss": 0.0958, + "step": 108290 + }, + { + "epoch": 23.01154741916265, + "grad_norm": 0.02891266904771328, + "learning_rate": 2.2967857637196316e-05, + "loss": 0.039, + "step": 108300 + }, + { + "epoch": 23.011601581541463, + "grad_norm": 0.37432971596717834, + "learning_rate": 2.2964848616151223e-05, + "loss": 0.0089, + "step": 108310 + }, + { + "epoch": 23.011655743920272, + "grad_norm": 2.096123695373535, + "learning_rate": 2.296183959510613e-05, + "loss": 0.0611, + "step": 108320 + }, + { + "epoch": 23.011709906299085, + "grad_norm": 0.0056579806841909885, + "learning_rate": 2.2958830574061035e-05, + "loss": 0.0498, + "step": 108330 + }, + { + "epoch": 23.011764068677895, + "grad_norm": 0.0025667808949947357, + "learning_rate": 2.2955821553015945e-05, + "loss": 0.0532, + "step": 108340 + }, + { + "epoch": 23.011818231056708, + "grad_norm": 2.1035993099212646, + "learning_rate": 2.2952812531970848e-05, + "loss": 0.0371, + "step": 108350 + }, + { + "epoch": 23.01187239343552, + "grad_norm": 0.020929310470819473, + "learning_rate": 2.2949803510925757e-05, + "loss": 0.0819, + "step": 108360 + }, + { + "epoch": 23.01192655581433, + "grad_norm": 0.5339466333389282, + "learning_rate": 2.2946794489880664e-05, + "loss": 0.0062, + "step": 108370 + }, + { + "epoch": 23.011980718193144, + "grad_norm": 1.4886442422866821, + "learning_rate": 2.294378546883557e-05, + "loss": 0.1626, + "step": 108380 + }, + { + "epoch": 23.012034880571953, + "grad_norm": 0.04362432658672333, + "learning_rate": 2.2940776447790476e-05, + "loss": 0.0219, + "step": 108390 + }, + { + "epoch": 23.012089042950766, + "grad_norm": 0.004103220067918301, + "learning_rate": 2.2937767426745382e-05, + "loss": 0.0294, + "step": 108400 + }, + { + "epoch": 23.01214320532958, + "grad_norm": 0.10270003229379654, + "learning_rate": 2.2934758405700292e-05, + "loss": 0.023, + "step": 108410 + }, + { + "epoch": 23.01219736770839, + "grad_norm": 1.087166666984558, + "learning_rate": 2.2931749384655195e-05, + "loss": 0.0466, + "step": 108420 + }, + { + "epoch": 23.0122515300872, + "grad_norm": 2.0392062664031982, + "learning_rate": 2.2928740363610105e-05, + "loss": 0.0414, + "step": 108430 + }, + { + "epoch": 23.012305692466015, + "grad_norm": 4.123697280883789, + "learning_rate": 2.292573134256501e-05, + "loss": 0.0253, + "step": 108440 + }, + { + "epoch": 23.012359854844824, + "grad_norm": 0.002548819174990058, + "learning_rate": 2.2922722321519917e-05, + "loss": 0.0066, + "step": 108450 + }, + { + "epoch": 23.012414017223637, + "grad_norm": 0.8896001577377319, + "learning_rate": 2.2919713300474823e-05, + "loss": 0.0623, + "step": 108460 + }, + { + "epoch": 23.012468179602447, + "grad_norm": 0.16899749636650085, + "learning_rate": 2.2916704279429733e-05, + "loss": 0.0049, + "step": 108470 + }, + { + "epoch": 23.01252234198126, + "grad_norm": 0.07422491908073425, + "learning_rate": 2.2913695258384636e-05, + "loss": 0.0599, + "step": 108480 + }, + { + "epoch": 23.012576504360073, + "grad_norm": 1.305389642715454, + "learning_rate": 2.2910686237339546e-05, + "loss": 0.0494, + "step": 108490 + }, + { + "epoch": 23.012630666738882, + "grad_norm": 0.40249964594841003, + "learning_rate": 2.2907677216294452e-05, + "loss": 0.0288, + "step": 108500 + }, + { + "epoch": 23.012684829117696, + "grad_norm": 0.07292588800191879, + "learning_rate": 2.2904668195249358e-05, + "loss": 0.0253, + "step": 108510 + }, + { + "epoch": 23.012738991496505, + "grad_norm": 0.0016034236177802086, + "learning_rate": 2.2901659174204265e-05, + "loss": 0.0155, + "step": 108520 + }, + { + "epoch": 23.012793153875318, + "grad_norm": 0.002184196375310421, + "learning_rate": 2.2898650153159174e-05, + "loss": 0.0007, + "step": 108530 + }, + { + "epoch": 23.01284731625413, + "grad_norm": 3.517063617706299, + "learning_rate": 2.289564113211408e-05, + "loss": 0.1, + "step": 108540 + }, + { + "epoch": 23.01290147863294, + "grad_norm": 0.44575127959251404, + "learning_rate": 2.2892632111068983e-05, + "loss": 0.1189, + "step": 108550 + }, + { + "epoch": 23.012955641011754, + "grad_norm": 0.0017112080240622163, + "learning_rate": 2.2889623090023893e-05, + "loss": 0.044, + "step": 108560 + }, + { + "epoch": 23.013009803390563, + "grad_norm": 0.0036389646120369434, + "learning_rate": 2.28866140689788e-05, + "loss": 0.0005, + "step": 108570 + }, + { + "epoch": 23.013063965769376, + "grad_norm": 0.030652200803160667, + "learning_rate": 2.2883605047933706e-05, + "loss": 0.0028, + "step": 108580 + }, + { + "epoch": 23.01311812814819, + "grad_norm": 0.35553744435310364, + "learning_rate": 2.2880596026888612e-05, + "loss": 0.0632, + "step": 108590 + }, + { + "epoch": 23.013172290527, + "grad_norm": 0.0019094781018793583, + "learning_rate": 2.287758700584352e-05, + "loss": 0.0205, + "step": 108600 + }, + { + "epoch": 23.013226452905812, + "grad_norm": 0.0018535699928179383, + "learning_rate": 2.2874577984798424e-05, + "loss": 0.03, + "step": 108610 + }, + { + "epoch": 23.013280615284625, + "grad_norm": 0.759688675403595, + "learning_rate": 2.2871568963753334e-05, + "loss": 0.0778, + "step": 108620 + }, + { + "epoch": 23.013334777663434, + "grad_norm": 0.05926753953099251, + "learning_rate": 2.286855994270824e-05, + "loss": 0.0032, + "step": 108630 + }, + { + "epoch": 23.013388940042248, + "grad_norm": 0.35321545600891113, + "learning_rate": 2.2865550921663147e-05, + "loss": 0.0026, + "step": 108640 + }, + { + "epoch": 23.013443102421057, + "grad_norm": 0.0890275165438652, + "learning_rate": 2.2862541900618053e-05, + "loss": 0.0065, + "step": 108650 + }, + { + "epoch": 23.01349726479987, + "grad_norm": 0.25778356194496155, + "learning_rate": 2.2859532879572963e-05, + "loss": 0.0202, + "step": 108660 + }, + { + "epoch": 23.013551427178683, + "grad_norm": 0.002683732658624649, + "learning_rate": 2.285652385852787e-05, + "loss": 0.084, + "step": 108670 + }, + { + "epoch": 23.013605589557493, + "grad_norm": 0.0020269276574254036, + "learning_rate": 2.2853514837482775e-05, + "loss": 0.0733, + "step": 108680 + }, + { + "epoch": 23.013659751936306, + "grad_norm": 0.002802109345793724, + "learning_rate": 2.285050581643768e-05, + "loss": 0.1484, + "step": 108690 + }, + { + "epoch": 23.013713914315115, + "grad_norm": 0.03161894902586937, + "learning_rate": 2.2847496795392588e-05, + "loss": 0.0066, + "step": 108700 + }, + { + "epoch": 23.01376807669393, + "grad_norm": 0.012780475430190563, + "learning_rate": 2.2844487774347494e-05, + "loss": 0.1229, + "step": 108710 + }, + { + "epoch": 23.01382223907274, + "grad_norm": 4.369467258453369, + "learning_rate": 2.28414787533024e-05, + "loss": 0.1153, + "step": 108720 + }, + { + "epoch": 23.01387640145155, + "grad_norm": 0.042584240436553955, + "learning_rate": 2.283846973225731e-05, + "loss": 0.0479, + "step": 108730 + }, + { + "epoch": 23.013930563830364, + "grad_norm": 0.00610000267624855, + "learning_rate": 2.2835460711212213e-05, + "loss": 0.0605, + "step": 108740 + }, + { + "epoch": 23.013984726209173, + "grad_norm": 0.24006026983261108, + "learning_rate": 2.2832451690167122e-05, + "loss": 0.0132, + "step": 108750 + }, + { + "epoch": 23.014038888587987, + "grad_norm": 0.04077494889497757, + "learning_rate": 2.282944266912203e-05, + "loss": 0.0052, + "step": 108760 + }, + { + "epoch": 23.0140930509668, + "grad_norm": 0.08478086441755295, + "learning_rate": 2.2826433648076935e-05, + "loss": 0.0021, + "step": 108770 + }, + { + "epoch": 23.01414721334561, + "grad_norm": 4.44708776473999, + "learning_rate": 2.282342462703184e-05, + "loss": 0.0427, + "step": 108780 + }, + { + "epoch": 23.014201375724422, + "grad_norm": 0.5550386905670166, + "learning_rate": 2.282041560598675e-05, + "loss": 0.0577, + "step": 108790 + }, + { + "epoch": 23.014255538103235, + "grad_norm": 1.8946664333343506, + "learning_rate": 2.2817406584941657e-05, + "loss": 0.019, + "step": 108800 + }, + { + "epoch": 23.014309700482045, + "grad_norm": 0.002224050695076585, + "learning_rate": 2.2814397563896563e-05, + "loss": 0.0347, + "step": 108810 + }, + { + "epoch": 23.014363862860858, + "grad_norm": 0.002970845904201269, + "learning_rate": 2.281138854285147e-05, + "loss": 0.114, + "step": 108820 + }, + { + "epoch": 23.014418025239667, + "grad_norm": 0.031377434730529785, + "learning_rate": 2.280837952180638e-05, + "loss": 0.0054, + "step": 108830 + }, + { + "epoch": 23.01447218761848, + "grad_norm": 0.1139596551656723, + "learning_rate": 2.2805370500761282e-05, + "loss": 0.1882, + "step": 108840 + }, + { + "epoch": 23.014526349997293, + "grad_norm": 0.7017090320587158, + "learning_rate": 2.280236147971619e-05, + "loss": 0.0322, + "step": 108850 + }, + { + "epoch": 23.014580512376103, + "grad_norm": 1.6412125825881958, + "learning_rate": 2.2799352458671098e-05, + "loss": 0.0364, + "step": 108860 + }, + { + "epoch": 23.014634674754916, + "grad_norm": 0.0015986788785085082, + "learning_rate": 2.2796343437626e-05, + "loss": 0.0455, + "step": 108870 + }, + { + "epoch": 23.014688837133725, + "grad_norm": 0.0020841818768531084, + "learning_rate": 2.279333441658091e-05, + "loss": 0.0132, + "step": 108880 + }, + { + "epoch": 23.01474299951254, + "grad_norm": 0.007820411585271358, + "learning_rate": 2.2790325395535817e-05, + "loss": 0.0588, + "step": 108890 + }, + { + "epoch": 23.01479716189135, + "grad_norm": 0.0014625760959461331, + "learning_rate": 2.2787316374490723e-05, + "loss": 0.0003, + "step": 108900 + }, + { + "epoch": 23.01485132427016, + "grad_norm": 2.9646501541137695, + "learning_rate": 2.278430735344563e-05, + "loss": 0.1328, + "step": 108910 + }, + { + "epoch": 23.014905486648974, + "grad_norm": 0.00592816062271595, + "learning_rate": 2.278129833240054e-05, + "loss": 0.0124, + "step": 108920 + }, + { + "epoch": 23.014959649027784, + "grad_norm": 0.041994668543338776, + "learning_rate": 2.2778289311355445e-05, + "loss": 0.0556, + "step": 108930 + }, + { + "epoch": 23.015013811406597, + "grad_norm": 0.5196550488471985, + "learning_rate": 2.2775280290310352e-05, + "loss": 0.0719, + "step": 108940 + }, + { + "epoch": 23.01506797378541, + "grad_norm": 0.044539231806993484, + "learning_rate": 2.2772271269265258e-05, + "loss": 0.0583, + "step": 108950 + }, + { + "epoch": 23.01512213616422, + "grad_norm": 0.039260383695364, + "learning_rate": 2.2769262248220168e-05, + "loss": 0.0376, + "step": 108960 + }, + { + "epoch": 23.015176298543032, + "grad_norm": 5.773038864135742, + "learning_rate": 2.276625322717507e-05, + "loss": 0.0507, + "step": 108970 + }, + { + "epoch": 23.015230460921845, + "grad_norm": 0.0318470261991024, + "learning_rate": 2.276324420612998e-05, + "loss": 0.005, + "step": 108980 + }, + { + "epoch": 23.015284623300655, + "grad_norm": 0.8348110318183899, + "learning_rate": 2.2760235185084887e-05, + "loss": 0.0538, + "step": 108990 + }, + { + "epoch": 23.015338785679468, + "grad_norm": 0.001560355070978403, + "learning_rate": 2.275722616403979e-05, + "loss": 0.0041, + "step": 109000 + }, + { + "epoch": 23.015392948058278, + "grad_norm": 0.7488173842430115, + "learning_rate": 2.27542171429947e-05, + "loss": 0.0119, + "step": 109010 + }, + { + "epoch": 23.01544711043709, + "grad_norm": 0.0015211139107123017, + "learning_rate": 2.2751208121949605e-05, + "loss": 0.0049, + "step": 109020 + }, + { + "epoch": 23.015501272815904, + "grad_norm": 0.015825394541025162, + "learning_rate": 2.274819910090451e-05, + "loss": 0.002, + "step": 109030 + }, + { + "epoch": 23.015555435194713, + "grad_norm": 0.003058956703171134, + "learning_rate": 2.2745190079859418e-05, + "loss": 0.0573, + "step": 109040 + }, + { + "epoch": 23.015609597573526, + "grad_norm": 0.0012878509005531669, + "learning_rate": 2.2742181058814328e-05, + "loss": 0.0949, + "step": 109050 + }, + { + "epoch": 23.015663759952336, + "grad_norm": 11.231524467468262, + "learning_rate": 2.2739172037769234e-05, + "loss": 0.0735, + "step": 109060 + }, + { + "epoch": 23.01571792233115, + "grad_norm": 0.642257034778595, + "learning_rate": 2.273616301672414e-05, + "loss": 0.0346, + "step": 109070 + }, + { + "epoch": 23.015772084709962, + "grad_norm": 0.141128808259964, + "learning_rate": 2.2733153995679046e-05, + "loss": 0.0014, + "step": 109080 + }, + { + "epoch": 23.01582624708877, + "grad_norm": 0.0013461158378049731, + "learning_rate": 2.2730144974633956e-05, + "loss": 0.0957, + "step": 109090 + }, + { + "epoch": 23.015880409467584, + "grad_norm": 0.009240097366273403, + "learning_rate": 2.272713595358886e-05, + "loss": 0.0038, + "step": 109100 + }, + { + "epoch": 23.015934571846394, + "grad_norm": 0.408531129360199, + "learning_rate": 2.272412693254377e-05, + "loss": 0.0063, + "step": 109110 + }, + { + "epoch": 23.015988734225207, + "grad_norm": 0.0017900121165439487, + "learning_rate": 2.2721117911498675e-05, + "loss": 0.1052, + "step": 109120 + }, + { + "epoch": 23.01604289660402, + "grad_norm": 0.051534503698349, + "learning_rate": 2.271810889045358e-05, + "loss": 0.0275, + "step": 109130 + }, + { + "epoch": 23.01609705898283, + "grad_norm": 0.01482758205384016, + "learning_rate": 2.2715099869408487e-05, + "loss": 0.0443, + "step": 109140 + }, + { + "epoch": 23.016151221361643, + "grad_norm": 0.0014122455613687634, + "learning_rate": 2.2712090848363394e-05, + "loss": 0.0998, + "step": 109150 + }, + { + "epoch": 23.016205383740452, + "grad_norm": 0.010940708220005035, + "learning_rate": 2.27090818273183e-05, + "loss": 0.0597, + "step": 109160 + }, + { + "epoch": 23.016259546119265, + "grad_norm": 5.237302303314209, + "learning_rate": 2.2706072806273206e-05, + "loss": 0.0718, + "step": 109170 + }, + { + "epoch": 23.01631370849808, + "grad_norm": 0.7473018765449524, + "learning_rate": 2.2703063785228116e-05, + "loss": 0.1034, + "step": 109180 + }, + { + "epoch": 23.016367870876888, + "grad_norm": 1.5403443574905396, + "learning_rate": 2.2700054764183022e-05, + "loss": 0.0414, + "step": 109190 + }, + { + "epoch": 23.0164220332557, + "grad_norm": 0.13858570158481598, + "learning_rate": 2.269704574313793e-05, + "loss": 0.0689, + "step": 109200 + }, + { + "epoch": 23.016476195634514, + "grad_norm": 1.1137278079986572, + "learning_rate": 2.2694036722092835e-05, + "loss": 0.0176, + "step": 109210 + }, + { + "epoch": 23.016530358013323, + "grad_norm": 0.001426874427124858, + "learning_rate": 2.2691027701047744e-05, + "loss": 0.0999, + "step": 109220 + }, + { + "epoch": 23.016584520392136, + "grad_norm": 0.0461258739233017, + "learning_rate": 2.2688018680002647e-05, + "loss": 0.059, + "step": 109230 + }, + { + "epoch": 23.016638682770946, + "grad_norm": 0.018950844183564186, + "learning_rate": 2.2685009658957557e-05, + "loss": 0.0784, + "step": 109240 + }, + { + "epoch": 23.01669284514976, + "grad_norm": 0.4104173183441162, + "learning_rate": 2.2682000637912463e-05, + "loss": 0.0792, + "step": 109250 + }, + { + "epoch": 23.016747007528572, + "grad_norm": 0.14235280454158783, + "learning_rate": 2.267899161686737e-05, + "loss": 0.0654, + "step": 109260 + }, + { + "epoch": 23.01680116990738, + "grad_norm": 0.0021463672164827585, + "learning_rate": 2.2675982595822276e-05, + "loss": 0.0214, + "step": 109270 + }, + { + "epoch": 23.016855332286195, + "grad_norm": 0.26051411032676697, + "learning_rate": 2.2672973574777185e-05, + "loss": 0.0278, + "step": 109280 + }, + { + "epoch": 23.016909494665004, + "grad_norm": 0.0063493866473436356, + "learning_rate": 2.266996455373209e-05, + "loss": 0.0214, + "step": 109290 + }, + { + "epoch": 23.016963657043817, + "grad_norm": 0.04431222379207611, + "learning_rate": 2.2666955532686995e-05, + "loss": 0.0211, + "step": 109300 + }, + { + "epoch": 23.01701781942263, + "grad_norm": 0.03435364365577698, + "learning_rate": 2.2663946511641904e-05, + "loss": 0.1098, + "step": 109310 + }, + { + "epoch": 23.01707198180144, + "grad_norm": 1.9329434633255005, + "learning_rate": 2.266093749059681e-05, + "loss": 0.0779, + "step": 109320 + }, + { + "epoch": 23.017126144180253, + "grad_norm": 0.6141281127929688, + "learning_rate": 2.2657928469551717e-05, + "loss": 0.0289, + "step": 109330 + }, + { + "epoch": 23.017180306559062, + "grad_norm": 0.040615882724523544, + "learning_rate": 2.2654919448506623e-05, + "loss": 0.0992, + "step": 109340 + }, + { + "epoch": 23.017234468937875, + "grad_norm": 0.33777084946632385, + "learning_rate": 2.2651910427461533e-05, + "loss": 0.0881, + "step": 109350 + }, + { + "epoch": 23.01728863131669, + "grad_norm": 1.2684558629989624, + "learning_rate": 2.2648901406416436e-05, + "loss": 0.037, + "step": 109360 + }, + { + "epoch": 23.017342793695498, + "grad_norm": 0.003449785290285945, + "learning_rate": 2.2645892385371345e-05, + "loss": 0.0096, + "step": 109370 + }, + { + "epoch": 23.01739695607431, + "grad_norm": 0.08862300217151642, + "learning_rate": 2.264288336432625e-05, + "loss": 0.0668, + "step": 109380 + }, + { + "epoch": 23.017451118453124, + "grad_norm": 0.004541020840406418, + "learning_rate": 2.2639874343281158e-05, + "loss": 0.0646, + "step": 109390 + }, + { + "epoch": 23.017505280831934, + "grad_norm": 1.077667474746704, + "learning_rate": 2.2636865322236064e-05, + "loss": 0.0731, + "step": 109400 + }, + { + "epoch": 23.017559443210747, + "grad_norm": 0.0014753531431779265, + "learning_rate": 2.2633856301190974e-05, + "loss": 0.0377, + "step": 109410 + }, + { + "epoch": 23.017613605589556, + "grad_norm": 3.4067745208740234, + "learning_rate": 2.2630847280145877e-05, + "loss": 0.0357, + "step": 109420 + }, + { + "epoch": 23.01766776796837, + "grad_norm": 0.33849093317985535, + "learning_rate": 2.2627838259100786e-05, + "loss": 0.0063, + "step": 109430 + }, + { + "epoch": 23.017721930347182, + "grad_norm": 0.0040078856982290745, + "learning_rate": 2.2624829238055693e-05, + "loss": 0.0209, + "step": 109440 + }, + { + "epoch": 23.017776092725992, + "grad_norm": 0.02924336865544319, + "learning_rate": 2.26218202170106e-05, + "loss": 0.0709, + "step": 109450 + }, + { + "epoch": 23.017830255104805, + "grad_norm": 0.0013836001744493842, + "learning_rate": 2.2618811195965505e-05, + "loss": 0.0363, + "step": 109460 + }, + { + "epoch": 23.017884417483614, + "grad_norm": 14.090242385864258, + "learning_rate": 2.261580217492041e-05, + "loss": 0.1709, + "step": 109470 + }, + { + "epoch": 23.017938579862427, + "grad_norm": 0.0016446085646748543, + "learning_rate": 2.261279315387532e-05, + "loss": 0.0628, + "step": 109480 + }, + { + "epoch": 23.01799274224124, + "grad_norm": 0.9046530723571777, + "learning_rate": 2.2609784132830224e-05, + "loss": 0.0153, + "step": 109490 + }, + { + "epoch": 23.01804690462005, + "grad_norm": 0.011651691980659962, + "learning_rate": 2.2606775111785134e-05, + "loss": 0.0194, + "step": 109500 + }, + { + "epoch": 23.018101066998863, + "grad_norm": 0.0017326134257018566, + "learning_rate": 2.260376609074004e-05, + "loss": 0.0328, + "step": 109510 + }, + { + "epoch": 23.018155229377673, + "grad_norm": 0.3825751841068268, + "learning_rate": 2.2600757069694946e-05, + "loss": 0.0601, + "step": 109520 + }, + { + "epoch": 23.018209391756486, + "grad_norm": 0.0026076429057866335, + "learning_rate": 2.2597748048649852e-05, + "loss": 0.0261, + "step": 109530 + }, + { + "epoch": 23.0182635541353, + "grad_norm": 0.003075118875131011, + "learning_rate": 2.2594739027604762e-05, + "loss": 0.0215, + "step": 109540 + }, + { + "epoch": 23.018317716514108, + "grad_norm": 0.5668923854827881, + "learning_rate": 2.2591730006559665e-05, + "loss": 0.0434, + "step": 109550 + }, + { + "epoch": 23.01837187889292, + "grad_norm": 0.013508020900189877, + "learning_rate": 2.2588720985514575e-05, + "loss": 0.0592, + "step": 109560 + }, + { + "epoch": 23.018426041271734, + "grad_norm": 3.4831948280334473, + "learning_rate": 2.258571196446948e-05, + "loss": 0.0487, + "step": 109570 + }, + { + "epoch": 23.018480203650544, + "grad_norm": 0.3021121025085449, + "learning_rate": 2.2582702943424387e-05, + "loss": 0.0638, + "step": 109580 + }, + { + "epoch": 23.018534366029357, + "grad_norm": 0.046850237995386124, + "learning_rate": 2.2579693922379293e-05, + "loss": 0.0539, + "step": 109590 + }, + { + "epoch": 23.018588528408166, + "grad_norm": 0.004908174276351929, + "learning_rate": 2.25766849013342e-05, + "loss": 0.0322, + "step": 109600 + }, + { + "epoch": 23.01864269078698, + "grad_norm": 0.0015625102678313851, + "learning_rate": 2.257367588028911e-05, + "loss": 0.0103, + "step": 109610 + }, + { + "epoch": 23.018696853165793, + "grad_norm": 0.0015302716055884957, + "learning_rate": 2.2570666859244012e-05, + "loss": 0.0418, + "step": 109620 + }, + { + "epoch": 23.018751015544602, + "grad_norm": 7.716817378997803, + "learning_rate": 2.2567657838198922e-05, + "loss": 0.0607, + "step": 109630 + }, + { + "epoch": 23.018805177923415, + "grad_norm": 0.003343066433444619, + "learning_rate": 2.2564648817153828e-05, + "loss": 0.0474, + "step": 109640 + }, + { + "epoch": 23.018859340302225, + "grad_norm": 0.4503149092197418, + "learning_rate": 2.2561639796108735e-05, + "loss": 0.0269, + "step": 109650 + }, + { + "epoch": 23.018913502681038, + "grad_norm": 0.04575883224606514, + "learning_rate": 2.255863077506364e-05, + "loss": 0.0706, + "step": 109660 + }, + { + "epoch": 23.01896766505985, + "grad_norm": 5.221747875213623, + "learning_rate": 2.255562175401855e-05, + "loss": 0.0359, + "step": 109670 + }, + { + "epoch": 23.01902182743866, + "grad_norm": 0.0013368194922804832, + "learning_rate": 2.2552612732973453e-05, + "loss": 0.0121, + "step": 109680 + }, + { + "epoch": 23.019075989817473, + "grad_norm": 0.2558926045894623, + "learning_rate": 2.2549603711928363e-05, + "loss": 0.0705, + "step": 109690 + }, + { + "epoch": 23.019130152196283, + "grad_norm": 0.7592337727546692, + "learning_rate": 2.254659469088327e-05, + "loss": 0.0077, + "step": 109700 + }, + { + "epoch": 23.019184314575096, + "grad_norm": 0.0014895459171384573, + "learning_rate": 2.2543585669838176e-05, + "loss": 0.0507, + "step": 109710 + }, + { + "epoch": 23.01923847695391, + "grad_norm": 0.0035152300260961056, + "learning_rate": 2.2540576648793082e-05, + "loss": 0.0331, + "step": 109720 + }, + { + "epoch": 23.01929263933272, + "grad_norm": 0.3926680386066437, + "learning_rate": 2.253756762774799e-05, + "loss": 0.117, + "step": 109730 + }, + { + "epoch": 23.01934680171153, + "grad_norm": 0.9230680465698242, + "learning_rate": 2.2534558606702898e-05, + "loss": 0.0467, + "step": 109740 + }, + { + "epoch": 23.019400964090345, + "grad_norm": 1.796796202659607, + "learning_rate": 2.25315495856578e-05, + "loss": 0.0746, + "step": 109750 + }, + { + "epoch": 23.019455126469154, + "grad_norm": 0.8085466623306274, + "learning_rate": 2.252854056461271e-05, + "loss": 0.0745, + "step": 109760 + }, + { + "epoch": 23.019509288847967, + "grad_norm": 0.003586292266845703, + "learning_rate": 2.2525531543567617e-05, + "loss": 0.0516, + "step": 109770 + }, + { + "epoch": 23.019563451226777, + "grad_norm": 0.001728438655845821, + "learning_rate": 2.2522522522522523e-05, + "loss": 0.049, + "step": 109780 + }, + { + "epoch": 23.01961761360559, + "grad_norm": 0.007303797639906406, + "learning_rate": 2.251951350147743e-05, + "loss": 0.0064, + "step": 109790 + }, + { + "epoch": 23.019671775984403, + "grad_norm": 0.0014582716394215822, + "learning_rate": 2.251650448043234e-05, + "loss": 0.0663, + "step": 109800 + }, + { + "epoch": 23.019725938363212, + "grad_norm": 0.0016021004412323236, + "learning_rate": 2.251349545938724e-05, + "loss": 0.0505, + "step": 109810 + }, + { + "epoch": 23.019780100742025, + "grad_norm": 0.0021869984921067953, + "learning_rate": 2.251048643834215e-05, + "loss": 0.0222, + "step": 109820 + }, + { + "epoch": 23.019834263120835, + "grad_norm": 0.001874241279438138, + "learning_rate": 2.2507477417297058e-05, + "loss": 0.1021, + "step": 109830 + }, + { + "epoch": 23.019888425499648, + "grad_norm": 1.4152352809906006, + "learning_rate": 2.2504468396251964e-05, + "loss": 0.0238, + "step": 109840 + }, + { + "epoch": 23.01994258787846, + "grad_norm": 0.0016587832942605019, + "learning_rate": 2.250145937520687e-05, + "loss": 0.0297, + "step": 109850 + }, + { + "epoch": 23.01999675025727, + "grad_norm": 3.8172767162323, + "learning_rate": 2.249845035416178e-05, + "loss": 0.0851, + "step": 109860 + }, + { + "epoch": 23.020050912636083, + "grad_norm": 0.008875020779669285, + "learning_rate": 2.2495441333116686e-05, + "loss": 0.0772, + "step": 109870 + }, + { + "epoch": 23.020105075014893, + "grad_norm": 0.664154589176178, + "learning_rate": 2.2492432312071592e-05, + "loss": 0.0083, + "step": 109880 + }, + { + "epoch": 23.020159237393706, + "grad_norm": 0.001957580214366317, + "learning_rate": 2.24894232910265e-05, + "loss": 0.125, + "step": 109890 + }, + { + "epoch": 23.02021339977252, + "grad_norm": 0.08743668347597122, + "learning_rate": 2.2486414269981405e-05, + "loss": 0.0429, + "step": 109900 + }, + { + "epoch": 23.02026756215133, + "grad_norm": 0.0025530799757689238, + "learning_rate": 2.248340524893631e-05, + "loss": 0.0359, + "step": 109910 + }, + { + "epoch": 23.02032172453014, + "grad_norm": 0.022540252655744553, + "learning_rate": 2.2480396227891217e-05, + "loss": 0.0219, + "step": 109920 + }, + { + "epoch": 23.020375886908955, + "grad_norm": 0.0033973543904721737, + "learning_rate": 2.2477387206846127e-05, + "loss": 0.0272, + "step": 109930 + }, + { + "epoch": 23.020430049287764, + "grad_norm": 0.002694562543183565, + "learning_rate": 2.247437818580103e-05, + "loss": 0.0143, + "step": 109940 + }, + { + "epoch": 23.020484211666577, + "grad_norm": 0.2754248380661011, + "learning_rate": 2.247136916475594e-05, + "loss": 0.0098, + "step": 109950 + }, + { + "epoch": 23.020538374045387, + "grad_norm": 0.05262177810072899, + "learning_rate": 2.2468360143710846e-05, + "loss": 0.1414, + "step": 109960 + }, + { + "epoch": 23.0205925364242, + "grad_norm": 0.007335825357586145, + "learning_rate": 2.2465351122665752e-05, + "loss": 0.0007, + "step": 109970 + }, + { + "epoch": 23.020646698803013, + "grad_norm": 0.09248040616512299, + "learning_rate": 2.246234210162066e-05, + "loss": 0.1022, + "step": 109980 + }, + { + "epoch": 23.020700861181822, + "grad_norm": 0.4912175238132477, + "learning_rate": 2.2459333080575568e-05, + "loss": 0.0412, + "step": 109990 + }, + { + "epoch": 23.020755023560636, + "grad_norm": 0.08750426024198532, + "learning_rate": 2.2456324059530474e-05, + "loss": 0.0772, + "step": 110000 + }, + { + "epoch": 23.020809185939445, + "grad_norm": 0.004158172756433487, + "learning_rate": 2.245331503848538e-05, + "loss": 0.012, + "step": 110010 + }, + { + "epoch": 23.020863348318258, + "grad_norm": 0.0034582708030939102, + "learning_rate": 2.2450306017440287e-05, + "loss": 0.0995, + "step": 110020 + }, + { + "epoch": 23.02091751069707, + "grad_norm": 0.0576632022857666, + "learning_rate": 2.2447296996395197e-05, + "loss": 0.0534, + "step": 110030 + }, + { + "epoch": 23.02097167307588, + "grad_norm": 0.002949067624285817, + "learning_rate": 2.24442879753501e-05, + "loss": 0.0319, + "step": 110040 + }, + { + "epoch": 23.021025835454694, + "grad_norm": 0.004493208136409521, + "learning_rate": 2.2441278954305006e-05, + "loss": 0.0963, + "step": 110050 + }, + { + "epoch": 23.021079997833503, + "grad_norm": 0.00286171305924654, + "learning_rate": 2.2438269933259915e-05, + "loss": 0.009, + "step": 110060 + }, + { + "epoch": 23.021134160212316, + "grad_norm": 0.03608626499772072, + "learning_rate": 2.243526091221482e-05, + "loss": 0.0154, + "step": 110070 + }, + { + "epoch": 23.02118832259113, + "grad_norm": 0.033307310193777084, + "learning_rate": 2.2432251891169728e-05, + "loss": 0.0114, + "step": 110080 + }, + { + "epoch": 23.02124248496994, + "grad_norm": 0.3865022361278534, + "learning_rate": 2.2429242870124634e-05, + "loss": 0.0198, + "step": 110090 + }, + { + "epoch": 23.021296647348752, + "grad_norm": 0.00750707508996129, + "learning_rate": 2.242623384907954e-05, + "loss": 0.0217, + "step": 110100 + }, + { + "epoch": 23.021350809727565, + "grad_norm": 0.005016239359974861, + "learning_rate": 2.2423224828034447e-05, + "loss": 0.0599, + "step": 110110 + }, + { + "epoch": 23.021404972106374, + "grad_norm": 10.023118019104004, + "learning_rate": 2.2420215806989357e-05, + "loss": 0.0873, + "step": 110120 + }, + { + "epoch": 23.021459134485188, + "grad_norm": 0.5135244727134705, + "learning_rate": 2.2417206785944263e-05, + "loss": 0.064, + "step": 110130 + }, + { + "epoch": 23.021513296863997, + "grad_norm": 0.40394607186317444, + "learning_rate": 2.241419776489917e-05, + "loss": 0.0199, + "step": 110140 + }, + { + "epoch": 23.02156745924281, + "grad_norm": 0.004303930327296257, + "learning_rate": 2.2411188743854075e-05, + "loss": 0.0004, + "step": 110150 + }, + { + "epoch": 23.021621621621623, + "grad_norm": 0.44368526339530945, + "learning_rate": 2.2408179722808985e-05, + "loss": 0.0105, + "step": 110160 + }, + { + "epoch": 23.021675784000433, + "grad_norm": 0.14435087144374847, + "learning_rate": 2.2405170701763888e-05, + "loss": 0.1031, + "step": 110170 + }, + { + "epoch": 23.021729946379246, + "grad_norm": 4.33536434173584, + "learning_rate": 2.2402161680718798e-05, + "loss": 0.0411, + "step": 110180 + }, + { + "epoch": 23.021784108758055, + "grad_norm": 0.004341425374150276, + "learning_rate": 2.2399152659673704e-05, + "loss": 0.0456, + "step": 110190 + }, + { + "epoch": 23.02183827113687, + "grad_norm": 0.0046515092253685, + "learning_rate": 2.2396143638628607e-05, + "loss": 0.1431, + "step": 110200 + }, + { + "epoch": 23.02189243351568, + "grad_norm": 0.6995670199394226, + "learning_rate": 2.2393134617583516e-05, + "loss": 0.0297, + "step": 110210 + }, + { + "epoch": 23.02194659589449, + "grad_norm": 0.004579803440719843, + "learning_rate": 2.2390125596538423e-05, + "loss": 0.0121, + "step": 110220 + }, + { + "epoch": 23.022000758273304, + "grad_norm": 0.406965970993042, + "learning_rate": 2.238711657549333e-05, + "loss": 0.1419, + "step": 110230 + }, + { + "epoch": 23.022054920652113, + "grad_norm": 0.005787709262222052, + "learning_rate": 2.2384107554448235e-05, + "loss": 0.0741, + "step": 110240 + }, + { + "epoch": 23.022109083030927, + "grad_norm": 0.15476664900779724, + "learning_rate": 2.2381098533403145e-05, + "loss": 0.0069, + "step": 110250 + }, + { + "epoch": 23.02216324540974, + "grad_norm": 0.03846681863069534, + "learning_rate": 2.237808951235805e-05, + "loss": 0.0572, + "step": 110260 + }, + { + "epoch": 23.02221740778855, + "grad_norm": 0.04914871230721474, + "learning_rate": 2.2375080491312957e-05, + "loss": 0.0192, + "step": 110270 + }, + { + "epoch": 23.022271570167362, + "grad_norm": 0.14201460778713226, + "learning_rate": 2.2372071470267864e-05, + "loss": 0.0681, + "step": 110280 + }, + { + "epoch": 23.02232573254617, + "grad_norm": 0.02447066828608513, + "learning_rate": 2.2369062449222773e-05, + "loss": 0.0182, + "step": 110290 + }, + { + "epoch": 23.022379894924985, + "grad_norm": 0.033369701355695724, + "learning_rate": 2.2366053428177676e-05, + "loss": 0.0081, + "step": 110300 + }, + { + "epoch": 23.022434057303798, + "grad_norm": 0.004005941562354565, + "learning_rate": 2.2363044407132586e-05, + "loss": 0.0659, + "step": 110310 + }, + { + "epoch": 23.022488219682607, + "grad_norm": 0.022863110527396202, + "learning_rate": 2.2360035386087492e-05, + "loss": 0.0241, + "step": 110320 + }, + { + "epoch": 23.02254238206142, + "grad_norm": 0.003380579175427556, + "learning_rate": 2.23570263650424e-05, + "loss": 0.0637, + "step": 110330 + }, + { + "epoch": 23.022596544440233, + "grad_norm": 0.003278894117102027, + "learning_rate": 2.2354017343997305e-05, + "loss": 0.0046, + "step": 110340 + }, + { + "epoch": 23.022650706819043, + "grad_norm": 0.0442531444132328, + "learning_rate": 2.235100832295221e-05, + "loss": 0.0225, + "step": 110350 + }, + { + "epoch": 23.022704869197856, + "grad_norm": 0.4903753697872162, + "learning_rate": 2.2347999301907117e-05, + "loss": 0.0271, + "step": 110360 + }, + { + "epoch": 23.022759031576665, + "grad_norm": 0.01454420667141676, + "learning_rate": 2.2344990280862024e-05, + "loss": 0.0277, + "step": 110370 + }, + { + "epoch": 23.02281319395548, + "grad_norm": 0.03019084595143795, + "learning_rate": 2.2341981259816933e-05, + "loss": 0.0195, + "step": 110380 + }, + { + "epoch": 23.02286735633429, + "grad_norm": 0.002177407266572118, + "learning_rate": 2.233897223877184e-05, + "loss": 0.0192, + "step": 110390 + }, + { + "epoch": 23.0229215187131, + "grad_norm": 0.3994889259338379, + "learning_rate": 2.2335963217726746e-05, + "loss": 0.0484, + "step": 110400 + }, + { + "epoch": 23.022975681091914, + "grad_norm": 0.03232041001319885, + "learning_rate": 2.2332954196681652e-05, + "loss": 0.0321, + "step": 110410 + }, + { + "epoch": 23.023029843470724, + "grad_norm": 0.039896659553050995, + "learning_rate": 2.2329945175636562e-05, + "loss": 0.004, + "step": 110420 + }, + { + "epoch": 23.023084005849537, + "grad_norm": 0.003179857973009348, + "learning_rate": 2.2326936154591465e-05, + "loss": 0.0071, + "step": 110430 + }, + { + "epoch": 23.02313816822835, + "grad_norm": 0.00710945064201951, + "learning_rate": 2.2323927133546374e-05, + "loss": 0.048, + "step": 110440 + }, + { + "epoch": 23.02319233060716, + "grad_norm": 0.01620737463235855, + "learning_rate": 2.232091811250128e-05, + "loss": 0.0194, + "step": 110450 + }, + { + "epoch": 23.023246492985972, + "grad_norm": 0.17111614346504211, + "learning_rate": 2.2317909091456187e-05, + "loss": 0.0666, + "step": 110460 + }, + { + "epoch": 23.023300655364782, + "grad_norm": 0.0026062247343361378, + "learning_rate": 2.2314900070411093e-05, + "loss": 0.0087, + "step": 110470 + }, + { + "epoch": 23.023354817743595, + "grad_norm": 0.0019194483757019043, + "learning_rate": 2.2311891049366003e-05, + "loss": 0.1241, + "step": 110480 + }, + { + "epoch": 23.023408980122408, + "grad_norm": 0.011794270016252995, + "learning_rate": 2.2308882028320906e-05, + "loss": 0.0176, + "step": 110490 + }, + { + "epoch": 23.023463142501218, + "grad_norm": 0.002049787435680628, + "learning_rate": 2.2305873007275812e-05, + "loss": 0.0497, + "step": 110500 + }, + { + "epoch": 23.02351730488003, + "grad_norm": 0.02932817116379738, + "learning_rate": 2.230286398623072e-05, + "loss": 0.0792, + "step": 110510 + }, + { + "epoch": 23.023571467258844, + "grad_norm": 0.017193911597132683, + "learning_rate": 2.2299854965185628e-05, + "loss": 0.0196, + "step": 110520 + }, + { + "epoch": 23.023625629637653, + "grad_norm": 4.936026573181152, + "learning_rate": 2.2296845944140534e-05, + "loss": 0.097, + "step": 110530 + }, + { + "epoch": 23.023679792016466, + "grad_norm": 0.19254648685455322, + "learning_rate": 2.229383692309544e-05, + "loss": 0.006, + "step": 110540 + }, + { + "epoch": 23.023733954395276, + "grad_norm": 0.027692364528775215, + "learning_rate": 2.229082790205035e-05, + "loss": 0.0667, + "step": 110550 + }, + { + "epoch": 23.02378811677409, + "grad_norm": 0.07584787905216217, + "learning_rate": 2.2287818881005253e-05, + "loss": 0.0715, + "step": 110560 + }, + { + "epoch": 23.023842279152902, + "grad_norm": 0.01276309322565794, + "learning_rate": 2.2284809859960163e-05, + "loss": 0.1191, + "step": 110570 + }, + { + "epoch": 23.02389644153171, + "grad_norm": 0.05792001634836197, + "learning_rate": 2.228180083891507e-05, + "loss": 0.0842, + "step": 110580 + }, + { + "epoch": 23.023950603910524, + "grad_norm": 0.022614412009716034, + "learning_rate": 2.2278791817869975e-05, + "loss": 0.0289, + "step": 110590 + }, + { + "epoch": 23.024004766289334, + "grad_norm": 0.8450888395309448, + "learning_rate": 2.227578279682488e-05, + "loss": 0.0629, + "step": 110600 + }, + { + "epoch": 23.024058928668147, + "grad_norm": 0.008173815906047821, + "learning_rate": 2.227277377577979e-05, + "loss": 0.0678, + "step": 110610 + }, + { + "epoch": 23.02411309104696, + "grad_norm": 0.03830474615097046, + "learning_rate": 2.2269764754734694e-05, + "loss": 0.0891, + "step": 110620 + }, + { + "epoch": 23.02416725342577, + "grad_norm": 1.7958028316497803, + "learning_rate": 2.2266755733689604e-05, + "loss": 0.0883, + "step": 110630 + }, + { + "epoch": 23.024221415804583, + "grad_norm": 4.5831451416015625, + "learning_rate": 2.226374671264451e-05, + "loss": 0.206, + "step": 110640 + }, + { + "epoch": 23.024275578183392, + "grad_norm": 0.00278969993814826, + "learning_rate": 2.2260737691599416e-05, + "loss": 0.0142, + "step": 110650 + }, + { + "epoch": 23.024329740562205, + "grad_norm": 0.20044898986816406, + "learning_rate": 2.2257728670554322e-05, + "loss": 0.0796, + "step": 110660 + }, + { + "epoch": 23.024383902941018, + "grad_norm": 0.008062638342380524, + "learning_rate": 2.225471964950923e-05, + "loss": 0.0252, + "step": 110670 + }, + { + "epoch": 23.024438065319828, + "grad_norm": 0.0028706889133900404, + "learning_rate": 2.225171062846414e-05, + "loss": 0.0841, + "step": 110680 + }, + { + "epoch": 23.02449222769864, + "grad_norm": 1.161927580833435, + "learning_rate": 2.224870160741904e-05, + "loss": 0.0312, + "step": 110690 + }, + { + "epoch": 23.024546390077454, + "grad_norm": 0.002191300503909588, + "learning_rate": 2.224569258637395e-05, + "loss": 0.0093, + "step": 110700 + }, + { + "epoch": 23.024600552456263, + "grad_norm": 1.5013246536254883, + "learning_rate": 2.2242683565328857e-05, + "loss": 0.0251, + "step": 110710 + }, + { + "epoch": 23.024654714835076, + "grad_norm": 0.10156980901956558, + "learning_rate": 2.2239674544283764e-05, + "loss": 0.0028, + "step": 110720 + }, + { + "epoch": 23.024708877213886, + "grad_norm": 0.7276570200920105, + "learning_rate": 2.223666552323867e-05, + "loss": 0.0841, + "step": 110730 + }, + { + "epoch": 23.0247630395927, + "grad_norm": 0.04791213944554329, + "learning_rate": 2.223365650219358e-05, + "loss": 0.0621, + "step": 110740 + }, + { + "epoch": 23.024817201971512, + "grad_norm": 0.4380984604358673, + "learning_rate": 2.2230647481148482e-05, + "loss": 0.0043, + "step": 110750 + }, + { + "epoch": 23.02487136435032, + "grad_norm": 1.2551544904708862, + "learning_rate": 2.2227638460103392e-05, + "loss": 0.0183, + "step": 110760 + }, + { + "epoch": 23.024925526729135, + "grad_norm": 0.843268871307373, + "learning_rate": 2.2224629439058298e-05, + "loss": 0.0118, + "step": 110770 + }, + { + "epoch": 23.024979689107944, + "grad_norm": 0.4589537978172302, + "learning_rate": 2.2221620418013205e-05, + "loss": 0.0304, + "step": 110780 + }, + { + "epoch": 23.02500135405947, + "eval_accuracy": 0.8520574787720444, + "eval_loss": 0.8359410762786865, + "eval_runtime": 117.9258, + "eval_samples_per_second": 25.965, + "eval_steps_per_second": 3.248, + "step": 110784 + }, + { + "epoch": 24.000032497427288, + "grad_norm": 0.07585441321134567, + "learning_rate": 2.221861139696811e-05, + "loss": 0.0214, + "step": 110790 + }, + { + "epoch": 24.000086659806097, + "grad_norm": 0.4770742952823639, + "learning_rate": 2.2215602375923017e-05, + "loss": 0.0308, + "step": 110800 + }, + { + "epoch": 24.00014082218491, + "grad_norm": 0.9943885207176208, + "learning_rate": 2.2212593354877927e-05, + "loss": 0.0926, + "step": 110810 + }, + { + "epoch": 24.000194984563723, + "grad_norm": 0.03635498508810997, + "learning_rate": 2.220958433383283e-05, + "loss": 0.0316, + "step": 110820 + }, + { + "epoch": 24.000249146942533, + "grad_norm": 0.004886207636445761, + "learning_rate": 2.220657531278774e-05, + "loss": 0.0041, + "step": 110830 + }, + { + "epoch": 24.000303309321346, + "grad_norm": 0.002839869586750865, + "learning_rate": 2.2203566291742646e-05, + "loss": 0.0858, + "step": 110840 + }, + { + "epoch": 24.000357471700156, + "grad_norm": 0.028832079842686653, + "learning_rate": 2.2200557270697552e-05, + "loss": 0.051, + "step": 110850 + }, + { + "epoch": 24.00041163407897, + "grad_norm": 0.001850611763074994, + "learning_rate": 2.2197548249652458e-05, + "loss": 0.0641, + "step": 110860 + }, + { + "epoch": 24.00046579645778, + "grad_norm": 3.273740530014038, + "learning_rate": 2.2194539228607368e-05, + "loss": 0.0744, + "step": 110870 + }, + { + "epoch": 24.00051995883659, + "grad_norm": 0.0334111787378788, + "learning_rate": 2.219153020756227e-05, + "loss": 0.0156, + "step": 110880 + }, + { + "epoch": 24.000574121215404, + "grad_norm": 0.0029080237727612257, + "learning_rate": 2.218852118651718e-05, + "loss": 0.0333, + "step": 110890 + }, + { + "epoch": 24.000628283594214, + "grad_norm": 0.0028274809010326862, + "learning_rate": 2.2185512165472087e-05, + "loss": 0.0893, + "step": 110900 + }, + { + "epoch": 24.000682445973027, + "grad_norm": 0.0019424142083153129, + "learning_rate": 2.2182503144426993e-05, + "loss": 0.0199, + "step": 110910 + }, + { + "epoch": 24.00073660835184, + "grad_norm": 0.0017743465723469853, + "learning_rate": 2.21794941233819e-05, + "loss": 0.0529, + "step": 110920 + }, + { + "epoch": 24.00079077073065, + "grad_norm": 0.007195489481091499, + "learning_rate": 2.217648510233681e-05, + "loss": 0.0154, + "step": 110930 + }, + { + "epoch": 24.000844933109462, + "grad_norm": 0.001777478028088808, + "learning_rate": 2.2173476081291715e-05, + "loss": 0.0336, + "step": 110940 + }, + { + "epoch": 24.000899095488275, + "grad_norm": 0.07716754823923111, + "learning_rate": 2.2170467060246618e-05, + "loss": 0.0292, + "step": 110950 + }, + { + "epoch": 24.000953257867085, + "grad_norm": 2.403637647628784, + "learning_rate": 2.2167458039201528e-05, + "loss": 0.062, + "step": 110960 + }, + { + "epoch": 24.001007420245898, + "grad_norm": 0.0018868072656914592, + "learning_rate": 2.2164449018156434e-05, + "loss": 0.0306, + "step": 110970 + }, + { + "epoch": 24.001061582624708, + "grad_norm": 0.05761241912841797, + "learning_rate": 2.216143999711134e-05, + "loss": 0.0216, + "step": 110980 + }, + { + "epoch": 24.00111574500352, + "grad_norm": 0.004428625572472811, + "learning_rate": 2.2158430976066246e-05, + "loss": 0.0019, + "step": 110990 + }, + { + "epoch": 24.001169907382334, + "grad_norm": 0.001814002636820078, + "learning_rate": 2.2155421955021156e-05, + "loss": 0.0375, + "step": 111000 + }, + { + "epoch": 24.001224069761143, + "grad_norm": 0.0025265715084969997, + "learning_rate": 2.215241293397606e-05, + "loss": 0.0027, + "step": 111010 + }, + { + "epoch": 24.001278232139956, + "grad_norm": 0.0016898313770070672, + "learning_rate": 2.214940391293097e-05, + "loss": 0.0534, + "step": 111020 + }, + { + "epoch": 24.001332394518766, + "grad_norm": 1.2885794639587402, + "learning_rate": 2.2146394891885875e-05, + "loss": 0.0802, + "step": 111030 + }, + { + "epoch": 24.00138655689758, + "grad_norm": 0.14835429191589355, + "learning_rate": 2.214338587084078e-05, + "loss": 0.0021, + "step": 111040 + }, + { + "epoch": 24.001440719276392, + "grad_norm": 0.002206089673563838, + "learning_rate": 2.2140376849795688e-05, + "loss": 0.0374, + "step": 111050 + }, + { + "epoch": 24.0014948816552, + "grad_norm": 0.04110646992921829, + "learning_rate": 2.2137367828750597e-05, + "loss": 0.0113, + "step": 111060 + }, + { + "epoch": 24.001549044034014, + "grad_norm": 0.08224540203809738, + "learning_rate": 2.2134358807705503e-05, + "loss": 0.0585, + "step": 111070 + }, + { + "epoch": 24.001603206412824, + "grad_norm": 0.02651745267212391, + "learning_rate": 2.213134978666041e-05, + "loss": 0.0381, + "step": 111080 + }, + { + "epoch": 24.001657368791637, + "grad_norm": 0.43020710349082947, + "learning_rate": 2.2128340765615316e-05, + "loss": 0.0566, + "step": 111090 + }, + { + "epoch": 24.00171153117045, + "grad_norm": 0.001662922091782093, + "learning_rate": 2.2125331744570222e-05, + "loss": 0.0064, + "step": 111100 + }, + { + "epoch": 24.00176569354926, + "grad_norm": 0.8190876841545105, + "learning_rate": 2.212232272352513e-05, + "loss": 0.0133, + "step": 111110 + }, + { + "epoch": 24.001819855928073, + "grad_norm": 0.001734757679514587, + "learning_rate": 2.2119313702480035e-05, + "loss": 0.0068, + "step": 111120 + }, + { + "epoch": 24.001874018306886, + "grad_norm": 0.7229350805282593, + "learning_rate": 2.2116304681434944e-05, + "loss": 0.0367, + "step": 111130 + }, + { + "epoch": 24.001928180685695, + "grad_norm": 0.0017582509899511933, + "learning_rate": 2.2113295660389847e-05, + "loss": 0.0231, + "step": 111140 + }, + { + "epoch": 24.00198234306451, + "grad_norm": 0.10728736221790314, + "learning_rate": 2.2110286639344757e-05, + "loss": 0.1446, + "step": 111150 + }, + { + "epoch": 24.002036505443318, + "grad_norm": 1.443547248840332, + "learning_rate": 2.2107277618299663e-05, + "loss": 0.0395, + "step": 111160 + }, + { + "epoch": 24.00209066782213, + "grad_norm": 0.10625575482845306, + "learning_rate": 2.210426859725457e-05, + "loss": 0.0242, + "step": 111170 + }, + { + "epoch": 24.002144830200944, + "grad_norm": 0.0016696929233148694, + "learning_rate": 2.2101259576209476e-05, + "loss": 0.0028, + "step": 111180 + }, + { + "epoch": 24.002198992579753, + "grad_norm": 0.9923835396766663, + "learning_rate": 2.2098250555164386e-05, + "loss": 0.0522, + "step": 111190 + }, + { + "epoch": 24.002253154958566, + "grad_norm": 0.0016274447552859783, + "learning_rate": 2.2095241534119292e-05, + "loss": 0.0482, + "step": 111200 + }, + { + "epoch": 24.002307317337376, + "grad_norm": 0.03850216791033745, + "learning_rate": 2.2092232513074198e-05, + "loss": 0.1076, + "step": 111210 + }, + { + "epoch": 24.00236147971619, + "grad_norm": 0.0017668181098997593, + "learning_rate": 2.2089223492029104e-05, + "loss": 0.0101, + "step": 111220 + }, + { + "epoch": 24.002415642095002, + "grad_norm": 0.0028626942075788975, + "learning_rate": 2.2086214470984014e-05, + "loss": 0.0578, + "step": 111230 + }, + { + "epoch": 24.00246980447381, + "grad_norm": 0.0022502285428345203, + "learning_rate": 2.2083205449938917e-05, + "loss": 0.0307, + "step": 111240 + }, + { + "epoch": 24.002523966852625, + "grad_norm": 0.4125281572341919, + "learning_rate": 2.2080196428893823e-05, + "loss": 0.0044, + "step": 111250 + }, + { + "epoch": 24.002578129231434, + "grad_norm": 0.4889199435710907, + "learning_rate": 2.2077187407848733e-05, + "loss": 0.0484, + "step": 111260 + }, + { + "epoch": 24.002632291610247, + "grad_norm": 4.783664226531982, + "learning_rate": 2.2074178386803636e-05, + "loss": 0.125, + "step": 111270 + }, + { + "epoch": 24.00268645398906, + "grad_norm": 0.0030894423834979534, + "learning_rate": 2.2071169365758545e-05, + "loss": 0.0099, + "step": 111280 + }, + { + "epoch": 24.00274061636787, + "grad_norm": 0.3297369182109833, + "learning_rate": 2.206816034471345e-05, + "loss": 0.0111, + "step": 111290 + }, + { + "epoch": 24.002794778746683, + "grad_norm": 0.043815918266773224, + "learning_rate": 2.2065151323668358e-05, + "loss": 0.0662, + "step": 111300 + }, + { + "epoch": 24.002848941125496, + "grad_norm": 0.002555149607360363, + "learning_rate": 2.2062142302623264e-05, + "loss": 0.1142, + "step": 111310 + }, + { + "epoch": 24.002903103504305, + "grad_norm": 0.006752930581569672, + "learning_rate": 2.2059133281578174e-05, + "loss": 0.0252, + "step": 111320 + }, + { + "epoch": 24.00295726588312, + "grad_norm": 0.660577654838562, + "learning_rate": 2.205612426053308e-05, + "loss": 0.0428, + "step": 111330 + }, + { + "epoch": 24.003011428261928, + "grad_norm": 0.005358751397579908, + "learning_rate": 2.2053115239487986e-05, + "loss": 0.0487, + "step": 111340 + }, + { + "epoch": 24.00306559064074, + "grad_norm": 0.010510887950658798, + "learning_rate": 2.2050106218442893e-05, + "loss": 0.1487, + "step": 111350 + }, + { + "epoch": 24.003119753019554, + "grad_norm": 0.006694409064948559, + "learning_rate": 2.2047097197397802e-05, + "loss": 0.0151, + "step": 111360 + }, + { + "epoch": 24.003173915398364, + "grad_norm": 1.3920259475708008, + "learning_rate": 2.2044088176352705e-05, + "loss": 0.0251, + "step": 111370 + }, + { + "epoch": 24.003228077777177, + "grad_norm": 2.627652883529663, + "learning_rate": 2.2041079155307615e-05, + "loss": 0.032, + "step": 111380 + }, + { + "epoch": 24.003282240155986, + "grad_norm": 0.7483208179473877, + "learning_rate": 2.203807013426252e-05, + "loss": 0.0574, + "step": 111390 + }, + { + "epoch": 24.0033364025348, + "grad_norm": 0.0056890747509896755, + "learning_rate": 2.2035061113217424e-05, + "loss": 0.0999, + "step": 111400 + }, + { + "epoch": 24.003390564913612, + "grad_norm": 0.00518557196483016, + "learning_rate": 2.2032052092172334e-05, + "loss": 0.0291, + "step": 111410 + }, + { + "epoch": 24.003444727292422, + "grad_norm": 0.01707303524017334, + "learning_rate": 2.202904307112724e-05, + "loss": 0.0288, + "step": 111420 + }, + { + "epoch": 24.003498889671235, + "grad_norm": 3.1541476249694824, + "learning_rate": 2.2026034050082146e-05, + "loss": 0.0556, + "step": 111430 + }, + { + "epoch": 24.003553052050044, + "grad_norm": 0.004536714870482683, + "learning_rate": 2.2023025029037053e-05, + "loss": 0.0805, + "step": 111440 + }, + { + "epoch": 24.003607214428857, + "grad_norm": 0.006246338598430157, + "learning_rate": 2.2020016007991962e-05, + "loss": 0.0332, + "step": 111450 + }, + { + "epoch": 24.00366137680767, + "grad_norm": 0.01866777241230011, + "learning_rate": 2.201700698694687e-05, + "loss": 0.0364, + "step": 111460 + }, + { + "epoch": 24.00371553918648, + "grad_norm": 0.015183445066213608, + "learning_rate": 2.2013997965901775e-05, + "loss": 0.0889, + "step": 111470 + }, + { + "epoch": 24.003769701565293, + "grad_norm": 1.1704528331756592, + "learning_rate": 2.201098894485668e-05, + "loss": 0.0507, + "step": 111480 + }, + { + "epoch": 24.003823863944106, + "grad_norm": 0.3146785497665405, + "learning_rate": 2.200797992381159e-05, + "loss": 0.0224, + "step": 111490 + }, + { + "epoch": 24.003878026322916, + "grad_norm": 4.251434326171875, + "learning_rate": 2.2004970902766494e-05, + "loss": 0.0768, + "step": 111500 + }, + { + "epoch": 24.00393218870173, + "grad_norm": 0.0031059011816978455, + "learning_rate": 2.2001961881721403e-05, + "loss": 0.0025, + "step": 111510 + }, + { + "epoch": 24.00398635108054, + "grad_norm": 0.0035660432185977697, + "learning_rate": 2.199895286067631e-05, + "loss": 0.1511, + "step": 111520 + }, + { + "epoch": 24.00404051345935, + "grad_norm": 1.5535625219345093, + "learning_rate": 2.1995943839631216e-05, + "loss": 0.182, + "step": 111530 + }, + { + "epoch": 24.004094675838164, + "grad_norm": 0.00486708153039217, + "learning_rate": 2.1992934818586122e-05, + "loss": 0.0371, + "step": 111540 + }, + { + "epoch": 24.004148838216974, + "grad_norm": 0.002669681329280138, + "learning_rate": 2.198992579754103e-05, + "loss": 0.1072, + "step": 111550 + }, + { + "epoch": 24.004203000595787, + "grad_norm": 0.012497237883508205, + "learning_rate": 2.1986916776495935e-05, + "loss": 0.0329, + "step": 111560 + }, + { + "epoch": 24.004257162974596, + "grad_norm": 0.0623018853366375, + "learning_rate": 2.198390775545084e-05, + "loss": 0.0175, + "step": 111570 + }, + { + "epoch": 24.00431132535341, + "grad_norm": 0.009411044418811798, + "learning_rate": 2.198089873440575e-05, + "loss": 0.0692, + "step": 111580 + }, + { + "epoch": 24.004365487732223, + "grad_norm": 2.3691766262054443, + "learning_rate": 2.1977889713360657e-05, + "loss": 0.0274, + "step": 111590 + }, + { + "epoch": 24.004419650111032, + "grad_norm": 0.09628459811210632, + "learning_rate": 2.1974880692315563e-05, + "loss": 0.0512, + "step": 111600 + }, + { + "epoch": 24.004473812489845, + "grad_norm": 0.0035679806023836136, + "learning_rate": 2.197187167127047e-05, + "loss": 0.0374, + "step": 111610 + }, + { + "epoch": 24.004527974868655, + "grad_norm": 0.015196415595710278, + "learning_rate": 2.196886265022538e-05, + "loss": 0.0119, + "step": 111620 + }, + { + "epoch": 24.004582137247468, + "grad_norm": 0.002580702770501375, + "learning_rate": 2.1965853629180282e-05, + "loss": 0.0084, + "step": 111630 + }, + { + "epoch": 24.00463629962628, + "grad_norm": 0.006398055702447891, + "learning_rate": 2.196284460813519e-05, + "loss": 0.0825, + "step": 111640 + }, + { + "epoch": 24.00469046200509, + "grad_norm": 0.14297406375408173, + "learning_rate": 2.1959835587090098e-05, + "loss": 0.0209, + "step": 111650 + }, + { + "epoch": 24.004744624383903, + "grad_norm": 2.185192108154297, + "learning_rate": 2.1956826566045004e-05, + "loss": 0.1206, + "step": 111660 + }, + { + "epoch": 24.004798786762716, + "grad_norm": 0.7189053297042847, + "learning_rate": 2.195381754499991e-05, + "loss": 0.1123, + "step": 111670 + }, + { + "epoch": 24.004852949141526, + "grad_norm": 0.1731422394514084, + "learning_rate": 2.195080852395482e-05, + "loss": 0.0366, + "step": 111680 + }, + { + "epoch": 24.00490711152034, + "grad_norm": 0.002271047793328762, + "learning_rate": 2.1947799502909723e-05, + "loss": 0.0703, + "step": 111690 + }, + { + "epoch": 24.00496127389915, + "grad_norm": 0.08084417134523392, + "learning_rate": 2.194479048186463e-05, + "loss": 0.0036, + "step": 111700 + }, + { + "epoch": 24.00501543627796, + "grad_norm": 0.2321203052997589, + "learning_rate": 2.194178146081954e-05, + "loss": 0.002, + "step": 111710 + }, + { + "epoch": 24.005069598656775, + "grad_norm": 0.06806837767362595, + "learning_rate": 2.1938772439774445e-05, + "loss": 0.0039, + "step": 111720 + }, + { + "epoch": 24.005123761035584, + "grad_norm": 0.07963258773088455, + "learning_rate": 2.193576341872935e-05, + "loss": 0.0523, + "step": 111730 + }, + { + "epoch": 24.005177923414397, + "grad_norm": 0.03867797553539276, + "learning_rate": 2.1932754397684258e-05, + "loss": 0.0636, + "step": 111740 + }, + { + "epoch": 24.005232085793207, + "grad_norm": 0.05099062994122505, + "learning_rate": 2.1929745376639167e-05, + "loss": 0.0423, + "step": 111750 + }, + { + "epoch": 24.00528624817202, + "grad_norm": 4.0521416664123535, + "learning_rate": 2.192673635559407e-05, + "loss": 0.08, + "step": 111760 + }, + { + "epoch": 24.005340410550833, + "grad_norm": 0.0018800209509208798, + "learning_rate": 2.192372733454898e-05, + "loss": 0.0346, + "step": 111770 + }, + { + "epoch": 24.005394572929642, + "grad_norm": 0.0019774283282458782, + "learning_rate": 2.1920718313503886e-05, + "loss": 0.0484, + "step": 111780 + }, + { + "epoch": 24.005448735308455, + "grad_norm": 0.3575263023376465, + "learning_rate": 2.1917709292458792e-05, + "loss": 0.0698, + "step": 111790 + }, + { + "epoch": 24.005502897687265, + "grad_norm": 0.00528734503313899, + "learning_rate": 2.19147002714137e-05, + "loss": 0.0109, + "step": 111800 + }, + { + "epoch": 24.005557060066078, + "grad_norm": 0.045310329645872116, + "learning_rate": 2.191169125036861e-05, + "loss": 0.0035, + "step": 111810 + }, + { + "epoch": 24.00561122244489, + "grad_norm": 0.002014393452554941, + "learning_rate": 2.190868222932351e-05, + "loss": 0.0035, + "step": 111820 + }, + { + "epoch": 24.0056653848237, + "grad_norm": 0.03327310085296631, + "learning_rate": 2.190567320827842e-05, + "loss": 0.0582, + "step": 111830 + }, + { + "epoch": 24.005719547202514, + "grad_norm": 0.8096393942832947, + "learning_rate": 2.1902664187233327e-05, + "loss": 0.0702, + "step": 111840 + }, + { + "epoch": 24.005773709581323, + "grad_norm": 2.017202615737915, + "learning_rate": 2.1899655166188234e-05, + "loss": 0.0451, + "step": 111850 + }, + { + "epoch": 24.005827871960136, + "grad_norm": 0.050680749118328094, + "learning_rate": 2.189664614514314e-05, + "loss": 0.0145, + "step": 111860 + }, + { + "epoch": 24.00588203433895, + "grad_norm": 0.20123402774333954, + "learning_rate": 2.1893637124098046e-05, + "loss": 0.0093, + "step": 111870 + }, + { + "epoch": 24.00593619671776, + "grad_norm": 0.0019466442754492164, + "learning_rate": 2.1890628103052956e-05, + "loss": 0.0966, + "step": 111880 + }, + { + "epoch": 24.00599035909657, + "grad_norm": 0.001929813646711409, + "learning_rate": 2.188761908200786e-05, + "loss": 0.0181, + "step": 111890 + }, + { + "epoch": 24.006044521475385, + "grad_norm": 0.08883125334978104, + "learning_rate": 2.1884610060962768e-05, + "loss": 0.0439, + "step": 111900 + }, + { + "epoch": 24.006098683854194, + "grad_norm": 0.001953336875885725, + "learning_rate": 2.1881601039917675e-05, + "loss": 0.1429, + "step": 111910 + }, + { + "epoch": 24.006152846233007, + "grad_norm": 0.0031604738906025887, + "learning_rate": 2.187859201887258e-05, + "loss": 0.0398, + "step": 111920 + }, + { + "epoch": 24.006207008611817, + "grad_norm": 0.067296102643013, + "learning_rate": 2.1875582997827487e-05, + "loss": 0.0097, + "step": 111930 + }, + { + "epoch": 24.00626117099063, + "grad_norm": 1.971771240234375, + "learning_rate": 2.1872573976782397e-05, + "loss": 0.0639, + "step": 111940 + }, + { + "epoch": 24.006315333369443, + "grad_norm": 0.013068148866295815, + "learning_rate": 2.18695649557373e-05, + "loss": 0.1164, + "step": 111950 + }, + { + "epoch": 24.006369495748253, + "grad_norm": 0.16443957388401031, + "learning_rate": 2.186655593469221e-05, + "loss": 0.0097, + "step": 111960 + }, + { + "epoch": 24.006423658127066, + "grad_norm": 0.06441991031169891, + "learning_rate": 2.1863546913647116e-05, + "loss": 0.0573, + "step": 111970 + }, + { + "epoch": 24.006477820505875, + "grad_norm": 0.03532743453979492, + "learning_rate": 2.1860537892602022e-05, + "loss": 0.013, + "step": 111980 + }, + { + "epoch": 24.006531982884688, + "grad_norm": 0.0023915371857583523, + "learning_rate": 2.1857528871556928e-05, + "loss": 0.0241, + "step": 111990 + }, + { + "epoch": 24.0065861452635, + "grad_norm": 0.0021641310304403305, + "learning_rate": 2.1854519850511834e-05, + "loss": 0.0204, + "step": 112000 + }, + { + "epoch": 24.00664030764231, + "grad_norm": 0.00878352764993906, + "learning_rate": 2.1851510829466744e-05, + "loss": 0.0018, + "step": 112010 + }, + { + "epoch": 24.006694470021124, + "grad_norm": 0.012749203480780125, + "learning_rate": 2.1848501808421647e-05, + "loss": 0.1296, + "step": 112020 + }, + { + "epoch": 24.006748632399933, + "grad_norm": 0.006166004575788975, + "learning_rate": 2.1845492787376557e-05, + "loss": 0.0259, + "step": 112030 + }, + { + "epoch": 24.006802794778746, + "grad_norm": 0.007040008902549744, + "learning_rate": 2.1842483766331463e-05, + "loss": 0.0795, + "step": 112040 + }, + { + "epoch": 24.00685695715756, + "grad_norm": 0.01157635822892189, + "learning_rate": 2.183947474528637e-05, + "loss": 0.0418, + "step": 112050 + }, + { + "epoch": 24.00691111953637, + "grad_norm": 0.12010690569877625, + "learning_rate": 2.1836465724241275e-05, + "loss": 0.0455, + "step": 112060 + }, + { + "epoch": 24.006965281915182, + "grad_norm": 0.7071412801742554, + "learning_rate": 2.1833456703196185e-05, + "loss": 0.0034, + "step": 112070 + }, + { + "epoch": 24.007019444293995, + "grad_norm": 0.0500498004257679, + "learning_rate": 2.1830447682151088e-05, + "loss": 0.0319, + "step": 112080 + }, + { + "epoch": 24.007073606672805, + "grad_norm": 0.002845443319529295, + "learning_rate": 2.1827438661105998e-05, + "loss": 0.0472, + "step": 112090 + }, + { + "epoch": 24.007127769051618, + "grad_norm": 0.0023323274217545986, + "learning_rate": 2.1824429640060904e-05, + "loss": 0.0002, + "step": 112100 + }, + { + "epoch": 24.007181931430427, + "grad_norm": 0.13917839527130127, + "learning_rate": 2.182142061901581e-05, + "loss": 0.0679, + "step": 112110 + }, + { + "epoch": 24.00723609380924, + "grad_norm": 0.0021323116961866617, + "learning_rate": 2.1818411597970716e-05, + "loss": 0.0218, + "step": 112120 + }, + { + "epoch": 24.007290256188053, + "grad_norm": 0.007603156846016645, + "learning_rate": 2.1815402576925626e-05, + "loss": 0.0194, + "step": 112130 + }, + { + "epoch": 24.007344418566863, + "grad_norm": 0.002734437817707658, + "learning_rate": 2.1812393555880532e-05, + "loss": 0.056, + "step": 112140 + }, + { + "epoch": 24.007398580945676, + "grad_norm": 0.07753966003656387, + "learning_rate": 2.1809384534835435e-05, + "loss": 0.0169, + "step": 112150 + }, + { + "epoch": 24.007452743324485, + "grad_norm": 0.11693260818719864, + "learning_rate": 2.1806375513790345e-05, + "loss": 0.0619, + "step": 112160 + }, + { + "epoch": 24.0075069057033, + "grad_norm": 0.13556896150112152, + "learning_rate": 2.180336649274525e-05, + "loss": 0.0021, + "step": 112170 + }, + { + "epoch": 24.00756106808211, + "grad_norm": 0.012735159136354923, + "learning_rate": 2.1800357471700158e-05, + "loss": 0.0166, + "step": 112180 + }, + { + "epoch": 24.00761523046092, + "grad_norm": 0.30104488134384155, + "learning_rate": 2.1797348450655064e-05, + "loss": 0.0035, + "step": 112190 + }, + { + "epoch": 24.007669392839734, + "grad_norm": 0.00535821495577693, + "learning_rate": 2.1794339429609973e-05, + "loss": 0.0185, + "step": 112200 + }, + { + "epoch": 24.007723555218544, + "grad_norm": 5.387190341949463, + "learning_rate": 2.1791330408564876e-05, + "loss": 0.0265, + "step": 112210 + }, + { + "epoch": 24.007777717597357, + "grad_norm": 0.021093761548399925, + "learning_rate": 2.1788321387519786e-05, + "loss": 0.0002, + "step": 112220 + }, + { + "epoch": 24.00783187997617, + "grad_norm": 0.0018830144545063376, + "learning_rate": 2.1785312366474692e-05, + "loss": 0.0062, + "step": 112230 + }, + { + "epoch": 24.00788604235498, + "grad_norm": 2.3598744869232178, + "learning_rate": 2.17823033454296e-05, + "loss": 0.0308, + "step": 112240 + }, + { + "epoch": 24.007940204733792, + "grad_norm": 0.0017068401211872697, + "learning_rate": 2.1779294324384505e-05, + "loss": 0.0748, + "step": 112250 + }, + { + "epoch": 24.007994367112605, + "grad_norm": 0.3013836443424225, + "learning_rate": 2.1776285303339415e-05, + "loss": 0.095, + "step": 112260 + }, + { + "epoch": 24.008048529491415, + "grad_norm": 0.011094007641077042, + "learning_rate": 2.177327628229432e-05, + "loss": 0.0375, + "step": 112270 + }, + { + "epoch": 24.008102691870228, + "grad_norm": 0.0017685301136225462, + "learning_rate": 2.1770267261249227e-05, + "loss": 0.0116, + "step": 112280 + }, + { + "epoch": 24.008156854249037, + "grad_norm": 0.018376044929027557, + "learning_rate": 2.1767258240204133e-05, + "loss": 0.0776, + "step": 112290 + }, + { + "epoch": 24.00821101662785, + "grad_norm": 0.5960006713867188, + "learning_rate": 2.176424921915904e-05, + "loss": 0.0456, + "step": 112300 + }, + { + "epoch": 24.008265179006663, + "grad_norm": 0.004247572273015976, + "learning_rate": 2.1761240198113946e-05, + "loss": 0.0487, + "step": 112310 + }, + { + "epoch": 24.008319341385473, + "grad_norm": 0.001917716464959085, + "learning_rate": 2.1758231177068852e-05, + "loss": 0.0308, + "step": 112320 + }, + { + "epoch": 24.008373503764286, + "grad_norm": 0.005920063238590956, + "learning_rate": 2.1755222156023762e-05, + "loss": 0.0926, + "step": 112330 + }, + { + "epoch": 24.008427666143096, + "grad_norm": 0.05688656121492386, + "learning_rate": 2.1752213134978665e-05, + "loss": 0.0025, + "step": 112340 + }, + { + "epoch": 24.00848182852191, + "grad_norm": 0.10815165936946869, + "learning_rate": 2.1749204113933574e-05, + "loss": 0.0022, + "step": 112350 + }, + { + "epoch": 24.00853599090072, + "grad_norm": 0.019306911155581474, + "learning_rate": 2.174619509288848e-05, + "loss": 0.1534, + "step": 112360 + }, + { + "epoch": 24.00859015327953, + "grad_norm": 0.001712639001198113, + "learning_rate": 2.1743186071843387e-05, + "loss": 0.0167, + "step": 112370 + }, + { + "epoch": 24.008644315658344, + "grad_norm": 1.168717622756958, + "learning_rate": 2.1740177050798293e-05, + "loss": 0.0736, + "step": 112380 + }, + { + "epoch": 24.008698478037154, + "grad_norm": 0.7969895005226135, + "learning_rate": 2.1737168029753203e-05, + "loss": 0.1676, + "step": 112390 + }, + { + "epoch": 24.008752640415967, + "grad_norm": 0.07338393479585648, + "learning_rate": 2.173415900870811e-05, + "loss": 0.0785, + "step": 112400 + }, + { + "epoch": 24.00880680279478, + "grad_norm": 0.0015848815673962235, + "learning_rate": 2.1731149987663015e-05, + "loss": 0.0109, + "step": 112410 + }, + { + "epoch": 24.00886096517359, + "grad_norm": 0.0019151680171489716, + "learning_rate": 2.172814096661792e-05, + "loss": 0.0538, + "step": 112420 + }, + { + "epoch": 24.008915127552402, + "grad_norm": 0.001672297716140747, + "learning_rate": 2.172513194557283e-05, + "loss": 0.0264, + "step": 112430 + }, + { + "epoch": 24.008969289931215, + "grad_norm": 0.0017784201772883534, + "learning_rate": 2.1722122924527734e-05, + "loss": 0.0643, + "step": 112440 + }, + { + "epoch": 24.009023452310025, + "grad_norm": 0.0036648998502641916, + "learning_rate": 2.171911390348264e-05, + "loss": 0.0091, + "step": 112450 + }, + { + "epoch": 24.009077614688838, + "grad_norm": 0.001890056999400258, + "learning_rate": 2.171610488243755e-05, + "loss": 0.0274, + "step": 112460 + }, + { + "epoch": 24.009131777067648, + "grad_norm": 0.002496580360457301, + "learning_rate": 2.1713095861392453e-05, + "loss": 0.0166, + "step": 112470 + }, + { + "epoch": 24.00918593944646, + "grad_norm": 0.5152420997619629, + "learning_rate": 2.1710086840347363e-05, + "loss": 0.0069, + "step": 112480 + }, + { + "epoch": 24.009240101825274, + "grad_norm": 0.28250423073768616, + "learning_rate": 2.170707781930227e-05, + "loss": 0.054, + "step": 112490 + }, + { + "epoch": 24.009294264204083, + "grad_norm": 0.012072592042386532, + "learning_rate": 2.1704068798257175e-05, + "loss": 0.0211, + "step": 112500 + }, + { + "epoch": 24.009348426582896, + "grad_norm": 0.0017611944349482656, + "learning_rate": 2.170105977721208e-05, + "loss": 0.0351, + "step": 112510 + }, + { + "epoch": 24.009402588961706, + "grad_norm": 0.0019188442965969443, + "learning_rate": 2.169805075616699e-05, + "loss": 0.064, + "step": 112520 + }, + { + "epoch": 24.00945675134052, + "grad_norm": 0.04418163374066353, + "learning_rate": 2.1695041735121897e-05, + "loss": 0.0532, + "step": 112530 + }, + { + "epoch": 24.009510913719332, + "grad_norm": 0.0015643625520169735, + "learning_rate": 2.1692032714076804e-05, + "loss": 0.0082, + "step": 112540 + }, + { + "epoch": 24.00956507609814, + "grad_norm": 0.0017962065758183599, + "learning_rate": 2.168902369303171e-05, + "loss": 0.0085, + "step": 112550 + }, + { + "epoch": 24.009619238476954, + "grad_norm": 5.016520977020264, + "learning_rate": 2.168601467198662e-05, + "loss": 0.0587, + "step": 112560 + }, + { + "epoch": 24.009673400855764, + "grad_norm": 0.0015533455880358815, + "learning_rate": 2.1683005650941523e-05, + "loss": 0.1082, + "step": 112570 + }, + { + "epoch": 24.009727563234577, + "grad_norm": 0.2732621133327484, + "learning_rate": 2.1679996629896432e-05, + "loss": 0.0163, + "step": 112580 + }, + { + "epoch": 24.00978172561339, + "grad_norm": 0.001471085473895073, + "learning_rate": 2.167698760885134e-05, + "loss": 0.0101, + "step": 112590 + }, + { + "epoch": 24.0098358879922, + "grad_norm": 0.0016785203479230404, + "learning_rate": 2.167397858780624e-05, + "loss": 0.0035, + "step": 112600 + }, + { + "epoch": 24.009890050371013, + "grad_norm": 0.025169605389237404, + "learning_rate": 2.167096956676115e-05, + "loss": 0.0231, + "step": 112610 + }, + { + "epoch": 24.009944212749826, + "grad_norm": 0.0013882674975320697, + "learning_rate": 2.1667960545716057e-05, + "loss": 0.037, + "step": 112620 + }, + { + "epoch": 24.009998375128635, + "grad_norm": 0.13531243801116943, + "learning_rate": 2.1664951524670964e-05, + "loss": 0.0183, + "step": 112630 + }, + { + "epoch": 24.01005253750745, + "grad_norm": 0.11568548530340195, + "learning_rate": 2.166194250362587e-05, + "loss": 0.0545, + "step": 112640 + }, + { + "epoch": 24.010106699886258, + "grad_norm": 0.21896880865097046, + "learning_rate": 2.165893348258078e-05, + "loss": 0.0866, + "step": 112650 + }, + { + "epoch": 24.01016086226507, + "grad_norm": 0.35188502073287964, + "learning_rate": 2.1655924461535686e-05, + "loss": 0.0938, + "step": 112660 + }, + { + "epoch": 24.010215024643884, + "grad_norm": 0.0014496732037514448, + "learning_rate": 2.1652915440490592e-05, + "loss": 0.0202, + "step": 112670 + }, + { + "epoch": 24.010269187022693, + "grad_norm": 0.018309298902750015, + "learning_rate": 2.16499064194455e-05, + "loss": 0.0056, + "step": 112680 + }, + { + "epoch": 24.010323349401506, + "grad_norm": 0.0018515611300244927, + "learning_rate": 2.1646897398400408e-05, + "loss": 0.0682, + "step": 112690 + }, + { + "epoch": 24.010377511780316, + "grad_norm": 3.596996784210205, + "learning_rate": 2.164388837735531e-05, + "loss": 0.0938, + "step": 112700 + }, + { + "epoch": 24.01043167415913, + "grad_norm": 0.0013744211755692959, + "learning_rate": 2.164087935631022e-05, + "loss": 0.0165, + "step": 112710 + }, + { + "epoch": 24.010485836537942, + "grad_norm": 0.0019622347317636013, + "learning_rate": 2.1637870335265127e-05, + "loss": 0.0505, + "step": 112720 + }, + { + "epoch": 24.01053999891675, + "grad_norm": 0.00144346640445292, + "learning_rate": 2.1634861314220033e-05, + "loss": 0.0428, + "step": 112730 + }, + { + "epoch": 24.010594161295565, + "grad_norm": 0.118797168135643, + "learning_rate": 2.163185229317494e-05, + "loss": 0.0858, + "step": 112740 + }, + { + "epoch": 24.010648323674374, + "grad_norm": 0.023842332884669304, + "learning_rate": 2.1628843272129846e-05, + "loss": 0.0071, + "step": 112750 + }, + { + "epoch": 24.010702486053187, + "grad_norm": 4.373809814453125, + "learning_rate": 2.1625834251084752e-05, + "loss": 0.0883, + "step": 112760 + }, + { + "epoch": 24.010756648432, + "grad_norm": 0.1146487295627594, + "learning_rate": 2.1622825230039658e-05, + "loss": 0.0364, + "step": 112770 + }, + { + "epoch": 24.01081081081081, + "grad_norm": 0.0015546065988019109, + "learning_rate": 2.1619816208994568e-05, + "loss": 0.0123, + "step": 112780 + }, + { + "epoch": 24.010864973189623, + "grad_norm": 1.3037118911743164, + "learning_rate": 2.1616807187949474e-05, + "loss": 0.0881, + "step": 112790 + }, + { + "epoch": 24.010919135568436, + "grad_norm": 0.2178073525428772, + "learning_rate": 2.161379816690438e-05, + "loss": 0.0222, + "step": 112800 + }, + { + "epoch": 24.010973297947245, + "grad_norm": 0.001531708985567093, + "learning_rate": 2.1610789145859287e-05, + "loss": 0.0834, + "step": 112810 + }, + { + "epoch": 24.01102746032606, + "grad_norm": 16.60877799987793, + "learning_rate": 2.1607780124814196e-05, + "loss": 0.1646, + "step": 112820 + }, + { + "epoch": 24.011081622704868, + "grad_norm": 16.609025955200195, + "learning_rate": 2.16047711037691e-05, + "loss": 0.0162, + "step": 112830 + }, + { + "epoch": 24.01113578508368, + "grad_norm": 0.0030234025325626135, + "learning_rate": 2.160176208272401e-05, + "loss": 0.0524, + "step": 112840 + }, + { + "epoch": 24.011189947462494, + "grad_norm": 0.03615400940179825, + "learning_rate": 2.1598753061678915e-05, + "loss": 0.1245, + "step": 112850 + }, + { + "epoch": 24.011244109841304, + "grad_norm": 0.002030319534242153, + "learning_rate": 2.159574404063382e-05, + "loss": 0.0687, + "step": 112860 + }, + { + "epoch": 24.011298272220117, + "grad_norm": 0.07465808093547821, + "learning_rate": 2.1592735019588728e-05, + "loss": 0.0342, + "step": 112870 + }, + { + "epoch": 24.011352434598926, + "grad_norm": 0.2867990732192993, + "learning_rate": 2.1589725998543637e-05, + "loss": 0.0398, + "step": 112880 + }, + { + "epoch": 24.01140659697774, + "grad_norm": 0.04339825361967087, + "learning_rate": 2.158671697749854e-05, + "loss": 0.0439, + "step": 112890 + }, + { + "epoch": 24.011460759356552, + "grad_norm": 0.1657474786043167, + "learning_rate": 2.1583707956453447e-05, + "loss": 0.014, + "step": 112900 + }, + { + "epoch": 24.011514921735362, + "grad_norm": 0.037248365581035614, + "learning_rate": 2.1580698935408356e-05, + "loss": 0.0061, + "step": 112910 + }, + { + "epoch": 24.011569084114175, + "grad_norm": 0.0018316766945645213, + "learning_rate": 2.1577689914363263e-05, + "loss": 0.0568, + "step": 112920 + }, + { + "epoch": 24.011623246492984, + "grad_norm": 0.41635459661483765, + "learning_rate": 2.157468089331817e-05, + "loss": 0.072, + "step": 112930 + }, + { + "epoch": 24.011677408871797, + "grad_norm": 0.008700650185346603, + "learning_rate": 2.1571671872273075e-05, + "loss": 0.0051, + "step": 112940 + }, + { + "epoch": 24.01173157125061, + "grad_norm": 6.8607001304626465, + "learning_rate": 2.1568662851227985e-05, + "loss": 0.1353, + "step": 112950 + }, + { + "epoch": 24.01178573362942, + "grad_norm": 0.0027496078982949257, + "learning_rate": 2.1565653830182888e-05, + "loss": 0.0006, + "step": 112960 + }, + { + "epoch": 24.011839896008233, + "grad_norm": 0.0044270846992731094, + "learning_rate": 2.1562644809137797e-05, + "loss": 0.0398, + "step": 112970 + }, + { + "epoch": 24.011894058387043, + "grad_norm": 0.08997154235839844, + "learning_rate": 2.1559635788092704e-05, + "loss": 0.0657, + "step": 112980 + }, + { + "epoch": 24.011948220765856, + "grad_norm": 0.2241232842206955, + "learning_rate": 2.155662676704761e-05, + "loss": 0.0191, + "step": 112990 + }, + { + "epoch": 24.01200238314467, + "grad_norm": 0.010795065201818943, + "learning_rate": 2.1553617746002516e-05, + "loss": 0.0345, + "step": 113000 + }, + { + "epoch": 24.01205654552348, + "grad_norm": 0.7933324575424194, + "learning_rate": 2.1550608724957426e-05, + "loss": 0.0338, + "step": 113010 + }, + { + "epoch": 24.01211070790229, + "grad_norm": 1.8673243522644043, + "learning_rate": 2.154759970391233e-05, + "loss": 0.09, + "step": 113020 + }, + { + "epoch": 24.012164870281104, + "grad_norm": 0.0018553482368588448, + "learning_rate": 2.1544590682867238e-05, + "loss": 0.0038, + "step": 113030 + }, + { + "epoch": 24.012219032659914, + "grad_norm": 0.0017748440150171518, + "learning_rate": 2.1541581661822145e-05, + "loss": 0.0748, + "step": 113040 + }, + { + "epoch": 24.012273195038727, + "grad_norm": 0.029561182484030724, + "learning_rate": 2.153857264077705e-05, + "loss": 0.0764, + "step": 113050 + }, + { + "epoch": 24.012327357417536, + "grad_norm": 0.00259788753464818, + "learning_rate": 2.1535563619731957e-05, + "loss": 0.0219, + "step": 113060 + }, + { + "epoch": 24.01238151979635, + "grad_norm": 0.025048965588212013, + "learning_rate": 2.1532554598686863e-05, + "loss": 0.0842, + "step": 113070 + }, + { + "epoch": 24.012435682175163, + "grad_norm": 0.06957130134105682, + "learning_rate": 2.1529545577641773e-05, + "loss": 0.023, + "step": 113080 + }, + { + "epoch": 24.012489844553972, + "grad_norm": 0.0054787565022706985, + "learning_rate": 2.1526536556596676e-05, + "loss": 0.0327, + "step": 113090 + }, + { + "epoch": 24.012544006932785, + "grad_norm": 0.003187913680449128, + "learning_rate": 2.1523527535551586e-05, + "loss": 0.0396, + "step": 113100 + }, + { + "epoch": 24.012598169311595, + "grad_norm": 2.254503011703491, + "learning_rate": 2.1520518514506492e-05, + "loss": 0.032, + "step": 113110 + }, + { + "epoch": 24.012652331690408, + "grad_norm": 0.0020638799760490656, + "learning_rate": 2.1517509493461398e-05, + "loss": 0.02, + "step": 113120 + }, + { + "epoch": 24.01270649406922, + "grad_norm": 0.6656591892242432, + "learning_rate": 2.1514500472416304e-05, + "loss": 0.0378, + "step": 113130 + }, + { + "epoch": 24.01276065644803, + "grad_norm": 0.1809844672679901, + "learning_rate": 2.1511491451371214e-05, + "loss": 0.0727, + "step": 113140 + }, + { + "epoch": 24.012814818826843, + "grad_norm": 0.010689768940210342, + "learning_rate": 2.1508482430326117e-05, + "loss": 0.0841, + "step": 113150 + }, + { + "epoch": 24.012868981205653, + "grad_norm": 0.0018098365981131792, + "learning_rate": 2.1505473409281027e-05, + "loss": 0.0178, + "step": 113160 + }, + { + "epoch": 24.012923143584466, + "grad_norm": 2.664243698120117, + "learning_rate": 2.1502464388235933e-05, + "loss": 0.1038, + "step": 113170 + }, + { + "epoch": 24.01297730596328, + "grad_norm": 0.020989201962947845, + "learning_rate": 2.149945536719084e-05, + "loss": 0.0085, + "step": 113180 + }, + { + "epoch": 24.01303146834209, + "grad_norm": 0.0030354689806699753, + "learning_rate": 2.1496446346145745e-05, + "loss": 0.0331, + "step": 113190 + }, + { + "epoch": 24.0130856307209, + "grad_norm": 0.1395062953233719, + "learning_rate": 2.1493437325100652e-05, + "loss": 0.0234, + "step": 113200 + }, + { + "epoch": 24.013139793099715, + "grad_norm": 0.044860485941171646, + "learning_rate": 2.149042830405556e-05, + "loss": 0.0254, + "step": 113210 + }, + { + "epoch": 24.013193955478524, + "grad_norm": 1.1462453603744507, + "learning_rate": 2.1487419283010464e-05, + "loss": 0.0539, + "step": 113220 + }, + { + "epoch": 24.013248117857337, + "grad_norm": 0.0016114816535264254, + "learning_rate": 2.1484410261965374e-05, + "loss": 0.0768, + "step": 113230 + }, + { + "epoch": 24.013302280236147, + "grad_norm": 0.04635956510901451, + "learning_rate": 2.148140124092028e-05, + "loss": 0.0568, + "step": 113240 + }, + { + "epoch": 24.01335644261496, + "grad_norm": 0.487678587436676, + "learning_rate": 2.1478392219875187e-05, + "loss": 0.053, + "step": 113250 + }, + { + "epoch": 24.013410604993773, + "grad_norm": 0.0017760992050170898, + "learning_rate": 2.1475383198830093e-05, + "loss": 0.0333, + "step": 113260 + }, + { + "epoch": 24.013464767372582, + "grad_norm": 0.0015582715859636664, + "learning_rate": 2.1472374177785002e-05, + "loss": 0.0153, + "step": 113270 + }, + { + "epoch": 24.013518929751395, + "grad_norm": 2.4364283084869385, + "learning_rate": 2.1469365156739905e-05, + "loss": 0.1056, + "step": 113280 + }, + { + "epoch": 24.013573092130205, + "grad_norm": 0.0020405235700309277, + "learning_rate": 2.1466356135694815e-05, + "loss": 0.0226, + "step": 113290 + }, + { + "epoch": 24.013627254509018, + "grad_norm": 0.0019794534891843796, + "learning_rate": 2.146334711464972e-05, + "loss": 0.0136, + "step": 113300 + }, + { + "epoch": 24.01368141688783, + "grad_norm": 0.06363284587860107, + "learning_rate": 2.1460338093604628e-05, + "loss": 0.0284, + "step": 113310 + }, + { + "epoch": 24.01373557926664, + "grad_norm": 0.5616014003753662, + "learning_rate": 2.1457329072559534e-05, + "loss": 0.0247, + "step": 113320 + }, + { + "epoch": 24.013789741645454, + "grad_norm": 0.022248700261116028, + "learning_rate": 2.1454320051514443e-05, + "loss": 0.0479, + "step": 113330 + }, + { + "epoch": 24.013843904024263, + "grad_norm": 0.0016042458591982722, + "learning_rate": 2.145131103046935e-05, + "loss": 0.0266, + "step": 113340 + }, + { + "epoch": 24.013898066403076, + "grad_norm": 0.04210544750094414, + "learning_rate": 2.1448302009424253e-05, + "loss": 0.0216, + "step": 113350 + }, + { + "epoch": 24.01395222878189, + "grad_norm": 0.3754367530345917, + "learning_rate": 2.1445292988379162e-05, + "loss": 0.0654, + "step": 113360 + }, + { + "epoch": 24.0140063911607, + "grad_norm": 0.2041029930114746, + "learning_rate": 2.144228396733407e-05, + "loss": 0.0484, + "step": 113370 + }, + { + "epoch": 24.01406055353951, + "grad_norm": 0.0572352297604084, + "learning_rate": 2.1439274946288975e-05, + "loss": 0.0688, + "step": 113380 + }, + { + "epoch": 24.014114715918325, + "grad_norm": 0.0026086363941431046, + "learning_rate": 2.143626592524388e-05, + "loss": 0.0432, + "step": 113390 + }, + { + "epoch": 24.014168878297134, + "grad_norm": 1.0183104276657104, + "learning_rate": 2.143325690419879e-05, + "loss": 0.0162, + "step": 113400 + }, + { + "epoch": 24.014223040675947, + "grad_norm": 0.0026129563339054585, + "learning_rate": 2.1430247883153694e-05, + "loss": 0.0007, + "step": 113410 + }, + { + "epoch": 24.014277203054757, + "grad_norm": 0.0017758746398612857, + "learning_rate": 2.1427238862108603e-05, + "loss": 0.0339, + "step": 113420 + }, + { + "epoch": 24.01433136543357, + "grad_norm": 0.0023856579791754484, + "learning_rate": 2.142422984106351e-05, + "loss": 0.0039, + "step": 113430 + }, + { + "epoch": 24.014385527812383, + "grad_norm": 0.0013530384749174118, + "learning_rate": 2.1421220820018416e-05, + "loss": 0.0417, + "step": 113440 + }, + { + "epoch": 24.014439690191193, + "grad_norm": 0.001812460832297802, + "learning_rate": 2.1418211798973322e-05, + "loss": 0.0076, + "step": 113450 + }, + { + "epoch": 24.014493852570006, + "grad_norm": 0.08437223732471466, + "learning_rate": 2.1415202777928232e-05, + "loss": 0.0363, + "step": 113460 + }, + { + "epoch": 24.014548014948815, + "grad_norm": 0.0013915997697040439, + "learning_rate": 2.1412193756883138e-05, + "loss": 0.0033, + "step": 113470 + }, + { + "epoch": 24.014602177327628, + "grad_norm": 0.0013687012251466513, + "learning_rate": 2.1409184735838044e-05, + "loss": 0.1933, + "step": 113480 + }, + { + "epoch": 24.01465633970644, + "grad_norm": 0.7441906332969666, + "learning_rate": 2.140617571479295e-05, + "loss": 0.0634, + "step": 113490 + }, + { + "epoch": 24.01471050208525, + "grad_norm": 0.023599838837981224, + "learning_rate": 2.1403166693747857e-05, + "loss": 0.0069, + "step": 113500 + }, + { + "epoch": 24.014764664464064, + "grad_norm": 6.938642978668213, + "learning_rate": 2.1400157672702763e-05, + "loss": 0.0064, + "step": 113510 + }, + { + "epoch": 24.014818826842873, + "grad_norm": 0.003211065661162138, + "learning_rate": 2.139714865165767e-05, + "loss": 0.0062, + "step": 113520 + }, + { + "epoch": 24.014872989221686, + "grad_norm": 0.02511332742869854, + "learning_rate": 2.139413963061258e-05, + "loss": 0.0001, + "step": 113530 + }, + { + "epoch": 24.0149271516005, + "grad_norm": 0.003675167914479971, + "learning_rate": 2.1391130609567482e-05, + "loss": 0.1482, + "step": 113540 + }, + { + "epoch": 24.01498131397931, + "grad_norm": 0.17067790031433105, + "learning_rate": 2.138812158852239e-05, + "loss": 0.0109, + "step": 113550 + }, + { + "epoch": 24.015035476358122, + "grad_norm": 0.45375728607177734, + "learning_rate": 2.1385112567477298e-05, + "loss": 0.0392, + "step": 113560 + }, + { + "epoch": 24.015089638736935, + "grad_norm": 0.0014812394510954618, + "learning_rate": 2.1382103546432204e-05, + "loss": 0.0122, + "step": 113570 + }, + { + "epoch": 24.015143801115745, + "grad_norm": 0.0013781000161543489, + "learning_rate": 2.137909452538711e-05, + "loss": 0.0864, + "step": 113580 + }, + { + "epoch": 24.015197963494558, + "grad_norm": 0.09229139983654022, + "learning_rate": 2.137608550434202e-05, + "loss": 0.0181, + "step": 113590 + }, + { + "epoch": 24.015252125873367, + "grad_norm": 0.07597949355840683, + "learning_rate": 2.1373076483296926e-05, + "loss": 0.0403, + "step": 113600 + }, + { + "epoch": 24.01530628825218, + "grad_norm": 0.07705762982368469, + "learning_rate": 2.1370067462251833e-05, + "loss": 0.0004, + "step": 113610 + }, + { + "epoch": 24.015360450630993, + "grad_norm": 0.0015343059785664082, + "learning_rate": 2.136705844120674e-05, + "loss": 0.0353, + "step": 113620 + }, + { + "epoch": 24.015414613009803, + "grad_norm": 0.18222694098949432, + "learning_rate": 2.136404942016165e-05, + "loss": 0.0393, + "step": 113630 + }, + { + "epoch": 24.015468775388616, + "grad_norm": 14.577596664428711, + "learning_rate": 2.136104039911655e-05, + "loss": 0.0761, + "step": 113640 + }, + { + "epoch": 24.015522937767425, + "grad_norm": 0.04568137601017952, + "learning_rate": 2.1358031378071458e-05, + "loss": 0.0652, + "step": 113650 + }, + { + "epoch": 24.01557710014624, + "grad_norm": 0.0014516324736177921, + "learning_rate": 2.1355022357026367e-05, + "loss": 0.1029, + "step": 113660 + }, + { + "epoch": 24.01563126252505, + "grad_norm": 0.046152692288160324, + "learning_rate": 2.135201333598127e-05, + "loss": 0.0037, + "step": 113670 + }, + { + "epoch": 24.01568542490386, + "grad_norm": 0.00147419108543545, + "learning_rate": 2.134900431493618e-05, + "loss": 0.0662, + "step": 113680 + }, + { + "epoch": 24.015739587282674, + "grad_norm": 3.139159917831421, + "learning_rate": 2.1345995293891086e-05, + "loss": 0.0768, + "step": 113690 + }, + { + "epoch": 24.015793749661484, + "grad_norm": 0.003473025979474187, + "learning_rate": 2.1342986272845993e-05, + "loss": 0.012, + "step": 113700 + }, + { + "epoch": 24.015847912040297, + "grad_norm": 0.0015986269572749734, + "learning_rate": 2.13399772518009e-05, + "loss": 0.0511, + "step": 113710 + }, + { + "epoch": 24.01590207441911, + "grad_norm": 2.446255922317505, + "learning_rate": 2.133696823075581e-05, + "loss": 0.0592, + "step": 113720 + }, + { + "epoch": 24.01595623679792, + "grad_norm": 0.03139108419418335, + "learning_rate": 2.1333959209710715e-05, + "loss": 0.0346, + "step": 113730 + }, + { + "epoch": 24.016010399176732, + "grad_norm": 0.0016689434414729476, + "learning_rate": 2.133095018866562e-05, + "loss": 0.1352, + "step": 113740 + }, + { + "epoch": 24.016064561555545, + "grad_norm": 0.0025955173186957836, + "learning_rate": 2.1327941167620527e-05, + "loss": 0.0335, + "step": 113750 + }, + { + "epoch": 24.016118723934355, + "grad_norm": 0.03900570049881935, + "learning_rate": 2.1324932146575437e-05, + "loss": 0.0452, + "step": 113760 + }, + { + "epoch": 24.016172886313168, + "grad_norm": 0.0018046335317194462, + "learning_rate": 2.132192312553034e-05, + "loss": 0.0549, + "step": 113770 + }, + { + "epoch": 24.016227048691977, + "grad_norm": 0.8266103863716125, + "learning_rate": 2.131891410448525e-05, + "loss": 0.0456, + "step": 113780 + }, + { + "epoch": 24.01628121107079, + "grad_norm": 7.276651382446289, + "learning_rate": 2.1315905083440156e-05, + "loss": 0.0214, + "step": 113790 + }, + { + "epoch": 24.016335373449603, + "grad_norm": 0.04057473689317703, + "learning_rate": 2.131289606239506e-05, + "loss": 0.0356, + "step": 113800 + }, + { + "epoch": 24.016389535828413, + "grad_norm": 0.01816633529961109, + "learning_rate": 2.130988704134997e-05, + "loss": 0.0019, + "step": 113810 + }, + { + "epoch": 24.016443698207226, + "grad_norm": 0.5858601331710815, + "learning_rate": 2.1306878020304875e-05, + "loss": 0.0042, + "step": 113820 + }, + { + "epoch": 24.016497860586036, + "grad_norm": 0.8483256101608276, + "learning_rate": 2.130386899925978e-05, + "loss": 0.0171, + "step": 113830 + }, + { + "epoch": 24.01655202296485, + "grad_norm": 0.0631588026881218, + "learning_rate": 2.1300859978214687e-05, + "loss": 0.1041, + "step": 113840 + }, + { + "epoch": 24.01660618534366, + "grad_norm": 1.2945460081100464, + "learning_rate": 2.1297850957169597e-05, + "loss": 0.0689, + "step": 113850 + }, + { + "epoch": 24.01666034772247, + "grad_norm": 0.0017774795414879918, + "learning_rate": 2.1294841936124503e-05, + "loss": 0.0625, + "step": 113860 + }, + { + "epoch": 24.016714510101284, + "grad_norm": 0.0016165822744369507, + "learning_rate": 2.129183291507941e-05, + "loss": 0.011, + "step": 113870 + }, + { + "epoch": 24.016768672480094, + "grad_norm": 0.06884371489286423, + "learning_rate": 2.1288823894034316e-05, + "loss": 0.0089, + "step": 113880 + }, + { + "epoch": 24.016822834858907, + "grad_norm": 0.0776732861995697, + "learning_rate": 2.1285814872989225e-05, + "loss": 0.0423, + "step": 113890 + }, + { + "epoch": 24.01687699723772, + "grad_norm": 0.06016451120376587, + "learning_rate": 2.1282805851944128e-05, + "loss": 0.0581, + "step": 113900 + }, + { + "epoch": 24.01693115961653, + "grad_norm": 0.0016860498581081629, + "learning_rate": 2.1279796830899038e-05, + "loss": 0.0618, + "step": 113910 + }, + { + "epoch": 24.016985321995342, + "grad_norm": 0.6701891422271729, + "learning_rate": 2.1276787809853944e-05, + "loss": 0.0191, + "step": 113920 + }, + { + "epoch": 24.017039484374155, + "grad_norm": 0.0016131805023178458, + "learning_rate": 2.127377878880885e-05, + "loss": 0.0013, + "step": 113930 + }, + { + "epoch": 24.017093646752965, + "grad_norm": 0.3032415211200714, + "learning_rate": 2.1270769767763757e-05, + "loss": 0.0026, + "step": 113940 + }, + { + "epoch": 24.017147809131778, + "grad_norm": 15.064041137695312, + "learning_rate": 2.1267760746718663e-05, + "loss": 0.2454, + "step": 113950 + }, + { + "epoch": 24.017201971510588, + "grad_norm": 0.03130391240119934, + "learning_rate": 2.126475172567357e-05, + "loss": 0.0331, + "step": 113960 + }, + { + "epoch": 24.0172561338894, + "grad_norm": 0.34014376997947693, + "learning_rate": 2.1261742704628476e-05, + "loss": 0.0872, + "step": 113970 + }, + { + "epoch": 24.017310296268214, + "grad_norm": 0.0017121119890362024, + "learning_rate": 2.1258733683583385e-05, + "loss": 0.0453, + "step": 113980 + }, + { + "epoch": 24.017364458647023, + "grad_norm": 0.002029063180088997, + "learning_rate": 2.125572466253829e-05, + "loss": 0.0501, + "step": 113990 + }, + { + "epoch": 24.017418621025836, + "grad_norm": 0.003881208598613739, + "learning_rate": 2.1252715641493198e-05, + "loss": 0.0011, + "step": 114000 + }, + { + "epoch": 24.017472783404646, + "grad_norm": 0.4466160237789154, + "learning_rate": 2.1249706620448104e-05, + "loss": 0.0669, + "step": 114010 + }, + { + "epoch": 24.01752694578346, + "grad_norm": 0.0056477803736925125, + "learning_rate": 2.1246697599403014e-05, + "loss": 0.0197, + "step": 114020 + }, + { + "epoch": 24.017581108162272, + "grad_norm": 0.0016898814355954528, + "learning_rate": 2.1243688578357917e-05, + "loss": 0.1148, + "step": 114030 + }, + { + "epoch": 24.01763527054108, + "grad_norm": 1.8342667818069458, + "learning_rate": 2.1240679557312826e-05, + "loss": 0.1027, + "step": 114040 + }, + { + "epoch": 24.017689432919894, + "grad_norm": 0.0032722039613872766, + "learning_rate": 2.1237670536267733e-05, + "loss": 0.0567, + "step": 114050 + }, + { + "epoch": 24.017743595298704, + "grad_norm": 1.4358357191085815, + "learning_rate": 2.123466151522264e-05, + "loss": 0.0733, + "step": 114060 + }, + { + "epoch": 24.017797757677517, + "grad_norm": 0.7617146968841553, + "learning_rate": 2.1231652494177545e-05, + "loss": 0.0318, + "step": 114070 + }, + { + "epoch": 24.01785192005633, + "grad_norm": 0.788500189781189, + "learning_rate": 2.122864347313245e-05, + "loss": 0.0858, + "step": 114080 + }, + { + "epoch": 24.01790608243514, + "grad_norm": 0.69354248046875, + "learning_rate": 2.1225634452087358e-05, + "loss": 0.0384, + "step": 114090 + }, + { + "epoch": 24.017960244813953, + "grad_norm": 0.6829826831817627, + "learning_rate": 2.1222625431042264e-05, + "loss": 0.1137, + "step": 114100 + }, + { + "epoch": 24.018014407192762, + "grad_norm": 0.0018541066674515605, + "learning_rate": 2.1219616409997174e-05, + "loss": 0.0529, + "step": 114110 + }, + { + "epoch": 24.018068569571575, + "grad_norm": 0.0037660757079720497, + "learning_rate": 2.121660738895208e-05, + "loss": 0.0378, + "step": 114120 + }, + { + "epoch": 24.01812273195039, + "grad_norm": 0.0026226083282381296, + "learning_rate": 2.1213598367906986e-05, + "loss": 0.0552, + "step": 114130 + }, + { + "epoch": 24.018176894329198, + "grad_norm": 0.005522522144019604, + "learning_rate": 2.1210589346861892e-05, + "loss": 0.0475, + "step": 114140 + }, + { + "epoch": 24.01823105670801, + "grad_norm": 0.012818469665944576, + "learning_rate": 2.1207580325816802e-05, + "loss": 0.036, + "step": 114150 + }, + { + "epoch": 24.018285219086824, + "grad_norm": 1.2338309288024902, + "learning_rate": 2.1204571304771705e-05, + "loss": 0.1286, + "step": 114160 + }, + { + "epoch": 24.018339381465633, + "grad_norm": 6.268895149230957, + "learning_rate": 2.1201562283726615e-05, + "loss": 0.093, + "step": 114170 + }, + { + "epoch": 24.018393543844446, + "grad_norm": 0.23401525616645813, + "learning_rate": 2.119855326268152e-05, + "loss": 0.0389, + "step": 114180 + }, + { + "epoch": 24.018447706223256, + "grad_norm": 0.0018627078970894217, + "learning_rate": 2.1195544241636427e-05, + "loss": 0.0526, + "step": 114190 + }, + { + "epoch": 24.01850186860207, + "grad_norm": 0.0056606680154800415, + "learning_rate": 2.1192535220591333e-05, + "loss": 0.0292, + "step": 114200 + }, + { + "epoch": 24.018556030980882, + "grad_norm": 0.9065867066383362, + "learning_rate": 2.1189526199546243e-05, + "loss": 0.0104, + "step": 114210 + }, + { + "epoch": 24.01861019335969, + "grad_norm": 0.014158060774207115, + "learning_rate": 2.1186517178501146e-05, + "loss": 0.0584, + "step": 114220 + }, + { + "epoch": 24.018664355738505, + "grad_norm": 0.013771045953035355, + "learning_rate": 2.1183508157456052e-05, + "loss": 0.0131, + "step": 114230 + }, + { + "epoch": 24.018718518117314, + "grad_norm": 0.001953363651409745, + "learning_rate": 2.1180499136410962e-05, + "loss": 0.012, + "step": 114240 + }, + { + "epoch": 24.018772680496127, + "grad_norm": 0.8317484259605408, + "learning_rate": 2.1177490115365868e-05, + "loss": 0.0621, + "step": 114250 + }, + { + "epoch": 24.01882684287494, + "grad_norm": 0.002819872461259365, + "learning_rate": 2.1174481094320774e-05, + "loss": 0.0688, + "step": 114260 + }, + { + "epoch": 24.01888100525375, + "grad_norm": 1.595841884613037, + "learning_rate": 2.117147207327568e-05, + "loss": 0.0403, + "step": 114270 + }, + { + "epoch": 24.018935167632563, + "grad_norm": 2.3176050186157227, + "learning_rate": 2.116846305223059e-05, + "loss": 0.1215, + "step": 114280 + }, + { + "epoch": 24.018989330011372, + "grad_norm": 0.0030866956803947687, + "learning_rate": 2.1165454031185493e-05, + "loss": 0.0133, + "step": 114290 + }, + { + "epoch": 24.019043492390185, + "grad_norm": 1.8573122024536133, + "learning_rate": 2.1162445010140403e-05, + "loss": 0.0606, + "step": 114300 + }, + { + "epoch": 24.019097654769, + "grad_norm": 11.588638305664062, + "learning_rate": 2.115943598909531e-05, + "loss": 0.1309, + "step": 114310 + }, + { + "epoch": 24.019151817147808, + "grad_norm": 0.0064269620925188065, + "learning_rate": 2.1156426968050215e-05, + "loss": 0.0044, + "step": 114320 + }, + { + "epoch": 24.01920597952662, + "grad_norm": 0.001987087307497859, + "learning_rate": 2.1153417947005122e-05, + "loss": 0.0225, + "step": 114330 + }, + { + "epoch": 24.019260141905434, + "grad_norm": 2.071107864379883, + "learning_rate": 2.115040892596003e-05, + "loss": 0.0197, + "step": 114340 + }, + { + "epoch": 24.019314304284244, + "grad_norm": 15.294261932373047, + "learning_rate": 2.1147399904914934e-05, + "loss": 0.024, + "step": 114350 + }, + { + "epoch": 24.019368466663057, + "grad_norm": 0.0018683391390368342, + "learning_rate": 2.1144390883869844e-05, + "loss": 0.0411, + "step": 114360 + }, + { + "epoch": 24.019422629041866, + "grad_norm": 0.0019173587206751108, + "learning_rate": 2.114138186282475e-05, + "loss": 0.0142, + "step": 114370 + }, + { + "epoch": 24.01947679142068, + "grad_norm": 0.5444108247756958, + "learning_rate": 2.1138372841779657e-05, + "loss": 0.025, + "step": 114380 + }, + { + "epoch": 24.019530953799492, + "grad_norm": 0.6401435136795044, + "learning_rate": 2.1135363820734563e-05, + "loss": 0.025, + "step": 114390 + }, + { + "epoch": 24.019585116178302, + "grad_norm": 0.002444078680127859, + "learning_rate": 2.113235479968947e-05, + "loss": 0.0224, + "step": 114400 + }, + { + "epoch": 24.019639278557115, + "grad_norm": 4.192163467407227, + "learning_rate": 2.112934577864438e-05, + "loss": 0.1008, + "step": 114410 + }, + { + "epoch": 24.019693440935924, + "grad_norm": 4.019739627838135, + "learning_rate": 2.112633675759928e-05, + "loss": 0.1424, + "step": 114420 + }, + { + "epoch": 24.019747603314737, + "grad_norm": 0.008965604938566685, + "learning_rate": 2.112332773655419e-05, + "loss": 0.0605, + "step": 114430 + }, + { + "epoch": 24.01980176569355, + "grad_norm": 0.5344670414924622, + "learning_rate": 2.1120318715509098e-05, + "loss": 0.0314, + "step": 114440 + }, + { + "epoch": 24.01985592807236, + "grad_norm": 1.0643422603607178, + "learning_rate": 2.1117309694464004e-05, + "loss": 0.0698, + "step": 114450 + }, + { + "epoch": 24.019910090451173, + "grad_norm": 0.001746906666085124, + "learning_rate": 2.111430067341891e-05, + "loss": 0.0498, + "step": 114460 + }, + { + "epoch": 24.019964252829983, + "grad_norm": 0.04114500805735588, + "learning_rate": 2.111129165237382e-05, + "loss": 0.0221, + "step": 114470 + }, + { + "epoch": 24.020018415208796, + "grad_norm": 0.4604531526565552, + "learning_rate": 2.1108282631328723e-05, + "loss": 0.0601, + "step": 114480 + }, + { + "epoch": 24.02007257758761, + "grad_norm": 0.48603716492652893, + "learning_rate": 2.1105273610283632e-05, + "loss": 0.0089, + "step": 114490 + }, + { + "epoch": 24.02012673996642, + "grad_norm": 0.2637609541416168, + "learning_rate": 2.110226458923854e-05, + "loss": 0.0084, + "step": 114500 + }, + { + "epoch": 24.02018090234523, + "grad_norm": 0.0031044562347233295, + "learning_rate": 2.1099255568193445e-05, + "loss": 0.0862, + "step": 114510 + }, + { + "epoch": 24.020235064724044, + "grad_norm": 0.0023622168228030205, + "learning_rate": 2.109624654714835e-05, + "loss": 0.0284, + "step": 114520 + }, + { + "epoch": 24.020289227102854, + "grad_norm": 0.5251624584197998, + "learning_rate": 2.1093237526103257e-05, + "loss": 0.0043, + "step": 114530 + }, + { + "epoch": 24.020343389481667, + "grad_norm": 0.8267257213592529, + "learning_rate": 2.1090228505058167e-05, + "loss": 0.0448, + "step": 114540 + }, + { + "epoch": 24.020397551860476, + "grad_norm": 1.407288908958435, + "learning_rate": 2.108721948401307e-05, + "loss": 0.0587, + "step": 114550 + }, + { + "epoch": 24.02045171423929, + "grad_norm": 0.6468743681907654, + "learning_rate": 2.108421046296798e-05, + "loss": 0.0186, + "step": 114560 + }, + { + "epoch": 24.020505876618103, + "grad_norm": 5.667914867401123, + "learning_rate": 2.1081201441922886e-05, + "loss": 0.0652, + "step": 114570 + }, + { + "epoch": 24.020560038996912, + "grad_norm": 0.32104915380477905, + "learning_rate": 2.1078192420877792e-05, + "loss": 0.0144, + "step": 114580 + }, + { + "epoch": 24.020614201375725, + "grad_norm": 0.32238179445266724, + "learning_rate": 2.10751833998327e-05, + "loss": 0.097, + "step": 114590 + }, + { + "epoch": 24.020668363754535, + "grad_norm": 0.6518173217773438, + "learning_rate": 2.1072174378787608e-05, + "loss": 0.034, + "step": 114600 + }, + { + "epoch": 24.020722526133348, + "grad_norm": 0.004997950047254562, + "learning_rate": 2.106916535774251e-05, + "loss": 0.044, + "step": 114610 + }, + { + "epoch": 24.02077668851216, + "grad_norm": 0.002755149034783244, + "learning_rate": 2.106615633669742e-05, + "loss": 0.0717, + "step": 114620 + }, + { + "epoch": 24.02083085089097, + "grad_norm": 1.2324559688568115, + "learning_rate": 2.1063147315652327e-05, + "loss": 0.0218, + "step": 114630 + }, + { + "epoch": 24.020885013269783, + "grad_norm": 0.002756106900051236, + "learning_rate": 2.1060138294607233e-05, + "loss": 0.041, + "step": 114640 + }, + { + "epoch": 24.020939175648593, + "grad_norm": 1.4787490367889404, + "learning_rate": 2.105712927356214e-05, + "loss": 0.0347, + "step": 114650 + }, + { + "epoch": 24.020993338027406, + "grad_norm": 0.1129300519824028, + "learning_rate": 2.105412025251705e-05, + "loss": 0.0137, + "step": 114660 + }, + { + "epoch": 24.02104750040622, + "grad_norm": 0.0016627744771540165, + "learning_rate": 2.1051111231471955e-05, + "loss": 0.0233, + "step": 114670 + }, + { + "epoch": 24.02110166278503, + "grad_norm": 0.002058436395600438, + "learning_rate": 2.104810221042686e-05, + "loss": 0.0151, + "step": 114680 + }, + { + "epoch": 24.02115582516384, + "grad_norm": 0.0016180307138711214, + "learning_rate": 2.1045093189381768e-05, + "loss": 0.095, + "step": 114690 + }, + { + "epoch": 24.021209987542655, + "grad_norm": 0.6992740631103516, + "learning_rate": 2.1042084168336674e-05, + "loss": 0.045, + "step": 114700 + }, + { + "epoch": 24.021264149921464, + "grad_norm": 0.4696643650531769, + "learning_rate": 2.103907514729158e-05, + "loss": 0.0065, + "step": 114710 + }, + { + "epoch": 24.021318312300277, + "grad_norm": 0.0030444511212408543, + "learning_rate": 2.1036066126246487e-05, + "loss": 0.0103, + "step": 114720 + }, + { + "epoch": 24.021372474679087, + "grad_norm": 0.8816163539886475, + "learning_rate": 2.1033057105201396e-05, + "loss": 0.0365, + "step": 114730 + }, + { + "epoch": 24.0214266370579, + "grad_norm": 0.0019355040276423097, + "learning_rate": 2.10300480841563e-05, + "loss": 0.0519, + "step": 114740 + }, + { + "epoch": 24.021480799436713, + "grad_norm": 0.028397196903824806, + "learning_rate": 2.102703906311121e-05, + "loss": 0.1061, + "step": 114750 + }, + { + "epoch": 24.021534961815522, + "grad_norm": 0.002074834192171693, + "learning_rate": 2.1024030042066115e-05, + "loss": 0.0721, + "step": 114760 + }, + { + "epoch": 24.021589124194335, + "grad_norm": 0.0041052671149373055, + "learning_rate": 2.102102102102102e-05, + "loss": 0.0182, + "step": 114770 + }, + { + "epoch": 24.021643286573145, + "grad_norm": 5.342196941375732, + "learning_rate": 2.1018011999975928e-05, + "loss": 0.1251, + "step": 114780 + }, + { + "epoch": 24.021697448951958, + "grad_norm": 0.0017699397867545485, + "learning_rate": 2.1015002978930838e-05, + "loss": 0.0956, + "step": 114790 + }, + { + "epoch": 24.02175161133077, + "grad_norm": 0.19486401975154877, + "learning_rate": 2.1011993957885744e-05, + "loss": 0.0493, + "step": 114800 + }, + { + "epoch": 24.02180577370958, + "grad_norm": 4.601468086242676, + "learning_rate": 2.100898493684065e-05, + "loss": 0.008, + "step": 114810 + }, + { + "epoch": 24.021859936088394, + "grad_norm": 0.0017622018931433558, + "learning_rate": 2.1005975915795556e-05, + "loss": 0.0436, + "step": 114820 + }, + { + "epoch": 24.021914098467203, + "grad_norm": 0.4911084473133087, + "learning_rate": 2.1002966894750463e-05, + "loss": 0.055, + "step": 114830 + }, + { + "epoch": 24.021968260846016, + "grad_norm": 0.14357589185237885, + "learning_rate": 2.099995787370537e-05, + "loss": 0.0239, + "step": 114840 + }, + { + "epoch": 24.02202242322483, + "grad_norm": 0.0015136526199057698, + "learning_rate": 2.0996948852660275e-05, + "loss": 0.0016, + "step": 114850 + }, + { + "epoch": 24.02207658560364, + "grad_norm": 0.0015095031121745706, + "learning_rate": 2.0993939831615185e-05, + "loss": 0.0588, + "step": 114860 + }, + { + "epoch": 24.02213074798245, + "grad_norm": 0.0017028532456606627, + "learning_rate": 2.0990930810570088e-05, + "loss": 0.085, + "step": 114870 + }, + { + "epoch": 24.022184910361265, + "grad_norm": 0.0028853563126176596, + "learning_rate": 2.0987921789524997e-05, + "loss": 0.0049, + "step": 114880 + }, + { + "epoch": 24.022239072740074, + "grad_norm": 0.29979026317596436, + "learning_rate": 2.0984912768479904e-05, + "loss": 0.0572, + "step": 114890 + }, + { + "epoch": 24.022293235118887, + "grad_norm": 1.7953213453292847, + "learning_rate": 2.098190374743481e-05, + "loss": 0.1474, + "step": 114900 + }, + { + "epoch": 24.022347397497697, + "grad_norm": 0.12882277369499207, + "learning_rate": 2.0978894726389716e-05, + "loss": 0.0173, + "step": 114910 + }, + { + "epoch": 24.02240155987651, + "grad_norm": 0.4890916049480438, + "learning_rate": 2.0975885705344626e-05, + "loss": 0.0271, + "step": 114920 + }, + { + "epoch": 24.022455722255323, + "grad_norm": 0.04649034142494202, + "learning_rate": 2.0972876684299532e-05, + "loss": 0.0109, + "step": 114930 + }, + { + "epoch": 24.022509884634133, + "grad_norm": 0.0026871790178120136, + "learning_rate": 2.096986766325444e-05, + "loss": 0.0535, + "step": 114940 + }, + { + "epoch": 24.022564047012946, + "grad_norm": 0.003981476183980703, + "learning_rate": 2.0966858642209345e-05, + "loss": 0.0135, + "step": 114950 + }, + { + "epoch": 24.022618209391755, + "grad_norm": 0.0016438240418210626, + "learning_rate": 2.0963849621164254e-05, + "loss": 0.0109, + "step": 114960 + }, + { + "epoch": 24.022672371770568, + "grad_norm": 0.008538289926946163, + "learning_rate": 2.0960840600119157e-05, + "loss": 0.0842, + "step": 114970 + }, + { + "epoch": 24.02272653414938, + "grad_norm": 0.8367528319358826, + "learning_rate": 2.0957831579074064e-05, + "loss": 0.0483, + "step": 114980 + }, + { + "epoch": 24.02278069652819, + "grad_norm": 0.3568626642227173, + "learning_rate": 2.0954822558028973e-05, + "loss": 0.037, + "step": 114990 + }, + { + "epoch": 24.022834858907004, + "grad_norm": 0.026030227541923523, + "learning_rate": 2.0951813536983876e-05, + "loss": 0.0264, + "step": 115000 + }, + { + "epoch": 24.022889021285813, + "grad_norm": 3.4285895824432373, + "learning_rate": 2.0948804515938786e-05, + "loss": 0.1005, + "step": 115010 + }, + { + "epoch": 24.022943183664626, + "grad_norm": 0.0015997630544006824, + "learning_rate": 2.0945795494893692e-05, + "loss": 0.0508, + "step": 115020 + }, + { + "epoch": 24.02299734604344, + "grad_norm": 0.0017382076475769281, + "learning_rate": 2.0942786473848598e-05, + "loss": 0.0948, + "step": 115030 + }, + { + "epoch": 24.02305150842225, + "grad_norm": 0.6508333086967468, + "learning_rate": 2.0939777452803505e-05, + "loss": 0.0618, + "step": 115040 + }, + { + "epoch": 24.023105670801062, + "grad_norm": 1.170750379562378, + "learning_rate": 2.0936768431758414e-05, + "loss": 0.0441, + "step": 115050 + }, + { + "epoch": 24.023159833179875, + "grad_norm": 0.5147772431373596, + "learning_rate": 2.093375941071332e-05, + "loss": 0.0242, + "step": 115060 + }, + { + "epoch": 24.023213995558685, + "grad_norm": 0.34230512380599976, + "learning_rate": 2.0930750389668227e-05, + "loss": 0.0057, + "step": 115070 + }, + { + "epoch": 24.023268157937498, + "grad_norm": 0.0025542585644870996, + "learning_rate": 2.0927741368623133e-05, + "loss": 0.025, + "step": 115080 + }, + { + "epoch": 24.023322320316307, + "grad_norm": 0.0017186686163768172, + "learning_rate": 2.0924732347578043e-05, + "loss": 0.0302, + "step": 115090 + }, + { + "epoch": 24.02337648269512, + "grad_norm": 0.001516475691460073, + "learning_rate": 2.0921723326532946e-05, + "loss": 0.0062, + "step": 115100 + }, + { + "epoch": 24.023430645073933, + "grad_norm": 0.11400994658470154, + "learning_rate": 2.0918714305487855e-05, + "loss": 0.0242, + "step": 115110 + }, + { + "epoch": 24.023484807452743, + "grad_norm": 0.028375733643770218, + "learning_rate": 2.091570528444276e-05, + "loss": 0.0526, + "step": 115120 + }, + { + "epoch": 24.023538969831556, + "grad_norm": 0.004559054970741272, + "learning_rate": 2.0912696263397664e-05, + "loss": 0.0408, + "step": 115130 + }, + { + "epoch": 24.023593132210365, + "grad_norm": 0.026611240580677986, + "learning_rate": 2.0909687242352574e-05, + "loss": 0.0429, + "step": 115140 + }, + { + "epoch": 24.02364729458918, + "grad_norm": 0.027209745720028877, + "learning_rate": 2.090667822130748e-05, + "loss": 0.0414, + "step": 115150 + }, + { + "epoch": 24.02370145696799, + "grad_norm": 0.04753221943974495, + "learning_rate": 2.0903669200262387e-05, + "loss": 0.1875, + "step": 115160 + }, + { + "epoch": 24.0237556193468, + "grad_norm": 0.7363863587379456, + "learning_rate": 2.0900660179217293e-05, + "loss": 0.0403, + "step": 115170 + }, + { + "epoch": 24.023809781725614, + "grad_norm": 0.002433965913951397, + "learning_rate": 2.0897651158172203e-05, + "loss": 0.018, + "step": 115180 + }, + { + "epoch": 24.023863944104423, + "grad_norm": 0.02420033887028694, + "learning_rate": 2.089464213712711e-05, + "loss": 0.0007, + "step": 115190 + }, + { + "epoch": 24.023918106483237, + "grad_norm": 0.0025097408797591925, + "learning_rate": 2.0891633116082015e-05, + "loss": 0.0064, + "step": 115200 + }, + { + "epoch": 24.02397226886205, + "grad_norm": 0.0016470117261633277, + "learning_rate": 2.088862409503692e-05, + "loss": 0.0212, + "step": 115210 + }, + { + "epoch": 24.02402643124086, + "grad_norm": 0.0016112062148749828, + "learning_rate": 2.088561507399183e-05, + "loss": 0.025, + "step": 115220 + }, + { + "epoch": 24.024080593619672, + "grad_norm": 0.0016201878897845745, + "learning_rate": 2.0882606052946734e-05, + "loss": 0.0334, + "step": 115230 + }, + { + "epoch": 24.02413475599848, + "grad_norm": 0.0016379792941734195, + "learning_rate": 2.0879597031901644e-05, + "loss": 0.0706, + "step": 115240 + }, + { + "epoch": 24.024188918377295, + "grad_norm": 0.03780272230505943, + "learning_rate": 2.087658801085655e-05, + "loss": 0.0321, + "step": 115250 + }, + { + "epoch": 24.024243080756108, + "grad_norm": 23.75926399230957, + "learning_rate": 2.0873578989811456e-05, + "loss": 0.04, + "step": 115260 + }, + { + "epoch": 24.024297243134917, + "grad_norm": 0.5393921136856079, + "learning_rate": 2.0870569968766362e-05, + "loss": 0.0523, + "step": 115270 + }, + { + "epoch": 24.02435140551373, + "grad_norm": 0.001638697343878448, + "learning_rate": 2.086756094772127e-05, + "loss": 0.0073, + "step": 115280 + }, + { + "epoch": 24.024405567892543, + "grad_norm": 0.0024580948520451784, + "learning_rate": 2.0864551926676175e-05, + "loss": 0.0086, + "step": 115290 + }, + { + "epoch": 24.024459730271353, + "grad_norm": 0.005714569706469774, + "learning_rate": 2.086154290563108e-05, + "loss": 0.0594, + "step": 115300 + }, + { + "epoch": 24.024513892650166, + "grad_norm": 0.06382538378238678, + "learning_rate": 2.085853388458599e-05, + "loss": 0.0819, + "step": 115310 + }, + { + "epoch": 24.024568055028976, + "grad_norm": 3.786346673965454, + "learning_rate": 2.0855524863540897e-05, + "loss": 0.1243, + "step": 115320 + }, + { + "epoch": 24.02462221740779, + "grad_norm": 0.006543222349137068, + "learning_rate": 2.0852515842495803e-05, + "loss": 0.0489, + "step": 115330 + }, + { + "epoch": 24.0246763797866, + "grad_norm": 0.0016057253815233707, + "learning_rate": 2.084950682145071e-05, + "loss": 0.0246, + "step": 115340 + }, + { + "epoch": 24.02473054216541, + "grad_norm": 0.5080811977386475, + "learning_rate": 2.084649780040562e-05, + "loss": 0.0075, + "step": 115350 + }, + { + "epoch": 24.024784704544224, + "grad_norm": 0.015281065367162228, + "learning_rate": 2.0843488779360522e-05, + "loss": 0.0246, + "step": 115360 + }, + { + "epoch": 24.024838866923034, + "grad_norm": 0.03563612326979637, + "learning_rate": 2.0840479758315432e-05, + "loss": 0.0406, + "step": 115370 + }, + { + "epoch": 24.024893029301847, + "grad_norm": 0.8707848787307739, + "learning_rate": 2.0837470737270338e-05, + "loss": 0.0251, + "step": 115380 + }, + { + "epoch": 24.02494719168066, + "grad_norm": 0.0015996730653569102, + "learning_rate": 2.0834461716225244e-05, + "loss": 0.0013, + "step": 115390 + }, + { + "epoch": 24.02500135405947, + "grad_norm": 0.0280962772667408, + "learning_rate": 2.083145269518015e-05, + "loss": 0.0089, + "step": 115400 + }, + { + "epoch": 24.02500135405947, + "eval_accuracy": 0.8233180927498367, + "eval_loss": 0.9719455242156982, + "eval_runtime": 117.2169, + "eval_samples_per_second": 26.123, + "eval_steps_per_second": 3.267, + "step": 115400 + }, + { + "epoch": 25.000054162378813, + "grad_norm": 0.003449298907071352, + "learning_rate": 2.082844367413506e-05, + "loss": 0.006, + "step": 115410 + }, + { + "epoch": 25.000108324757623, + "grad_norm": 0.0023926456924527884, + "learning_rate": 2.0825434653089963e-05, + "loss": 0.0893, + "step": 115420 + }, + { + "epoch": 25.000162487136436, + "grad_norm": 0.006884499918669462, + "learning_rate": 2.082242563204487e-05, + "loss": 0.0172, + "step": 115430 + }, + { + "epoch": 25.000216649515245, + "grad_norm": 0.031112387776374817, + "learning_rate": 2.081941661099978e-05, + "loss": 0.0429, + "step": 115440 + }, + { + "epoch": 25.000270811894058, + "grad_norm": 0.029018225148320198, + "learning_rate": 2.0816407589954686e-05, + "loss": 0.0272, + "step": 115450 + }, + { + "epoch": 25.00032497427287, + "grad_norm": 0.5980083346366882, + "learning_rate": 2.0813398568909592e-05, + "loss": 0.0114, + "step": 115460 + }, + { + "epoch": 25.00037913665168, + "grad_norm": 3.523651599884033, + "learning_rate": 2.0810389547864498e-05, + "loss": 0.1123, + "step": 115470 + }, + { + "epoch": 25.000433299030494, + "grad_norm": 0.0015247914707288146, + "learning_rate": 2.0807380526819408e-05, + "loss": 0.0005, + "step": 115480 + }, + { + "epoch": 25.000487461409303, + "grad_norm": 0.4326915144920349, + "learning_rate": 2.080437150577431e-05, + "loss": 0.0133, + "step": 115490 + }, + { + "epoch": 25.000541623788116, + "grad_norm": 0.0026793775614351034, + "learning_rate": 2.080136248472922e-05, + "loss": 0.0226, + "step": 115500 + }, + { + "epoch": 25.00059578616693, + "grad_norm": 5.561033725738525, + "learning_rate": 2.0798353463684127e-05, + "loss": 0.0502, + "step": 115510 + }, + { + "epoch": 25.00064994854574, + "grad_norm": 1.4584808349609375, + "learning_rate": 2.0795344442639033e-05, + "loss": 0.0718, + "step": 115520 + }, + { + "epoch": 25.000704110924552, + "grad_norm": 0.06071177124977112, + "learning_rate": 2.079233542159394e-05, + "loss": 0.0048, + "step": 115530 + }, + { + "epoch": 25.000758273303365, + "grad_norm": 1.9636989831924438, + "learning_rate": 2.078932640054885e-05, + "loss": 0.0303, + "step": 115540 + }, + { + "epoch": 25.000812435682175, + "grad_norm": 0.0022294677328318357, + "learning_rate": 2.078631737950375e-05, + "loss": 0.1015, + "step": 115550 + }, + { + "epoch": 25.000866598060988, + "grad_norm": 4.17563009262085, + "learning_rate": 2.078330835845866e-05, + "loss": 0.0802, + "step": 115560 + }, + { + "epoch": 25.000920760439797, + "grad_norm": 0.0015578687889501452, + "learning_rate": 2.0780299337413568e-05, + "loss": 0.0177, + "step": 115570 + }, + { + "epoch": 25.00097492281861, + "grad_norm": 0.06154852733016014, + "learning_rate": 2.0777290316368474e-05, + "loss": 0.0154, + "step": 115580 + }, + { + "epoch": 25.001029085197423, + "grad_norm": 0.2816171646118164, + "learning_rate": 2.077428129532338e-05, + "loss": 0.0509, + "step": 115590 + }, + { + "epoch": 25.001083247576233, + "grad_norm": 0.0016745250904932618, + "learning_rate": 2.0771272274278286e-05, + "loss": 0.0019, + "step": 115600 + }, + { + "epoch": 25.001137409955046, + "grad_norm": 1.5950102806091309, + "learning_rate": 2.0768263253233196e-05, + "loss": 0.0534, + "step": 115610 + }, + { + "epoch": 25.001191572333855, + "grad_norm": 0.0027018103282898664, + "learning_rate": 2.07652542321881e-05, + "loss": 0.0009, + "step": 115620 + }, + { + "epoch": 25.00124573471267, + "grad_norm": 1.3294475078582764, + "learning_rate": 2.076224521114301e-05, + "loss": 0.0794, + "step": 115630 + }, + { + "epoch": 25.00129989709148, + "grad_norm": 0.18511667847633362, + "learning_rate": 2.0759236190097915e-05, + "loss": 0.0295, + "step": 115640 + }, + { + "epoch": 25.00135405947029, + "grad_norm": 0.04323244467377663, + "learning_rate": 2.075622716905282e-05, + "loss": 0.0151, + "step": 115650 + }, + { + "epoch": 25.001408221849104, + "grad_norm": 0.0015625805826857686, + "learning_rate": 2.0753218148007727e-05, + "loss": 0.023, + "step": 115660 + }, + { + "epoch": 25.001462384227914, + "grad_norm": 0.001489792368374765, + "learning_rate": 2.0750209126962637e-05, + "loss": 0.0572, + "step": 115670 + }, + { + "epoch": 25.001516546606727, + "grad_norm": 2.072737216949463, + "learning_rate": 2.074720010591754e-05, + "loss": 0.0273, + "step": 115680 + }, + { + "epoch": 25.00157070898554, + "grad_norm": 0.0455583781003952, + "learning_rate": 2.074419108487245e-05, + "loss": 0.0168, + "step": 115690 + }, + { + "epoch": 25.00162487136435, + "grad_norm": 0.0020988094620406628, + "learning_rate": 2.0741182063827356e-05, + "loss": 0.0258, + "step": 115700 + }, + { + "epoch": 25.001679033743162, + "grad_norm": 1.784401774406433, + "learning_rate": 2.0738173042782262e-05, + "loss": 0.0259, + "step": 115710 + }, + { + "epoch": 25.001733196121975, + "grad_norm": 0.003067219629883766, + "learning_rate": 2.073516402173717e-05, + "loss": 0.0013, + "step": 115720 + }, + { + "epoch": 25.001787358500785, + "grad_norm": 0.8103523254394531, + "learning_rate": 2.0732155000692075e-05, + "loss": 0.0104, + "step": 115730 + }, + { + "epoch": 25.001841520879598, + "grad_norm": 0.0022320058196783066, + "learning_rate": 2.0729145979646984e-05, + "loss": 0.0052, + "step": 115740 + }, + { + "epoch": 25.001895683258407, + "grad_norm": 6.02981424331665, + "learning_rate": 2.0726136958601887e-05, + "loss": 0.0747, + "step": 115750 + }, + { + "epoch": 25.00194984563722, + "grad_norm": 0.0013195184292271733, + "learning_rate": 2.0723127937556797e-05, + "loss": 0.087, + "step": 115760 + }, + { + "epoch": 25.002004008016034, + "grad_norm": 0.0014928510645404458, + "learning_rate": 2.0720118916511703e-05, + "loss": 0.0504, + "step": 115770 + }, + { + "epoch": 25.002058170394843, + "grad_norm": 0.44356569647789, + "learning_rate": 2.071710989546661e-05, + "loss": 0.0085, + "step": 115780 + }, + { + "epoch": 25.002112332773656, + "grad_norm": 0.0013263659784570336, + "learning_rate": 2.0714100874421516e-05, + "loss": 0.0022, + "step": 115790 + }, + { + "epoch": 25.002166495152466, + "grad_norm": 12.754256248474121, + "learning_rate": 2.0711091853376425e-05, + "loss": 0.0557, + "step": 115800 + }, + { + "epoch": 25.00222065753128, + "grad_norm": 0.06693080067634583, + "learning_rate": 2.070808283233133e-05, + "loss": 0.0755, + "step": 115810 + }, + { + "epoch": 25.00227481991009, + "grad_norm": 0.2870968282222748, + "learning_rate": 2.0705073811286238e-05, + "loss": 0.0045, + "step": 115820 + }, + { + "epoch": 25.0023289822889, + "grad_norm": 0.4411396384239197, + "learning_rate": 2.0702064790241144e-05, + "loss": 0.0073, + "step": 115830 + }, + { + "epoch": 25.002383144667714, + "grad_norm": 14.19367504119873, + "learning_rate": 2.069905576919605e-05, + "loss": 0.02, + "step": 115840 + }, + { + "epoch": 25.002437307046524, + "grad_norm": 0.0012286662822589278, + "learning_rate": 2.0696046748150957e-05, + "loss": 0.0479, + "step": 115850 + }, + { + "epoch": 25.002491469425337, + "grad_norm": 0.1824190318584442, + "learning_rate": 2.0693037727105866e-05, + "loss": 0.088, + "step": 115860 + }, + { + "epoch": 25.00254563180415, + "grad_norm": 0.012149419635534286, + "learning_rate": 2.0690028706060773e-05, + "loss": 0.016, + "step": 115870 + }, + { + "epoch": 25.00259979418296, + "grad_norm": 0.46333107352256775, + "learning_rate": 2.0687019685015676e-05, + "loss": 0.0611, + "step": 115880 + }, + { + "epoch": 25.002653956561772, + "grad_norm": 0.1912345588207245, + "learning_rate": 2.0684010663970585e-05, + "loss": 0.0497, + "step": 115890 + }, + { + "epoch": 25.002708118940586, + "grad_norm": 1.3774163722991943, + "learning_rate": 2.068100164292549e-05, + "loss": 0.0241, + "step": 115900 + }, + { + "epoch": 25.002762281319395, + "grad_norm": 7.925605297088623, + "learning_rate": 2.0677992621880398e-05, + "loss": 0.0422, + "step": 115910 + }, + { + "epoch": 25.002816443698208, + "grad_norm": 0.0012473409296944737, + "learning_rate": 2.0674983600835304e-05, + "loss": 0.0259, + "step": 115920 + }, + { + "epoch": 25.002870606077018, + "grad_norm": 0.0013604495907202363, + "learning_rate": 2.0671974579790214e-05, + "loss": 0.0931, + "step": 115930 + }, + { + "epoch": 25.00292476845583, + "grad_norm": 0.001260675024241209, + "learning_rate": 2.0668965558745117e-05, + "loss": 0.0205, + "step": 115940 + }, + { + "epoch": 25.002978930834644, + "grad_norm": 0.0015071420930325985, + "learning_rate": 2.0665956537700026e-05, + "loss": 0.0077, + "step": 115950 + }, + { + "epoch": 25.003033093213453, + "grad_norm": 0.20262789726257324, + "learning_rate": 2.0662947516654933e-05, + "loss": 0.0262, + "step": 115960 + }, + { + "epoch": 25.003087255592266, + "grad_norm": 0.056554652750492096, + "learning_rate": 2.065993849560984e-05, + "loss": 0.0653, + "step": 115970 + }, + { + "epoch": 25.003141417971076, + "grad_norm": 1.2413557767868042, + "learning_rate": 2.0656929474564745e-05, + "loss": 0.0531, + "step": 115980 + }, + { + "epoch": 25.00319558034989, + "grad_norm": 0.5695541501045227, + "learning_rate": 2.0653920453519655e-05, + "loss": 0.0422, + "step": 115990 + }, + { + "epoch": 25.003249742728702, + "grad_norm": 0.0013394098496064544, + "learning_rate": 2.065091143247456e-05, + "loss": 0.0273, + "step": 116000 + }, + { + "epoch": 25.00330390510751, + "grad_norm": 0.03422505408525467, + "learning_rate": 2.0647902411429467e-05, + "loss": 0.0176, + "step": 116010 + }, + { + "epoch": 25.003358067486325, + "grad_norm": 0.07794050127267838, + "learning_rate": 2.0644893390384374e-05, + "loss": 0.0056, + "step": 116020 + }, + { + "epoch": 25.003412229865134, + "grad_norm": 0.0030429232865571976, + "learning_rate": 2.064188436933928e-05, + "loss": 0.0423, + "step": 116030 + }, + { + "epoch": 25.003466392243947, + "grad_norm": 0.1562911868095398, + "learning_rate": 2.0638875348294186e-05, + "loss": 0.1888, + "step": 116040 + }, + { + "epoch": 25.00352055462276, + "grad_norm": 0.14623956382274628, + "learning_rate": 2.0635866327249092e-05, + "loss": 0.0072, + "step": 116050 + }, + { + "epoch": 25.00357471700157, + "grad_norm": 0.0013476061867550015, + "learning_rate": 2.0632857306204002e-05, + "loss": 0.0007, + "step": 116060 + }, + { + "epoch": 25.003628879380383, + "grad_norm": 0.0016989611322060227, + "learning_rate": 2.0629848285158905e-05, + "loss": 0.0015, + "step": 116070 + }, + { + "epoch": 25.003683041759196, + "grad_norm": 0.0014064416754990816, + "learning_rate": 2.0626839264113815e-05, + "loss": 0.013, + "step": 116080 + }, + { + "epoch": 25.003737204138005, + "grad_norm": 0.02954924665391445, + "learning_rate": 2.062383024306872e-05, + "loss": 0.0928, + "step": 116090 + }, + { + "epoch": 25.00379136651682, + "grad_norm": 0.06095561012625694, + "learning_rate": 2.0620821222023627e-05, + "loss": 0.0483, + "step": 116100 + }, + { + "epoch": 25.003845528895628, + "grad_norm": 1.7515220642089844, + "learning_rate": 2.0617812200978534e-05, + "loss": 0.0242, + "step": 116110 + }, + { + "epoch": 25.00389969127444, + "grad_norm": 0.026747917756438255, + "learning_rate": 2.0614803179933443e-05, + "loss": 0.0095, + "step": 116120 + }, + { + "epoch": 25.003953853653254, + "grad_norm": 0.026276273652911186, + "learning_rate": 2.061179415888835e-05, + "loss": 0.0074, + "step": 116130 + }, + { + "epoch": 25.004008016032063, + "grad_norm": 0.023856300860643387, + "learning_rate": 2.0608785137843256e-05, + "loss": 0.0551, + "step": 116140 + }, + { + "epoch": 25.004062178410877, + "grad_norm": 0.0013574601616710424, + "learning_rate": 2.0605776116798162e-05, + "loss": 0.0591, + "step": 116150 + }, + { + "epoch": 25.004116340789686, + "grad_norm": 7.961254596710205, + "learning_rate": 2.0602767095753068e-05, + "loss": 0.2509, + "step": 116160 + }, + { + "epoch": 25.0041705031685, + "grad_norm": 0.3232858180999756, + "learning_rate": 2.0599758074707975e-05, + "loss": 0.0446, + "step": 116170 + }, + { + "epoch": 25.004224665547312, + "grad_norm": 0.004731547087430954, + "learning_rate": 2.059674905366288e-05, + "loss": 0.0016, + "step": 116180 + }, + { + "epoch": 25.00427882792612, + "grad_norm": 0.051236141473054886, + "learning_rate": 2.059374003261779e-05, + "loss": 0.0078, + "step": 116190 + }, + { + "epoch": 25.004332990304935, + "grad_norm": 0.1092943623661995, + "learning_rate": 2.0590731011572693e-05, + "loss": 0.0627, + "step": 116200 + }, + { + "epoch": 25.004387152683744, + "grad_norm": 0.0622544102370739, + "learning_rate": 2.0587721990527603e-05, + "loss": 0.0392, + "step": 116210 + }, + { + "epoch": 25.004441315062557, + "grad_norm": 0.014308014884591103, + "learning_rate": 2.058471296948251e-05, + "loss": 0.0176, + "step": 116220 + }, + { + "epoch": 25.00449547744137, + "grad_norm": 0.005426339339464903, + "learning_rate": 2.0581703948437416e-05, + "loss": 0.0653, + "step": 116230 + }, + { + "epoch": 25.00454963982018, + "grad_norm": 2.8798370361328125, + "learning_rate": 2.0578694927392322e-05, + "loss": 0.0419, + "step": 116240 + }, + { + "epoch": 25.004603802198993, + "grad_norm": 0.25897687673568726, + "learning_rate": 2.057568590634723e-05, + "loss": 0.0467, + "step": 116250 + }, + { + "epoch": 25.004657964577806, + "grad_norm": 0.2952876389026642, + "learning_rate": 2.0572676885302138e-05, + "loss": 0.0065, + "step": 116260 + }, + { + "epoch": 25.004712126956615, + "grad_norm": 0.005018536001443863, + "learning_rate": 2.0569667864257044e-05, + "loss": 0.0777, + "step": 116270 + }, + { + "epoch": 25.00476628933543, + "grad_norm": 1.884772539138794, + "learning_rate": 2.056665884321195e-05, + "loss": 0.0629, + "step": 116280 + }, + { + "epoch": 25.004820451714238, + "grad_norm": 0.0040660821832716465, + "learning_rate": 2.0563649822166857e-05, + "loss": 0.0266, + "step": 116290 + }, + { + "epoch": 25.00487461409305, + "grad_norm": 0.037138450890779495, + "learning_rate": 2.0560640801121763e-05, + "loss": 0.0167, + "step": 116300 + }, + { + "epoch": 25.004928776471864, + "grad_norm": 0.42696577310562134, + "learning_rate": 2.0557631780076673e-05, + "loss": 0.0089, + "step": 116310 + }, + { + "epoch": 25.004982938850674, + "grad_norm": 22.857545852661133, + "learning_rate": 2.055462275903158e-05, + "loss": 0.0736, + "step": 116320 + }, + { + "epoch": 25.005037101229487, + "grad_norm": 0.003005865029990673, + "learning_rate": 2.0551613737986482e-05, + "loss": 0.102, + "step": 116330 + }, + { + "epoch": 25.005091263608296, + "grad_norm": 0.003970982972532511, + "learning_rate": 2.054860471694139e-05, + "loss": 0.1574, + "step": 116340 + }, + { + "epoch": 25.00514542598711, + "grad_norm": 0.004778135102242231, + "learning_rate": 2.0545595695896298e-05, + "loss": 0.0258, + "step": 116350 + }, + { + "epoch": 25.005199588365922, + "grad_norm": 2.6798739433288574, + "learning_rate": 2.0542586674851204e-05, + "loss": 0.1169, + "step": 116360 + }, + { + "epoch": 25.005253750744732, + "grad_norm": 0.730757474899292, + "learning_rate": 2.053957765380611e-05, + "loss": 0.0787, + "step": 116370 + }, + { + "epoch": 25.005307913123545, + "grad_norm": 0.004551961086690426, + "learning_rate": 2.053656863276102e-05, + "loss": 0.0027, + "step": 116380 + }, + { + "epoch": 25.005362075502354, + "grad_norm": 0.0019536553882062435, + "learning_rate": 2.0533559611715926e-05, + "loss": 0.026, + "step": 116390 + }, + { + "epoch": 25.005416237881168, + "grad_norm": 0.005826347507536411, + "learning_rate": 2.0530550590670832e-05, + "loss": 0.0439, + "step": 116400 + }, + { + "epoch": 25.00547040025998, + "grad_norm": 0.08343426138162613, + "learning_rate": 2.052754156962574e-05, + "loss": 0.0203, + "step": 116410 + }, + { + "epoch": 25.00552456263879, + "grad_norm": 0.0486268475651741, + "learning_rate": 2.0524532548580645e-05, + "loss": 0.0148, + "step": 116420 + }, + { + "epoch": 25.005578725017603, + "grad_norm": 0.0024981186725199223, + "learning_rate": 2.052152352753555e-05, + "loss": 0.0673, + "step": 116430 + }, + { + "epoch": 25.005632887396416, + "grad_norm": 0.15173058211803436, + "learning_rate": 2.051851450649046e-05, + "loss": 0.0371, + "step": 116440 + }, + { + "epoch": 25.005687049775226, + "grad_norm": 0.002070331946015358, + "learning_rate": 2.0515505485445367e-05, + "loss": 0.019, + "step": 116450 + }, + { + "epoch": 25.00574121215404, + "grad_norm": 2.772873878479004, + "learning_rate": 2.0512496464400273e-05, + "loss": 0.1472, + "step": 116460 + }, + { + "epoch": 25.00579537453285, + "grad_norm": 0.03158621862530708, + "learning_rate": 2.050948744335518e-05, + "loss": 0.01, + "step": 116470 + }, + { + "epoch": 25.00584953691166, + "grad_norm": 0.005666679702699184, + "learning_rate": 2.0506478422310086e-05, + "loss": 0.0388, + "step": 116480 + }, + { + "epoch": 25.005903699290474, + "grad_norm": 0.011976417154073715, + "learning_rate": 2.0503469401264992e-05, + "loss": 0.0153, + "step": 116490 + }, + { + "epoch": 25.005957861669284, + "grad_norm": 0.023639677092432976, + "learning_rate": 2.05004603802199e-05, + "loss": 0.0817, + "step": 116500 + }, + { + "epoch": 25.006012024048097, + "grad_norm": 0.003296501934528351, + "learning_rate": 2.0497451359174808e-05, + "loss": 0.0011, + "step": 116510 + }, + { + "epoch": 25.006066186426906, + "grad_norm": 0.4871305525302887, + "learning_rate": 2.0494442338129714e-05, + "loss": 0.0341, + "step": 116520 + }, + { + "epoch": 25.00612034880572, + "grad_norm": 0.0019287425093352795, + "learning_rate": 2.049143331708462e-05, + "loss": 0.0022, + "step": 116530 + }, + { + "epoch": 25.006174511184533, + "grad_norm": 0.0032583444844931364, + "learning_rate": 2.0488424296039527e-05, + "loss": 0.0161, + "step": 116540 + }, + { + "epoch": 25.006228673563342, + "grad_norm": 11.029531478881836, + "learning_rate": 2.0485415274994437e-05, + "loss": 0.1192, + "step": 116550 + }, + { + "epoch": 25.006282835942155, + "grad_norm": 4.782392978668213, + "learning_rate": 2.048240625394934e-05, + "loss": 0.0692, + "step": 116560 + }, + { + "epoch": 25.006336998320965, + "grad_norm": 0.0020452174358069897, + "learning_rate": 2.047939723290425e-05, + "loss": 0.0431, + "step": 116570 + }, + { + "epoch": 25.006391160699778, + "grad_norm": 0.09118261933326721, + "learning_rate": 2.0476388211859156e-05, + "loss": 0.0192, + "step": 116580 + }, + { + "epoch": 25.00644532307859, + "grad_norm": 0.0219107735902071, + "learning_rate": 2.0473379190814062e-05, + "loss": 0.1099, + "step": 116590 + }, + { + "epoch": 25.0064994854574, + "grad_norm": 0.001687818206846714, + "learning_rate": 2.0470370169768968e-05, + "loss": 0.0016, + "step": 116600 + }, + { + "epoch": 25.006553647836213, + "grad_norm": 0.0030010768678039312, + "learning_rate": 2.0467361148723878e-05, + "loss": 0.0489, + "step": 116610 + }, + { + "epoch": 25.006607810215023, + "grad_norm": 16.5167293548584, + "learning_rate": 2.046435212767878e-05, + "loss": 0.0192, + "step": 116620 + }, + { + "epoch": 25.006661972593836, + "grad_norm": 0.021808501332998276, + "learning_rate": 2.0461343106633687e-05, + "loss": 0.1555, + "step": 116630 + }, + { + "epoch": 25.00671613497265, + "grad_norm": 0.02591378428041935, + "learning_rate": 2.0458334085588597e-05, + "loss": 0.0664, + "step": 116640 + }, + { + "epoch": 25.00677029735146, + "grad_norm": 0.0024309682194143534, + "learning_rate": 2.0455325064543503e-05, + "loss": 0.0024, + "step": 116650 + }, + { + "epoch": 25.00682445973027, + "grad_norm": 0.13789673149585724, + "learning_rate": 2.045231604349841e-05, + "loss": 0.0408, + "step": 116660 + }, + { + "epoch": 25.006878622109085, + "grad_norm": 0.3692687451839447, + "learning_rate": 2.0449307022453315e-05, + "loss": 0.0117, + "step": 116670 + }, + { + "epoch": 25.006932784487894, + "grad_norm": 0.2577459216117859, + "learning_rate": 2.0446298001408225e-05, + "loss": 0.1007, + "step": 116680 + }, + { + "epoch": 25.006986946866707, + "grad_norm": 1.384235143661499, + "learning_rate": 2.0443288980363128e-05, + "loss": 0.0422, + "step": 116690 + }, + { + "epoch": 25.007041109245517, + "grad_norm": 0.0018833541544154286, + "learning_rate": 2.0440279959318038e-05, + "loss": 0.0735, + "step": 116700 + }, + { + "epoch": 25.00709527162433, + "grad_norm": 0.0020650839433073997, + "learning_rate": 2.0437270938272944e-05, + "loss": 0.0078, + "step": 116710 + }, + { + "epoch": 25.007149434003143, + "grad_norm": 0.5639738440513611, + "learning_rate": 2.043426191722785e-05, + "loss": 0.0355, + "step": 116720 + }, + { + "epoch": 25.007203596381952, + "grad_norm": 1.8224393129348755, + "learning_rate": 2.0431252896182756e-05, + "loss": 0.0453, + "step": 116730 + }, + { + "epoch": 25.007257758760765, + "grad_norm": 0.09256459772586823, + "learning_rate": 2.0428243875137666e-05, + "loss": 0.0351, + "step": 116740 + }, + { + "epoch": 25.007311921139575, + "grad_norm": 4.970856189727783, + "learning_rate": 2.042523485409257e-05, + "loss": 0.0519, + "step": 116750 + }, + { + "epoch": 25.007366083518388, + "grad_norm": 0.0024962234310805798, + "learning_rate": 2.042222583304748e-05, + "loss": 0.0007, + "step": 116760 + }, + { + "epoch": 25.0074202458972, + "grad_norm": 0.6926066279411316, + "learning_rate": 2.0419216812002385e-05, + "loss": 0.0571, + "step": 116770 + }, + { + "epoch": 25.00747440827601, + "grad_norm": 0.0031462267506867647, + "learning_rate": 2.041620779095729e-05, + "loss": 0.0585, + "step": 116780 + }, + { + "epoch": 25.007528570654824, + "grad_norm": 0.002109949942678213, + "learning_rate": 2.0413198769912197e-05, + "loss": 0.0148, + "step": 116790 + }, + { + "epoch": 25.007582733033633, + "grad_norm": 1.5214594602584839, + "learning_rate": 2.0410189748867104e-05, + "loss": 0.0252, + "step": 116800 + }, + { + "epoch": 25.007636895412446, + "grad_norm": 0.06772293150424957, + "learning_rate": 2.0407180727822013e-05, + "loss": 0.0645, + "step": 116810 + }, + { + "epoch": 25.00769105779126, + "grad_norm": 0.005410561803728342, + "learning_rate": 2.0404171706776916e-05, + "loss": 0.0938, + "step": 116820 + }, + { + "epoch": 25.00774522017007, + "grad_norm": 0.9319505095481873, + "learning_rate": 2.0401162685731826e-05, + "loss": 0.0135, + "step": 116830 + }, + { + "epoch": 25.007799382548882, + "grad_norm": 3.233227014541626, + "learning_rate": 2.0398153664686732e-05, + "loss": 0.05, + "step": 116840 + }, + { + "epoch": 25.007853544927695, + "grad_norm": 0.002485871547833085, + "learning_rate": 2.039514464364164e-05, + "loss": 0.0037, + "step": 116850 + }, + { + "epoch": 25.007907707306504, + "grad_norm": 0.001969998935237527, + "learning_rate": 2.0392135622596545e-05, + "loss": 0.0569, + "step": 116860 + }, + { + "epoch": 25.007961869685317, + "grad_norm": 0.0020555320661514997, + "learning_rate": 2.0389126601551454e-05, + "loss": 0.0407, + "step": 116870 + }, + { + "epoch": 25.008016032064127, + "grad_norm": 0.013657491654157639, + "learning_rate": 2.0386117580506357e-05, + "loss": 0.0192, + "step": 116880 + }, + { + "epoch": 25.00807019444294, + "grad_norm": 0.00173793057911098, + "learning_rate": 2.0383108559461267e-05, + "loss": 0.0108, + "step": 116890 + }, + { + "epoch": 25.008124356821753, + "grad_norm": 0.0016119448700919747, + "learning_rate": 2.0380099538416173e-05, + "loss": 0.0022, + "step": 116900 + }, + { + "epoch": 25.008178519200563, + "grad_norm": 0.026171578094363213, + "learning_rate": 2.037709051737108e-05, + "loss": 0.0625, + "step": 116910 + }, + { + "epoch": 25.008232681579376, + "grad_norm": 6.097987651824951, + "learning_rate": 2.0374081496325986e-05, + "loss": 0.091, + "step": 116920 + }, + { + "epoch": 25.008286843958185, + "grad_norm": 0.005160744767636061, + "learning_rate": 2.0371072475280892e-05, + "loss": 0.0107, + "step": 116930 + }, + { + "epoch": 25.008341006336998, + "grad_norm": 0.0023468926083296537, + "learning_rate": 2.0368063454235802e-05, + "loss": 0.0678, + "step": 116940 + }, + { + "epoch": 25.00839516871581, + "grad_norm": 0.32702040672302246, + "learning_rate": 2.0365054433190705e-05, + "loss": 0.007, + "step": 116950 + }, + { + "epoch": 25.00844933109462, + "grad_norm": 0.3671294152736664, + "learning_rate": 2.0362045412145614e-05, + "loss": 0.0399, + "step": 116960 + }, + { + "epoch": 25.008503493473434, + "grad_norm": 0.03527054563164711, + "learning_rate": 2.035903639110052e-05, + "loss": 0.0281, + "step": 116970 + }, + { + "epoch": 25.008557655852243, + "grad_norm": 0.35439619421958923, + "learning_rate": 2.0356027370055427e-05, + "loss": 0.0981, + "step": 116980 + }, + { + "epoch": 25.008611818231056, + "grad_norm": 0.5092580914497375, + "learning_rate": 2.0353018349010333e-05, + "loss": 0.0411, + "step": 116990 + }, + { + "epoch": 25.00866598060987, + "grad_norm": 0.0018846935126930475, + "learning_rate": 2.0350009327965243e-05, + "loss": 0.0404, + "step": 117000 + }, + { + "epoch": 25.00872014298868, + "grad_norm": 0.001659418805502355, + "learning_rate": 2.0347000306920146e-05, + "loss": 0.0182, + "step": 117010 + }, + { + "epoch": 25.008774305367492, + "grad_norm": 0.0581243634223938, + "learning_rate": 2.0343991285875055e-05, + "loss": 0.0068, + "step": 117020 + }, + { + "epoch": 25.008828467746305, + "grad_norm": 0.05574354529380798, + "learning_rate": 2.034098226482996e-05, + "loss": 0.0058, + "step": 117030 + }, + { + "epoch": 25.008882630125115, + "grad_norm": 0.7461893558502197, + "learning_rate": 2.0337973243784868e-05, + "loss": 0.0071, + "step": 117040 + }, + { + "epoch": 25.008936792503928, + "grad_norm": 0.017642151564359665, + "learning_rate": 2.0334964222739774e-05, + "loss": 0.112, + "step": 117050 + }, + { + "epoch": 25.008990954882737, + "grad_norm": 0.052885979413986206, + "learning_rate": 2.0331955201694684e-05, + "loss": 0.0057, + "step": 117060 + }, + { + "epoch": 25.00904511726155, + "grad_norm": 0.002565631177276373, + "learning_rate": 2.032894618064959e-05, + "loss": 0.0516, + "step": 117070 + }, + { + "epoch": 25.009099279640363, + "grad_norm": 0.0012351294280961156, + "learning_rate": 2.0325937159604493e-05, + "loss": 0.003, + "step": 117080 + }, + { + "epoch": 25.009153442019173, + "grad_norm": 0.0012380360858514905, + "learning_rate": 2.0322928138559403e-05, + "loss": 0.011, + "step": 117090 + }, + { + "epoch": 25.009207604397986, + "grad_norm": 0.4011366367340088, + "learning_rate": 2.031991911751431e-05, + "loss": 0.0563, + "step": 117100 + }, + { + "epoch": 25.009261766776795, + "grad_norm": 0.5526049137115479, + "learning_rate": 2.0316910096469215e-05, + "loss": 0.0523, + "step": 117110 + }, + { + "epoch": 25.00931592915561, + "grad_norm": 0.0012648925185203552, + "learning_rate": 2.031390107542412e-05, + "loss": 0.1121, + "step": 117120 + }, + { + "epoch": 25.00937009153442, + "grad_norm": 0.002781914547085762, + "learning_rate": 2.031089205437903e-05, + "loss": 0.0754, + "step": 117130 + }, + { + "epoch": 25.00942425391323, + "grad_norm": 0.0014561269199475646, + "learning_rate": 2.0307883033333934e-05, + "loss": 0.0343, + "step": 117140 + }, + { + "epoch": 25.009478416292044, + "grad_norm": 0.029591161757707596, + "learning_rate": 2.0304874012288844e-05, + "loss": 0.0281, + "step": 117150 + }, + { + "epoch": 25.009532578670854, + "grad_norm": 0.2626762390136719, + "learning_rate": 2.030186499124375e-05, + "loss": 0.0766, + "step": 117160 + }, + { + "epoch": 25.009586741049667, + "grad_norm": 0.0015076969284564257, + "learning_rate": 2.0298855970198656e-05, + "loss": 0.0379, + "step": 117170 + }, + { + "epoch": 25.00964090342848, + "grad_norm": 0.0027103819884359837, + "learning_rate": 2.0295846949153563e-05, + "loss": 0.0467, + "step": 117180 + }, + { + "epoch": 25.00969506580729, + "grad_norm": 0.0019345565233379602, + "learning_rate": 2.0292837928108472e-05, + "loss": 0.0145, + "step": 117190 + }, + { + "epoch": 25.009749228186102, + "grad_norm": 0.0015179517213255167, + "learning_rate": 2.028982890706338e-05, + "loss": 0.0917, + "step": 117200 + }, + { + "epoch": 25.009803390564915, + "grad_norm": 1.0009301900863647, + "learning_rate": 2.0286819886018285e-05, + "loss": 0.1867, + "step": 117210 + }, + { + "epoch": 25.009857552943725, + "grad_norm": 0.0040560378693044186, + "learning_rate": 2.028381086497319e-05, + "loss": 0.0812, + "step": 117220 + }, + { + "epoch": 25.009911715322538, + "grad_norm": 1.014934778213501, + "learning_rate": 2.0280801843928097e-05, + "loss": 0.0237, + "step": 117230 + }, + { + "epoch": 25.009965877701347, + "grad_norm": 0.03501765429973602, + "learning_rate": 2.0277792822883004e-05, + "loss": 0.0617, + "step": 117240 + }, + { + "epoch": 25.01002004008016, + "grad_norm": 2.370514154434204, + "learning_rate": 2.027478380183791e-05, + "loss": 0.0637, + "step": 117250 + }, + { + "epoch": 25.010074202458974, + "grad_norm": 0.0037135002203285694, + "learning_rate": 2.027177478079282e-05, + "loss": 0.0571, + "step": 117260 + }, + { + "epoch": 25.010128364837783, + "grad_norm": 0.08029929548501968, + "learning_rate": 2.0268765759747722e-05, + "loss": 0.0012, + "step": 117270 + }, + { + "epoch": 25.010182527216596, + "grad_norm": 0.0017665585037320852, + "learning_rate": 2.0265756738702632e-05, + "loss": 0.0679, + "step": 117280 + }, + { + "epoch": 25.010236689595406, + "grad_norm": 0.0024082299787551165, + "learning_rate": 2.0262747717657538e-05, + "loss": 0.0206, + "step": 117290 + }, + { + "epoch": 25.01029085197422, + "grad_norm": 0.10853516310453415, + "learning_rate": 2.0259738696612445e-05, + "loss": 0.0185, + "step": 117300 + }, + { + "epoch": 25.01034501435303, + "grad_norm": 1.2557419538497925, + "learning_rate": 2.025672967556735e-05, + "loss": 0.0612, + "step": 117310 + }, + { + "epoch": 25.01039917673184, + "grad_norm": 0.0033852451015263796, + "learning_rate": 2.025372065452226e-05, + "loss": 0.084, + "step": 117320 + }, + { + "epoch": 25.010453339110654, + "grad_norm": 0.0018619972979649901, + "learning_rate": 2.0250711633477167e-05, + "loss": 0.0002, + "step": 117330 + }, + { + "epoch": 25.010507501489464, + "grad_norm": 0.0019362970488145947, + "learning_rate": 2.0247702612432073e-05, + "loss": 0.0074, + "step": 117340 + }, + { + "epoch": 25.010561663868277, + "grad_norm": 0.0024201234336942434, + "learning_rate": 2.024469359138698e-05, + "loss": 0.1294, + "step": 117350 + }, + { + "epoch": 25.01061582624709, + "grad_norm": 0.7262639403343201, + "learning_rate": 2.0241684570341886e-05, + "loss": 0.007, + "step": 117360 + }, + { + "epoch": 25.0106699886259, + "grad_norm": 0.002860352164134383, + "learning_rate": 2.0238675549296792e-05, + "loss": 0.0077, + "step": 117370 + }, + { + "epoch": 25.010724151004712, + "grad_norm": 0.012225419282913208, + "learning_rate": 2.0235666528251698e-05, + "loss": 0.0232, + "step": 117380 + }, + { + "epoch": 25.010778313383526, + "grad_norm": 0.31300580501556396, + "learning_rate": 2.0232657507206608e-05, + "loss": 0.0554, + "step": 117390 + }, + { + "epoch": 25.010832475762335, + "grad_norm": 0.05582312121987343, + "learning_rate": 2.022964848616151e-05, + "loss": 0.0021, + "step": 117400 + }, + { + "epoch": 25.010886638141148, + "grad_norm": 0.012642626650631428, + "learning_rate": 2.022663946511642e-05, + "loss": 0.0427, + "step": 117410 + }, + { + "epoch": 25.010940800519958, + "grad_norm": 0.009491493925452232, + "learning_rate": 2.0223630444071327e-05, + "loss": 0.0588, + "step": 117420 + }, + { + "epoch": 25.01099496289877, + "grad_norm": 0.013007014989852905, + "learning_rate": 2.0220621423026233e-05, + "loss": 0.0312, + "step": 117430 + }, + { + "epoch": 25.011049125277584, + "grad_norm": 0.002990500535815954, + "learning_rate": 2.021761240198114e-05, + "loss": 0.0243, + "step": 117440 + }, + { + "epoch": 25.011103287656393, + "grad_norm": 0.005447554402053356, + "learning_rate": 2.021460338093605e-05, + "loss": 0.0021, + "step": 117450 + }, + { + "epoch": 25.011157450035206, + "grad_norm": 0.17918366193771362, + "learning_rate": 2.0211594359890955e-05, + "loss": 0.0686, + "step": 117460 + }, + { + "epoch": 25.011211612414016, + "grad_norm": 0.001988678239285946, + "learning_rate": 2.020858533884586e-05, + "loss": 0.0449, + "step": 117470 + }, + { + "epoch": 25.01126577479283, + "grad_norm": 0.3336198031902313, + "learning_rate": 2.0205576317800768e-05, + "loss": 0.0051, + "step": 117480 + }, + { + "epoch": 25.011319937171642, + "grad_norm": 0.0019091063877567649, + "learning_rate": 2.0202567296755674e-05, + "loss": 0.0206, + "step": 117490 + }, + { + "epoch": 25.01137409955045, + "grad_norm": 3.2618701457977295, + "learning_rate": 2.019955827571058e-05, + "loss": 0.0979, + "step": 117500 + }, + { + "epoch": 25.011428261929264, + "grad_norm": 0.22918733954429626, + "learning_rate": 2.019654925466549e-05, + "loss": 0.0761, + "step": 117510 + }, + { + "epoch": 25.011482424308074, + "grad_norm": 0.046208977699279785, + "learning_rate": 2.0193540233620396e-05, + "loss": 0.0263, + "step": 117520 + }, + { + "epoch": 25.011536586686887, + "grad_norm": 0.0018080492736771703, + "learning_rate": 2.01905312125753e-05, + "loss": 0.0258, + "step": 117530 + }, + { + "epoch": 25.0115907490657, + "grad_norm": 0.012697304598987103, + "learning_rate": 2.018752219153021e-05, + "loss": 0.0265, + "step": 117540 + }, + { + "epoch": 25.01164491144451, + "grad_norm": 0.15950413048267365, + "learning_rate": 2.0184513170485115e-05, + "loss": 0.0324, + "step": 117550 + }, + { + "epoch": 25.011699073823323, + "grad_norm": 0.001769058988429606, + "learning_rate": 2.018150414944002e-05, + "loss": 0.014, + "step": 117560 + }, + { + "epoch": 25.011753236202136, + "grad_norm": 0.0014617352280765772, + "learning_rate": 2.0178495128394928e-05, + "loss": 0.0022, + "step": 117570 + }, + { + "epoch": 25.011807398580945, + "grad_norm": 0.0014664912596344948, + "learning_rate": 2.0175486107349837e-05, + "loss": 0.0413, + "step": 117580 + }, + { + "epoch": 25.01186156095976, + "grad_norm": 0.0020372269209474325, + "learning_rate": 2.0172477086304743e-05, + "loss": 0.1357, + "step": 117590 + }, + { + "epoch": 25.011915723338568, + "grad_norm": 0.003227871609851718, + "learning_rate": 2.016946806525965e-05, + "loss": 0.0633, + "step": 117600 + }, + { + "epoch": 25.01196988571738, + "grad_norm": 0.0030845573637634516, + "learning_rate": 2.0166459044214556e-05, + "loss": 0.0233, + "step": 117610 + }, + { + "epoch": 25.012024048096194, + "grad_norm": 0.0379854217171669, + "learning_rate": 2.0163450023169462e-05, + "loss": 0.064, + "step": 117620 + }, + { + "epoch": 25.012078210475003, + "grad_norm": 0.0026066170539706945, + "learning_rate": 2.016044100212437e-05, + "loss": 0.1739, + "step": 117630 + }, + { + "epoch": 25.012132372853817, + "grad_norm": 2.8309223651885986, + "learning_rate": 2.0157431981079278e-05, + "loss": 0.0823, + "step": 117640 + }, + { + "epoch": 25.012186535232626, + "grad_norm": 0.012220818549394608, + "learning_rate": 2.0154422960034185e-05, + "loss": 0.0264, + "step": 117650 + }, + { + "epoch": 25.01224069761144, + "grad_norm": 0.006089567672461271, + "learning_rate": 2.015141393898909e-05, + "loss": 0.1666, + "step": 117660 + }, + { + "epoch": 25.012294859990252, + "grad_norm": 0.0036512906663119793, + "learning_rate": 2.0148404917943997e-05, + "loss": 0.1203, + "step": 117670 + }, + { + "epoch": 25.01234902236906, + "grad_norm": 0.3774498999118805, + "learning_rate": 2.0145395896898903e-05, + "loss": 0.0095, + "step": 117680 + }, + { + "epoch": 25.012403184747875, + "grad_norm": 0.29129552841186523, + "learning_rate": 2.014238687585381e-05, + "loss": 0.0057, + "step": 117690 + }, + { + "epoch": 25.012457347126684, + "grad_norm": 0.0019272994250059128, + "learning_rate": 2.0139377854808716e-05, + "loss": 0.1264, + "step": 117700 + }, + { + "epoch": 25.012511509505497, + "grad_norm": 0.022619886323809624, + "learning_rate": 2.0136368833763626e-05, + "loss": 0.0375, + "step": 117710 + }, + { + "epoch": 25.01256567188431, + "grad_norm": 0.0021981613244861364, + "learning_rate": 2.0133359812718532e-05, + "loss": 0.0597, + "step": 117720 + }, + { + "epoch": 25.01261983426312, + "grad_norm": 1.3216830492019653, + "learning_rate": 2.0130350791673438e-05, + "loss": 0.0843, + "step": 117730 + }, + { + "epoch": 25.012673996641933, + "grad_norm": 3.815887689590454, + "learning_rate": 2.0127341770628344e-05, + "loss": 0.0869, + "step": 117740 + }, + { + "epoch": 25.012728159020742, + "grad_norm": 0.9164071083068848, + "learning_rate": 2.012433274958325e-05, + "loss": 0.1102, + "step": 117750 + }, + { + "epoch": 25.012782321399555, + "grad_norm": 0.6766653060913086, + "learning_rate": 2.0121323728538157e-05, + "loss": 0.0133, + "step": 117760 + }, + { + "epoch": 25.01283648377837, + "grad_norm": 0.008435067720711231, + "learning_rate": 2.0118314707493067e-05, + "loss": 0.0085, + "step": 117770 + }, + { + "epoch": 25.012890646157178, + "grad_norm": 0.002017831429839134, + "learning_rate": 2.0115305686447973e-05, + "loss": 0.0217, + "step": 117780 + }, + { + "epoch": 25.01294480853599, + "grad_norm": 0.9888148307800293, + "learning_rate": 2.011229666540288e-05, + "loss": 0.072, + "step": 117790 + }, + { + "epoch": 25.012998970914804, + "grad_norm": 0.002335419412702322, + "learning_rate": 2.0109287644357785e-05, + "loss": 0.1293, + "step": 117800 + }, + { + "epoch": 25.013053133293614, + "grad_norm": 0.005796991288661957, + "learning_rate": 2.0106278623312695e-05, + "loss": 0.0309, + "step": 117810 + }, + { + "epoch": 25.013107295672427, + "grad_norm": 0.0021472107619047165, + "learning_rate": 2.0103269602267598e-05, + "loss": 0.047, + "step": 117820 + }, + { + "epoch": 25.013161458051236, + "grad_norm": 0.05574481561779976, + "learning_rate": 2.0100260581222504e-05, + "loss": 0.0267, + "step": 117830 + }, + { + "epoch": 25.01321562043005, + "grad_norm": 0.004749855492264032, + "learning_rate": 2.0097251560177414e-05, + "loss": 0.0082, + "step": 117840 + }, + { + "epoch": 25.013269782808862, + "grad_norm": 0.5394225120544434, + "learning_rate": 2.009424253913232e-05, + "loss": 0.1003, + "step": 117850 + }, + { + "epoch": 25.013323945187672, + "grad_norm": 0.6414756774902344, + "learning_rate": 2.0091233518087226e-05, + "loss": 0.039, + "step": 117860 + }, + { + "epoch": 25.013378107566485, + "grad_norm": 0.07339762896299362, + "learning_rate": 2.0088224497042133e-05, + "loss": 0.0107, + "step": 117870 + }, + { + "epoch": 25.013432269945294, + "grad_norm": 0.48701977729797363, + "learning_rate": 2.008521547599704e-05, + "loss": 0.1151, + "step": 117880 + }, + { + "epoch": 25.013486432324108, + "grad_norm": 0.26714104413986206, + "learning_rate": 2.0082206454951945e-05, + "loss": 0.0287, + "step": 117890 + }, + { + "epoch": 25.01354059470292, + "grad_norm": 3.4314939975738525, + "learning_rate": 2.0079197433906855e-05, + "loss": 0.0673, + "step": 117900 + }, + { + "epoch": 25.01359475708173, + "grad_norm": 0.002843158319592476, + "learning_rate": 2.007618841286176e-05, + "loss": 0.0013, + "step": 117910 + }, + { + "epoch": 25.013648919460543, + "grad_norm": 0.0018119424348697066, + "learning_rate": 2.0073179391816667e-05, + "loss": 0.0877, + "step": 117920 + }, + { + "epoch": 25.013703081839353, + "grad_norm": 0.527621328830719, + "learning_rate": 2.0070170370771574e-05, + "loss": 0.0584, + "step": 117930 + }, + { + "epoch": 25.013757244218166, + "grad_norm": 0.14626474678516388, + "learning_rate": 2.0067161349726483e-05, + "loss": 0.0346, + "step": 117940 + }, + { + "epoch": 25.01381140659698, + "grad_norm": 0.002653121016919613, + "learning_rate": 2.0064152328681386e-05, + "loss": 0.0603, + "step": 117950 + }, + { + "epoch": 25.01386556897579, + "grad_norm": 0.07948555052280426, + "learning_rate": 2.0061143307636296e-05, + "loss": 0.0641, + "step": 117960 + }, + { + "epoch": 25.0139197313546, + "grad_norm": 0.0018296531634405255, + "learning_rate": 2.0058134286591202e-05, + "loss": 0.0048, + "step": 117970 + }, + { + "epoch": 25.013973893733414, + "grad_norm": 0.11602866649627686, + "learning_rate": 2.005512526554611e-05, + "loss": 0.0547, + "step": 117980 + }, + { + "epoch": 25.014028056112224, + "grad_norm": 0.0018196346936747432, + "learning_rate": 2.0052116244501015e-05, + "loss": 0.0483, + "step": 117990 + }, + { + "epoch": 25.014082218491037, + "grad_norm": 0.04719286412000656, + "learning_rate": 2.004910722345592e-05, + "loss": 0.0284, + "step": 118000 + }, + { + "epoch": 25.014136380869846, + "grad_norm": 2.282381772994995, + "learning_rate": 2.0046098202410827e-05, + "loss": 0.0419, + "step": 118010 + }, + { + "epoch": 25.01419054324866, + "grad_norm": 1.4036351442337036, + "learning_rate": 2.0043089181365734e-05, + "loss": 0.1018, + "step": 118020 + }, + { + "epoch": 25.014244705627473, + "grad_norm": 0.3976576328277588, + "learning_rate": 2.0040080160320643e-05, + "loss": 0.0043, + "step": 118030 + }, + { + "epoch": 25.014298868006282, + "grad_norm": 0.00600038468837738, + "learning_rate": 2.003707113927555e-05, + "loss": 0.0006, + "step": 118040 + }, + { + "epoch": 25.014353030385095, + "grad_norm": 0.002066517947241664, + "learning_rate": 2.0034062118230456e-05, + "loss": 0.0047, + "step": 118050 + }, + { + "epoch": 25.014407192763905, + "grad_norm": 0.0019481913186609745, + "learning_rate": 2.0031053097185362e-05, + "loss": 0.0367, + "step": 118060 + }, + { + "epoch": 25.014461355142718, + "grad_norm": 1.5857652425765991, + "learning_rate": 2.0028044076140272e-05, + "loss": 0.0114, + "step": 118070 + }, + { + "epoch": 25.01451551752153, + "grad_norm": 0.017755458131432533, + "learning_rate": 2.0025035055095175e-05, + "loss": 0.0182, + "step": 118080 + }, + { + "epoch": 25.01456967990034, + "grad_norm": 0.09352610260248184, + "learning_rate": 2.0022026034050084e-05, + "loss": 0.058, + "step": 118090 + }, + { + "epoch": 25.014623842279153, + "grad_norm": 0.10160966962575912, + "learning_rate": 2.001901701300499e-05, + "loss": 0.0267, + "step": 118100 + }, + { + "epoch": 25.014678004657963, + "grad_norm": 0.03142967447638512, + "learning_rate": 2.0016007991959897e-05, + "loss": 0.0068, + "step": 118110 + }, + { + "epoch": 25.014732167036776, + "grad_norm": 0.0680447518825531, + "learning_rate": 2.0012998970914803e-05, + "loss": 0.0015, + "step": 118120 + }, + { + "epoch": 25.01478632941559, + "grad_norm": 0.002472824649885297, + "learning_rate": 2.000998994986971e-05, + "loss": 0.0517, + "step": 118130 + }, + { + "epoch": 25.0148404917944, + "grad_norm": 0.001608413178473711, + "learning_rate": 2.000698092882462e-05, + "loss": 0.0296, + "step": 118140 + }, + { + "epoch": 25.01489465417321, + "grad_norm": 0.5085532069206238, + "learning_rate": 2.0003971907779522e-05, + "loss": 0.005, + "step": 118150 + }, + { + "epoch": 25.014948816552025, + "grad_norm": 0.010648422874510288, + "learning_rate": 2.000096288673443e-05, + "loss": 0.0312, + "step": 118160 + }, + { + "epoch": 25.015002978930834, + "grad_norm": 0.0022200699895620346, + "learning_rate": 1.9997953865689338e-05, + "loss": 0.0358, + "step": 118170 + }, + { + "epoch": 25.015057141309647, + "grad_norm": 0.0015890432987362146, + "learning_rate": 1.9994944844644244e-05, + "loss": 0.0686, + "step": 118180 + }, + { + "epoch": 25.015111303688457, + "grad_norm": 0.0019876437727361917, + "learning_rate": 1.999193582359915e-05, + "loss": 0.0616, + "step": 118190 + }, + { + "epoch": 25.01516546606727, + "grad_norm": 0.0728658139705658, + "learning_rate": 1.998892680255406e-05, + "loss": 0.0547, + "step": 118200 + }, + { + "epoch": 25.015219628446083, + "grad_norm": 5.055369853973389, + "learning_rate": 1.9985917781508963e-05, + "loss": 0.0746, + "step": 118210 + }, + { + "epoch": 25.015273790824892, + "grad_norm": 0.08730310201644897, + "learning_rate": 1.9982908760463873e-05, + "loss": 0.0199, + "step": 118220 + }, + { + "epoch": 25.015327953203705, + "grad_norm": 0.05597221851348877, + "learning_rate": 1.997989973941878e-05, + "loss": 0.0245, + "step": 118230 + }, + { + "epoch": 25.015382115582515, + "grad_norm": 1.3476238250732422, + "learning_rate": 1.9976890718373685e-05, + "loss": 0.0799, + "step": 118240 + }, + { + "epoch": 25.015436277961328, + "grad_norm": 0.6554756760597229, + "learning_rate": 1.997388169732859e-05, + "loss": 0.0099, + "step": 118250 + }, + { + "epoch": 25.01549044034014, + "grad_norm": 0.7846980690956116, + "learning_rate": 1.99708726762835e-05, + "loss": 0.0797, + "step": 118260 + }, + { + "epoch": 25.01554460271895, + "grad_norm": 0.37961116433143616, + "learning_rate": 1.9967863655238407e-05, + "loss": 0.0564, + "step": 118270 + }, + { + "epoch": 25.015598765097764, + "grad_norm": 0.0015157945454120636, + "learning_rate": 1.996485463419331e-05, + "loss": 0.0137, + "step": 118280 + }, + { + "epoch": 25.015652927476573, + "grad_norm": 2.3959834575653076, + "learning_rate": 1.996184561314822e-05, + "loss": 0.0696, + "step": 118290 + }, + { + "epoch": 25.015707089855386, + "grad_norm": 0.00551934540271759, + "learning_rate": 1.9958836592103126e-05, + "loss": 0.0894, + "step": 118300 + }, + { + "epoch": 25.0157612522342, + "grad_norm": 0.05228593945503235, + "learning_rate": 1.9955827571058033e-05, + "loss": 0.0408, + "step": 118310 + }, + { + "epoch": 25.01581541461301, + "grad_norm": 0.006525001954287291, + "learning_rate": 1.995281855001294e-05, + "loss": 0.0009, + "step": 118320 + }, + { + "epoch": 25.015869576991822, + "grad_norm": 0.0015963177429512143, + "learning_rate": 1.994980952896785e-05, + "loss": 0.0173, + "step": 118330 + }, + { + "epoch": 25.015923739370635, + "grad_norm": 0.9755423665046692, + "learning_rate": 1.994680050792275e-05, + "loss": 0.0195, + "step": 118340 + }, + { + "epoch": 25.015977901749444, + "grad_norm": 0.5823934078216553, + "learning_rate": 1.994379148687766e-05, + "loss": 0.0699, + "step": 118350 + }, + { + "epoch": 25.016032064128257, + "grad_norm": 0.004628414288163185, + "learning_rate": 1.9940782465832567e-05, + "loss": 0.1054, + "step": 118360 + }, + { + "epoch": 25.016086226507067, + "grad_norm": 0.0016177367651835084, + "learning_rate": 1.9937773444787474e-05, + "loss": 0.0821, + "step": 118370 + }, + { + "epoch": 25.01614038888588, + "grad_norm": 0.004816865548491478, + "learning_rate": 1.993476442374238e-05, + "loss": 0.0709, + "step": 118380 + }, + { + "epoch": 25.016194551264693, + "grad_norm": 0.02765801176428795, + "learning_rate": 1.993175540269729e-05, + "loss": 0.0544, + "step": 118390 + }, + { + "epoch": 25.016248713643503, + "grad_norm": 0.2592173218727112, + "learning_rate": 1.9928746381652196e-05, + "loss": 0.1322, + "step": 118400 + }, + { + "epoch": 25.016302876022316, + "grad_norm": 0.0033519959542900324, + "learning_rate": 1.9925737360607102e-05, + "loss": 0.0004, + "step": 118410 + }, + { + "epoch": 25.016357038401125, + "grad_norm": 1.2230371236801147, + "learning_rate": 1.992272833956201e-05, + "loss": 0.0769, + "step": 118420 + }, + { + "epoch": 25.016411200779938, + "grad_norm": 0.003871713764965534, + "learning_rate": 1.9919719318516915e-05, + "loss": 0.045, + "step": 118430 + }, + { + "epoch": 25.01646536315875, + "grad_norm": 0.0033495789393782616, + "learning_rate": 1.991671029747182e-05, + "loss": 0.0135, + "step": 118440 + }, + { + "epoch": 25.01651952553756, + "grad_norm": 1.1323070526123047, + "learning_rate": 1.9913701276426727e-05, + "loss": 0.0424, + "step": 118450 + }, + { + "epoch": 25.016573687916374, + "grad_norm": 1.5226364135742188, + "learning_rate": 1.9910692255381637e-05, + "loss": 0.1015, + "step": 118460 + }, + { + "epoch": 25.016627850295183, + "grad_norm": 1.174737811088562, + "learning_rate": 1.990768323433654e-05, + "loss": 0.175, + "step": 118470 + }, + { + "epoch": 25.016682012673996, + "grad_norm": 0.8462948203086853, + "learning_rate": 1.990467421329145e-05, + "loss": 0.0655, + "step": 118480 + }, + { + "epoch": 25.01673617505281, + "grad_norm": 0.006409842986613512, + "learning_rate": 1.9901665192246356e-05, + "loss": 0.0459, + "step": 118490 + }, + { + "epoch": 25.01679033743162, + "grad_norm": 0.13867603242397308, + "learning_rate": 1.9898656171201262e-05, + "loss": 0.0612, + "step": 118500 + }, + { + "epoch": 25.016844499810432, + "grad_norm": 0.028599530458450317, + "learning_rate": 1.9895647150156168e-05, + "loss": 0.03, + "step": 118510 + }, + { + "epoch": 25.016898662189245, + "grad_norm": 0.0666658878326416, + "learning_rate": 1.9892638129111078e-05, + "loss": 0.0433, + "step": 118520 + }, + { + "epoch": 25.016952824568055, + "grad_norm": 0.003012597793713212, + "learning_rate": 1.9889629108065984e-05, + "loss": 0.0872, + "step": 118530 + }, + { + "epoch": 25.017006986946868, + "grad_norm": 0.056165218353271484, + "learning_rate": 1.988662008702089e-05, + "loss": 0.0007, + "step": 118540 + }, + { + "epoch": 25.017061149325677, + "grad_norm": 1.3358759880065918, + "learning_rate": 1.9883611065975797e-05, + "loss": 0.028, + "step": 118550 + }, + { + "epoch": 25.01711531170449, + "grad_norm": 0.06905084103345871, + "learning_rate": 1.9880602044930703e-05, + "loss": 0.0466, + "step": 118560 + }, + { + "epoch": 25.017169474083303, + "grad_norm": 0.002693149261176586, + "learning_rate": 1.987759302388561e-05, + "loss": 0.0295, + "step": 118570 + }, + { + "epoch": 25.017223636462113, + "grad_norm": 0.8105577826499939, + "learning_rate": 1.9874584002840515e-05, + "loss": 0.0082, + "step": 118580 + }, + { + "epoch": 25.017277798840926, + "grad_norm": 0.0024034467060118914, + "learning_rate": 1.9871574981795425e-05, + "loss": 0.021, + "step": 118590 + }, + { + "epoch": 25.017331961219735, + "grad_norm": 0.9060829281806946, + "learning_rate": 1.9868565960750328e-05, + "loss": 0.0521, + "step": 118600 + }, + { + "epoch": 25.01738612359855, + "grad_norm": 0.6814110279083252, + "learning_rate": 1.9865556939705238e-05, + "loss": 0.0683, + "step": 118610 + }, + { + "epoch": 25.01744028597736, + "grad_norm": 0.2449735403060913, + "learning_rate": 1.9862547918660144e-05, + "loss": 0.0208, + "step": 118620 + }, + { + "epoch": 25.01749444835617, + "grad_norm": 0.7310232520103455, + "learning_rate": 1.985953889761505e-05, + "loss": 0.0201, + "step": 118630 + }, + { + "epoch": 25.017548610734984, + "grad_norm": 4.48527717590332, + "learning_rate": 1.9856529876569957e-05, + "loss": 0.0727, + "step": 118640 + }, + { + "epoch": 25.017602773113794, + "grad_norm": 0.0015661492943763733, + "learning_rate": 1.9853520855524866e-05, + "loss": 0.0053, + "step": 118650 + }, + { + "epoch": 25.017656935492607, + "grad_norm": 0.012369215488433838, + "learning_rate": 1.9850511834479772e-05, + "loss": 0.0346, + "step": 118660 + }, + { + "epoch": 25.01771109787142, + "grad_norm": 0.005996950902044773, + "learning_rate": 1.984750281343468e-05, + "loss": 0.0382, + "step": 118670 + }, + { + "epoch": 25.01776526025023, + "grad_norm": 0.00167924037668854, + "learning_rate": 1.9844493792389585e-05, + "loss": 0.0559, + "step": 118680 + }, + { + "epoch": 25.017819422629042, + "grad_norm": 0.001607400132343173, + "learning_rate": 1.984148477134449e-05, + "loss": 0.0627, + "step": 118690 + }, + { + "epoch": 25.017873585007855, + "grad_norm": 0.0017254980048164725, + "learning_rate": 1.9838475750299398e-05, + "loss": 0.0356, + "step": 118700 + }, + { + "epoch": 25.017927747386665, + "grad_norm": 0.013310772366821766, + "learning_rate": 1.9835466729254307e-05, + "loss": 0.0147, + "step": 118710 + }, + { + "epoch": 25.017981909765478, + "grad_norm": 0.003257519332692027, + "learning_rate": 1.9832457708209214e-05, + "loss": 0.0126, + "step": 118720 + }, + { + "epoch": 25.018036072144287, + "grad_norm": 0.001671939273364842, + "learning_rate": 1.9829448687164116e-05, + "loss": 0.0265, + "step": 118730 + }, + { + "epoch": 25.0180902345231, + "grad_norm": 0.0020697296131402254, + "learning_rate": 1.9826439666119026e-05, + "loss": 0.0032, + "step": 118740 + }, + { + "epoch": 25.018144396901913, + "grad_norm": 0.16130076348781586, + "learning_rate": 1.9823430645073932e-05, + "loss": 0.0035, + "step": 118750 + }, + { + "epoch": 25.018198559280723, + "grad_norm": 0.36302685737609863, + "learning_rate": 1.982042162402884e-05, + "loss": 0.0058, + "step": 118760 + }, + { + "epoch": 25.018252721659536, + "grad_norm": 0.0016730626812204719, + "learning_rate": 1.9817412602983745e-05, + "loss": 0.1158, + "step": 118770 + }, + { + "epoch": 25.018306884038346, + "grad_norm": 3.844447135925293, + "learning_rate": 1.9814403581938655e-05, + "loss": 0.1089, + "step": 118780 + }, + { + "epoch": 25.01836104641716, + "grad_norm": 0.0075867269188165665, + "learning_rate": 1.981139456089356e-05, + "loss": 0.0542, + "step": 118790 + }, + { + "epoch": 25.01841520879597, + "grad_norm": 0.0120740607380867, + "learning_rate": 1.9808385539848467e-05, + "loss": 0.0859, + "step": 118800 + }, + { + "epoch": 25.01846937117478, + "grad_norm": 0.0019167213467881083, + "learning_rate": 1.9805376518803373e-05, + "loss": 0.0568, + "step": 118810 + }, + { + "epoch": 25.018523533553594, + "grad_norm": 0.6645863652229309, + "learning_rate": 1.980236749775828e-05, + "loss": 0.1066, + "step": 118820 + }, + { + "epoch": 25.018577695932404, + "grad_norm": 0.022307928651571274, + "learning_rate": 1.9799358476713186e-05, + "loss": 0.063, + "step": 118830 + }, + { + "epoch": 25.018631858311217, + "grad_norm": 1.2452245950698853, + "learning_rate": 1.9796349455668096e-05, + "loss": 0.091, + "step": 118840 + }, + { + "epoch": 25.01868602069003, + "grad_norm": 0.008710517548024654, + "learning_rate": 1.9793340434623002e-05, + "loss": 0.0103, + "step": 118850 + }, + { + "epoch": 25.01874018306884, + "grad_norm": 0.2739175856113434, + "learning_rate": 1.9790331413577908e-05, + "loss": 0.0338, + "step": 118860 + }, + { + "epoch": 25.018794345447652, + "grad_norm": 0.004167754668742418, + "learning_rate": 1.9787322392532814e-05, + "loss": 0.0934, + "step": 118870 + }, + { + "epoch": 25.018848507826462, + "grad_norm": 0.027362050488591194, + "learning_rate": 1.978431337148772e-05, + "loss": 0.0439, + "step": 118880 + }, + { + "epoch": 25.018902670205275, + "grad_norm": 0.44684866070747375, + "learning_rate": 1.9781304350442627e-05, + "loss": 0.0034, + "step": 118890 + }, + { + "epoch": 25.018956832584088, + "grad_norm": 2.664895534515381, + "learning_rate": 1.9778295329397533e-05, + "loss": 0.0527, + "step": 118900 + }, + { + "epoch": 25.019010994962898, + "grad_norm": 0.9061908721923828, + "learning_rate": 1.9775286308352443e-05, + "loss": 0.0667, + "step": 118910 + }, + { + "epoch": 25.01906515734171, + "grad_norm": 0.07502548396587372, + "learning_rate": 1.977227728730735e-05, + "loss": 0.0243, + "step": 118920 + }, + { + "epoch": 25.019119319720524, + "grad_norm": 0.026552977040410042, + "learning_rate": 1.9769268266262255e-05, + "loss": 0.008, + "step": 118930 + }, + { + "epoch": 25.019173482099333, + "grad_norm": 1.8165169954299927, + "learning_rate": 1.9766259245217162e-05, + "loss": 0.1003, + "step": 118940 + }, + { + "epoch": 25.019227644478146, + "grad_norm": 1.3697984218597412, + "learning_rate": 1.9763250224172068e-05, + "loss": 0.0478, + "step": 118950 + }, + { + "epoch": 25.019281806856956, + "grad_norm": 0.24258023500442505, + "learning_rate": 1.9760241203126974e-05, + "loss": 0.0088, + "step": 118960 + }, + { + "epoch": 25.01933596923577, + "grad_norm": 0.005863790865987539, + "learning_rate": 1.9757232182081884e-05, + "loss": 0.1424, + "step": 118970 + }, + { + "epoch": 25.019390131614582, + "grad_norm": 0.5378403663635254, + "learning_rate": 1.975422316103679e-05, + "loss": 0.0343, + "step": 118980 + }, + { + "epoch": 25.01944429399339, + "grad_norm": 0.002798407571390271, + "learning_rate": 1.9751214139991696e-05, + "loss": 0.0267, + "step": 118990 + }, + { + "epoch": 25.019498456372204, + "grad_norm": 0.015593525022268295, + "learning_rate": 1.9748205118946603e-05, + "loss": 0.0395, + "step": 119000 + }, + { + "epoch": 25.019552618751014, + "grad_norm": 1.0012056827545166, + "learning_rate": 1.9745196097901512e-05, + "loss": 0.0433, + "step": 119010 + }, + { + "epoch": 25.019606781129827, + "grad_norm": 0.46078652143478394, + "learning_rate": 1.9742187076856415e-05, + "loss": 0.0184, + "step": 119020 + }, + { + "epoch": 25.01966094350864, + "grad_norm": 0.2176724225282669, + "learning_rate": 1.973917805581132e-05, + "loss": 0.0808, + "step": 119030 + }, + { + "epoch": 25.01971510588745, + "grad_norm": 0.0039899894036352634, + "learning_rate": 1.973616903476623e-05, + "loss": 0.0797, + "step": 119040 + }, + { + "epoch": 25.019769268266263, + "grad_norm": 0.0015125555219128728, + "learning_rate": 1.9733160013721138e-05, + "loss": 0.0576, + "step": 119050 + }, + { + "epoch": 25.019823430645072, + "grad_norm": 0.8612082600593567, + "learning_rate": 1.9730150992676044e-05, + "loss": 0.1175, + "step": 119060 + }, + { + "epoch": 25.019877593023885, + "grad_norm": 1.121279001235962, + "learning_rate": 1.972714197163095e-05, + "loss": 0.0178, + "step": 119070 + }, + { + "epoch": 25.0199317554027, + "grad_norm": 0.002204443560913205, + "learning_rate": 1.9724132950585856e-05, + "loss": 0.0693, + "step": 119080 + }, + { + "epoch": 25.019985917781508, + "grad_norm": 0.001550523447804153, + "learning_rate": 1.9721123929540763e-05, + "loss": 0.0016, + "step": 119090 + }, + { + "epoch": 25.02004008016032, + "grad_norm": 0.11759502440690994, + "learning_rate": 1.9718114908495672e-05, + "loss": 0.0866, + "step": 119100 + }, + { + "epoch": 25.020094242539134, + "grad_norm": 0.001587366801686585, + "learning_rate": 1.971510588745058e-05, + "loss": 0.0771, + "step": 119110 + }, + { + "epoch": 25.020148404917943, + "grad_norm": 0.0015981417382135987, + "learning_rate": 1.9712096866405485e-05, + "loss": 0.0381, + "step": 119120 + }, + { + "epoch": 25.020202567296757, + "grad_norm": 0.0015271971933543682, + "learning_rate": 1.970908784536039e-05, + "loss": 0.0239, + "step": 119130 + }, + { + "epoch": 25.020256729675566, + "grad_norm": 0.001957438886165619, + "learning_rate": 1.97060788243153e-05, + "loss": 0.1404, + "step": 119140 + }, + { + "epoch": 25.02031089205438, + "grad_norm": 0.06166454404592514, + "learning_rate": 1.9703069803270204e-05, + "loss": 0.0173, + "step": 119150 + }, + { + "epoch": 25.020365054433192, + "grad_norm": 0.9094252586364746, + "learning_rate": 1.9700060782225113e-05, + "loss": 0.0381, + "step": 119160 + }, + { + "epoch": 25.020419216812, + "grad_norm": 0.003923508338630199, + "learning_rate": 1.969705176118002e-05, + "loss": 0.0467, + "step": 119170 + }, + { + "epoch": 25.020473379190815, + "grad_norm": 0.060390569269657135, + "learning_rate": 1.9694042740134926e-05, + "loss": 0.1375, + "step": 119180 + }, + { + "epoch": 25.020527541569624, + "grad_norm": 0.0015554772689938545, + "learning_rate": 1.9691033719089832e-05, + "loss": 0.0021, + "step": 119190 + }, + { + "epoch": 25.020581703948437, + "grad_norm": 0.09574049711227417, + "learning_rate": 1.968802469804474e-05, + "loss": 0.003, + "step": 119200 + }, + { + "epoch": 25.02063586632725, + "grad_norm": 0.016166122630238533, + "learning_rate": 1.9685015676999645e-05, + "loss": 0.0222, + "step": 119210 + }, + { + "epoch": 25.02069002870606, + "grad_norm": 0.09750299155712128, + "learning_rate": 1.968200665595455e-05, + "loss": 0.0718, + "step": 119220 + }, + { + "epoch": 25.020744191084873, + "grad_norm": 0.658517599105835, + "learning_rate": 1.967899763490946e-05, + "loss": 0.0834, + "step": 119230 + }, + { + "epoch": 25.020798353463682, + "grad_norm": 0.03279784694314003, + "learning_rate": 1.9675988613864367e-05, + "loss": 0.0152, + "step": 119240 + }, + { + "epoch": 25.020852515842495, + "grad_norm": 0.6982927918434143, + "learning_rate": 1.9672979592819273e-05, + "loss": 0.0158, + "step": 119250 + }, + { + "epoch": 25.02090667822131, + "grad_norm": 0.13870398700237274, + "learning_rate": 1.966997057177418e-05, + "loss": 0.0253, + "step": 119260 + }, + { + "epoch": 25.020960840600118, + "grad_norm": 0.0019268121104687452, + "learning_rate": 1.966696155072909e-05, + "loss": 0.0532, + "step": 119270 + }, + { + "epoch": 25.02101500297893, + "grad_norm": 0.012191904708743095, + "learning_rate": 1.9663952529683992e-05, + "loss": 0.038, + "step": 119280 + }, + { + "epoch": 25.021069165357744, + "grad_norm": 0.03768107667565346, + "learning_rate": 1.96609435086389e-05, + "loss": 0.0365, + "step": 119290 + }, + { + "epoch": 25.021123327736554, + "grad_norm": 0.0019224986899644136, + "learning_rate": 1.9657934487593808e-05, + "loss": 0.0036, + "step": 119300 + }, + { + "epoch": 25.021177490115367, + "grad_norm": 0.06194322928786278, + "learning_rate": 1.9654925466548714e-05, + "loss": 0.06, + "step": 119310 + }, + { + "epoch": 25.021231652494176, + "grad_norm": 0.0020062518306076527, + "learning_rate": 1.965191644550362e-05, + "loss": 0.0878, + "step": 119320 + }, + { + "epoch": 25.02128581487299, + "grad_norm": 0.06972154229879379, + "learning_rate": 1.9648907424458527e-05, + "loss": 0.0225, + "step": 119330 + }, + { + "epoch": 25.021339977251802, + "grad_norm": 0.17001540958881378, + "learning_rate": 1.9645898403413433e-05, + "loss": 0.0366, + "step": 119340 + }, + { + "epoch": 25.021394139630612, + "grad_norm": 0.08944077044725418, + "learning_rate": 1.964288938236834e-05, + "loss": 0.0031, + "step": 119350 + }, + { + "epoch": 25.021448302009425, + "grad_norm": 1.0611728429794312, + "learning_rate": 1.963988036132325e-05, + "loss": 0.011, + "step": 119360 + }, + { + "epoch": 25.021502464388234, + "grad_norm": 0.0035284385085105896, + "learning_rate": 1.9636871340278155e-05, + "loss": 0.0894, + "step": 119370 + }, + { + "epoch": 25.021556626767048, + "grad_norm": 0.7146709561347961, + "learning_rate": 1.963386231923306e-05, + "loss": 0.0224, + "step": 119380 + }, + { + "epoch": 25.02161078914586, + "grad_norm": 0.38601869344711304, + "learning_rate": 1.9630853298187968e-05, + "loss": 0.0209, + "step": 119390 + }, + { + "epoch": 25.02166495152467, + "grad_norm": 0.0013895619194954634, + "learning_rate": 1.9627844277142877e-05, + "loss": 0.0354, + "step": 119400 + }, + { + "epoch": 25.021719113903483, + "grad_norm": 0.002285062801092863, + "learning_rate": 1.962483525609778e-05, + "loss": 0.0081, + "step": 119410 + }, + { + "epoch": 25.021773276282293, + "grad_norm": 0.021343538537621498, + "learning_rate": 1.962182623505269e-05, + "loss": 0.0888, + "step": 119420 + }, + { + "epoch": 25.021827438661106, + "grad_norm": 0.05633361265063286, + "learning_rate": 1.9618817214007596e-05, + "loss": 0.0687, + "step": 119430 + }, + { + "epoch": 25.02188160103992, + "grad_norm": 0.01579630933701992, + "learning_rate": 1.9615808192962503e-05, + "loss": 0.007, + "step": 119440 + }, + { + "epoch": 25.02193576341873, + "grad_norm": 0.006937259808182716, + "learning_rate": 1.961279917191741e-05, + "loss": 0.0489, + "step": 119450 + }, + { + "epoch": 25.02198992579754, + "grad_norm": 0.00617119949311018, + "learning_rate": 1.960979015087232e-05, + "loss": 0.0299, + "step": 119460 + }, + { + "epoch": 25.022044088176354, + "grad_norm": 0.023323988541960716, + "learning_rate": 1.960678112982722e-05, + "loss": 0.0037, + "step": 119470 + }, + { + "epoch": 25.022098250555164, + "grad_norm": 0.3772840201854706, + "learning_rate": 1.9603772108782128e-05, + "loss": 0.0082, + "step": 119480 + }, + { + "epoch": 25.022152412933977, + "grad_norm": 0.0025137518532574177, + "learning_rate": 1.9600763087737037e-05, + "loss": 0.0646, + "step": 119490 + }, + { + "epoch": 25.022206575312786, + "grad_norm": 0.06330598890781403, + "learning_rate": 1.9597754066691944e-05, + "loss": 0.1286, + "step": 119500 + }, + { + "epoch": 25.0222607376916, + "grad_norm": 0.12041895091533661, + "learning_rate": 1.959474504564685e-05, + "loss": 0.068, + "step": 119510 + }, + { + "epoch": 25.022314900070413, + "grad_norm": 0.001461703795939684, + "learning_rate": 1.9591736024601756e-05, + "loss": 0.0801, + "step": 119520 + }, + { + "epoch": 25.022369062449222, + "grad_norm": 27.003854751586914, + "learning_rate": 1.9588727003556666e-05, + "loss": 0.2096, + "step": 119530 + }, + { + "epoch": 25.022423224828035, + "grad_norm": 0.008309829980134964, + "learning_rate": 1.958571798251157e-05, + "loss": 0.0079, + "step": 119540 + }, + { + "epoch": 25.022477387206845, + "grad_norm": 0.03602304309606552, + "learning_rate": 1.958270896146648e-05, + "loss": 0.065, + "step": 119550 + }, + { + "epoch": 25.022531549585658, + "grad_norm": 0.0014871911844238639, + "learning_rate": 1.9579699940421385e-05, + "loss": 0.0269, + "step": 119560 + }, + { + "epoch": 25.02258571196447, + "grad_norm": 0.381795734167099, + "learning_rate": 1.957669091937629e-05, + "loss": 0.0253, + "step": 119570 + }, + { + "epoch": 25.02263987434328, + "grad_norm": 2.1854960918426514, + "learning_rate": 1.9573681898331197e-05, + "loss": 0.0447, + "step": 119580 + }, + { + "epoch": 25.022694036722093, + "grad_norm": 0.002531003672629595, + "learning_rate": 1.9570672877286107e-05, + "loss": 0.009, + "step": 119590 + }, + { + "epoch": 25.022748199100903, + "grad_norm": 0.28858810663223267, + "learning_rate": 1.956766385624101e-05, + "loss": 0.0495, + "step": 119600 + }, + { + "epoch": 25.022802361479716, + "grad_norm": 0.32300978899002075, + "learning_rate": 1.956465483519592e-05, + "loss": 0.0616, + "step": 119610 + }, + { + "epoch": 25.02285652385853, + "grad_norm": 0.0014218998840078712, + "learning_rate": 1.9561645814150826e-05, + "loss": 0.0205, + "step": 119620 + }, + { + "epoch": 25.02291068623734, + "grad_norm": 0.21046747267246246, + "learning_rate": 1.9558636793105732e-05, + "loss": 0.046, + "step": 119630 + }, + { + "epoch": 25.02296484861615, + "grad_norm": 0.001519920420832932, + "learning_rate": 1.9555627772060638e-05, + "loss": 0.028, + "step": 119640 + }, + { + "epoch": 25.023019010994965, + "grad_norm": 0.001594011322595179, + "learning_rate": 1.9552618751015544e-05, + "loss": 0.0633, + "step": 119650 + }, + { + "epoch": 25.023073173373774, + "grad_norm": 0.0013966577826067805, + "learning_rate": 1.9549609729970454e-05, + "loss": 0.0187, + "step": 119660 + }, + { + "epoch": 25.023127335752587, + "grad_norm": 0.011486641131341457, + "learning_rate": 1.9546600708925357e-05, + "loss": 0.0834, + "step": 119670 + }, + { + "epoch": 25.023181498131397, + "grad_norm": 0.07977431267499924, + "learning_rate": 1.9543591687880267e-05, + "loss": 0.0236, + "step": 119680 + }, + { + "epoch": 25.02323566051021, + "grad_norm": 0.0020973554346710443, + "learning_rate": 1.9540582666835173e-05, + "loss": 0.0016, + "step": 119690 + }, + { + "epoch": 25.023289822889023, + "grad_norm": 0.04764346778392792, + "learning_rate": 1.953757364579008e-05, + "loss": 0.0148, + "step": 119700 + }, + { + "epoch": 25.023343985267832, + "grad_norm": 4.3737640380859375, + "learning_rate": 1.9534564624744986e-05, + "loss": 0.1191, + "step": 119710 + }, + { + "epoch": 25.023398147646645, + "grad_norm": 0.0023205005563795567, + "learning_rate": 1.9531555603699895e-05, + "loss": 0.0002, + "step": 119720 + }, + { + "epoch": 25.023452310025455, + "grad_norm": 0.0031815131660550833, + "learning_rate": 1.9528546582654798e-05, + "loss": 0.0037, + "step": 119730 + }, + { + "epoch": 25.023506472404268, + "grad_norm": 0.0028773238882422447, + "learning_rate": 1.9525537561609708e-05, + "loss": 0.0287, + "step": 119740 + }, + { + "epoch": 25.02356063478308, + "grad_norm": 0.0014809396816417575, + "learning_rate": 1.9522528540564614e-05, + "loss": 0.0035, + "step": 119750 + }, + { + "epoch": 25.02361479716189, + "grad_norm": 0.0014060345711186528, + "learning_rate": 1.951951951951952e-05, + "loss": 0.0138, + "step": 119760 + }, + { + "epoch": 25.023668959540704, + "grad_norm": 0.3974057734012604, + "learning_rate": 1.9516510498474427e-05, + "loss": 0.0447, + "step": 119770 + }, + { + "epoch": 25.023723121919513, + "grad_norm": 0.05507676303386688, + "learning_rate": 1.9513501477429333e-05, + "loss": 0.0417, + "step": 119780 + }, + { + "epoch": 25.023777284298326, + "grad_norm": 0.0040383851155638695, + "learning_rate": 1.9510492456384242e-05, + "loss": 0.0786, + "step": 119790 + }, + { + "epoch": 25.02383144667714, + "grad_norm": 0.16863687336444855, + "learning_rate": 1.9507483435339145e-05, + "loss": 0.0498, + "step": 119800 + }, + { + "epoch": 25.02388560905595, + "grad_norm": 0.0013535460457205772, + "learning_rate": 1.9504474414294055e-05, + "loss": 0.0451, + "step": 119810 + }, + { + "epoch": 25.023939771434762, + "grad_norm": 0.0014539392432197928, + "learning_rate": 1.950146539324896e-05, + "loss": 0.0539, + "step": 119820 + }, + { + "epoch": 25.023993933813575, + "grad_norm": 0.005315342452377081, + "learning_rate": 1.9498456372203868e-05, + "loss": 0.0678, + "step": 119830 + }, + { + "epoch": 25.024048096192384, + "grad_norm": 0.026141205802559853, + "learning_rate": 1.9495447351158774e-05, + "loss": 0.0107, + "step": 119840 + }, + { + "epoch": 25.024102258571197, + "grad_norm": 1.9739551544189453, + "learning_rate": 1.9492438330113684e-05, + "loss": 0.1071, + "step": 119850 + }, + { + "epoch": 25.024156420950007, + "grad_norm": 0.015783481299877167, + "learning_rate": 1.948942930906859e-05, + "loss": 0.0516, + "step": 119860 + }, + { + "epoch": 25.02421058332882, + "grad_norm": 0.19641055166721344, + "learning_rate": 1.9486420288023496e-05, + "loss": 0.0214, + "step": 119870 + }, + { + "epoch": 25.024264745707633, + "grad_norm": 0.022098548710346222, + "learning_rate": 1.9483411266978402e-05, + "loss": 0.1021, + "step": 119880 + }, + { + "epoch": 25.024318908086443, + "grad_norm": 0.66741544008255, + "learning_rate": 1.948040224593331e-05, + "loss": 0.0424, + "step": 119890 + }, + { + "epoch": 25.024373070465256, + "grad_norm": 0.0037524006329476833, + "learning_rate": 1.9477393224888215e-05, + "loss": 0.0031, + "step": 119900 + }, + { + "epoch": 25.024427232844065, + "grad_norm": 0.0016268679173663259, + "learning_rate": 1.947438420384312e-05, + "loss": 0.0125, + "step": 119910 + }, + { + "epoch": 25.024481395222878, + "grad_norm": 0.6981022357940674, + "learning_rate": 1.947137518279803e-05, + "loss": 0.058, + "step": 119920 + }, + { + "epoch": 25.02453555760169, + "grad_norm": 0.0013053779257461429, + "learning_rate": 1.9468366161752934e-05, + "loss": 0.0194, + "step": 119930 + }, + { + "epoch": 25.0245897199805, + "grad_norm": 0.0013349363580346107, + "learning_rate": 1.9465357140707843e-05, + "loss": 0.0002, + "step": 119940 + }, + { + "epoch": 25.024643882359314, + "grad_norm": 0.5427045822143555, + "learning_rate": 1.946234811966275e-05, + "loss": 0.0283, + "step": 119950 + }, + { + "epoch": 25.024698044738123, + "grad_norm": 0.0012914224062114954, + "learning_rate": 1.9459339098617656e-05, + "loss": 0.0024, + "step": 119960 + }, + { + "epoch": 25.024752207116936, + "grad_norm": 0.0013642063131555915, + "learning_rate": 1.9456330077572562e-05, + "loss": 0.0734, + "step": 119970 + }, + { + "epoch": 25.02480636949575, + "grad_norm": 4.47000789642334, + "learning_rate": 1.9453321056527472e-05, + "loss": 0.2045, + "step": 119980 + }, + { + "epoch": 25.02486053187456, + "grad_norm": 0.04421629384160042, + "learning_rate": 1.9450312035482378e-05, + "loss": 0.0342, + "step": 119990 + }, + { + "epoch": 25.024914694253372, + "grad_norm": 0.0023104590363800526, + "learning_rate": 1.9447303014437284e-05, + "loss": 0.0081, + "step": 120000 + }, + { + "epoch": 25.02496885663218, + "grad_norm": 1.8701322078704834, + "learning_rate": 1.944429399339219e-05, + "loss": 0.0371, + "step": 120010 + }, + { + "epoch": 25.02500135405947, + "eval_accuracy": 0.8380143696930111, + "eval_loss": 0.9091816544532776, + "eval_runtime": 116.099, + "eval_samples_per_second": 26.374, + "eval_steps_per_second": 3.299, + "step": 120016 + }, + { + "epoch": 26.000021664951525, + "grad_norm": 0.0037733863573521376, + "learning_rate": 1.9441284972347097e-05, + "loss": 0.0402, + "step": 120020 + }, + { + "epoch": 26.000075827330335, + "grad_norm": 0.6188281774520874, + "learning_rate": 1.9438275951302003e-05, + "loss": 0.0068, + "step": 120030 + }, + { + "epoch": 26.000129989709148, + "grad_norm": 0.06209612637758255, + "learning_rate": 1.9435266930256913e-05, + "loss": 0.0968, + "step": 120040 + }, + { + "epoch": 26.00018415208796, + "grad_norm": 21.603683471679688, + "learning_rate": 1.943225790921182e-05, + "loss": 0.0593, + "step": 120050 + }, + { + "epoch": 26.00023831446677, + "grad_norm": 0.6104966402053833, + "learning_rate": 1.9429248888166722e-05, + "loss": 0.0365, + "step": 120060 + }, + { + "epoch": 26.000292476845583, + "grad_norm": 0.004316407721489668, + "learning_rate": 1.9426239867121632e-05, + "loss": 0.0101, + "step": 120070 + }, + { + "epoch": 26.000346639224396, + "grad_norm": 0.00829527247697115, + "learning_rate": 1.9423230846076538e-05, + "loss": 0.0429, + "step": 120080 + }, + { + "epoch": 26.000400801603206, + "grad_norm": 0.07106726616621017, + "learning_rate": 1.9420221825031444e-05, + "loss": 0.0354, + "step": 120090 + }, + { + "epoch": 26.00045496398202, + "grad_norm": 1.6481728553771973, + "learning_rate": 1.941721280398635e-05, + "loss": 0.0683, + "step": 120100 + }, + { + "epoch": 26.00050912636083, + "grad_norm": 1.3653291463851929, + "learning_rate": 1.941420378294126e-05, + "loss": 0.0754, + "step": 120110 + }, + { + "epoch": 26.00056328873964, + "grad_norm": 0.7805411219596863, + "learning_rate": 1.9411194761896166e-05, + "loss": 0.0261, + "step": 120120 + }, + { + "epoch": 26.000617451118455, + "grad_norm": 0.0017019161023199558, + "learning_rate": 1.9408185740851073e-05, + "loss": 0.052, + "step": 120130 + }, + { + "epoch": 26.000671613497264, + "grad_norm": 0.038972679525613785, + "learning_rate": 1.940517671980598e-05, + "loss": 0.042, + "step": 120140 + }, + { + "epoch": 26.000725775876077, + "grad_norm": 0.022427551448345184, + "learning_rate": 1.9402167698760885e-05, + "loss": 0.0645, + "step": 120150 + }, + { + "epoch": 26.000779938254887, + "grad_norm": 0.6271002292633057, + "learning_rate": 1.939915867771579e-05, + "loss": 0.0213, + "step": 120160 + }, + { + "epoch": 26.0008341006337, + "grad_norm": 0.31771355867385864, + "learning_rate": 1.93961496566707e-05, + "loss": 0.0163, + "step": 120170 + }, + { + "epoch": 26.000888263012513, + "grad_norm": 0.003038607304915786, + "learning_rate": 1.9393140635625608e-05, + "loss": 0.0294, + "step": 120180 + }, + { + "epoch": 26.000942425391322, + "grad_norm": 0.10291565954685211, + "learning_rate": 1.9390131614580514e-05, + "loss": 0.1081, + "step": 120190 + }, + { + "epoch": 26.000996587770135, + "grad_norm": 0.0014406084083020687, + "learning_rate": 1.938712259353542e-05, + "loss": 0.0421, + "step": 120200 + }, + { + "epoch": 26.001050750148945, + "grad_norm": 0.0017418927745893598, + "learning_rate": 1.9384113572490326e-05, + "loss": 0.0084, + "step": 120210 + }, + { + "epoch": 26.001104912527758, + "grad_norm": 0.042176902294158936, + "learning_rate": 1.9381104551445233e-05, + "loss": 0.0085, + "step": 120220 + }, + { + "epoch": 26.00115907490657, + "grad_norm": 0.13935993611812592, + "learning_rate": 1.937809553040014e-05, + "loss": 0.0069, + "step": 120230 + }, + { + "epoch": 26.00121323728538, + "grad_norm": 0.08002486079931259, + "learning_rate": 1.937508650935505e-05, + "loss": 0.0435, + "step": 120240 + }, + { + "epoch": 26.001267399664194, + "grad_norm": 0.684719443321228, + "learning_rate": 1.9372077488309955e-05, + "loss": 0.0569, + "step": 120250 + }, + { + "epoch": 26.001321562043003, + "grad_norm": 0.3519047200679779, + "learning_rate": 1.936906846726486e-05, + "loss": 0.0369, + "step": 120260 + }, + { + "epoch": 26.001375724421816, + "grad_norm": 3.5720033645629883, + "learning_rate": 1.9366059446219767e-05, + "loss": 0.0247, + "step": 120270 + }, + { + "epoch": 26.00142988680063, + "grad_norm": 0.3869733214378357, + "learning_rate": 1.9363050425174674e-05, + "loss": 0.0167, + "step": 120280 + }, + { + "epoch": 26.00148404917944, + "grad_norm": 0.0052352165803313255, + "learning_rate": 1.936004140412958e-05, + "loss": 0.0218, + "step": 120290 + }, + { + "epoch": 26.001538211558252, + "grad_norm": 0.00184723106212914, + "learning_rate": 1.935703238308449e-05, + "loss": 0.0947, + "step": 120300 + }, + { + "epoch": 26.001592373937065, + "grad_norm": 0.001182342180982232, + "learning_rate": 1.9354023362039396e-05, + "loss": 0.0072, + "step": 120310 + }, + { + "epoch": 26.001646536315874, + "grad_norm": 0.11313663423061371, + "learning_rate": 1.9351014340994302e-05, + "loss": 0.0099, + "step": 120320 + }, + { + "epoch": 26.001700698694687, + "grad_norm": 0.8849897384643555, + "learning_rate": 1.934800531994921e-05, + "loss": 0.0076, + "step": 120330 + }, + { + "epoch": 26.001754861073497, + "grad_norm": 0.004019620828330517, + "learning_rate": 1.9344996298904118e-05, + "loss": 0.0888, + "step": 120340 + }, + { + "epoch": 26.00180902345231, + "grad_norm": 0.008128361776471138, + "learning_rate": 1.934198727785902e-05, + "loss": 0.0292, + "step": 120350 + }, + { + "epoch": 26.001863185831123, + "grad_norm": 0.08026524633169174, + "learning_rate": 1.9338978256813927e-05, + "loss": 0.0391, + "step": 120360 + }, + { + "epoch": 26.001917348209933, + "grad_norm": 6.321882724761963, + "learning_rate": 1.9335969235768837e-05, + "loss": 0.1435, + "step": 120370 + }, + { + "epoch": 26.001971510588746, + "grad_norm": 5.693256378173828, + "learning_rate": 1.9332960214723743e-05, + "loss": 0.13, + "step": 120380 + }, + { + "epoch": 26.002025672967555, + "grad_norm": 0.0015555284917354584, + "learning_rate": 1.932995119367865e-05, + "loss": 0.1114, + "step": 120390 + }, + { + "epoch": 26.00207983534637, + "grad_norm": 0.0013878491008654237, + "learning_rate": 1.9326942172633556e-05, + "loss": 0.1061, + "step": 120400 + }, + { + "epoch": 26.00213399772518, + "grad_norm": 1.9418047666549683, + "learning_rate": 1.9323933151588462e-05, + "loss": 0.0334, + "step": 120410 + }, + { + "epoch": 26.00218816010399, + "grad_norm": 0.11974826455116272, + "learning_rate": 1.9320924130543368e-05, + "loss": 0.0362, + "step": 120420 + }, + { + "epoch": 26.002242322482804, + "grad_norm": 0.02648170478641987, + "learning_rate": 1.9317915109498278e-05, + "loss": 0.0158, + "step": 120430 + }, + { + "epoch": 26.002296484861613, + "grad_norm": 1.7538340091705322, + "learning_rate": 1.9314906088453184e-05, + "loss": 0.0411, + "step": 120440 + }, + { + "epoch": 26.002350647240426, + "grad_norm": 0.0034822302404791117, + "learning_rate": 1.931189706740809e-05, + "loss": 0.1273, + "step": 120450 + }, + { + "epoch": 26.00240480961924, + "grad_norm": 0.01003744825720787, + "learning_rate": 1.9308888046362997e-05, + "loss": 0.0266, + "step": 120460 + }, + { + "epoch": 26.00245897199805, + "grad_norm": 0.0013452019775286317, + "learning_rate": 1.9305879025317906e-05, + "loss": 0.0534, + "step": 120470 + }, + { + "epoch": 26.002513134376862, + "grad_norm": 0.06274498999118805, + "learning_rate": 1.930287000427281e-05, + "loss": 0.0012, + "step": 120480 + }, + { + "epoch": 26.002567296755675, + "grad_norm": 0.0022924321237951517, + "learning_rate": 1.929986098322772e-05, + "loss": 0.0296, + "step": 120490 + }, + { + "epoch": 26.002621459134485, + "grad_norm": 0.0013312585651874542, + "learning_rate": 1.9296851962182625e-05, + "loss": 0.0075, + "step": 120500 + }, + { + "epoch": 26.002675621513298, + "grad_norm": 0.0014422426465898752, + "learning_rate": 1.929384294113753e-05, + "loss": 0.0558, + "step": 120510 + }, + { + "epoch": 26.002729783892107, + "grad_norm": 0.09651674330234528, + "learning_rate": 1.9290833920092438e-05, + "loss": 0.0156, + "step": 120520 + }, + { + "epoch": 26.00278394627092, + "grad_norm": 0.0014249543892219663, + "learning_rate": 1.9287824899047344e-05, + "loss": 0.0346, + "step": 120530 + }, + { + "epoch": 26.002838108649733, + "grad_norm": 0.6643553972244263, + "learning_rate": 1.928481587800225e-05, + "loss": 0.0238, + "step": 120540 + }, + { + "epoch": 26.002892271028543, + "grad_norm": 0.8564016222953796, + "learning_rate": 1.9281806856957157e-05, + "loss": 0.0042, + "step": 120550 + }, + { + "epoch": 26.002946433407356, + "grad_norm": 0.027644554153084755, + "learning_rate": 1.9278797835912066e-05, + "loss": 0.1173, + "step": 120560 + }, + { + "epoch": 26.003000595786165, + "grad_norm": 0.0013059693155810237, + "learning_rate": 1.9275788814866973e-05, + "loss": 0.0412, + "step": 120570 + }, + { + "epoch": 26.00305475816498, + "grad_norm": 0.611481249332428, + "learning_rate": 1.927277979382188e-05, + "loss": 0.0557, + "step": 120580 + }, + { + "epoch": 26.00310892054379, + "grad_norm": 0.0025878548622131348, + "learning_rate": 1.9269770772776785e-05, + "loss": 0.0025, + "step": 120590 + }, + { + "epoch": 26.0031630829226, + "grad_norm": 0.0029885382391512394, + "learning_rate": 1.9266761751731695e-05, + "loss": 0.0438, + "step": 120600 + }, + { + "epoch": 26.003217245301414, + "grad_norm": 0.0028872182592749596, + "learning_rate": 1.9263752730686598e-05, + "loss": 0.0319, + "step": 120610 + }, + { + "epoch": 26.003271407680224, + "grad_norm": 0.8426011204719543, + "learning_rate": 1.9260743709641507e-05, + "loss": 0.0333, + "step": 120620 + }, + { + "epoch": 26.003325570059037, + "grad_norm": 2.9865102767944336, + "learning_rate": 1.9257734688596414e-05, + "loss": 0.0386, + "step": 120630 + }, + { + "epoch": 26.00337973243785, + "grad_norm": 0.009964648634195328, + "learning_rate": 1.925472566755132e-05, + "loss": 0.0484, + "step": 120640 + }, + { + "epoch": 26.00343389481666, + "grad_norm": 0.0013512871228158474, + "learning_rate": 1.9251716646506226e-05, + "loss": 0.0885, + "step": 120650 + }, + { + "epoch": 26.003488057195472, + "grad_norm": 0.03280460089445114, + "learning_rate": 1.9248707625461132e-05, + "loss": 0.0145, + "step": 120660 + }, + { + "epoch": 26.003542219574285, + "grad_norm": 0.0631202831864357, + "learning_rate": 1.924569860441604e-05, + "loss": 0.1785, + "step": 120670 + }, + { + "epoch": 26.003596381953095, + "grad_norm": 1.1534584760665894, + "learning_rate": 1.9242689583370945e-05, + "loss": 0.0493, + "step": 120680 + }, + { + "epoch": 26.003650544331908, + "grad_norm": 0.5715038180351257, + "learning_rate": 1.9239680562325855e-05, + "loss": 0.031, + "step": 120690 + }, + { + "epoch": 26.003704706710717, + "grad_norm": 0.45151379704475403, + "learning_rate": 1.923667154128076e-05, + "loss": 0.0517, + "step": 120700 + }, + { + "epoch": 26.00375886908953, + "grad_norm": 0.014353998005390167, + "learning_rate": 1.9233662520235667e-05, + "loss": 0.0039, + "step": 120710 + }, + { + "epoch": 26.003813031468344, + "grad_norm": 0.0015354444039985538, + "learning_rate": 1.9230653499190573e-05, + "loss": 0.041, + "step": 120720 + }, + { + "epoch": 26.003867193847153, + "grad_norm": 0.0017994341906160116, + "learning_rate": 1.9227644478145483e-05, + "loss": 0.0151, + "step": 120730 + }, + { + "epoch": 26.003921356225966, + "grad_norm": 0.03973192349076271, + "learning_rate": 1.9224635457100386e-05, + "loss": 0.0349, + "step": 120740 + }, + { + "epoch": 26.003975518604776, + "grad_norm": 0.001224817126058042, + "learning_rate": 1.9221626436055296e-05, + "loss": 0.0559, + "step": 120750 + }, + { + "epoch": 26.00402968098359, + "grad_norm": 2.123209238052368, + "learning_rate": 1.9218617415010202e-05, + "loss": 0.0329, + "step": 120760 + }, + { + "epoch": 26.0040838433624, + "grad_norm": 0.015162691473960876, + "learning_rate": 1.9215608393965108e-05, + "loss": 0.046, + "step": 120770 + }, + { + "epoch": 26.00413800574121, + "grad_norm": 0.7393739223480225, + "learning_rate": 1.9212599372920014e-05, + "loss": 0.107, + "step": 120780 + }, + { + "epoch": 26.004192168120024, + "grad_norm": 0.6776723861694336, + "learning_rate": 1.9209590351874924e-05, + "loss": 0.043, + "step": 120790 + }, + { + "epoch": 26.004246330498834, + "grad_norm": 0.6531484127044678, + "learning_rate": 1.9206581330829827e-05, + "loss": 0.0681, + "step": 120800 + }, + { + "epoch": 26.004300492877647, + "grad_norm": 0.0012345677241683006, + "learning_rate": 1.9203572309784733e-05, + "loss": 0.0063, + "step": 120810 + }, + { + "epoch": 26.00435465525646, + "grad_norm": 0.0015364508144557476, + "learning_rate": 1.9200563288739643e-05, + "loss": 0.0382, + "step": 120820 + }, + { + "epoch": 26.00440881763527, + "grad_norm": 9.717782974243164, + "learning_rate": 1.919755426769455e-05, + "loss": 0.1525, + "step": 120830 + }, + { + "epoch": 26.004462980014083, + "grad_norm": 0.1817285567522049, + "learning_rate": 1.9194545246649456e-05, + "loss": 0.0394, + "step": 120840 + }, + { + "epoch": 26.004517142392896, + "grad_norm": 0.015563402324914932, + "learning_rate": 1.9191536225604362e-05, + "loss": 0.0132, + "step": 120850 + }, + { + "epoch": 26.004571304771705, + "grad_norm": 0.06622754782438278, + "learning_rate": 1.918852720455927e-05, + "loss": 0.0172, + "step": 120860 + }, + { + "epoch": 26.004625467150518, + "grad_norm": 0.012971136718988419, + "learning_rate": 1.9185518183514174e-05, + "loss": 0.0021, + "step": 120870 + }, + { + "epoch": 26.004679629529328, + "grad_norm": 33.91977310180664, + "learning_rate": 1.9182509162469084e-05, + "loss": 0.0857, + "step": 120880 + }, + { + "epoch": 26.00473379190814, + "grad_norm": 0.005570549517869949, + "learning_rate": 1.917950014142399e-05, + "loss": 0.0557, + "step": 120890 + }, + { + "epoch": 26.004787954286954, + "grad_norm": 0.01552284974604845, + "learning_rate": 1.9176491120378897e-05, + "loss": 0.0013, + "step": 120900 + }, + { + "epoch": 26.004842116665763, + "grad_norm": 0.0011926741572096944, + "learning_rate": 1.9173482099333803e-05, + "loss": 0.0447, + "step": 120910 + }, + { + "epoch": 26.004896279044576, + "grad_norm": 0.008841443806886673, + "learning_rate": 1.9170473078288713e-05, + "loss": 0.046, + "step": 120920 + }, + { + "epoch": 26.004950441423386, + "grad_norm": 0.07337965816259384, + "learning_rate": 1.9167464057243615e-05, + "loss": 0.0311, + "step": 120930 + }, + { + "epoch": 26.0050046038022, + "grad_norm": 0.024767572060227394, + "learning_rate": 1.9164455036198525e-05, + "loss": 0.0826, + "step": 120940 + }, + { + "epoch": 26.005058766181012, + "grad_norm": 0.0011218269355595112, + "learning_rate": 1.916144601515343e-05, + "loss": 0.0508, + "step": 120950 + }, + { + "epoch": 26.00511292855982, + "grad_norm": 0.002187693491578102, + "learning_rate": 1.9158436994108338e-05, + "loss": 0.0142, + "step": 120960 + }, + { + "epoch": 26.005167090938635, + "grad_norm": 0.005044154357165098, + "learning_rate": 1.9155427973063244e-05, + "loss": 0.0124, + "step": 120970 + }, + { + "epoch": 26.005221253317444, + "grad_norm": 0.001191894756630063, + "learning_rate": 1.915241895201815e-05, + "loss": 0.0666, + "step": 120980 + }, + { + "epoch": 26.005275415696257, + "grad_norm": 0.007139263208955526, + "learning_rate": 1.914940993097306e-05, + "loss": 0.1161, + "step": 120990 + }, + { + "epoch": 26.00532957807507, + "grad_norm": 0.019365647807717323, + "learning_rate": 1.9146400909927963e-05, + "loss": 0.0047, + "step": 121000 + }, + { + "epoch": 26.00538374045388, + "grad_norm": 0.5083245635032654, + "learning_rate": 1.9143391888882872e-05, + "loss": 0.0172, + "step": 121010 + }, + { + "epoch": 26.005437902832693, + "grad_norm": 0.019506286829710007, + "learning_rate": 1.914038286783778e-05, + "loss": 0.0005, + "step": 121020 + }, + { + "epoch": 26.005492065211506, + "grad_norm": 0.018701601773500443, + "learning_rate": 1.9137373846792685e-05, + "loss": 0.0063, + "step": 121030 + }, + { + "epoch": 26.005546227590315, + "grad_norm": 0.0034879327286034822, + "learning_rate": 1.913436482574759e-05, + "loss": 0.0621, + "step": 121040 + }, + { + "epoch": 26.00560038996913, + "grad_norm": 0.015263632871210575, + "learning_rate": 1.91313558047025e-05, + "loss": 0.0034, + "step": 121050 + }, + { + "epoch": 26.005654552347938, + "grad_norm": 0.006282101850956678, + "learning_rate": 1.9128346783657404e-05, + "loss": 0.0218, + "step": 121060 + }, + { + "epoch": 26.00570871472675, + "grad_norm": 0.002954622730612755, + "learning_rate": 1.9125337762612313e-05, + "loss": 0.0133, + "step": 121070 + }, + { + "epoch": 26.005762877105564, + "grad_norm": 0.0010554592590779066, + "learning_rate": 1.912232874156722e-05, + "loss": 0.0005, + "step": 121080 + }, + { + "epoch": 26.005817039484374, + "grad_norm": 0.02793450653553009, + "learning_rate": 1.9119319720522126e-05, + "loss": 0.0163, + "step": 121090 + }, + { + "epoch": 26.005871201863187, + "grad_norm": 0.7300714254379272, + "learning_rate": 1.9116310699477032e-05, + "loss": 0.205, + "step": 121100 + }, + { + "epoch": 26.005925364241996, + "grad_norm": 0.0443769171833992, + "learning_rate": 1.911330167843194e-05, + "loss": 0.0181, + "step": 121110 + }, + { + "epoch": 26.00597952662081, + "grad_norm": 0.018496019765734673, + "learning_rate": 1.9110292657386848e-05, + "loss": 0.0034, + "step": 121120 + }, + { + "epoch": 26.006033688999622, + "grad_norm": 0.009591144509613514, + "learning_rate": 1.910728363634175e-05, + "loss": 0.0324, + "step": 121130 + }, + { + "epoch": 26.00608785137843, + "grad_norm": 0.009449910372495651, + "learning_rate": 1.910427461529666e-05, + "loss": 0.0637, + "step": 121140 + }, + { + "epoch": 26.006142013757245, + "grad_norm": 0.09460920095443726, + "learning_rate": 1.9101265594251567e-05, + "loss": 0.0219, + "step": 121150 + }, + { + "epoch": 26.006196176136054, + "grad_norm": 0.0011349758133292198, + "learning_rate": 1.9098256573206473e-05, + "loss": 0.1026, + "step": 121160 + }, + { + "epoch": 26.006250338514867, + "grad_norm": 4.0093913078308105, + "learning_rate": 1.909524755216138e-05, + "loss": 0.0793, + "step": 121170 + }, + { + "epoch": 26.00630450089368, + "grad_norm": 0.0012467693304643035, + "learning_rate": 1.909223853111629e-05, + "loss": 0.0238, + "step": 121180 + }, + { + "epoch": 26.00635866327249, + "grad_norm": 0.01889365166425705, + "learning_rate": 1.9089229510071192e-05, + "loss": 0.0061, + "step": 121190 + }, + { + "epoch": 26.006412825651303, + "grad_norm": 0.01532716117799282, + "learning_rate": 1.9086220489026102e-05, + "loss": 0.0005, + "step": 121200 + }, + { + "epoch": 26.006466988030116, + "grad_norm": 0.08293505012989044, + "learning_rate": 1.9083211467981008e-05, + "loss": 0.0579, + "step": 121210 + }, + { + "epoch": 26.006521150408926, + "grad_norm": 0.001135774189606309, + "learning_rate": 1.9080202446935914e-05, + "loss": 0.0189, + "step": 121220 + }, + { + "epoch": 26.00657531278774, + "grad_norm": 0.027924925088882446, + "learning_rate": 1.907719342589082e-05, + "loss": 0.0068, + "step": 121230 + }, + { + "epoch": 26.006629475166548, + "grad_norm": 0.041508615016937256, + "learning_rate": 1.907418440484573e-05, + "loss": 0.1167, + "step": 121240 + }, + { + "epoch": 26.00668363754536, + "grad_norm": 0.00114825286436826, + "learning_rate": 1.9071175383800637e-05, + "loss": 0.0671, + "step": 121250 + }, + { + "epoch": 26.006737799924174, + "grad_norm": 0.0011476099025458097, + "learning_rate": 1.906816636275554e-05, + "loss": 0.0392, + "step": 121260 + }, + { + "epoch": 26.006791962302984, + "grad_norm": 2.5063893795013428, + "learning_rate": 1.906515734171045e-05, + "loss": 0.1001, + "step": 121270 + }, + { + "epoch": 26.006846124681797, + "grad_norm": 0.6905075907707214, + "learning_rate": 1.9062148320665355e-05, + "loss": 0.0242, + "step": 121280 + }, + { + "epoch": 26.006900287060606, + "grad_norm": 4.471081733703613, + "learning_rate": 1.905913929962026e-05, + "loss": 0.0644, + "step": 121290 + }, + { + "epoch": 26.00695444943942, + "grad_norm": 0.09533371776342392, + "learning_rate": 1.9056130278575168e-05, + "loss": 0.0643, + "step": 121300 + }, + { + "epoch": 26.007008611818232, + "grad_norm": 0.0012480730656534433, + "learning_rate": 1.9053121257530078e-05, + "loss": 0.0452, + "step": 121310 + }, + { + "epoch": 26.007062774197042, + "grad_norm": 0.1418069750070572, + "learning_rate": 1.905011223648498e-05, + "loss": 0.0056, + "step": 121320 + }, + { + "epoch": 26.007116936575855, + "grad_norm": 0.1106337159872055, + "learning_rate": 1.904710321543989e-05, + "loss": 0.1355, + "step": 121330 + }, + { + "epoch": 26.007171098954665, + "grad_norm": 0.12779730558395386, + "learning_rate": 1.9044094194394796e-05, + "loss": 0.0146, + "step": 121340 + }, + { + "epoch": 26.007225261333478, + "grad_norm": 0.5332517027854919, + "learning_rate": 1.9041085173349703e-05, + "loss": 0.1042, + "step": 121350 + }, + { + "epoch": 26.00727942371229, + "grad_norm": 2.5557680130004883, + "learning_rate": 1.903807615230461e-05, + "loss": 0.074, + "step": 121360 + }, + { + "epoch": 26.0073335860911, + "grad_norm": 0.0041114469058811665, + "learning_rate": 1.903506713125952e-05, + "loss": 0.0106, + "step": 121370 + }, + { + "epoch": 26.007387748469913, + "grad_norm": 4.1263651847839355, + "learning_rate": 1.9032058110214425e-05, + "loss": 0.0456, + "step": 121380 + }, + { + "epoch": 26.007441910848726, + "grad_norm": 0.08357115834951401, + "learning_rate": 1.902904908916933e-05, + "loss": 0.1454, + "step": 121390 + }, + { + "epoch": 26.007496073227536, + "grad_norm": 4.8033037185668945, + "learning_rate": 1.9026040068124237e-05, + "loss": 0.061, + "step": 121400 + }, + { + "epoch": 26.00755023560635, + "grad_norm": 0.4338551461696625, + "learning_rate": 1.9023031047079144e-05, + "loss": 0.0531, + "step": 121410 + }, + { + "epoch": 26.00760439798516, + "grad_norm": 0.026538310572504997, + "learning_rate": 1.902002202603405e-05, + "loss": 0.0263, + "step": 121420 + }, + { + "epoch": 26.00765856036397, + "grad_norm": 0.22653983533382416, + "learning_rate": 1.9017013004988956e-05, + "loss": 0.1636, + "step": 121430 + }, + { + "epoch": 26.007712722742784, + "grad_norm": 0.4477836489677429, + "learning_rate": 1.9014003983943866e-05, + "loss": 0.0412, + "step": 121440 + }, + { + "epoch": 26.007766885121594, + "grad_norm": 0.0015400816919282079, + "learning_rate": 1.901099496289877e-05, + "loss": 0.0954, + "step": 121450 + }, + { + "epoch": 26.007821047500407, + "grad_norm": 0.00218988792039454, + "learning_rate": 1.900798594185368e-05, + "loss": 0.0869, + "step": 121460 + }, + { + "epoch": 26.007875209879217, + "grad_norm": 0.0016972462181001902, + "learning_rate": 1.9004976920808585e-05, + "loss": 0.0135, + "step": 121470 + }, + { + "epoch": 26.00792937225803, + "grad_norm": 0.0037985097151249647, + "learning_rate": 1.900196789976349e-05, + "loss": 0.0318, + "step": 121480 + }, + { + "epoch": 26.007983534636843, + "grad_norm": 0.1265009343624115, + "learning_rate": 1.8998958878718397e-05, + "loss": 0.001, + "step": 121490 + }, + { + "epoch": 26.008037697015652, + "grad_norm": 0.04043367877602577, + "learning_rate": 1.8995949857673307e-05, + "loss": 0.1164, + "step": 121500 + }, + { + "epoch": 26.008091859394465, + "grad_norm": 0.015577365644276142, + "learning_rate": 1.8992940836628213e-05, + "loss": 0.0376, + "step": 121510 + }, + { + "epoch": 26.008146021773275, + "grad_norm": 0.0015165581135079265, + "learning_rate": 1.898993181558312e-05, + "loss": 0.0262, + "step": 121520 + }, + { + "epoch": 26.008200184152088, + "grad_norm": 0.4210056662559509, + "learning_rate": 1.8986922794538026e-05, + "loss": 0.0239, + "step": 121530 + }, + { + "epoch": 26.0082543465309, + "grad_norm": 0.0018377025844529271, + "learning_rate": 1.8983913773492935e-05, + "loss": 0.0427, + "step": 121540 + }, + { + "epoch": 26.00830850890971, + "grad_norm": 0.0843539834022522, + "learning_rate": 1.8980904752447838e-05, + "loss": 0.0321, + "step": 121550 + }, + { + "epoch": 26.008362671288523, + "grad_norm": 0.006711521651595831, + "learning_rate": 1.8977895731402745e-05, + "loss": 0.1372, + "step": 121560 + }, + { + "epoch": 26.008416833667333, + "grad_norm": 0.4677336812019348, + "learning_rate": 1.8974886710357654e-05, + "loss": 0.1128, + "step": 121570 + }, + { + "epoch": 26.008470996046146, + "grad_norm": 1.7371479272842407, + "learning_rate": 1.897187768931256e-05, + "loss": 0.0248, + "step": 121580 + }, + { + "epoch": 26.00852515842496, + "grad_norm": 0.5493689179420471, + "learning_rate": 1.8968868668267467e-05, + "loss": 0.005, + "step": 121590 + }, + { + "epoch": 26.00857932080377, + "grad_norm": 0.00523462425917387, + "learning_rate": 1.8965859647222373e-05, + "loss": 0.1002, + "step": 121600 + }, + { + "epoch": 26.00863348318258, + "grad_norm": 0.001474746037274599, + "learning_rate": 1.896285062617728e-05, + "loss": 0.0051, + "step": 121610 + }, + { + "epoch": 26.008687645561395, + "grad_norm": 0.0017027012072503567, + "learning_rate": 1.8959841605132186e-05, + "loss": 0.0239, + "step": 121620 + }, + { + "epoch": 26.008741807940204, + "grad_norm": 0.12931348383426666, + "learning_rate": 1.8956832584087095e-05, + "loss": 0.0075, + "step": 121630 + }, + { + "epoch": 26.008795970319017, + "grad_norm": 0.03173345699906349, + "learning_rate": 1.8953823563042e-05, + "loss": 0.0221, + "step": 121640 + }, + { + "epoch": 26.008850132697827, + "grad_norm": 0.003464675508439541, + "learning_rate": 1.8950814541996908e-05, + "loss": 0.0163, + "step": 121650 + }, + { + "epoch": 26.00890429507664, + "grad_norm": 0.0012660215143114328, + "learning_rate": 1.8947805520951814e-05, + "loss": 0.0289, + "step": 121660 + }, + { + "epoch": 26.008958457455453, + "grad_norm": 0.0018709675641730428, + "learning_rate": 1.8944796499906724e-05, + "loss": 0.037, + "step": 121670 + }, + { + "epoch": 26.009012619834262, + "grad_norm": 0.010791799984872341, + "learning_rate": 1.8941787478861627e-05, + "loss": 0.0162, + "step": 121680 + }, + { + "epoch": 26.009066782213075, + "grad_norm": 0.001298688817769289, + "learning_rate": 1.8938778457816536e-05, + "loss": 0.03, + "step": 121690 + }, + { + "epoch": 26.009120944591885, + "grad_norm": 0.001245391322299838, + "learning_rate": 1.8935769436771443e-05, + "loss": 0.0336, + "step": 121700 + }, + { + "epoch": 26.009175106970698, + "grad_norm": 0.1174963042140007, + "learning_rate": 1.893276041572635e-05, + "loss": 0.0009, + "step": 121710 + }, + { + "epoch": 26.00922926934951, + "grad_norm": 0.0014106081798672676, + "learning_rate": 1.8929751394681255e-05, + "loss": 0.0491, + "step": 121720 + }, + { + "epoch": 26.00928343172832, + "grad_norm": 0.021538490429520607, + "learning_rate": 1.892674237363616e-05, + "loss": 0.1033, + "step": 121730 + }, + { + "epoch": 26.009337594107134, + "grad_norm": 0.03522901609539986, + "learning_rate": 1.8923733352591068e-05, + "loss": 0.0007, + "step": 121740 + }, + { + "epoch": 26.009391756485943, + "grad_norm": 0.9573174118995667, + "learning_rate": 1.8920724331545974e-05, + "loss": 0.1145, + "step": 121750 + }, + { + "epoch": 26.009445918864756, + "grad_norm": 1.531088948249817, + "learning_rate": 1.8917715310500884e-05, + "loss": 0.0224, + "step": 121760 + }, + { + "epoch": 26.00950008124357, + "grad_norm": 3.327139377593994, + "learning_rate": 1.891470628945579e-05, + "loss": 0.085, + "step": 121770 + }, + { + "epoch": 26.00955424362238, + "grad_norm": 0.0025625259149819613, + "learning_rate": 1.8911697268410696e-05, + "loss": 0.0566, + "step": 121780 + }, + { + "epoch": 26.009608406001192, + "grad_norm": 0.6444591283798218, + "learning_rate": 1.8908688247365602e-05, + "loss": 0.0467, + "step": 121790 + }, + { + "epoch": 26.009662568380005, + "grad_norm": 0.07904085516929626, + "learning_rate": 1.8905679226320512e-05, + "loss": 0.0128, + "step": 121800 + }, + { + "epoch": 26.009716730758814, + "grad_norm": 0.9610477685928345, + "learning_rate": 1.8902670205275415e-05, + "loss": 0.0254, + "step": 121810 + }, + { + "epoch": 26.009770893137627, + "grad_norm": 0.0029260192532092333, + "learning_rate": 1.8899661184230325e-05, + "loss": 0.0928, + "step": 121820 + }, + { + "epoch": 26.009825055516437, + "grad_norm": 0.2284097522497177, + "learning_rate": 1.889665216318523e-05, + "loss": 0.0291, + "step": 121830 + }, + { + "epoch": 26.00987921789525, + "grad_norm": 0.0015398085815832019, + "learning_rate": 1.8893643142140137e-05, + "loss": 0.0842, + "step": 121840 + }, + { + "epoch": 26.009933380274063, + "grad_norm": 0.0031750774942338467, + "learning_rate": 1.8890634121095043e-05, + "loss": 0.0673, + "step": 121850 + }, + { + "epoch": 26.009987542652873, + "grad_norm": 0.0019708790350705385, + "learning_rate": 1.888762510004995e-05, + "loss": 0.0117, + "step": 121860 + }, + { + "epoch": 26.010041705031686, + "grad_norm": 0.005420094821602106, + "learning_rate": 1.8884616079004856e-05, + "loss": 0.0014, + "step": 121870 + }, + { + "epoch": 26.010095867410495, + "grad_norm": 0.005480438470840454, + "learning_rate": 1.8881607057959762e-05, + "loss": 0.0658, + "step": 121880 + }, + { + "epoch": 26.01015002978931, + "grad_norm": 0.008247398771345615, + "learning_rate": 1.8878598036914672e-05, + "loss": 0.0132, + "step": 121890 + }, + { + "epoch": 26.01020419216812, + "grad_norm": 3.60406494140625, + "learning_rate": 1.8875589015869578e-05, + "loss": 0.0672, + "step": 121900 + }, + { + "epoch": 26.01025835454693, + "grad_norm": 0.0023483778350055218, + "learning_rate": 1.8872579994824485e-05, + "loss": 0.0062, + "step": 121910 + }, + { + "epoch": 26.010312516925744, + "grad_norm": 0.7860305309295654, + "learning_rate": 1.886957097377939e-05, + "loss": 0.1165, + "step": 121920 + }, + { + "epoch": 26.010366679304553, + "grad_norm": 0.0014210027875378728, + "learning_rate": 1.88665619527343e-05, + "loss": 0.0087, + "step": 121930 + }, + { + "epoch": 26.010420841683366, + "grad_norm": 0.0026632293593138456, + "learning_rate": 1.8863552931689203e-05, + "loss": 0.0066, + "step": 121940 + }, + { + "epoch": 26.01047500406218, + "grad_norm": 0.004680311307311058, + "learning_rate": 1.8860543910644113e-05, + "loss": 0.011, + "step": 121950 + }, + { + "epoch": 26.01052916644099, + "grad_norm": 0.01377585157752037, + "learning_rate": 1.885753488959902e-05, + "loss": 0.0078, + "step": 121960 + }, + { + "epoch": 26.010583328819802, + "grad_norm": 0.012766721658408642, + "learning_rate": 1.8854525868553926e-05, + "loss": 0.03, + "step": 121970 + }, + { + "epoch": 26.010637491198615, + "grad_norm": 0.009137849323451519, + "learning_rate": 1.8851516847508832e-05, + "loss": 0.0361, + "step": 121980 + }, + { + "epoch": 26.010691653577425, + "grad_norm": 0.2879354953765869, + "learning_rate": 1.884850782646374e-05, + "loss": 0.0503, + "step": 121990 + }, + { + "epoch": 26.010745815956238, + "grad_norm": 0.1181088536977768, + "learning_rate": 1.8845498805418644e-05, + "loss": 0.0239, + "step": 122000 + }, + { + "epoch": 26.010799978335047, + "grad_norm": 2.02061128616333, + "learning_rate": 1.884248978437355e-05, + "loss": 0.0945, + "step": 122010 + }, + { + "epoch": 26.01085414071386, + "grad_norm": 0.36346498131752014, + "learning_rate": 1.883948076332846e-05, + "loss": 0.0848, + "step": 122020 + }, + { + "epoch": 26.010908303092673, + "grad_norm": 0.001286715967580676, + "learning_rate": 1.8836471742283367e-05, + "loss": 0.0002, + "step": 122030 + }, + { + "epoch": 26.010962465471483, + "grad_norm": 0.0014621360460296273, + "learning_rate": 1.8833462721238273e-05, + "loss": 0.0362, + "step": 122040 + }, + { + "epoch": 26.011016627850296, + "grad_norm": 0.048671118915081024, + "learning_rate": 1.883045370019318e-05, + "loss": 0.0101, + "step": 122050 + }, + { + "epoch": 26.011070790229105, + "grad_norm": 0.00887388177216053, + "learning_rate": 1.882744467914809e-05, + "loss": 0.1262, + "step": 122060 + }, + { + "epoch": 26.01112495260792, + "grad_norm": 0.004456190392374992, + "learning_rate": 1.882443565810299e-05, + "loss": 0.033, + "step": 122070 + }, + { + "epoch": 26.01117911498673, + "grad_norm": 0.1918564736843109, + "learning_rate": 1.88214266370579e-05, + "loss": 0.0036, + "step": 122080 + }, + { + "epoch": 26.01123327736554, + "grad_norm": 1.1970689296722412, + "learning_rate": 1.8818417616012808e-05, + "loss": 0.0551, + "step": 122090 + }, + { + "epoch": 26.011287439744354, + "grad_norm": 0.06349761039018631, + "learning_rate": 1.8815408594967714e-05, + "loss": 0.0668, + "step": 122100 + }, + { + "epoch": 26.011341602123164, + "grad_norm": 0.438160240650177, + "learning_rate": 1.881239957392262e-05, + "loss": 0.0319, + "step": 122110 + }, + { + "epoch": 26.011395764501977, + "grad_norm": 0.003210645169019699, + "learning_rate": 1.880939055287753e-05, + "loss": 0.0305, + "step": 122120 + }, + { + "epoch": 26.01144992688079, + "grad_norm": 0.0014478371012955904, + "learning_rate": 1.8806381531832433e-05, + "loss": 0.0059, + "step": 122130 + }, + { + "epoch": 26.0115040892596, + "grad_norm": 0.101414754986763, + "learning_rate": 1.8803372510787342e-05, + "loss": 0.1562, + "step": 122140 + }, + { + "epoch": 26.011558251638412, + "grad_norm": 3.3728976249694824, + "learning_rate": 1.880036348974225e-05, + "loss": 0.1487, + "step": 122150 + }, + { + "epoch": 26.011612414017225, + "grad_norm": 0.9543789625167847, + "learning_rate": 1.8797354468697155e-05, + "loss": 0.0372, + "step": 122160 + }, + { + "epoch": 26.011666576396035, + "grad_norm": 0.007071264088153839, + "learning_rate": 1.879434544765206e-05, + "loss": 0.0212, + "step": 122170 + }, + { + "epoch": 26.011720738774848, + "grad_norm": 0.05390681326389313, + "learning_rate": 1.8791336426606967e-05, + "loss": 0.0118, + "step": 122180 + }, + { + "epoch": 26.011774901153657, + "grad_norm": 0.3772726356983185, + "learning_rate": 1.8788327405561877e-05, + "loss": 0.0793, + "step": 122190 + }, + { + "epoch": 26.01182906353247, + "grad_norm": 0.016773059964179993, + "learning_rate": 1.878531838451678e-05, + "loss": 0.0219, + "step": 122200 + }, + { + "epoch": 26.011883225911284, + "grad_norm": 0.04453694075345993, + "learning_rate": 1.878230936347169e-05, + "loss": 0.0284, + "step": 122210 + }, + { + "epoch": 26.011937388290093, + "grad_norm": 0.002967548556625843, + "learning_rate": 1.8779300342426596e-05, + "loss": 0.0589, + "step": 122220 + }, + { + "epoch": 26.011991550668906, + "grad_norm": 0.002158809220418334, + "learning_rate": 1.8776291321381502e-05, + "loss": 0.0007, + "step": 122230 + }, + { + "epoch": 26.012045713047716, + "grad_norm": 4.326767444610596, + "learning_rate": 1.877328230033641e-05, + "loss": 0.1802, + "step": 122240 + }, + { + "epoch": 26.01209987542653, + "grad_norm": 1.6155822277069092, + "learning_rate": 1.8770273279291318e-05, + "loss": 0.0604, + "step": 122250 + }, + { + "epoch": 26.01215403780534, + "grad_norm": 1.0317466259002686, + "learning_rate": 1.876726425824622e-05, + "loss": 0.0356, + "step": 122260 + }, + { + "epoch": 26.01220820018415, + "grad_norm": 0.07006890326738358, + "learning_rate": 1.876425523720113e-05, + "loss": 0.089, + "step": 122270 + }, + { + "epoch": 26.012262362562964, + "grad_norm": 0.060222506523132324, + "learning_rate": 1.8761246216156037e-05, + "loss": 0.041, + "step": 122280 + }, + { + "epoch": 26.012316524941774, + "grad_norm": 0.001525085885077715, + "learning_rate": 1.8758237195110943e-05, + "loss": 0.0454, + "step": 122290 + }, + { + "epoch": 26.012370687320587, + "grad_norm": 0.9887785911560059, + "learning_rate": 1.875522817406585e-05, + "loss": 0.0523, + "step": 122300 + }, + { + "epoch": 26.0124248496994, + "grad_norm": 0.2918573021888733, + "learning_rate": 1.8752219153020756e-05, + "loss": 0.0182, + "step": 122310 + }, + { + "epoch": 26.01247901207821, + "grad_norm": 0.20474480092525482, + "learning_rate": 1.8749210131975665e-05, + "loss": 0.0827, + "step": 122320 + }, + { + "epoch": 26.012533174457023, + "grad_norm": 0.008355697616934776, + "learning_rate": 1.874620111093057e-05, + "loss": 0.0203, + "step": 122330 + }, + { + "epoch": 26.012587336835836, + "grad_norm": 0.0014820047654211521, + "learning_rate": 1.8743192089885478e-05, + "loss": 0.0085, + "step": 122340 + }, + { + "epoch": 26.012641499214645, + "grad_norm": 0.007547496352344751, + "learning_rate": 1.8740183068840384e-05, + "loss": 0.0188, + "step": 122350 + }, + { + "epoch": 26.012695661593458, + "grad_norm": 0.0014652932295575738, + "learning_rate": 1.873717404779529e-05, + "loss": 0.0294, + "step": 122360 + }, + { + "epoch": 26.012749823972268, + "grad_norm": 0.016896793618798256, + "learning_rate": 1.8734165026750197e-05, + "loss": 0.0882, + "step": 122370 + }, + { + "epoch": 26.01280398635108, + "grad_norm": 3.6000194549560547, + "learning_rate": 1.8731156005705107e-05, + "loss": 0.0522, + "step": 122380 + }, + { + "epoch": 26.012858148729894, + "grad_norm": 0.0015015414683148265, + "learning_rate": 1.872814698466001e-05, + "loss": 0.0082, + "step": 122390 + }, + { + "epoch": 26.012912311108703, + "grad_norm": 2.58707594871521, + "learning_rate": 1.872513796361492e-05, + "loss": 0.0469, + "step": 122400 + }, + { + "epoch": 26.012966473487516, + "grad_norm": 0.06545374542474747, + "learning_rate": 1.8722128942569825e-05, + "loss": 0.0534, + "step": 122410 + }, + { + "epoch": 26.013020635866326, + "grad_norm": 0.04088418185710907, + "learning_rate": 1.871911992152473e-05, + "loss": 0.0162, + "step": 122420 + }, + { + "epoch": 26.01307479824514, + "grad_norm": 4.547693729400635, + "learning_rate": 1.8716110900479638e-05, + "loss": 0.1103, + "step": 122430 + }, + { + "epoch": 26.013128960623952, + "grad_norm": 0.0015159383183345199, + "learning_rate": 1.8713101879434548e-05, + "loss": 0.0332, + "step": 122440 + }, + { + "epoch": 26.01318312300276, + "grad_norm": 0.6775871515274048, + "learning_rate": 1.8710092858389454e-05, + "loss": 0.0572, + "step": 122450 + }, + { + "epoch": 26.013237285381575, + "grad_norm": 2.9602179527282715, + "learning_rate": 1.8707083837344357e-05, + "loss": 0.0489, + "step": 122460 + }, + { + "epoch": 26.013291447760384, + "grad_norm": 0.047192152589559555, + "learning_rate": 1.8704074816299266e-05, + "loss": 0.1349, + "step": 122470 + }, + { + "epoch": 26.013345610139197, + "grad_norm": 0.2700527012348175, + "learning_rate": 1.8701065795254173e-05, + "loss": 0.0146, + "step": 122480 + }, + { + "epoch": 26.01339977251801, + "grad_norm": 1.4118843078613281, + "learning_rate": 1.869805677420908e-05, + "loss": 0.0986, + "step": 122490 + }, + { + "epoch": 26.01345393489682, + "grad_norm": 0.1113165020942688, + "learning_rate": 1.8695047753163985e-05, + "loss": 0.0501, + "step": 122500 + }, + { + "epoch": 26.013508097275633, + "grad_norm": 2.016409397125244, + "learning_rate": 1.8692038732118895e-05, + "loss": 0.0675, + "step": 122510 + }, + { + "epoch": 26.013562259654446, + "grad_norm": 0.31062886118888855, + "learning_rate": 1.8689029711073798e-05, + "loss": 0.0147, + "step": 122520 + }, + { + "epoch": 26.013616422033255, + "grad_norm": 0.46617069840431213, + "learning_rate": 1.8686020690028707e-05, + "loss": 0.058, + "step": 122530 + }, + { + "epoch": 26.01367058441207, + "grad_norm": 0.1619178056716919, + "learning_rate": 1.8683011668983614e-05, + "loss": 0.0186, + "step": 122540 + }, + { + "epoch": 26.013724746790878, + "grad_norm": 0.00142099114600569, + "learning_rate": 1.868000264793852e-05, + "loss": 0.0055, + "step": 122550 + }, + { + "epoch": 26.01377890916969, + "grad_norm": 0.001414826954714954, + "learning_rate": 1.8676993626893426e-05, + "loss": 0.0875, + "step": 122560 + }, + { + "epoch": 26.013833071548504, + "grad_norm": 6.863112926483154, + "learning_rate": 1.8673984605848336e-05, + "loss": 0.0295, + "step": 122570 + }, + { + "epoch": 26.013887233927314, + "grad_norm": 0.33824315667152405, + "learning_rate": 1.8670975584803242e-05, + "loss": 0.0045, + "step": 122580 + }, + { + "epoch": 26.013941396306127, + "grad_norm": 0.0038999568205326796, + "learning_rate": 1.866796656375815e-05, + "loss": 0.0392, + "step": 122590 + }, + { + "epoch": 26.013995558684936, + "grad_norm": 0.05523681640625, + "learning_rate": 1.8664957542713055e-05, + "loss": 0.0068, + "step": 122600 + }, + { + "epoch": 26.01404972106375, + "grad_norm": 0.11081261932849884, + "learning_rate": 1.866194852166796e-05, + "loss": 0.002, + "step": 122610 + }, + { + "epoch": 26.014103883442562, + "grad_norm": 0.012348136864602566, + "learning_rate": 1.8658939500622867e-05, + "loss": 0.0234, + "step": 122620 + }, + { + "epoch": 26.01415804582137, + "grad_norm": 1.8305130004882812, + "learning_rate": 1.8655930479577774e-05, + "loss": 0.0534, + "step": 122630 + }, + { + "epoch": 26.014212208200185, + "grad_norm": 0.0017993164947256446, + "learning_rate": 1.8652921458532683e-05, + "loss": 0.1205, + "step": 122640 + }, + { + "epoch": 26.014266370578994, + "grad_norm": 0.0022245817817747593, + "learning_rate": 1.8649912437487586e-05, + "loss": 0.0816, + "step": 122650 + }, + { + "epoch": 26.014320532957807, + "grad_norm": 0.040482811629772186, + "learning_rate": 1.8646903416442496e-05, + "loss": 0.0073, + "step": 122660 + }, + { + "epoch": 26.01437469533662, + "grad_norm": 4.269340991973877, + "learning_rate": 1.8643894395397402e-05, + "loss": 0.1018, + "step": 122670 + }, + { + "epoch": 26.01442885771543, + "grad_norm": 0.0013676207745447755, + "learning_rate": 1.864088537435231e-05, + "loss": 0.0326, + "step": 122680 + }, + { + "epoch": 26.014483020094243, + "grad_norm": 0.0049654836766421795, + "learning_rate": 1.8637876353307215e-05, + "loss": 0.001, + "step": 122690 + }, + { + "epoch": 26.014537182473052, + "grad_norm": 0.006102011539041996, + "learning_rate": 1.8634867332262124e-05, + "loss": 0.1052, + "step": 122700 + }, + { + "epoch": 26.014591344851866, + "grad_norm": 0.002675245516002178, + "learning_rate": 1.863185831121703e-05, + "loss": 0.0047, + "step": 122710 + }, + { + "epoch": 26.01464550723068, + "grad_norm": 2.1287736892700195, + "learning_rate": 1.8628849290171937e-05, + "loss": 0.0715, + "step": 122720 + }, + { + "epoch": 26.014699669609488, + "grad_norm": 0.0014189559733495116, + "learning_rate": 1.8625840269126843e-05, + "loss": 0.0962, + "step": 122730 + }, + { + "epoch": 26.0147538319883, + "grad_norm": 0.0024380264803767204, + "learning_rate": 1.8622831248081753e-05, + "loss": 0.0062, + "step": 122740 + }, + { + "epoch": 26.014807994367114, + "grad_norm": 0.32594430446624756, + "learning_rate": 1.8619822227036656e-05, + "loss": 0.0037, + "step": 122750 + }, + { + "epoch": 26.014862156745924, + "grad_norm": 0.16187143325805664, + "learning_rate": 1.8616813205991562e-05, + "loss": 0.0607, + "step": 122760 + }, + { + "epoch": 26.014916319124737, + "grad_norm": 1.8212074041366577, + "learning_rate": 1.861380418494647e-05, + "loss": 0.0309, + "step": 122770 + }, + { + "epoch": 26.014970481503546, + "grad_norm": 0.0014161781873553991, + "learning_rate": 1.8610795163901374e-05, + "loss": 0.0228, + "step": 122780 + }, + { + "epoch": 26.01502464388236, + "grad_norm": 0.07175455242395401, + "learning_rate": 1.8607786142856284e-05, + "loss": 0.0014, + "step": 122790 + }, + { + "epoch": 26.015078806261172, + "grad_norm": 0.002238101791590452, + "learning_rate": 1.860477712181119e-05, + "loss": 0.027, + "step": 122800 + }, + { + "epoch": 26.015132968639982, + "grad_norm": 0.002223566174507141, + "learning_rate": 1.8601768100766097e-05, + "loss": 0.0152, + "step": 122810 + }, + { + "epoch": 26.015187131018795, + "grad_norm": 0.005642581265419722, + "learning_rate": 1.8598759079721003e-05, + "loss": 0.0079, + "step": 122820 + }, + { + "epoch": 26.015241293397604, + "grad_norm": 0.0013469660189002752, + "learning_rate": 1.8595750058675913e-05, + "loss": 0.0825, + "step": 122830 + }, + { + "epoch": 26.015295455776418, + "grad_norm": 0.054427534341812134, + "learning_rate": 1.859274103763082e-05, + "loss": 0.0709, + "step": 122840 + }, + { + "epoch": 26.01534961815523, + "grad_norm": 0.001682883594185114, + "learning_rate": 1.8589732016585725e-05, + "loss": 0.0416, + "step": 122850 + }, + { + "epoch": 26.01540378053404, + "grad_norm": 0.08897803723812103, + "learning_rate": 1.858672299554063e-05, + "loss": 0.0286, + "step": 122860 + }, + { + "epoch": 26.015457942912853, + "grad_norm": 1.319754958152771, + "learning_rate": 1.858371397449554e-05, + "loss": 0.0487, + "step": 122870 + }, + { + "epoch": 26.015512105291663, + "grad_norm": 0.0014181681908667088, + "learning_rate": 1.8580704953450444e-05, + "loss": 0.0088, + "step": 122880 + }, + { + "epoch": 26.015566267670476, + "grad_norm": 0.2844350337982178, + "learning_rate": 1.8577695932405354e-05, + "loss": 0.0084, + "step": 122890 + }, + { + "epoch": 26.01562043004929, + "grad_norm": 2.189014196395874, + "learning_rate": 1.857468691136026e-05, + "loss": 0.0456, + "step": 122900 + }, + { + "epoch": 26.0156745924281, + "grad_norm": 0.002765498124063015, + "learning_rate": 1.8571677890315163e-05, + "loss": 0.0285, + "step": 122910 + }, + { + "epoch": 26.01572875480691, + "grad_norm": 0.9311849474906921, + "learning_rate": 1.8568668869270072e-05, + "loss": 0.013, + "step": 122920 + }, + { + "epoch": 26.015782917185724, + "grad_norm": 0.0016481529455631971, + "learning_rate": 1.856565984822498e-05, + "loss": 0.0086, + "step": 122930 + }, + { + "epoch": 26.015837079564534, + "grad_norm": 0.0012398913968354464, + "learning_rate": 1.8562650827179885e-05, + "loss": 0.0518, + "step": 122940 + }, + { + "epoch": 26.015891241943347, + "grad_norm": 0.0012227138504385948, + "learning_rate": 1.855964180613479e-05, + "loss": 0.0281, + "step": 122950 + }, + { + "epoch": 26.015945404322157, + "grad_norm": 0.02788795717060566, + "learning_rate": 1.85566327850897e-05, + "loss": 0.0247, + "step": 122960 + }, + { + "epoch": 26.01599956670097, + "grad_norm": 0.5822024345397949, + "learning_rate": 1.8553623764044607e-05, + "loss": 0.0077, + "step": 122970 + }, + { + "epoch": 26.016053729079783, + "grad_norm": 0.02666018344461918, + "learning_rate": 1.8550614742999513e-05, + "loss": 0.0155, + "step": 122980 + }, + { + "epoch": 26.016107891458592, + "grad_norm": 0.15712279081344604, + "learning_rate": 1.854760572195442e-05, + "loss": 0.0025, + "step": 122990 + }, + { + "epoch": 26.016162053837405, + "grad_norm": 1.3374642133712769, + "learning_rate": 1.854459670090933e-05, + "loss": 0.0203, + "step": 123000 + }, + { + "epoch": 26.016216216216215, + "grad_norm": 0.046583328396081924, + "learning_rate": 1.8541587679864232e-05, + "loss": 0.081, + "step": 123010 + }, + { + "epoch": 26.016270378595028, + "grad_norm": 0.0011781543726101518, + "learning_rate": 1.8538578658819142e-05, + "loss": 0.0636, + "step": 123020 + }, + { + "epoch": 26.01632454097384, + "grad_norm": 2.2198381423950195, + "learning_rate": 1.8535569637774048e-05, + "loss": 0.0873, + "step": 123030 + }, + { + "epoch": 26.01637870335265, + "grad_norm": 0.1468515247106552, + "learning_rate": 1.8532560616728955e-05, + "loss": 0.0474, + "step": 123040 + }, + { + "epoch": 26.016432865731463, + "grad_norm": 0.06985693424940109, + "learning_rate": 1.852955159568386e-05, + "loss": 0.1064, + "step": 123050 + }, + { + "epoch": 26.016487028110273, + "grad_norm": 0.1744094341993332, + "learning_rate": 1.8526542574638767e-05, + "loss": 0.141, + "step": 123060 + }, + { + "epoch": 26.016541190489086, + "grad_norm": 0.0013272527139633894, + "learning_rate": 1.8523533553593673e-05, + "loss": 0.1282, + "step": 123070 + }, + { + "epoch": 26.0165953528679, + "grad_norm": 2.823333978652954, + "learning_rate": 1.852052453254858e-05, + "loss": 0.0733, + "step": 123080 + }, + { + "epoch": 26.01664951524671, + "grad_norm": 0.0029111425392329693, + "learning_rate": 1.851751551150349e-05, + "loss": 0.0526, + "step": 123090 + }, + { + "epoch": 26.01670367762552, + "grad_norm": 0.3550756871700287, + "learning_rate": 1.8514506490458396e-05, + "loss": 0.0113, + "step": 123100 + }, + { + "epoch": 26.016757840004335, + "grad_norm": 0.0013201426481828094, + "learning_rate": 1.8511497469413302e-05, + "loss": 0.1076, + "step": 123110 + }, + { + "epoch": 26.016812002383144, + "grad_norm": 0.035384826362133026, + "learning_rate": 1.8508488448368208e-05, + "loss": 0.0029, + "step": 123120 + }, + { + "epoch": 26.016866164761957, + "grad_norm": 0.2801956534385681, + "learning_rate": 1.8505479427323118e-05, + "loss": 0.0242, + "step": 123130 + }, + { + "epoch": 26.016920327140767, + "grad_norm": 0.001812613569200039, + "learning_rate": 1.850247040627802e-05, + "loss": 0.1488, + "step": 123140 + }, + { + "epoch": 26.01697448951958, + "grad_norm": 0.0016750313807278872, + "learning_rate": 1.849946138523293e-05, + "loss": 0.0581, + "step": 123150 + }, + { + "epoch": 26.017028651898393, + "grad_norm": 0.0014702557818964124, + "learning_rate": 1.8496452364187837e-05, + "loss": 0.064, + "step": 123160 + }, + { + "epoch": 26.017082814277202, + "grad_norm": 0.00149097153916955, + "learning_rate": 1.8493443343142743e-05, + "loss": 0.0566, + "step": 123170 + }, + { + "epoch": 26.017136976656015, + "grad_norm": 1.0573662519454956, + "learning_rate": 1.849043432209765e-05, + "loss": 0.04, + "step": 123180 + }, + { + "epoch": 26.017191139034825, + "grad_norm": 0.8691235780715942, + "learning_rate": 1.848742530105256e-05, + "loss": 0.0169, + "step": 123190 + }, + { + "epoch": 26.017245301413638, + "grad_norm": 0.09717128425836563, + "learning_rate": 1.8484416280007462e-05, + "loss": 0.0024, + "step": 123200 + }, + { + "epoch": 26.01729946379245, + "grad_norm": 0.0014397836057469249, + "learning_rate": 1.8481407258962368e-05, + "loss": 0.0322, + "step": 123210 + }, + { + "epoch": 26.01735362617126, + "grad_norm": 0.6617986559867859, + "learning_rate": 1.8478398237917278e-05, + "loss": 0.0569, + "step": 123220 + }, + { + "epoch": 26.017407788550074, + "grad_norm": 0.8871917724609375, + "learning_rate": 1.8475389216872184e-05, + "loss": 0.0232, + "step": 123230 + }, + { + "epoch": 26.017461950928883, + "grad_norm": 0.005464574322104454, + "learning_rate": 1.847238019582709e-05, + "loss": 0.028, + "step": 123240 + }, + { + "epoch": 26.017516113307696, + "grad_norm": 0.1688757836818695, + "learning_rate": 1.8469371174781996e-05, + "loss": 0.0029, + "step": 123250 + }, + { + "epoch": 26.01757027568651, + "grad_norm": 0.9351680874824524, + "learning_rate": 1.8466362153736906e-05, + "loss": 0.0307, + "step": 123260 + }, + { + "epoch": 26.01762443806532, + "grad_norm": 0.003560151904821396, + "learning_rate": 1.846335313269181e-05, + "loss": 0.0136, + "step": 123270 + }, + { + "epoch": 26.017678600444132, + "grad_norm": 0.0014262204058468342, + "learning_rate": 1.846034411164672e-05, + "loss": 0.0132, + "step": 123280 + }, + { + "epoch": 26.017732762822945, + "grad_norm": 0.0033280898351222277, + "learning_rate": 1.8457335090601625e-05, + "loss": 0.0322, + "step": 123290 + }, + { + "epoch": 26.017786925201754, + "grad_norm": 0.003230503061786294, + "learning_rate": 1.845432606955653e-05, + "loss": 0.0006, + "step": 123300 + }, + { + "epoch": 26.017841087580567, + "grad_norm": 0.8413687944412231, + "learning_rate": 1.8451317048511437e-05, + "loss": 0.0512, + "step": 123310 + }, + { + "epoch": 26.017895249959377, + "grad_norm": 0.0013255189405754209, + "learning_rate": 1.8448308027466347e-05, + "loss": 0.0641, + "step": 123320 + }, + { + "epoch": 26.01794941233819, + "grad_norm": 0.26937681436538696, + "learning_rate": 1.844529900642125e-05, + "loss": 0.0521, + "step": 123330 + }, + { + "epoch": 26.018003574717003, + "grad_norm": 0.0022907762322574854, + "learning_rate": 1.844228998537616e-05, + "loss": 0.0703, + "step": 123340 + }, + { + "epoch": 26.018057737095813, + "grad_norm": 0.5746240615844727, + "learning_rate": 1.8439280964331066e-05, + "loss": 0.0146, + "step": 123350 + }, + { + "epoch": 26.018111899474626, + "grad_norm": 0.0048826467245817184, + "learning_rate": 1.8436271943285972e-05, + "loss": 0.0368, + "step": 123360 + }, + { + "epoch": 26.018166061853435, + "grad_norm": 0.26541268825531006, + "learning_rate": 1.843326292224088e-05, + "loss": 0.017, + "step": 123370 + }, + { + "epoch": 26.01822022423225, + "grad_norm": 0.0028954080771654844, + "learning_rate": 1.8430253901195785e-05, + "loss": 0.0019, + "step": 123380 + }, + { + "epoch": 26.01827438661106, + "grad_norm": 0.07528308779001236, + "learning_rate": 1.8427244880150694e-05, + "loss": 0.0158, + "step": 123390 + }, + { + "epoch": 26.01832854898987, + "grad_norm": 0.004759546834975481, + "learning_rate": 1.8424235859105597e-05, + "loss": 0.0305, + "step": 123400 + }, + { + "epoch": 26.018382711368684, + "grad_norm": 0.0012480744626373053, + "learning_rate": 1.8421226838060507e-05, + "loss": 0.1008, + "step": 123410 + }, + { + "epoch": 26.018436873747493, + "grad_norm": 0.11900897324085236, + "learning_rate": 1.8418217817015413e-05, + "loss": 0.0257, + "step": 123420 + }, + { + "epoch": 26.018491036126306, + "grad_norm": 0.0037146322429180145, + "learning_rate": 1.841520879597032e-05, + "loss": 0.0382, + "step": 123430 + }, + { + "epoch": 26.01854519850512, + "grad_norm": 0.001362158334814012, + "learning_rate": 1.8412199774925226e-05, + "loss": 0.1342, + "step": 123440 + }, + { + "epoch": 26.01859936088393, + "grad_norm": 0.11981400102376938, + "learning_rate": 1.8409190753880136e-05, + "loss": 0.0373, + "step": 123450 + }, + { + "epoch": 26.018653523262742, + "grad_norm": 0.0021587295923382044, + "learning_rate": 1.840618173283504e-05, + "loss": 0.0421, + "step": 123460 + }, + { + "epoch": 26.018707685641555, + "grad_norm": 0.00217383517883718, + "learning_rate": 1.8403172711789948e-05, + "loss": 0.0764, + "step": 123470 + }, + { + "epoch": 26.018761848020365, + "grad_norm": 0.00885560642927885, + "learning_rate": 1.8400163690744854e-05, + "loss": 0.0784, + "step": 123480 + }, + { + "epoch": 26.018816010399178, + "grad_norm": 0.0014098075916990638, + "learning_rate": 1.839715466969976e-05, + "loss": 0.0022, + "step": 123490 + }, + { + "epoch": 26.018870172777987, + "grad_norm": 0.0014142653672024608, + "learning_rate": 1.8394145648654667e-05, + "loss": 0.0277, + "step": 123500 + }, + { + "epoch": 26.0189243351568, + "grad_norm": 0.9785363674163818, + "learning_rate": 1.8391136627609573e-05, + "loss": 0.0204, + "step": 123510 + }, + { + "epoch": 26.018978497535613, + "grad_norm": 6.210361003875732, + "learning_rate": 1.8388127606564483e-05, + "loss": 0.1292, + "step": 123520 + }, + { + "epoch": 26.019032659914423, + "grad_norm": 0.0015556145226582885, + "learning_rate": 1.8385118585519386e-05, + "loss": 0.006, + "step": 123530 + }, + { + "epoch": 26.019086822293236, + "grad_norm": 0.001746003283187747, + "learning_rate": 1.8382109564474295e-05, + "loss": 0.0368, + "step": 123540 + }, + { + "epoch": 26.019140984672045, + "grad_norm": 0.002634116681292653, + "learning_rate": 1.83791005434292e-05, + "loss": 0.0713, + "step": 123550 + }, + { + "epoch": 26.01919514705086, + "grad_norm": 0.4939673840999603, + "learning_rate": 1.8376091522384108e-05, + "loss": 0.0034, + "step": 123560 + }, + { + "epoch": 26.01924930942967, + "grad_norm": 0.0018385484581813216, + "learning_rate": 1.8373082501339014e-05, + "loss": 0.142, + "step": 123570 + }, + { + "epoch": 26.01930347180848, + "grad_norm": 0.036168165504932404, + "learning_rate": 1.8370073480293924e-05, + "loss": 0.0014, + "step": 123580 + }, + { + "epoch": 26.019357634187294, + "grad_norm": 0.0017876139609143138, + "learning_rate": 1.8367064459248827e-05, + "loss": 0.0756, + "step": 123590 + }, + { + "epoch": 26.019411796566104, + "grad_norm": 0.8368721008300781, + "learning_rate": 1.8364055438203736e-05, + "loss": 0.0058, + "step": 123600 + }, + { + "epoch": 26.019465958944917, + "grad_norm": 2.7533457279205322, + "learning_rate": 1.8361046417158643e-05, + "loss": 0.0896, + "step": 123610 + }, + { + "epoch": 26.01952012132373, + "grad_norm": 0.0017985469894483685, + "learning_rate": 1.835803739611355e-05, + "loss": 0.0472, + "step": 123620 + }, + { + "epoch": 26.01957428370254, + "grad_norm": 0.0016911436105147004, + "learning_rate": 1.8355028375068455e-05, + "loss": 0.0515, + "step": 123630 + }, + { + "epoch": 26.019628446081352, + "grad_norm": 0.007985423319041729, + "learning_rate": 1.8352019354023365e-05, + "loss": 0.0056, + "step": 123640 + }, + { + "epoch": 26.019682608460165, + "grad_norm": 0.003614643355831504, + "learning_rate": 1.834901033297827e-05, + "loss": 0.0008, + "step": 123650 + }, + { + "epoch": 26.019736770838975, + "grad_norm": 0.004935542121529579, + "learning_rate": 1.8346001311933174e-05, + "loss": 0.0119, + "step": 123660 + }, + { + "epoch": 26.019790933217788, + "grad_norm": 0.04890884459018707, + "learning_rate": 1.8342992290888084e-05, + "loss": 0.0551, + "step": 123670 + }, + { + "epoch": 26.019845095596597, + "grad_norm": 0.031405020505189896, + "learning_rate": 1.833998326984299e-05, + "loss": 0.0162, + "step": 123680 + }, + { + "epoch": 26.01989925797541, + "grad_norm": 0.001579337869770825, + "learning_rate": 1.8336974248797896e-05, + "loss": 0.0374, + "step": 123690 + }, + { + "epoch": 26.019953420354224, + "grad_norm": 17.833833694458008, + "learning_rate": 1.8333965227752803e-05, + "loss": 0.0739, + "step": 123700 + }, + { + "epoch": 26.020007582733033, + "grad_norm": 0.001532205380499363, + "learning_rate": 1.8330956206707712e-05, + "loss": 0.0045, + "step": 123710 + }, + { + "epoch": 26.020061745111846, + "grad_norm": 0.0028916450683027506, + "learning_rate": 1.8327947185662615e-05, + "loss": 0.0028, + "step": 123720 + }, + { + "epoch": 26.020115907490656, + "grad_norm": 0.03833632543683052, + "learning_rate": 1.8324938164617525e-05, + "loss": 0.005, + "step": 123730 + }, + { + "epoch": 26.02017006986947, + "grad_norm": 0.7574649453163147, + "learning_rate": 1.832192914357243e-05, + "loss": 0.034, + "step": 123740 + }, + { + "epoch": 26.02022423224828, + "grad_norm": 0.0027812300249934196, + "learning_rate": 1.8318920122527337e-05, + "loss": 0.0782, + "step": 123750 + }, + { + "epoch": 26.02027839462709, + "grad_norm": 0.0014150802744552493, + "learning_rate": 1.8315911101482244e-05, + "loss": 0.0125, + "step": 123760 + }, + { + "epoch": 26.020332557005904, + "grad_norm": 0.0013341064332053065, + "learning_rate": 1.8312902080437153e-05, + "loss": 0.1088, + "step": 123770 + }, + { + "epoch": 26.020386719384714, + "grad_norm": 0.6433883905410767, + "learning_rate": 1.830989305939206e-05, + "loss": 0.1533, + "step": 123780 + }, + { + "epoch": 26.020440881763527, + "grad_norm": 0.00335581973195076, + "learning_rate": 1.8306884038346966e-05, + "loss": 0.0472, + "step": 123790 + }, + { + "epoch": 26.02049504414234, + "grad_norm": 0.002829363802447915, + "learning_rate": 1.8303875017301872e-05, + "loss": 0.044, + "step": 123800 + }, + { + "epoch": 26.02054920652115, + "grad_norm": 0.191884383559227, + "learning_rate": 1.830086599625678e-05, + "loss": 0.0675, + "step": 123810 + }, + { + "epoch": 26.020603368899963, + "grad_norm": 0.10773047059774399, + "learning_rate": 1.8297856975211685e-05, + "loss": 0.073, + "step": 123820 + }, + { + "epoch": 26.020657531278772, + "grad_norm": 0.0015261570224538445, + "learning_rate": 1.829484795416659e-05, + "loss": 0.1736, + "step": 123830 + }, + { + "epoch": 26.020711693657585, + "grad_norm": 0.14124073088169098, + "learning_rate": 1.82918389331215e-05, + "loss": 0.0297, + "step": 123840 + }, + { + "epoch": 26.020765856036398, + "grad_norm": 0.056042131036520004, + "learning_rate": 1.8288829912076403e-05, + "loss": 0.0724, + "step": 123850 + }, + { + "epoch": 26.020820018415208, + "grad_norm": 0.0016947145340964198, + "learning_rate": 1.8285820891031313e-05, + "loss": 0.0128, + "step": 123860 + }, + { + "epoch": 26.02087418079402, + "grad_norm": 0.0015708435093984008, + "learning_rate": 1.828281186998622e-05, + "loss": 0.0153, + "step": 123870 + }, + { + "epoch": 26.020928343172834, + "grad_norm": 0.04502356797456741, + "learning_rate": 1.8279802848941126e-05, + "loss": 0.0084, + "step": 123880 + }, + { + "epoch": 26.020982505551643, + "grad_norm": 0.001527300919406116, + "learning_rate": 1.8276793827896032e-05, + "loss": 0.006, + "step": 123890 + }, + { + "epoch": 26.021036667930456, + "grad_norm": 0.01053001917898655, + "learning_rate": 1.827378480685094e-05, + "loss": 0.0408, + "step": 123900 + }, + { + "epoch": 26.021090830309266, + "grad_norm": 0.02290721982717514, + "learning_rate": 1.8270775785805848e-05, + "loss": 0.0762, + "step": 123910 + }, + { + "epoch": 26.02114499268808, + "grad_norm": 0.29984545707702637, + "learning_rate": 1.8267766764760754e-05, + "loss": 0.0734, + "step": 123920 + }, + { + "epoch": 26.021199155066892, + "grad_norm": 0.09261634945869446, + "learning_rate": 1.826475774371566e-05, + "loss": 0.0011, + "step": 123930 + }, + { + "epoch": 26.0212533174457, + "grad_norm": 0.0013854234712198377, + "learning_rate": 1.826174872267057e-05, + "loss": 0.0192, + "step": 123940 + }, + { + "epoch": 26.021307479824515, + "grad_norm": 0.0014275690773501992, + "learning_rate": 1.8258739701625473e-05, + "loss": 0.025, + "step": 123950 + }, + { + "epoch": 26.021361642203324, + "grad_norm": 0.04545685276389122, + "learning_rate": 1.825573068058038e-05, + "loss": 0.0648, + "step": 123960 + }, + { + "epoch": 26.021415804582137, + "grad_norm": 0.0014665537746623158, + "learning_rate": 1.825272165953529e-05, + "loss": 0.1964, + "step": 123970 + }, + { + "epoch": 26.02146996696095, + "grad_norm": 1.5148378610610962, + "learning_rate": 1.8249712638490192e-05, + "loss": 0.0525, + "step": 123980 + }, + { + "epoch": 26.02152412933976, + "grad_norm": 0.24772296845912933, + "learning_rate": 1.82467036174451e-05, + "loss": 0.0524, + "step": 123990 + }, + { + "epoch": 26.021578291718573, + "grad_norm": 0.0019349445356056094, + "learning_rate": 1.8243694596400008e-05, + "loss": 0.0842, + "step": 124000 + }, + { + "epoch": 26.021632454097382, + "grad_norm": 0.0033622640185058117, + "learning_rate": 1.8240685575354914e-05, + "loss": 0.0507, + "step": 124010 + }, + { + "epoch": 26.021686616476195, + "grad_norm": 0.030671915039420128, + "learning_rate": 1.823767655430982e-05, + "loss": 0.0068, + "step": 124020 + }, + { + "epoch": 26.02174077885501, + "grad_norm": 0.16853857040405273, + "learning_rate": 1.823466753326473e-05, + "loss": 0.0042, + "step": 124030 + }, + { + "epoch": 26.021794941233818, + "grad_norm": 3.31315541267395, + "learning_rate": 1.8231658512219636e-05, + "loss": 0.0501, + "step": 124040 + }, + { + "epoch": 26.02184910361263, + "grad_norm": 0.02773980423808098, + "learning_rate": 1.8228649491174542e-05, + "loss": 0.0442, + "step": 124050 + }, + { + "epoch": 26.021903265991444, + "grad_norm": 0.0015163787174969912, + "learning_rate": 1.822564047012945e-05, + "loss": 0.0066, + "step": 124060 + }, + { + "epoch": 26.021957428370253, + "grad_norm": 0.0016523312078788877, + "learning_rate": 1.822263144908436e-05, + "loss": 0.0012, + "step": 124070 + }, + { + "epoch": 26.022011590749067, + "grad_norm": 0.001628134516067803, + "learning_rate": 1.821962242803926e-05, + "loss": 0.1528, + "step": 124080 + }, + { + "epoch": 26.022065753127876, + "grad_norm": 0.5326830148696899, + "learning_rate": 1.821661340699417e-05, + "loss": 0.0131, + "step": 124090 + }, + { + "epoch": 26.02211991550669, + "grad_norm": 0.9620410203933716, + "learning_rate": 1.8213604385949077e-05, + "loss": 0.0146, + "step": 124100 + }, + { + "epoch": 26.022174077885502, + "grad_norm": 0.0636458694934845, + "learning_rate": 1.821059536490398e-05, + "loss": 0.0031, + "step": 124110 + }, + { + "epoch": 26.02222824026431, + "grad_norm": 0.0046465229243040085, + "learning_rate": 1.820758634385889e-05, + "loss": 0.0738, + "step": 124120 + }, + { + "epoch": 26.022282402643125, + "grad_norm": 0.003896500216796994, + "learning_rate": 1.8204577322813796e-05, + "loss": 0.001, + "step": 124130 + }, + { + "epoch": 26.022336565021934, + "grad_norm": 1.8900938034057617, + "learning_rate": 1.8201568301768702e-05, + "loss": 0.0272, + "step": 124140 + }, + { + "epoch": 26.022390727400747, + "grad_norm": 0.002293186727911234, + "learning_rate": 1.819855928072361e-05, + "loss": 0.033, + "step": 124150 + }, + { + "epoch": 26.02244488977956, + "grad_norm": 1.1368041038513184, + "learning_rate": 1.8195550259678518e-05, + "loss": 0.0124, + "step": 124160 + }, + { + "epoch": 26.02249905215837, + "grad_norm": 0.039811789989471436, + "learning_rate": 1.8192541238633425e-05, + "loss": 0.1106, + "step": 124170 + }, + { + "epoch": 26.022553214537183, + "grad_norm": 0.07483365386724472, + "learning_rate": 1.818953221758833e-05, + "loss": 0.1204, + "step": 124180 + }, + { + "epoch": 26.022607376915992, + "grad_norm": 3.2777788639068604, + "learning_rate": 1.8186523196543237e-05, + "loss": 0.0831, + "step": 124190 + }, + { + "epoch": 26.022661539294806, + "grad_norm": 0.004556517116725445, + "learning_rate": 1.8183514175498147e-05, + "loss": 0.0023, + "step": 124200 + }, + { + "epoch": 26.02271570167362, + "grad_norm": 0.0539836660027504, + "learning_rate": 1.818050515445305e-05, + "loss": 0.0118, + "step": 124210 + }, + { + "epoch": 26.022769864052428, + "grad_norm": 0.11220191419124603, + "learning_rate": 1.817749613340796e-05, + "loss": 0.0226, + "step": 124220 + }, + { + "epoch": 26.02282402643124, + "grad_norm": 0.00203847698867321, + "learning_rate": 1.8174487112362866e-05, + "loss": 0.0344, + "step": 124230 + }, + { + "epoch": 26.022878188810054, + "grad_norm": 0.0020133978687226772, + "learning_rate": 1.8171478091317772e-05, + "loss": 0.013, + "step": 124240 + }, + { + "epoch": 26.022932351188864, + "grad_norm": 4.432783603668213, + "learning_rate": 1.8168469070272678e-05, + "loss": 0.1213, + "step": 124250 + }, + { + "epoch": 26.022986513567677, + "grad_norm": 0.03832477331161499, + "learning_rate": 1.8165460049227584e-05, + "loss": 0.0075, + "step": 124260 + }, + { + "epoch": 26.023040675946486, + "grad_norm": 3.8460381031036377, + "learning_rate": 1.816245102818249e-05, + "loss": 0.0028, + "step": 124270 + }, + { + "epoch": 26.0230948383253, + "grad_norm": 0.005252056755125523, + "learning_rate": 1.8159442007137397e-05, + "loss": 0.0581, + "step": 124280 + }, + { + "epoch": 26.023149000704112, + "grad_norm": 0.006863261107355356, + "learning_rate": 1.8156432986092307e-05, + "loss": 0.0237, + "step": 124290 + }, + { + "epoch": 26.023203163082922, + "grad_norm": 0.004082258325070143, + "learning_rate": 1.8153423965047213e-05, + "loss": 0.0082, + "step": 124300 + }, + { + "epoch": 26.023257325461735, + "grad_norm": 2.875058650970459, + "learning_rate": 1.815041494400212e-05, + "loss": 0.0854, + "step": 124310 + }, + { + "epoch": 26.023311487840544, + "grad_norm": 0.10145943611860275, + "learning_rate": 1.8147405922957025e-05, + "loss": 0.0749, + "step": 124320 + }, + { + "epoch": 26.023365650219358, + "grad_norm": 0.13618837296962738, + "learning_rate": 1.8144396901911935e-05, + "loss": 0.0655, + "step": 124330 + }, + { + "epoch": 26.02341981259817, + "grad_norm": 1.196210265159607, + "learning_rate": 1.8141387880866838e-05, + "loss": 0.1296, + "step": 124340 + }, + { + "epoch": 26.02347397497698, + "grad_norm": 0.054659582674503326, + "learning_rate": 1.8138378859821748e-05, + "loss": 0.0398, + "step": 124350 + }, + { + "epoch": 26.023528137355793, + "grad_norm": 0.40607768297195435, + "learning_rate": 1.8135369838776654e-05, + "loss": 0.0081, + "step": 124360 + }, + { + "epoch": 26.023582299734603, + "grad_norm": 0.0017449320293962955, + "learning_rate": 1.813236081773156e-05, + "loss": 0.0193, + "step": 124370 + }, + { + "epoch": 26.023636462113416, + "grad_norm": 0.0016879114555194974, + "learning_rate": 1.8129351796686466e-05, + "loss": 0.0174, + "step": 124380 + }, + { + "epoch": 26.02369062449223, + "grad_norm": 1.5540579557418823, + "learning_rate": 1.8126342775641376e-05, + "loss": 0.0887, + "step": 124390 + }, + { + "epoch": 26.02374478687104, + "grad_norm": 0.01030879095196724, + "learning_rate": 1.812333375459628e-05, + "loss": 0.0609, + "step": 124400 + }, + { + "epoch": 26.02379894924985, + "grad_norm": 2.185588836669922, + "learning_rate": 1.8120324733551185e-05, + "loss": 0.0426, + "step": 124410 + }, + { + "epoch": 26.023853111628664, + "grad_norm": 0.007145365700125694, + "learning_rate": 1.8117315712506095e-05, + "loss": 0.0261, + "step": 124420 + }, + { + "epoch": 26.023907274007474, + "grad_norm": 0.011901534162461758, + "learning_rate": 1.8114306691461e-05, + "loss": 0.0466, + "step": 124430 + }, + { + "epoch": 26.023961436386287, + "grad_norm": 0.01584041304886341, + "learning_rate": 1.8111297670415908e-05, + "loss": 0.0586, + "step": 124440 + }, + { + "epoch": 26.024015598765097, + "grad_norm": 0.0018194839358329773, + "learning_rate": 1.8108288649370814e-05, + "loss": 0.0038, + "step": 124450 + }, + { + "epoch": 26.02406976114391, + "grad_norm": 0.11401823163032532, + "learning_rate": 1.8105279628325723e-05, + "loss": 0.0211, + "step": 124460 + }, + { + "epoch": 26.024123923522723, + "grad_norm": 0.0019511909922584891, + "learning_rate": 1.8102270607280626e-05, + "loss": 0.0332, + "step": 124470 + }, + { + "epoch": 26.024178085901532, + "grad_norm": 0.02254406362771988, + "learning_rate": 1.8099261586235536e-05, + "loss": 0.0279, + "step": 124480 + }, + { + "epoch": 26.024232248280345, + "grad_norm": 0.008593284524977207, + "learning_rate": 1.8096252565190442e-05, + "loss": 0.0003, + "step": 124490 + }, + { + "epoch": 26.024286410659155, + "grad_norm": 0.0015233028680086136, + "learning_rate": 1.809324354414535e-05, + "loss": 0.0309, + "step": 124500 + }, + { + "epoch": 26.024340573037968, + "grad_norm": 0.1288481503725052, + "learning_rate": 1.8090234523100255e-05, + "loss": 0.0033, + "step": 124510 + }, + { + "epoch": 26.02439473541678, + "grad_norm": 0.005004085134714842, + "learning_rate": 1.8087225502055164e-05, + "loss": 0.0316, + "step": 124520 + }, + { + "epoch": 26.02444889779559, + "grad_norm": 0.14763472974300385, + "learning_rate": 1.8084216481010067e-05, + "loss": 0.0129, + "step": 124530 + }, + { + "epoch": 26.024503060174403, + "grad_norm": 0.03197422996163368, + "learning_rate": 1.8081207459964977e-05, + "loss": 0.0912, + "step": 124540 + }, + { + "epoch": 26.024557222553213, + "grad_norm": 0.029010480269789696, + "learning_rate": 1.8078198438919883e-05, + "loss": 0.0829, + "step": 124550 + }, + { + "epoch": 26.024611384932026, + "grad_norm": 0.08752969652414322, + "learning_rate": 1.807518941787479e-05, + "loss": 0.019, + "step": 124560 + }, + { + "epoch": 26.02466554731084, + "grad_norm": 0.4946966767311096, + "learning_rate": 1.8072180396829696e-05, + "loss": 0.0224, + "step": 124570 + }, + { + "epoch": 26.02471970968965, + "grad_norm": 0.00137722073122859, + "learning_rate": 1.8069171375784602e-05, + "loss": 0.0841, + "step": 124580 + }, + { + "epoch": 26.02477387206846, + "grad_norm": 0.11438951641321182, + "learning_rate": 1.8066162354739512e-05, + "loss": 0.0323, + "step": 124590 + }, + { + "epoch": 26.024828034447275, + "grad_norm": 0.2956278622150421, + "learning_rate": 1.8063153333694415e-05, + "loss": 0.0158, + "step": 124600 + }, + { + "epoch": 26.024882196826084, + "grad_norm": 0.0014581719879060984, + "learning_rate": 1.8060144312649324e-05, + "loss": 0.0337, + "step": 124610 + }, + { + "epoch": 26.024936359204897, + "grad_norm": 0.009861872531473637, + "learning_rate": 1.805713529160423e-05, + "loss": 0.0069, + "step": 124620 + }, + { + "epoch": 26.024990521583707, + "grad_norm": 0.010790658183395863, + "learning_rate": 1.8054126270559137e-05, + "loss": 0.0278, + "step": 124630 + }, + { + "epoch": 26.02500135405947, + "eval_accuracy": 0.83736120182887, + "eval_loss": 0.9337897300720215, + "eval_runtime": 116.779, + "eval_samples_per_second": 26.22, + "eval_steps_per_second": 3.28, + "step": 124632 + }, + { + "epoch": 27.00004332990305, + "grad_norm": 1.2575223445892334, + "learning_rate": 1.8051117249514043e-05, + "loss": 0.0117, + "step": 124640 + }, + { + "epoch": 27.00009749228186, + "grad_norm": 0.014150821603834629, + "learning_rate": 1.8048108228468953e-05, + "loss": 0.0159, + "step": 124650 + }, + { + "epoch": 27.000151654660673, + "grad_norm": 0.016183409839868546, + "learning_rate": 1.8045099207423856e-05, + "loss": 0.0223, + "step": 124660 + }, + { + "epoch": 27.000205817039486, + "grad_norm": 0.021610332652926445, + "learning_rate": 1.8042090186378765e-05, + "loss": 0.004, + "step": 124670 + }, + { + "epoch": 27.000259979418296, + "grad_norm": 0.006700617726892233, + "learning_rate": 1.803908116533367e-05, + "loss": 0.0933, + "step": 124680 + }, + { + "epoch": 27.00031414179711, + "grad_norm": 1.2040599584579468, + "learning_rate": 1.8036072144288578e-05, + "loss": 0.0891, + "step": 124690 + }, + { + "epoch": 27.000368304175918, + "grad_norm": 5.291249752044678, + "learning_rate": 1.8033063123243484e-05, + "loss": 0.0469, + "step": 124700 + }, + { + "epoch": 27.00042246655473, + "grad_norm": 2.2785284519195557, + "learning_rate": 1.803005410219839e-05, + "loss": 0.0457, + "step": 124710 + }, + { + "epoch": 27.000476628933544, + "grad_norm": 0.04465630650520325, + "learning_rate": 1.80270450811533e-05, + "loss": 0.0097, + "step": 124720 + }, + { + "epoch": 27.000530791312354, + "grad_norm": 0.3750409185886383, + "learning_rate": 1.8024036060108203e-05, + "loss": 0.0399, + "step": 124730 + }, + { + "epoch": 27.000584953691167, + "grad_norm": 0.003293558955192566, + "learning_rate": 1.8021027039063113e-05, + "loss": 0.0114, + "step": 124740 + }, + { + "epoch": 27.000639116069976, + "grad_norm": 0.0013175776693969965, + "learning_rate": 1.801801801801802e-05, + "loss": 0.0019, + "step": 124750 + }, + { + "epoch": 27.00069327844879, + "grad_norm": 0.007763972971588373, + "learning_rate": 1.8015008996972925e-05, + "loss": 0.0157, + "step": 124760 + }, + { + "epoch": 27.000747440827602, + "grad_norm": 0.047577548772096634, + "learning_rate": 1.801199997592783e-05, + "loss": 0.0145, + "step": 124770 + }, + { + "epoch": 27.000801603206412, + "grad_norm": 0.0037777309771627188, + "learning_rate": 1.800899095488274e-05, + "loss": 0.0036, + "step": 124780 + }, + { + "epoch": 27.000855765585225, + "grad_norm": 0.535031259059906, + "learning_rate": 1.8005981933837644e-05, + "loss": 0.0148, + "step": 124790 + }, + { + "epoch": 27.000909927964035, + "grad_norm": 1.8911386728286743, + "learning_rate": 1.8002972912792554e-05, + "loss": 0.0318, + "step": 124800 + }, + { + "epoch": 27.000964090342848, + "grad_norm": 11.185466766357422, + "learning_rate": 1.799996389174746e-05, + "loss": 0.0872, + "step": 124810 + }, + { + "epoch": 27.00101825272166, + "grad_norm": 0.002061262959614396, + "learning_rate": 1.7996954870702366e-05, + "loss": 0.0044, + "step": 124820 + }, + { + "epoch": 27.00107241510047, + "grad_norm": 0.0024579628370702267, + "learning_rate": 1.7993945849657273e-05, + "loss": 0.1054, + "step": 124830 + }, + { + "epoch": 27.001126577479283, + "grad_norm": 0.0014437339268624783, + "learning_rate": 1.7990936828612182e-05, + "loss": 0.0414, + "step": 124840 + }, + { + "epoch": 27.001180739858096, + "grad_norm": 1.8006385564804077, + "learning_rate": 1.798792780756709e-05, + "loss": 0.0402, + "step": 124850 + }, + { + "epoch": 27.001234902236906, + "grad_norm": 0.006212329491972923, + "learning_rate": 1.798491878652199e-05, + "loss": 0.0195, + "step": 124860 + }, + { + "epoch": 27.00128906461572, + "grad_norm": 0.004507367964833975, + "learning_rate": 1.79819097654769e-05, + "loss": 0.0005, + "step": 124870 + }, + { + "epoch": 27.00134322699453, + "grad_norm": 1.3161736726760864, + "learning_rate": 1.7978900744431807e-05, + "loss": 0.0203, + "step": 124880 + }, + { + "epoch": 27.00139738937334, + "grad_norm": 1.621742606163025, + "learning_rate": 1.7975891723386714e-05, + "loss": 0.1176, + "step": 124890 + }, + { + "epoch": 27.001451551752155, + "grad_norm": 0.003550764173269272, + "learning_rate": 1.797288270234162e-05, + "loss": 0.0046, + "step": 124900 + }, + { + "epoch": 27.001505714130964, + "grad_norm": 0.025920560583472252, + "learning_rate": 1.796987368129653e-05, + "loss": 0.0033, + "step": 124910 + }, + { + "epoch": 27.001559876509777, + "grad_norm": 0.0014043031260371208, + "learning_rate": 1.7966864660251432e-05, + "loss": 0.0021, + "step": 124920 + }, + { + "epoch": 27.001614038888587, + "grad_norm": 0.4724108576774597, + "learning_rate": 1.7963855639206342e-05, + "loss": 0.0194, + "step": 124930 + }, + { + "epoch": 27.0016682012674, + "grad_norm": 0.03754027187824249, + "learning_rate": 1.796084661816125e-05, + "loss": 0.005, + "step": 124940 + }, + { + "epoch": 27.001722363646213, + "grad_norm": 5.634045600891113, + "learning_rate": 1.7957837597116155e-05, + "loss": 0.028, + "step": 124950 + }, + { + "epoch": 27.001776526025022, + "grad_norm": 0.0032256837002933025, + "learning_rate": 1.795482857607106e-05, + "loss": 0.0619, + "step": 124960 + }, + { + "epoch": 27.001830688403835, + "grad_norm": 0.019169336184859276, + "learning_rate": 1.795181955502597e-05, + "loss": 0.0236, + "step": 124970 + }, + { + "epoch": 27.001884850782645, + "grad_norm": 0.004087264649569988, + "learning_rate": 1.7948810533980877e-05, + "loss": 0.0583, + "step": 124980 + }, + { + "epoch": 27.001939013161458, + "grad_norm": 0.0022525631356984377, + "learning_rate": 1.7945801512935783e-05, + "loss": 0.0609, + "step": 124990 + }, + { + "epoch": 27.00199317554027, + "grad_norm": 0.4672706127166748, + "learning_rate": 1.794279249189069e-05, + "loss": 0.0451, + "step": 125000 + }, + { + "epoch": 27.00204733791908, + "grad_norm": 0.002564970636740327, + "learning_rate": 1.7939783470845596e-05, + "loss": 0.0201, + "step": 125010 + }, + { + "epoch": 27.002101500297893, + "grad_norm": 0.2579585313796997, + "learning_rate": 1.7936774449800502e-05, + "loss": 0.0413, + "step": 125020 + }, + { + "epoch": 27.002155662676707, + "grad_norm": 0.001695144921541214, + "learning_rate": 1.7933765428755408e-05, + "loss": 0.031, + "step": 125030 + }, + { + "epoch": 27.002209825055516, + "grad_norm": 7.408670902252197, + "learning_rate": 1.7930756407710318e-05, + "loss": 0.046, + "step": 125040 + }, + { + "epoch": 27.00226398743433, + "grad_norm": 0.003897082759067416, + "learning_rate": 1.792774738666522e-05, + "loss": 0.0006, + "step": 125050 + }, + { + "epoch": 27.00231814981314, + "grad_norm": 0.006205215118825436, + "learning_rate": 1.792473836562013e-05, + "loss": 0.0121, + "step": 125060 + }, + { + "epoch": 27.00237231219195, + "grad_norm": 0.0013735266402363777, + "learning_rate": 1.7921729344575037e-05, + "loss": 0.032, + "step": 125070 + }, + { + "epoch": 27.002426474570765, + "grad_norm": 0.7234442234039307, + "learning_rate": 1.7918720323529943e-05, + "loss": 0.0427, + "step": 125080 + }, + { + "epoch": 27.002480636949574, + "grad_norm": 0.0012460026191547513, + "learning_rate": 1.791571130248485e-05, + "loss": 0.0193, + "step": 125090 + }, + { + "epoch": 27.002534799328387, + "grad_norm": 0.005710120312869549, + "learning_rate": 1.791270228143976e-05, + "loss": 0.0299, + "step": 125100 + }, + { + "epoch": 27.002588961707197, + "grad_norm": 0.00302384071983397, + "learning_rate": 1.7909693260394665e-05, + "loss": 0.0015, + "step": 125110 + }, + { + "epoch": 27.00264312408601, + "grad_norm": 0.0013482666108757257, + "learning_rate": 1.790668423934957e-05, + "loss": 0.0138, + "step": 125120 + }, + { + "epoch": 27.002697286464823, + "grad_norm": 0.0012357702944427729, + "learning_rate": 1.7903675218304478e-05, + "loss": 0.1379, + "step": 125130 + }, + { + "epoch": 27.002751448843632, + "grad_norm": 0.0038874465972185135, + "learning_rate": 1.7900666197259387e-05, + "loss": 0.0393, + "step": 125140 + }, + { + "epoch": 27.002805611222445, + "grad_norm": 0.001399896340444684, + "learning_rate": 1.789765717621429e-05, + "loss": 0.1121, + "step": 125150 + }, + { + "epoch": 27.002859773601255, + "grad_norm": 0.11052021384239197, + "learning_rate": 1.7894648155169197e-05, + "loss": 0.0152, + "step": 125160 + }, + { + "epoch": 27.002913935980068, + "grad_norm": 0.0014894906198605895, + "learning_rate": 1.7891639134124106e-05, + "loss": 0.0026, + "step": 125170 + }, + { + "epoch": 27.00296809835888, + "grad_norm": 0.007578251417726278, + "learning_rate": 1.788863011307901e-05, + "loss": 0.0565, + "step": 125180 + }, + { + "epoch": 27.00302226073769, + "grad_norm": 0.06834246963262558, + "learning_rate": 1.788562109203392e-05, + "loss": 0.0234, + "step": 125190 + }, + { + "epoch": 27.003076423116504, + "grad_norm": 0.03861774131655693, + "learning_rate": 1.7882612070988825e-05, + "loss": 0.0648, + "step": 125200 + }, + { + "epoch": 27.003130585495313, + "grad_norm": 0.05319063365459442, + "learning_rate": 1.787960304994373e-05, + "loss": 0.0369, + "step": 125210 + }, + { + "epoch": 27.003184747874126, + "grad_norm": 0.22364702820777893, + "learning_rate": 1.7876594028898638e-05, + "loss": 0.1197, + "step": 125220 + }, + { + "epoch": 27.00323891025294, + "grad_norm": 0.07381228357553482, + "learning_rate": 1.7873585007853547e-05, + "loss": 0.0664, + "step": 125230 + }, + { + "epoch": 27.00329307263175, + "grad_norm": 0.001496389857493341, + "learning_rate": 1.7870575986808454e-05, + "loss": 0.0993, + "step": 125240 + }, + { + "epoch": 27.003347235010562, + "grad_norm": 0.42116084694862366, + "learning_rate": 1.786756696576336e-05, + "loss": 0.069, + "step": 125250 + }, + { + "epoch": 27.003401397389375, + "grad_norm": 0.0021471241489052773, + "learning_rate": 1.7864557944718266e-05, + "loss": 0.0471, + "step": 125260 + }, + { + "epoch": 27.003455559768184, + "grad_norm": 0.6585800647735596, + "learning_rate": 1.7861548923673176e-05, + "loss": 0.0441, + "step": 125270 + }, + { + "epoch": 27.003509722146998, + "grad_norm": 0.0014681974425911903, + "learning_rate": 1.785853990262808e-05, + "loss": 0.0227, + "step": 125280 + }, + { + "epoch": 27.003563884525807, + "grad_norm": 0.0017257914878427982, + "learning_rate": 1.7855530881582985e-05, + "loss": 0.0074, + "step": 125290 + }, + { + "epoch": 27.00361804690462, + "grad_norm": 0.0024011258501559496, + "learning_rate": 1.7852521860537895e-05, + "loss": 0.002, + "step": 125300 + }, + { + "epoch": 27.003672209283433, + "grad_norm": 0.05740256980061531, + "learning_rate": 1.7849512839492797e-05, + "loss": 0.0216, + "step": 125310 + }, + { + "epoch": 27.003726371662243, + "grad_norm": 0.0041962903924286366, + "learning_rate": 1.7846503818447707e-05, + "loss": 0.0707, + "step": 125320 + }, + { + "epoch": 27.003780534041056, + "grad_norm": 0.19647946953773499, + "learning_rate": 1.7843494797402613e-05, + "loss": 0.0398, + "step": 125330 + }, + { + "epoch": 27.003834696419865, + "grad_norm": 0.002362459432333708, + "learning_rate": 1.784048577635752e-05, + "loss": 0.0082, + "step": 125340 + }, + { + "epoch": 27.00388885879868, + "grad_norm": 0.1052156537771225, + "learning_rate": 1.7837476755312426e-05, + "loss": 0.016, + "step": 125350 + }, + { + "epoch": 27.00394302117749, + "grad_norm": 0.0013926177052780986, + "learning_rate": 1.7834467734267336e-05, + "loss": 0.0111, + "step": 125360 + }, + { + "epoch": 27.0039971835563, + "grad_norm": 0.0021923112217336893, + "learning_rate": 1.7831458713222242e-05, + "loss": 0.0292, + "step": 125370 + }, + { + "epoch": 27.004051345935114, + "grad_norm": 0.0023968603927642107, + "learning_rate": 1.7828449692177148e-05, + "loss": 0.0045, + "step": 125380 + }, + { + "epoch": 27.004105508313923, + "grad_norm": 0.0012279287911951542, + "learning_rate": 1.7825440671132054e-05, + "loss": 0.0416, + "step": 125390 + }, + { + "epoch": 27.004159670692736, + "grad_norm": 0.0013472671853378415, + "learning_rate": 1.7822431650086964e-05, + "loss": 0.0007, + "step": 125400 + }, + { + "epoch": 27.00421383307155, + "grad_norm": 0.079412080347538, + "learning_rate": 1.7819422629041867e-05, + "loss": 0.1666, + "step": 125410 + }, + { + "epoch": 27.00426799545036, + "grad_norm": 0.0013827694347128272, + "learning_rate": 1.7816413607996777e-05, + "loss": 0.0213, + "step": 125420 + }, + { + "epoch": 27.004322157829172, + "grad_norm": 0.587766170501709, + "learning_rate": 1.7813404586951683e-05, + "loss": 0.0753, + "step": 125430 + }, + { + "epoch": 27.004376320207985, + "grad_norm": 0.002770269988104701, + "learning_rate": 1.7810395565906586e-05, + "loss": 0.1229, + "step": 125440 + }, + { + "epoch": 27.004430482586795, + "grad_norm": 0.12722443044185638, + "learning_rate": 1.7807386544861495e-05, + "loss": 0.1208, + "step": 125450 + }, + { + "epoch": 27.004484644965608, + "grad_norm": 0.03468436375260353, + "learning_rate": 1.7804377523816402e-05, + "loss": 0.0095, + "step": 125460 + }, + { + "epoch": 27.004538807344417, + "grad_norm": 0.0029610542114824057, + "learning_rate": 1.7801368502771308e-05, + "loss": 0.0776, + "step": 125470 + }, + { + "epoch": 27.00459296972323, + "grad_norm": 0.0029845652170479298, + "learning_rate": 1.7798359481726214e-05, + "loss": 0.0589, + "step": 125480 + }, + { + "epoch": 27.004647132102043, + "grad_norm": 1.4792622327804565, + "learning_rate": 1.7795350460681124e-05, + "loss": 0.0484, + "step": 125490 + }, + { + "epoch": 27.004701294480853, + "grad_norm": 0.10013587027788162, + "learning_rate": 1.779234143963603e-05, + "loss": 0.0411, + "step": 125500 + }, + { + "epoch": 27.004755456859666, + "grad_norm": 0.42707496881484985, + "learning_rate": 1.7789332418590937e-05, + "loss": 0.0404, + "step": 125510 + }, + { + "epoch": 27.004809619238475, + "grad_norm": 0.0023777480237185955, + "learning_rate": 1.7786323397545843e-05, + "loss": 0.0448, + "step": 125520 + }, + { + "epoch": 27.00486378161729, + "grad_norm": 0.3611290752887726, + "learning_rate": 1.7783314376500752e-05, + "loss": 0.024, + "step": 125530 + }, + { + "epoch": 27.0049179439961, + "grad_norm": 0.006243993062525988, + "learning_rate": 1.7780305355455655e-05, + "loss": 0.0589, + "step": 125540 + }, + { + "epoch": 27.00497210637491, + "grad_norm": 0.0016612421022728086, + "learning_rate": 1.7777296334410565e-05, + "loss": 0.0521, + "step": 125550 + }, + { + "epoch": 27.005026268753724, + "grad_norm": 3.734945058822632, + "learning_rate": 1.777428731336547e-05, + "loss": 0.0367, + "step": 125560 + }, + { + "epoch": 27.005080431132534, + "grad_norm": 0.001944120624102652, + "learning_rate": 1.7771278292320378e-05, + "loss": 0.0746, + "step": 125570 + }, + { + "epoch": 27.005134593511347, + "grad_norm": 0.6622650623321533, + "learning_rate": 1.7768269271275284e-05, + "loss": 0.0809, + "step": 125580 + }, + { + "epoch": 27.00518875589016, + "grad_norm": 0.0032920276280492544, + "learning_rate": 1.776526025023019e-05, + "loss": 0.0085, + "step": 125590 + }, + { + "epoch": 27.00524291826897, + "grad_norm": 1.074408769607544, + "learning_rate": 1.7762251229185096e-05, + "loss": 0.0597, + "step": 125600 + }, + { + "epoch": 27.005297080647782, + "grad_norm": 0.14848089218139648, + "learning_rate": 1.7759242208140003e-05, + "loss": 0.017, + "step": 125610 + }, + { + "epoch": 27.005351243026595, + "grad_norm": 0.5248625874519348, + "learning_rate": 1.7756233187094912e-05, + "loss": 0.1247, + "step": 125620 + }, + { + "epoch": 27.005405405405405, + "grad_norm": 0.18331730365753174, + "learning_rate": 1.775322416604982e-05, + "loss": 0.1034, + "step": 125630 + }, + { + "epoch": 27.005459567784218, + "grad_norm": 0.734413206577301, + "learning_rate": 1.7750215145004725e-05, + "loss": 0.0357, + "step": 125640 + }, + { + "epoch": 27.005513730163027, + "grad_norm": 0.0016913973959162831, + "learning_rate": 1.774720612395963e-05, + "loss": 0.0112, + "step": 125650 + }, + { + "epoch": 27.00556789254184, + "grad_norm": 0.005158542655408382, + "learning_rate": 1.774419710291454e-05, + "loss": 0.0731, + "step": 125660 + }, + { + "epoch": 27.005622054920654, + "grad_norm": 0.041208818554878235, + "learning_rate": 1.7741188081869444e-05, + "loss": 0.0749, + "step": 125670 + }, + { + "epoch": 27.005676217299463, + "grad_norm": 1.7464708089828491, + "learning_rate": 1.7738179060824353e-05, + "loss": 0.0577, + "step": 125680 + }, + { + "epoch": 27.005730379678276, + "grad_norm": 0.12166804075241089, + "learning_rate": 1.773517003977926e-05, + "loss": 0.0338, + "step": 125690 + }, + { + "epoch": 27.005784542057086, + "grad_norm": 0.0034190986771136522, + "learning_rate": 1.7732161018734166e-05, + "loss": 0.0391, + "step": 125700 + }, + { + "epoch": 27.0058387044359, + "grad_norm": 0.0017373042646795511, + "learning_rate": 1.7729151997689072e-05, + "loss": 0.0416, + "step": 125710 + }, + { + "epoch": 27.005892866814712, + "grad_norm": 0.6916327476501465, + "learning_rate": 1.7726142976643982e-05, + "loss": 0.0331, + "step": 125720 + }, + { + "epoch": 27.00594702919352, + "grad_norm": 0.0019468412501737475, + "learning_rate": 1.7723133955598885e-05, + "loss": 0.0413, + "step": 125730 + }, + { + "epoch": 27.006001191572334, + "grad_norm": 0.030559375882148743, + "learning_rate": 1.772012493455379e-05, + "loss": 0.0392, + "step": 125740 + }, + { + "epoch": 27.006055353951144, + "grad_norm": 0.040741290897130966, + "learning_rate": 1.77171159135087e-05, + "loss": 0.0451, + "step": 125750 + }, + { + "epoch": 27.006109516329957, + "grad_norm": 0.7324576377868652, + "learning_rate": 1.7714106892463607e-05, + "loss": 0.038, + "step": 125760 + }, + { + "epoch": 27.00616367870877, + "grad_norm": 0.001608371501788497, + "learning_rate": 1.7711097871418513e-05, + "loss": 0.0003, + "step": 125770 + }, + { + "epoch": 27.00621784108758, + "grad_norm": 0.001804795698262751, + "learning_rate": 1.770808885037342e-05, + "loss": 0.0043, + "step": 125780 + }, + { + "epoch": 27.006272003466393, + "grad_norm": 0.0014982480788603425, + "learning_rate": 1.770507982932833e-05, + "loss": 0.1355, + "step": 125790 + }, + { + "epoch": 27.006326165845206, + "grad_norm": 0.03909086436033249, + "learning_rate": 1.7702070808283232e-05, + "loss": 0.0163, + "step": 125800 + }, + { + "epoch": 27.006380328224015, + "grad_norm": 0.0014828500570729375, + "learning_rate": 1.769906178723814e-05, + "loss": 0.0228, + "step": 125810 + }, + { + "epoch": 27.006434490602828, + "grad_norm": 0.0014736471930518746, + "learning_rate": 1.7696052766193048e-05, + "loss": 0.0857, + "step": 125820 + }, + { + "epoch": 27.006488652981638, + "grad_norm": 0.7693319320678711, + "learning_rate": 1.7693043745147954e-05, + "loss": 0.0301, + "step": 125830 + }, + { + "epoch": 27.00654281536045, + "grad_norm": 1.48642897605896, + "learning_rate": 1.769003472410286e-05, + "loss": 0.0743, + "step": 125840 + }, + { + "epoch": 27.006596977739264, + "grad_norm": 1.692260980606079, + "learning_rate": 1.768702570305777e-05, + "loss": 0.0647, + "step": 125850 + }, + { + "epoch": 27.006651140118073, + "grad_norm": 0.0015535579295828938, + "learning_rate": 1.7684016682012673e-05, + "loss": 0.0851, + "step": 125860 + }, + { + "epoch": 27.006705302496886, + "grad_norm": 0.0015533469850197434, + "learning_rate": 1.7681007660967583e-05, + "loss": 0.0203, + "step": 125870 + }, + { + "epoch": 27.006759464875696, + "grad_norm": 0.44601088762283325, + "learning_rate": 1.767799863992249e-05, + "loss": 0.0502, + "step": 125880 + }, + { + "epoch": 27.00681362725451, + "grad_norm": 0.0014696103753522038, + "learning_rate": 1.7674989618877395e-05, + "loss": 0.0338, + "step": 125890 + }, + { + "epoch": 27.006867789633322, + "grad_norm": 0.0015142600750550628, + "learning_rate": 1.76719805978323e-05, + "loss": 0.0322, + "step": 125900 + }, + { + "epoch": 27.00692195201213, + "grad_norm": 0.0013953439192846417, + "learning_rate": 1.7668971576787208e-05, + "loss": 0.0148, + "step": 125910 + }, + { + "epoch": 27.006976114390945, + "grad_norm": 0.0014696724247187376, + "learning_rate": 1.7665962555742117e-05, + "loss": 0.0329, + "step": 125920 + }, + { + "epoch": 27.007030276769754, + "grad_norm": 0.003546111285686493, + "learning_rate": 1.766295353469702e-05, + "loss": 0.066, + "step": 125930 + }, + { + "epoch": 27.007084439148567, + "grad_norm": 0.38334283232688904, + "learning_rate": 1.765994451365193e-05, + "loss": 0.0287, + "step": 125940 + }, + { + "epoch": 27.00713860152738, + "grad_norm": 1.269890546798706, + "learning_rate": 1.7656935492606836e-05, + "loss": 0.0221, + "step": 125950 + }, + { + "epoch": 27.00719276390619, + "grad_norm": 0.4014417231082916, + "learning_rate": 1.7653926471561743e-05, + "loss": 0.0168, + "step": 125960 + }, + { + "epoch": 27.007246926285003, + "grad_norm": 0.002022276632487774, + "learning_rate": 1.765091745051665e-05, + "loss": 0.011, + "step": 125970 + }, + { + "epoch": 27.007301088663816, + "grad_norm": 3.1467840671539307, + "learning_rate": 1.764790842947156e-05, + "loss": 0.0492, + "step": 125980 + }, + { + "epoch": 27.007355251042625, + "grad_norm": 0.0013198264641687274, + "learning_rate": 1.764489940842646e-05, + "loss": 0.0015, + "step": 125990 + }, + { + "epoch": 27.00740941342144, + "grad_norm": 0.0013675946975126863, + "learning_rate": 1.764189038738137e-05, + "loss": 0.1107, + "step": 126000 + }, + { + "epoch": 27.007463575800248, + "grad_norm": 2.0945675373077393, + "learning_rate": 1.7638881366336277e-05, + "loss": 0.0763, + "step": 126010 + }, + { + "epoch": 27.00751773817906, + "grad_norm": 0.8491575717926025, + "learning_rate": 1.7635872345291184e-05, + "loss": 0.0403, + "step": 126020 + }, + { + "epoch": 27.007571900557874, + "grad_norm": 0.7740633487701416, + "learning_rate": 1.763286332424609e-05, + "loss": 0.0538, + "step": 126030 + }, + { + "epoch": 27.007626062936684, + "grad_norm": 0.001326893805526197, + "learning_rate": 1.7629854303200996e-05, + "loss": 0.0115, + "step": 126040 + }, + { + "epoch": 27.007680225315497, + "grad_norm": 0.0020018417853862047, + "learning_rate": 1.7626845282155906e-05, + "loss": 0.0051, + "step": 126050 + }, + { + "epoch": 27.007734387694306, + "grad_norm": 0.0013361009769141674, + "learning_rate": 1.762383626111081e-05, + "loss": 0.0357, + "step": 126060 + }, + { + "epoch": 27.00778855007312, + "grad_norm": 0.36327406764030457, + "learning_rate": 1.762082724006572e-05, + "loss": 0.0157, + "step": 126070 + }, + { + "epoch": 27.007842712451932, + "grad_norm": 0.0012361612170934677, + "learning_rate": 1.7617818219020625e-05, + "loss": 0.0926, + "step": 126080 + }, + { + "epoch": 27.00789687483074, + "grad_norm": 3.2155184745788574, + "learning_rate": 1.761480919797553e-05, + "loss": 0.0541, + "step": 126090 + }, + { + "epoch": 27.007951037209555, + "grad_norm": 0.0014298747992143035, + "learning_rate": 1.7611800176930437e-05, + "loss": 0.046, + "step": 126100 + }, + { + "epoch": 27.008005199588364, + "grad_norm": 0.011748144403100014, + "learning_rate": 1.7608791155885347e-05, + "loss": 0.1345, + "step": 126110 + }, + { + "epoch": 27.008059361967177, + "grad_norm": 0.0921768769621849, + "learning_rate": 1.760578213484025e-05, + "loss": 0.027, + "step": 126120 + }, + { + "epoch": 27.00811352434599, + "grad_norm": 0.666801393032074, + "learning_rate": 1.760277311379516e-05, + "loss": 0.0699, + "step": 126130 + }, + { + "epoch": 27.0081676867248, + "grad_norm": 0.034809816628694534, + "learning_rate": 1.7599764092750066e-05, + "loss": 0.007, + "step": 126140 + }, + { + "epoch": 27.008221849103613, + "grad_norm": 0.030400699004530907, + "learning_rate": 1.7596755071704972e-05, + "loss": 0.0006, + "step": 126150 + }, + { + "epoch": 27.008276011482426, + "grad_norm": 2.597208023071289, + "learning_rate": 1.7593746050659878e-05, + "loss": 0.0385, + "step": 126160 + }, + { + "epoch": 27.008330173861236, + "grad_norm": 0.0014639680739492178, + "learning_rate": 1.7590737029614788e-05, + "loss": 0.0518, + "step": 126170 + }, + { + "epoch": 27.00838433624005, + "grad_norm": 0.004489482846111059, + "learning_rate": 1.7587728008569694e-05, + "loss": 0.0984, + "step": 126180 + }, + { + "epoch": 27.008438498618858, + "grad_norm": 0.03677300736308098, + "learning_rate": 1.7584718987524597e-05, + "loss": 0.0551, + "step": 126190 + }, + { + "epoch": 27.00849266099767, + "grad_norm": 0.003592003835365176, + "learning_rate": 1.7581709966479507e-05, + "loss": 0.0463, + "step": 126200 + }, + { + "epoch": 27.008546823376484, + "grad_norm": 0.002333789598196745, + "learning_rate": 1.7578700945434413e-05, + "loss": 0.036, + "step": 126210 + }, + { + "epoch": 27.008600985755294, + "grad_norm": 0.0022708107717335224, + "learning_rate": 1.757569192438932e-05, + "loss": 0.0138, + "step": 126220 + }, + { + "epoch": 27.008655148134107, + "grad_norm": 0.5908771753311157, + "learning_rate": 1.7572682903344226e-05, + "loss": 0.041, + "step": 126230 + }, + { + "epoch": 27.008709310512916, + "grad_norm": 0.001608318299986422, + "learning_rate": 1.7569673882299135e-05, + "loss": 0.0038, + "step": 126240 + }, + { + "epoch": 27.00876347289173, + "grad_norm": 0.0027127498760819435, + "learning_rate": 1.7566664861254038e-05, + "loss": 0.0206, + "step": 126250 + }, + { + "epoch": 27.008817635270542, + "grad_norm": 0.0015979703748598695, + "learning_rate": 1.7563655840208948e-05, + "loss": 0.0343, + "step": 126260 + }, + { + "epoch": 27.008871797649352, + "grad_norm": 0.80031818151474, + "learning_rate": 1.7560646819163854e-05, + "loss": 0.0446, + "step": 126270 + }, + { + "epoch": 27.008925960028165, + "grad_norm": 0.0022250425536185503, + "learning_rate": 1.755763779811876e-05, + "loss": 0.0001, + "step": 126280 + }, + { + "epoch": 27.008980122406975, + "grad_norm": 3.280229091644287, + "learning_rate": 1.7554628777073667e-05, + "loss": 0.2985, + "step": 126290 + }, + { + "epoch": 27.009034284785788, + "grad_norm": 0.0038828810211271048, + "learning_rate": 1.7551619756028576e-05, + "loss": 0.0363, + "step": 126300 + }, + { + "epoch": 27.0090884471646, + "grad_norm": 1.5397158861160278, + "learning_rate": 1.7548610734983483e-05, + "loss": 0.0149, + "step": 126310 + }, + { + "epoch": 27.00914260954341, + "grad_norm": 0.04514249786734581, + "learning_rate": 1.754560171393839e-05, + "loss": 0.023, + "step": 126320 + }, + { + "epoch": 27.009196771922223, + "grad_norm": 1.2488698959350586, + "learning_rate": 1.7542592692893295e-05, + "loss": 0.0273, + "step": 126330 + }, + { + "epoch": 27.009250934301033, + "grad_norm": 0.0037169107235968113, + "learning_rate": 1.75395836718482e-05, + "loss": 0.0333, + "step": 126340 + }, + { + "epoch": 27.009305096679846, + "grad_norm": 0.06787066161632538, + "learning_rate": 1.7536574650803108e-05, + "loss": 0.0316, + "step": 126350 + }, + { + "epoch": 27.00935925905866, + "grad_norm": 0.2328677773475647, + "learning_rate": 1.7533565629758014e-05, + "loss": 0.0683, + "step": 126360 + }, + { + "epoch": 27.00941342143747, + "grad_norm": 0.002403853926807642, + "learning_rate": 1.7530556608712924e-05, + "loss": 0.0128, + "step": 126370 + }, + { + "epoch": 27.00946758381628, + "grad_norm": 0.5479315519332886, + "learning_rate": 1.7527547587667826e-05, + "loss": 0.0127, + "step": 126380 + }, + { + "epoch": 27.009521746195094, + "grad_norm": 0.04222961142659187, + "learning_rate": 1.7524538566622736e-05, + "loss": 0.0378, + "step": 126390 + }, + { + "epoch": 27.009575908573904, + "grad_norm": 0.008586465381085873, + "learning_rate": 1.7521529545577642e-05, + "loss": 0.065, + "step": 126400 + }, + { + "epoch": 27.009630070952717, + "grad_norm": 0.0013541447697207332, + "learning_rate": 1.751852052453255e-05, + "loss": 0.0331, + "step": 126410 + }, + { + "epoch": 27.009684233331527, + "grad_norm": 0.0021531928796321154, + "learning_rate": 1.7515511503487455e-05, + "loss": 0.0286, + "step": 126420 + }, + { + "epoch": 27.00973839571034, + "grad_norm": 0.6068945527076721, + "learning_rate": 1.7512502482442365e-05, + "loss": 0.0577, + "step": 126430 + }, + { + "epoch": 27.009792558089153, + "grad_norm": 0.0014371599536389112, + "learning_rate": 1.750949346139727e-05, + "loss": 0.005, + "step": 126440 + }, + { + "epoch": 27.009846720467962, + "grad_norm": 0.001978287473320961, + "learning_rate": 1.7506484440352177e-05, + "loss": 0.0191, + "step": 126450 + }, + { + "epoch": 27.009900882846775, + "grad_norm": 0.024199984967708588, + "learning_rate": 1.7503475419307083e-05, + "loss": 0.0929, + "step": 126460 + }, + { + "epoch": 27.009955045225585, + "grad_norm": 10.51973819732666, + "learning_rate": 1.7500466398261993e-05, + "loss": 0.0592, + "step": 126470 + }, + { + "epoch": 27.010009207604398, + "grad_norm": 0.0110634695738554, + "learning_rate": 1.7497457377216896e-05, + "loss": 0.0588, + "step": 126480 + }, + { + "epoch": 27.01006336998321, + "grad_norm": 0.0015211217105388641, + "learning_rate": 1.7494448356171802e-05, + "loss": 0.0405, + "step": 126490 + }, + { + "epoch": 27.01011753236202, + "grad_norm": 0.9748439788818359, + "learning_rate": 1.7491439335126712e-05, + "loss": 0.0391, + "step": 126500 + }, + { + "epoch": 27.010171694740833, + "grad_norm": 0.002026344882324338, + "learning_rate": 1.7488430314081615e-05, + "loss": 0.0243, + "step": 126510 + }, + { + "epoch": 27.010225857119643, + "grad_norm": 0.0013643515994772315, + "learning_rate": 1.7485421293036524e-05, + "loss": 0.0336, + "step": 126520 + }, + { + "epoch": 27.010280019498456, + "grad_norm": 0.008042876608669758, + "learning_rate": 1.748241227199143e-05, + "loss": 0.0331, + "step": 126530 + }, + { + "epoch": 27.01033418187727, + "grad_norm": 0.15469609200954437, + "learning_rate": 1.7479403250946337e-05, + "loss": 0.0079, + "step": 126540 + }, + { + "epoch": 27.01038834425608, + "grad_norm": 0.003354710293933749, + "learning_rate": 1.7476394229901243e-05, + "loss": 0.0178, + "step": 126550 + }, + { + "epoch": 27.01044250663489, + "grad_norm": 0.09501762688159943, + "learning_rate": 1.7473385208856153e-05, + "loss": 0.0341, + "step": 126560 + }, + { + "epoch": 27.010496669013705, + "grad_norm": 0.001294926623813808, + "learning_rate": 1.747037618781106e-05, + "loss": 0.0446, + "step": 126570 + }, + { + "epoch": 27.010550831392514, + "grad_norm": 0.0015351673355326056, + "learning_rate": 1.7467367166765965e-05, + "loss": 0.1055, + "step": 126580 + }, + { + "epoch": 27.010604993771327, + "grad_norm": 0.014649536460638046, + "learning_rate": 1.7464358145720872e-05, + "loss": 0.0503, + "step": 126590 + }, + { + "epoch": 27.010659156150137, + "grad_norm": 0.0023871916346251965, + "learning_rate": 1.746134912467578e-05, + "loss": 0.039, + "step": 126600 + }, + { + "epoch": 27.01071331852895, + "grad_norm": 0.5701262354850769, + "learning_rate": 1.7458340103630684e-05, + "loss": 0.0344, + "step": 126610 + }, + { + "epoch": 27.010767480907763, + "grad_norm": 0.03758478909730911, + "learning_rate": 1.7455331082585594e-05, + "loss": 0.0604, + "step": 126620 + }, + { + "epoch": 27.010821643286572, + "grad_norm": 0.047125063836574554, + "learning_rate": 1.74523220615405e-05, + "loss": 0.0363, + "step": 126630 + }, + { + "epoch": 27.010875805665385, + "grad_norm": 0.24735186994075775, + "learning_rate": 1.7449313040495403e-05, + "loss": 0.0913, + "step": 126640 + }, + { + "epoch": 27.010929968044195, + "grad_norm": 0.002647592220455408, + "learning_rate": 1.7446304019450313e-05, + "loss": 0.023, + "step": 126650 + }, + { + "epoch": 27.010984130423008, + "grad_norm": 5.345956802368164, + "learning_rate": 1.744329499840522e-05, + "loss": 0.1018, + "step": 126660 + }, + { + "epoch": 27.01103829280182, + "grad_norm": 0.0013737642439082265, + "learning_rate": 1.7440285977360125e-05, + "loss": 0.0586, + "step": 126670 + }, + { + "epoch": 27.01109245518063, + "grad_norm": 1.0811153650283813, + "learning_rate": 1.743727695631503e-05, + "loss": 0.084, + "step": 126680 + }, + { + "epoch": 27.011146617559444, + "grad_norm": 0.17393918335437775, + "learning_rate": 1.743426793526994e-05, + "loss": 0.0178, + "step": 126690 + }, + { + "epoch": 27.011200779938253, + "grad_norm": 0.050700053572654724, + "learning_rate": 1.7431258914224848e-05, + "loss": 0.014, + "step": 126700 + }, + { + "epoch": 27.011254942317066, + "grad_norm": 0.6450880765914917, + "learning_rate": 1.7428249893179754e-05, + "loss": 0.023, + "step": 126710 + }, + { + "epoch": 27.01130910469588, + "grad_norm": 0.0074565475806593895, + "learning_rate": 1.742524087213466e-05, + "loss": 0.0076, + "step": 126720 + }, + { + "epoch": 27.01136326707469, + "grad_norm": 0.26356109976768494, + "learning_rate": 1.742223185108957e-05, + "loss": 0.0167, + "step": 126730 + }, + { + "epoch": 27.011417429453502, + "grad_norm": 0.0024874319788068533, + "learning_rate": 1.7419222830044473e-05, + "loss": 0.0098, + "step": 126740 + }, + { + "epoch": 27.011471591832315, + "grad_norm": 37.19004440307617, + "learning_rate": 1.7416213808999382e-05, + "loss": 0.0945, + "step": 126750 + }, + { + "epoch": 27.011525754211124, + "grad_norm": 0.0011750413104891777, + "learning_rate": 1.741320478795429e-05, + "loss": 0.0133, + "step": 126760 + }, + { + "epoch": 27.011579916589938, + "grad_norm": 0.9011620283126831, + "learning_rate": 1.7410195766909195e-05, + "loss": 0.0242, + "step": 126770 + }, + { + "epoch": 27.011634078968747, + "grad_norm": 0.003502080449834466, + "learning_rate": 1.74071867458641e-05, + "loss": 0.0171, + "step": 126780 + }, + { + "epoch": 27.01168824134756, + "grad_norm": 0.02473708800971508, + "learning_rate": 1.7404177724819007e-05, + "loss": 0.0099, + "step": 126790 + }, + { + "epoch": 27.011742403726373, + "grad_norm": 0.001202149549499154, + "learning_rate": 1.7401168703773914e-05, + "loss": 0.1086, + "step": 126800 + }, + { + "epoch": 27.011796566105183, + "grad_norm": 0.5581299066543579, + "learning_rate": 1.739815968272882e-05, + "loss": 0.0423, + "step": 126810 + }, + { + "epoch": 27.011850728483996, + "grad_norm": 0.037177011370658875, + "learning_rate": 1.739515066168373e-05, + "loss": 0.0711, + "step": 126820 + }, + { + "epoch": 27.011904890862805, + "grad_norm": 0.02237994410097599, + "learning_rate": 1.7392141640638636e-05, + "loss": 0.0217, + "step": 126830 + }, + { + "epoch": 27.01195905324162, + "grad_norm": 0.006481124088168144, + "learning_rate": 1.7389132619593542e-05, + "loss": 0.002, + "step": 126840 + }, + { + "epoch": 27.01201321562043, + "grad_norm": 0.04392660781741142, + "learning_rate": 1.738612359854845e-05, + "loss": 0.0316, + "step": 126850 + }, + { + "epoch": 27.01206737799924, + "grad_norm": 0.0019663020502775908, + "learning_rate": 1.7383114577503358e-05, + "loss": 0.0108, + "step": 126860 + }, + { + "epoch": 27.012121540378054, + "grad_norm": 2.304368734359741, + "learning_rate": 1.738010555645826e-05, + "loss": 0.0179, + "step": 126870 + }, + { + "epoch": 27.012175702756863, + "grad_norm": 0.07309019565582275, + "learning_rate": 1.737709653541317e-05, + "loss": 0.0265, + "step": 126880 + }, + { + "epoch": 27.012229865135676, + "grad_norm": 0.6814529299736023, + "learning_rate": 1.7374087514368077e-05, + "loss": 0.0109, + "step": 126890 + }, + { + "epoch": 27.01228402751449, + "grad_norm": 0.001128733274526894, + "learning_rate": 1.7371078493322983e-05, + "loss": 0.0627, + "step": 126900 + }, + { + "epoch": 27.0123381898933, + "grad_norm": 0.007810278329998255, + "learning_rate": 1.736806947227789e-05, + "loss": 0.0311, + "step": 126910 + }, + { + "epoch": 27.012392352272112, + "grad_norm": 0.0036641189362853765, + "learning_rate": 1.73650604512328e-05, + "loss": 0.0358, + "step": 126920 + }, + { + "epoch": 27.012446514650925, + "grad_norm": 0.13740329444408417, + "learning_rate": 1.7362051430187702e-05, + "loss": 0.0563, + "step": 126930 + }, + { + "epoch": 27.012500677029735, + "grad_norm": 0.002054450800642371, + "learning_rate": 1.735904240914261e-05, + "loss": 0.037, + "step": 126940 + }, + { + "epoch": 27.012554839408548, + "grad_norm": 0.07054304331541061, + "learning_rate": 1.7356033388097518e-05, + "loss": 0.0333, + "step": 126950 + }, + { + "epoch": 27.012609001787357, + "grad_norm": 9.834628105163574, + "learning_rate": 1.7353024367052424e-05, + "loss": 0.1737, + "step": 126960 + }, + { + "epoch": 27.01266316416617, + "grad_norm": 0.0014152411604300141, + "learning_rate": 1.735001534600733e-05, + "loss": 0.0003, + "step": 126970 + }, + { + "epoch": 27.012717326544983, + "grad_norm": 0.004006372299045324, + "learning_rate": 1.7347006324962237e-05, + "loss": 0.1113, + "step": 126980 + }, + { + "epoch": 27.012771488923793, + "grad_norm": 0.0014722226187586784, + "learning_rate": 1.7343997303917146e-05, + "loss": 0.0146, + "step": 126990 + }, + { + "epoch": 27.012825651302606, + "grad_norm": 0.0012798451352864504, + "learning_rate": 1.734098828287205e-05, + "loss": 0.0026, + "step": 127000 + }, + { + "epoch": 27.012879813681415, + "grad_norm": 1.2523813247680664, + "learning_rate": 1.733797926182696e-05, + "loss": 0.0243, + "step": 127010 + }, + { + "epoch": 27.01293397606023, + "grad_norm": 10.794889450073242, + "learning_rate": 1.7334970240781865e-05, + "loss": 0.063, + "step": 127020 + }, + { + "epoch": 27.01298813843904, + "grad_norm": 0.019525155425071716, + "learning_rate": 1.733196121973677e-05, + "loss": 0.0145, + "step": 127030 + }, + { + "epoch": 27.01304230081785, + "grad_norm": 3.721374750137329, + "learning_rate": 1.7328952198691678e-05, + "loss": 0.0107, + "step": 127040 + }, + { + "epoch": 27.013096463196664, + "grad_norm": 0.0013670568587258458, + "learning_rate": 1.7325943177646587e-05, + "loss": 0.0375, + "step": 127050 + }, + { + "epoch": 27.013150625575474, + "grad_norm": 0.005481599830091, + "learning_rate": 1.732293415660149e-05, + "loss": 0.0594, + "step": 127060 + }, + { + "epoch": 27.013204787954287, + "grad_norm": 2.6318299770355225, + "learning_rate": 1.73199251355564e-05, + "loss": 0.0556, + "step": 127070 + }, + { + "epoch": 27.0132589503331, + "grad_norm": 0.0012069386430084705, + "learning_rate": 1.7316916114511306e-05, + "loss": 0.0104, + "step": 127080 + }, + { + "epoch": 27.01331311271191, + "grad_norm": 0.0025325543247163296, + "learning_rate": 1.7313907093466213e-05, + "loss": 0.0577, + "step": 127090 + }, + { + "epoch": 27.013367275090722, + "grad_norm": 0.0023693584371358156, + "learning_rate": 1.731089807242112e-05, + "loss": 0.0195, + "step": 127100 + }, + { + "epoch": 27.013421437469535, + "grad_norm": 3.8716421127319336, + "learning_rate": 1.7307889051376025e-05, + "loss": 0.0864, + "step": 127110 + }, + { + "epoch": 27.013475599848345, + "grad_norm": 0.001184569438919425, + "learning_rate": 1.7304880030330935e-05, + "loss": 0.0519, + "step": 127120 + }, + { + "epoch": 27.013529762227158, + "grad_norm": 0.07628283649682999, + "learning_rate": 1.7301871009285838e-05, + "loss": 0.0488, + "step": 127130 + }, + { + "epoch": 27.013583924605967, + "grad_norm": 0.013673569075763226, + "learning_rate": 1.7298861988240747e-05, + "loss": 0.0361, + "step": 127140 + }, + { + "epoch": 27.01363808698478, + "grad_norm": 0.0012246445985510945, + "learning_rate": 1.7295852967195654e-05, + "loss": 0.0055, + "step": 127150 + }, + { + "epoch": 27.013692249363594, + "grad_norm": 0.0012336603831499815, + "learning_rate": 1.729284394615056e-05, + "loss": 0.0979, + "step": 127160 + }, + { + "epoch": 27.013746411742403, + "grad_norm": 0.003592150751501322, + "learning_rate": 1.7289834925105466e-05, + "loss": 0.0864, + "step": 127170 + }, + { + "epoch": 27.013800574121216, + "grad_norm": 0.001359546440653503, + "learning_rate": 1.7286825904060376e-05, + "loss": 0.0043, + "step": 127180 + }, + { + "epoch": 27.013854736500026, + "grad_norm": 3.1182026863098145, + "learning_rate": 1.728381688301528e-05, + "loss": 0.0374, + "step": 127190 + }, + { + "epoch": 27.01390889887884, + "grad_norm": 0.0038811094127595425, + "learning_rate": 1.728080786197019e-05, + "loss": 0.0051, + "step": 127200 + }, + { + "epoch": 27.013963061257652, + "grad_norm": 2.768841505050659, + "learning_rate": 1.7277798840925095e-05, + "loss": 0.1163, + "step": 127210 + }, + { + "epoch": 27.01401722363646, + "grad_norm": 0.4323040843009949, + "learning_rate": 1.727478981988e-05, + "loss": 0.0513, + "step": 127220 + }, + { + "epoch": 27.014071386015274, + "grad_norm": 0.0012791830813512206, + "learning_rate": 1.7271780798834907e-05, + "loss": 0.0231, + "step": 127230 + }, + { + "epoch": 27.014125548394084, + "grad_norm": 0.3186591863632202, + "learning_rate": 1.7268771777789813e-05, + "loss": 0.0468, + "step": 127240 + }, + { + "epoch": 27.014179710772897, + "grad_norm": 0.0016320666763931513, + "learning_rate": 1.7265762756744723e-05, + "loss": 0.0247, + "step": 127250 + }, + { + "epoch": 27.01423387315171, + "grad_norm": 0.002300388179719448, + "learning_rate": 1.7262753735699626e-05, + "loss": 0.0815, + "step": 127260 + }, + { + "epoch": 27.01428803553052, + "grad_norm": 2.6833794116973877, + "learning_rate": 1.7259744714654536e-05, + "loss": 0.0392, + "step": 127270 + }, + { + "epoch": 27.014342197909333, + "grad_norm": 0.0014442192623391747, + "learning_rate": 1.7256735693609442e-05, + "loss": 0.0538, + "step": 127280 + }, + { + "epoch": 27.014396360288146, + "grad_norm": 0.06042671948671341, + "learning_rate": 1.7253726672564348e-05, + "loss": 0.0253, + "step": 127290 + }, + { + "epoch": 27.014450522666955, + "grad_norm": 0.0381193645298481, + "learning_rate": 1.7250717651519255e-05, + "loss": 0.0425, + "step": 127300 + }, + { + "epoch": 27.014504685045768, + "grad_norm": 0.0014607921475544572, + "learning_rate": 1.7247708630474164e-05, + "loss": 0.0719, + "step": 127310 + }, + { + "epoch": 27.014558847424578, + "grad_norm": 0.004413103684782982, + "learning_rate": 1.7244699609429067e-05, + "loss": 0.0407, + "step": 127320 + }, + { + "epoch": 27.01461300980339, + "grad_norm": 1.0385137796401978, + "learning_rate": 1.7241690588383977e-05, + "loss": 0.0141, + "step": 127330 + }, + { + "epoch": 27.014667172182204, + "grad_norm": 0.9989345669746399, + "learning_rate": 1.7238681567338883e-05, + "loss": 0.041, + "step": 127340 + }, + { + "epoch": 27.014721334561013, + "grad_norm": 0.0015176224987953901, + "learning_rate": 1.723567254629379e-05, + "loss": 0.0763, + "step": 127350 + }, + { + "epoch": 27.014775496939826, + "grad_norm": 3.7003729343414307, + "learning_rate": 1.7232663525248696e-05, + "loss": 0.0776, + "step": 127360 + }, + { + "epoch": 27.014829659318636, + "grad_norm": 0.004973096307367086, + "learning_rate": 1.7229654504203605e-05, + "loss": 0.0972, + "step": 127370 + }, + { + "epoch": 27.01488382169745, + "grad_norm": 0.0032224380411207676, + "learning_rate": 1.722664548315851e-05, + "loss": 0.021, + "step": 127380 + }, + { + "epoch": 27.014937984076262, + "grad_norm": 0.04541664570569992, + "learning_rate": 1.7223636462113414e-05, + "loss": 0.0264, + "step": 127390 + }, + { + "epoch": 27.01499214645507, + "grad_norm": 0.013894150033593178, + "learning_rate": 1.7220627441068324e-05, + "loss": 0.0045, + "step": 127400 + }, + { + "epoch": 27.015046308833885, + "grad_norm": 0.001954553881660104, + "learning_rate": 1.721761842002323e-05, + "loss": 0.0273, + "step": 127410 + }, + { + "epoch": 27.015100471212694, + "grad_norm": 2.3566412925720215, + "learning_rate": 1.7214609398978137e-05, + "loss": 0.0531, + "step": 127420 + }, + { + "epoch": 27.015154633591507, + "grad_norm": 0.0015644300729036331, + "learning_rate": 1.7211600377933043e-05, + "loss": 0.0249, + "step": 127430 + }, + { + "epoch": 27.01520879597032, + "grad_norm": 0.003575878916308284, + "learning_rate": 1.7208591356887953e-05, + "loss": 0.1413, + "step": 127440 + }, + { + "epoch": 27.01526295834913, + "grad_norm": 1.9494991302490234, + "learning_rate": 1.7205582335842855e-05, + "loss": 0.0487, + "step": 127450 + }, + { + "epoch": 27.015317120727943, + "grad_norm": 0.5325559973716736, + "learning_rate": 1.7202573314797765e-05, + "loss": 0.0337, + "step": 127460 + }, + { + "epoch": 27.015371283106752, + "grad_norm": 0.0061002932488918304, + "learning_rate": 1.719956429375267e-05, + "loss": 0.06, + "step": 127470 + }, + { + "epoch": 27.015425445485565, + "grad_norm": 2.6167209148406982, + "learning_rate": 1.7196555272707578e-05, + "loss": 0.044, + "step": 127480 + }, + { + "epoch": 27.01547960786438, + "grad_norm": 0.17774879932403564, + "learning_rate": 1.7193546251662484e-05, + "loss": 0.0321, + "step": 127490 + }, + { + "epoch": 27.015533770243188, + "grad_norm": 0.4816402196884155, + "learning_rate": 1.7190537230617394e-05, + "loss": 0.0264, + "step": 127500 + }, + { + "epoch": 27.015587932622, + "grad_norm": 0.003531487425789237, + "learning_rate": 1.71875282095723e-05, + "loss": 0.014, + "step": 127510 + }, + { + "epoch": 27.015642095000814, + "grad_norm": 0.001412784680724144, + "learning_rate": 1.7184519188527206e-05, + "loss": 0.0098, + "step": 127520 + }, + { + "epoch": 27.015696257379624, + "grad_norm": 0.0011717566521838307, + "learning_rate": 1.7181510167482112e-05, + "loss": 0.0568, + "step": 127530 + }, + { + "epoch": 27.015750419758437, + "grad_norm": 0.0145330261439085, + "learning_rate": 1.717850114643702e-05, + "loss": 0.0371, + "step": 127540 + }, + { + "epoch": 27.015804582137246, + "grad_norm": 0.0011377267073839903, + "learning_rate": 1.7175492125391925e-05, + "loss": 0.0529, + "step": 127550 + }, + { + "epoch": 27.01585874451606, + "grad_norm": 0.001275520189665258, + "learning_rate": 1.717248310434683e-05, + "loss": 0.0001, + "step": 127560 + }, + { + "epoch": 27.015912906894872, + "grad_norm": 0.002375583862885833, + "learning_rate": 1.716947408330174e-05, + "loss": 0.0161, + "step": 127570 + }, + { + "epoch": 27.01596706927368, + "grad_norm": 0.05077945813536644, + "learning_rate": 1.7166465062256644e-05, + "loss": 0.0627, + "step": 127580 + }, + { + "epoch": 27.016021231652495, + "grad_norm": 0.008057131431996822, + "learning_rate": 1.7163456041211553e-05, + "loss": 0.0378, + "step": 127590 + }, + { + "epoch": 27.016075394031304, + "grad_norm": 0.0013389593223109841, + "learning_rate": 1.716044702016646e-05, + "loss": 0.0004, + "step": 127600 + }, + { + "epoch": 27.016129556410117, + "grad_norm": 0.4423978626728058, + "learning_rate": 1.7157437999121366e-05, + "loss": 0.0577, + "step": 127610 + }, + { + "epoch": 27.01618371878893, + "grad_norm": 1.2595947980880737, + "learning_rate": 1.7154428978076272e-05, + "loss": 0.0646, + "step": 127620 + }, + { + "epoch": 27.01623788116774, + "grad_norm": 0.004081040620803833, + "learning_rate": 1.7151419957031182e-05, + "loss": 0.0537, + "step": 127630 + }, + { + "epoch": 27.016292043546553, + "grad_norm": 0.16321833431720734, + "learning_rate": 1.7148410935986088e-05, + "loss": 0.0269, + "step": 127640 + }, + { + "epoch": 27.016346205925363, + "grad_norm": 0.03321417048573494, + "learning_rate": 1.7145401914940994e-05, + "loss": 0.0192, + "step": 127650 + }, + { + "epoch": 27.016400368304176, + "grad_norm": 1.8532034158706665, + "learning_rate": 1.71423928938959e-05, + "loss": 0.1254, + "step": 127660 + }, + { + "epoch": 27.01645453068299, + "grad_norm": 1.7915037870407104, + "learning_rate": 1.713938387285081e-05, + "loss": 0.0187, + "step": 127670 + }, + { + "epoch": 27.016508693061798, + "grad_norm": 0.09268465638160706, + "learning_rate": 1.7136374851805713e-05, + "loss": 0.1715, + "step": 127680 + }, + { + "epoch": 27.01656285544061, + "grad_norm": 0.0012407408794388175, + "learning_rate": 1.713336583076062e-05, + "loss": 0.0449, + "step": 127690 + }, + { + "epoch": 27.016617017819424, + "grad_norm": 0.02463383786380291, + "learning_rate": 1.713035680971553e-05, + "loss": 0.0225, + "step": 127700 + }, + { + "epoch": 27.016671180198234, + "grad_norm": 2.589510202407837, + "learning_rate": 1.7127347788670432e-05, + "loss": 0.0263, + "step": 127710 + }, + { + "epoch": 27.016725342577047, + "grad_norm": 0.6806936860084534, + "learning_rate": 1.7124338767625342e-05, + "loss": 0.1257, + "step": 127720 + }, + { + "epoch": 27.016779504955856, + "grad_norm": 0.016387686133384705, + "learning_rate": 1.7121329746580248e-05, + "loss": 0.0011, + "step": 127730 + }, + { + "epoch": 27.01683366733467, + "grad_norm": 0.005547930486500263, + "learning_rate": 1.7118320725535154e-05, + "loss": 0.015, + "step": 127740 + }, + { + "epoch": 27.016887829713482, + "grad_norm": 0.03564789518713951, + "learning_rate": 1.711531170449006e-05, + "loss": 0.0212, + "step": 127750 + }, + { + "epoch": 27.016941992092292, + "grad_norm": 0.0019730220083147287, + "learning_rate": 1.711230268344497e-05, + "loss": 0.1242, + "step": 127760 + }, + { + "epoch": 27.016996154471105, + "grad_norm": 0.0014194468967616558, + "learning_rate": 1.7109293662399877e-05, + "loss": 0.0147, + "step": 127770 + }, + { + "epoch": 27.017050316849915, + "grad_norm": 0.0021978728473186493, + "learning_rate": 1.7106284641354783e-05, + "loss": 0.0903, + "step": 127780 + }, + { + "epoch": 27.017104479228728, + "grad_norm": 0.12225591391324997, + "learning_rate": 1.710327562030969e-05, + "loss": 0.0277, + "step": 127790 + }, + { + "epoch": 27.01715864160754, + "grad_norm": 0.7046177387237549, + "learning_rate": 1.71002665992646e-05, + "loss": 0.0159, + "step": 127800 + }, + { + "epoch": 27.01721280398635, + "grad_norm": 1.2708766460418701, + "learning_rate": 1.70972575782195e-05, + "loss": 0.0509, + "step": 127810 + }, + { + "epoch": 27.017266966365163, + "grad_norm": 0.04359958693385124, + "learning_rate": 1.709424855717441e-05, + "loss": 0.009, + "step": 127820 + }, + { + "epoch": 27.017321128743973, + "grad_norm": 0.016404887661337852, + "learning_rate": 1.7091239536129318e-05, + "loss": 0.004, + "step": 127830 + }, + { + "epoch": 27.017375291122786, + "grad_norm": 0.001545037142932415, + "learning_rate": 1.708823051508422e-05, + "loss": 0.0309, + "step": 127840 + }, + { + "epoch": 27.0174294535016, + "grad_norm": 0.0028426966164261103, + "learning_rate": 1.708522149403913e-05, + "loss": 0.028, + "step": 127850 + }, + { + "epoch": 27.01748361588041, + "grad_norm": 0.02357609011232853, + "learning_rate": 1.7082212472994036e-05, + "loss": 0.0084, + "step": 127860 + }, + { + "epoch": 27.01753777825922, + "grad_norm": 0.9700624942779541, + "learning_rate": 1.7079203451948943e-05, + "loss": 0.0563, + "step": 127870 + }, + { + "epoch": 27.017591940638034, + "grad_norm": 0.018010295927524567, + "learning_rate": 1.707619443090385e-05, + "loss": 0.013, + "step": 127880 + }, + { + "epoch": 27.017646103016844, + "grad_norm": 0.5641188025474548, + "learning_rate": 1.707318540985876e-05, + "loss": 0.0156, + "step": 127890 + }, + { + "epoch": 27.017700265395657, + "grad_norm": 16.849775314331055, + "learning_rate": 1.7070176388813665e-05, + "loss": 0.1409, + "step": 127900 + }, + { + "epoch": 27.017754427774467, + "grad_norm": 1.3293538093566895, + "learning_rate": 1.706716736776857e-05, + "loss": 0.0438, + "step": 127910 + }, + { + "epoch": 27.01780859015328, + "grad_norm": 0.0013713767984881997, + "learning_rate": 1.7064158346723477e-05, + "loss": 0.0642, + "step": 127920 + }, + { + "epoch": 27.017862752532093, + "grad_norm": 0.6688695549964905, + "learning_rate": 1.7061149325678387e-05, + "loss": 0.0413, + "step": 127930 + }, + { + "epoch": 27.017916914910902, + "grad_norm": 0.019671084359288216, + "learning_rate": 1.705814030463329e-05, + "loss": 0.0453, + "step": 127940 + }, + { + "epoch": 27.017971077289715, + "grad_norm": 0.0019055672455579042, + "learning_rate": 1.70551312835882e-05, + "loss": 0.0286, + "step": 127950 + }, + { + "epoch": 27.018025239668525, + "grad_norm": 0.0012414148077368736, + "learning_rate": 1.7052122262543106e-05, + "loss": 0.0062, + "step": 127960 + }, + { + "epoch": 27.018079402047338, + "grad_norm": 0.001184628577902913, + "learning_rate": 1.7049113241498012e-05, + "loss": 0.005, + "step": 127970 + }, + { + "epoch": 27.01813356442615, + "grad_norm": 0.003806950757279992, + "learning_rate": 1.704610422045292e-05, + "loss": 0.0352, + "step": 127980 + }, + { + "epoch": 27.01818772680496, + "grad_norm": 0.05391277000308037, + "learning_rate": 1.7043095199407825e-05, + "loss": 0.0359, + "step": 127990 + }, + { + "epoch": 27.018241889183773, + "grad_norm": 0.08884914964437485, + "learning_rate": 1.704008617836273e-05, + "loss": 0.0262, + "step": 128000 + }, + { + "epoch": 27.018296051562583, + "grad_norm": 0.004250614438205957, + "learning_rate": 1.7037077157317637e-05, + "loss": 0.0001, + "step": 128010 + }, + { + "epoch": 27.018350213941396, + "grad_norm": 0.0036129599902778864, + "learning_rate": 1.7034068136272547e-05, + "loss": 0.0006, + "step": 128020 + }, + { + "epoch": 27.01840437632021, + "grad_norm": 0.06772156804800034, + "learning_rate": 1.7031059115227453e-05, + "loss": 0.0187, + "step": 128030 + }, + { + "epoch": 27.01845853869902, + "grad_norm": 0.0013099635252729058, + "learning_rate": 1.702805009418236e-05, + "loss": 0.081, + "step": 128040 + }, + { + "epoch": 27.01851270107783, + "grad_norm": 0.09139319509267807, + "learning_rate": 1.7025041073137266e-05, + "loss": 0.0206, + "step": 128050 + }, + { + "epoch": 27.018566863456645, + "grad_norm": 0.0011974861845374107, + "learning_rate": 1.7022032052092175e-05, + "loss": 0.0031, + "step": 128060 + }, + { + "epoch": 27.018621025835454, + "grad_norm": 7.380950927734375, + "learning_rate": 1.701902303104708e-05, + "loss": 0.1339, + "step": 128070 + }, + { + "epoch": 27.018675188214267, + "grad_norm": 0.0011967034079134464, + "learning_rate": 1.7016014010001988e-05, + "loss": 0.0951, + "step": 128080 + }, + { + "epoch": 27.018729350593077, + "grad_norm": 0.19925321638584137, + "learning_rate": 1.7013004988956894e-05, + "loss": 0.0524, + "step": 128090 + }, + { + "epoch": 27.01878351297189, + "grad_norm": 0.002160710748285055, + "learning_rate": 1.70099959679118e-05, + "loss": 0.0398, + "step": 128100 + }, + { + "epoch": 27.018837675350703, + "grad_norm": 3.4366135597229004, + "learning_rate": 1.7006986946866707e-05, + "loss": 0.0945, + "step": 128110 + }, + { + "epoch": 27.018891837729512, + "grad_norm": 0.003316173329949379, + "learning_rate": 1.7003977925821616e-05, + "loss": 0.033, + "step": 128120 + }, + { + "epoch": 27.018946000108325, + "grad_norm": 0.002329170471057296, + "learning_rate": 1.700096890477652e-05, + "loss": 0.024, + "step": 128130 + }, + { + "epoch": 27.019000162487135, + "grad_norm": 0.0017735057044774294, + "learning_rate": 1.6997959883731426e-05, + "loss": 0.0027, + "step": 128140 + }, + { + "epoch": 27.019054324865948, + "grad_norm": 0.010716499760746956, + "learning_rate": 1.6994950862686335e-05, + "loss": 0.0086, + "step": 128150 + }, + { + "epoch": 27.01910848724476, + "grad_norm": 0.07916419208049774, + "learning_rate": 1.699194184164124e-05, + "loss": 0.0059, + "step": 128160 + }, + { + "epoch": 27.01916264962357, + "grad_norm": 0.001240299898199737, + "learning_rate": 1.6988932820596148e-05, + "loss": 0.0289, + "step": 128170 + }, + { + "epoch": 27.019216812002384, + "grad_norm": 0.006870293524116278, + "learning_rate": 1.6985923799551054e-05, + "loss": 0.0082, + "step": 128180 + }, + { + "epoch": 27.019270974381193, + "grad_norm": 0.2654552757740021, + "learning_rate": 1.6982914778505964e-05, + "loss": 0.0319, + "step": 128190 + }, + { + "epoch": 27.019325136760006, + "grad_norm": 0.014268441125750542, + "learning_rate": 1.6979905757460867e-05, + "loss": 0.0087, + "step": 128200 + }, + { + "epoch": 27.01937929913882, + "grad_norm": 0.5854828953742981, + "learning_rate": 1.6976896736415776e-05, + "loss": 0.0126, + "step": 128210 + }, + { + "epoch": 27.01943346151763, + "grad_norm": 0.0020178703125566244, + "learning_rate": 1.6973887715370683e-05, + "loss": 0.035, + "step": 128220 + }, + { + "epoch": 27.019487623896442, + "grad_norm": 0.01469582598656416, + "learning_rate": 1.697087869432559e-05, + "loss": 0.0454, + "step": 128230 + }, + { + "epoch": 27.019541786275255, + "grad_norm": 1.2073618173599243, + "learning_rate": 1.6967869673280495e-05, + "loss": 0.0138, + "step": 128240 + }, + { + "epoch": 27.019595948654064, + "grad_norm": 0.0011771174613386393, + "learning_rate": 1.6964860652235405e-05, + "loss": 0.0253, + "step": 128250 + }, + { + "epoch": 27.019650111032878, + "grad_norm": 0.0012682988308370113, + "learning_rate": 1.6961851631190308e-05, + "loss": 0.0125, + "step": 128260 + }, + { + "epoch": 27.019704273411687, + "grad_norm": 1.6651633977890015, + "learning_rate": 1.6958842610145217e-05, + "loss": 0.0311, + "step": 128270 + }, + { + "epoch": 27.0197584357905, + "grad_norm": 0.0046780407428741455, + "learning_rate": 1.6955833589100124e-05, + "loss": 0.0315, + "step": 128280 + }, + { + "epoch": 27.019812598169313, + "grad_norm": 0.001549244625493884, + "learning_rate": 1.695282456805503e-05, + "loss": 0.0015, + "step": 128290 + }, + { + "epoch": 27.019866760548123, + "grad_norm": 8.430404663085938, + "learning_rate": 1.6949815547009936e-05, + "loss": 0.0851, + "step": 128300 + }, + { + "epoch": 27.019920922926936, + "grad_norm": 0.01332101970911026, + "learning_rate": 1.6946806525964842e-05, + "loss": 0.0089, + "step": 128310 + }, + { + "epoch": 27.019975085305745, + "grad_norm": 0.0009606993407942355, + "learning_rate": 1.6943797504919752e-05, + "loss": 0.0212, + "step": 128320 + }, + { + "epoch": 27.02002924768456, + "grad_norm": 2.3127713203430176, + "learning_rate": 1.6940788483874655e-05, + "loss": 0.0368, + "step": 128330 + }, + { + "epoch": 27.02008341006337, + "grad_norm": 1.1474753618240356, + "learning_rate": 1.6937779462829565e-05, + "loss": 0.0618, + "step": 128340 + }, + { + "epoch": 27.02013757244218, + "grad_norm": 0.0017018730286508799, + "learning_rate": 1.693477044178447e-05, + "loss": 0.0567, + "step": 128350 + }, + { + "epoch": 27.020191734820994, + "grad_norm": 0.0022039422765374184, + "learning_rate": 1.6931761420739377e-05, + "loss": 0.0006, + "step": 128360 + }, + { + "epoch": 27.020245897199803, + "grad_norm": 0.0009572331327944994, + "learning_rate": 1.6928752399694284e-05, + "loss": 0.1173, + "step": 128370 + }, + { + "epoch": 27.020300059578616, + "grad_norm": 0.6596554517745972, + "learning_rate": 1.6925743378649193e-05, + "loss": 0.018, + "step": 128380 + }, + { + "epoch": 27.02035422195743, + "grad_norm": 3.7107386589050293, + "learning_rate": 1.6922734357604096e-05, + "loss": 0.116, + "step": 128390 + }, + { + "epoch": 27.02040838433624, + "grad_norm": 0.010232027620077133, + "learning_rate": 1.6919725336559006e-05, + "loss": 0.025, + "step": 128400 + }, + { + "epoch": 27.020462546715052, + "grad_norm": 0.014599586836993694, + "learning_rate": 1.6916716315513912e-05, + "loss": 0.0065, + "step": 128410 + }, + { + "epoch": 27.020516709093865, + "grad_norm": 0.003561965189874172, + "learning_rate": 1.6913707294468818e-05, + "loss": 0.0555, + "step": 128420 + }, + { + "epoch": 27.020570871472675, + "grad_norm": 0.0009439211571589112, + "learning_rate": 1.6910698273423725e-05, + "loss": 0.006, + "step": 128430 + }, + { + "epoch": 27.020625033851488, + "grad_norm": 0.03718459978699684, + "learning_rate": 1.690768925237863e-05, + "loss": 0.2171, + "step": 128440 + }, + { + "epoch": 27.020679196230297, + "grad_norm": 0.0012639195192605257, + "learning_rate": 1.690468023133354e-05, + "loss": 0.0398, + "step": 128450 + }, + { + "epoch": 27.02073335860911, + "grad_norm": 0.0022791693918406963, + "learning_rate": 1.6901671210288443e-05, + "loss": 0.0068, + "step": 128460 + }, + { + "epoch": 27.020787520987923, + "grad_norm": 0.001009710831567645, + "learning_rate": 1.6898662189243353e-05, + "loss": 0.0009, + "step": 128470 + }, + { + "epoch": 27.020841683366733, + "grad_norm": 0.0011214797850698233, + "learning_rate": 1.689565316819826e-05, + "loss": 0.0787, + "step": 128480 + }, + { + "epoch": 27.020895845745546, + "grad_norm": 0.007487883325666189, + "learning_rate": 1.6892644147153166e-05, + "loss": 0.0008, + "step": 128490 + }, + { + "epoch": 27.020950008124355, + "grad_norm": 0.0014454950578510761, + "learning_rate": 1.6889635126108072e-05, + "loss": 0.1076, + "step": 128500 + }, + { + "epoch": 27.02100417050317, + "grad_norm": 0.002089510904625058, + "learning_rate": 1.688662610506298e-05, + "loss": 0.073, + "step": 128510 + }, + { + "epoch": 27.02105833288198, + "grad_norm": 0.0011963517172262073, + "learning_rate": 1.6883617084017884e-05, + "loss": 0.0002, + "step": 128520 + }, + { + "epoch": 27.02111249526079, + "grad_norm": 0.0016405293717980385, + "learning_rate": 1.6880608062972794e-05, + "loss": 0.0489, + "step": 128530 + }, + { + "epoch": 27.021166657639604, + "grad_norm": 1.2424160242080688, + "learning_rate": 1.68775990419277e-05, + "loss": 0.11, + "step": 128540 + }, + { + "epoch": 27.021220820018414, + "grad_norm": 0.11027256399393082, + "learning_rate": 1.6874590020882607e-05, + "loss": 0.0303, + "step": 128550 + }, + { + "epoch": 27.021274982397227, + "grad_norm": 0.039332132786512375, + "learning_rate": 1.6871580999837513e-05, + "loss": 0.02, + "step": 128560 + }, + { + "epoch": 27.02132914477604, + "grad_norm": 0.0012017737608402967, + "learning_rate": 1.6868571978792423e-05, + "loss": 0.0007, + "step": 128570 + }, + { + "epoch": 27.02138330715485, + "grad_norm": 0.2961244285106659, + "learning_rate": 1.686556295774733e-05, + "loss": 0.1443, + "step": 128580 + }, + { + "epoch": 27.021437469533662, + "grad_norm": 1.2202427387237549, + "learning_rate": 1.6862553936702232e-05, + "loss": 0.0709, + "step": 128590 + }, + { + "epoch": 27.021491631912472, + "grad_norm": 0.0300214234739542, + "learning_rate": 1.685954491565714e-05, + "loss": 0.0312, + "step": 128600 + }, + { + "epoch": 27.021545794291285, + "grad_norm": 0.2554672360420227, + "learning_rate": 1.6856535894612048e-05, + "loss": 0.1117, + "step": 128610 + }, + { + "epoch": 27.021599956670098, + "grad_norm": 0.0021842927671968937, + "learning_rate": 1.6853526873566954e-05, + "loss": 0.001, + "step": 128620 + }, + { + "epoch": 27.021654119048907, + "grad_norm": 0.0286469217389822, + "learning_rate": 1.685051785252186e-05, + "loss": 0.0082, + "step": 128630 + }, + { + "epoch": 27.02170828142772, + "grad_norm": 0.2836906909942627, + "learning_rate": 1.684750883147677e-05, + "loss": 0.0039, + "step": 128640 + }, + { + "epoch": 27.021762443806534, + "grad_norm": 0.007542968261986971, + "learning_rate": 1.6844499810431673e-05, + "loss": 0.0522, + "step": 128650 + }, + { + "epoch": 27.021816606185343, + "grad_norm": 0.08932830393314362, + "learning_rate": 1.6841490789386582e-05, + "loss": 0.0075, + "step": 128660 + }, + { + "epoch": 27.021870768564156, + "grad_norm": 0.005600167438387871, + "learning_rate": 1.683848176834149e-05, + "loss": 0.0286, + "step": 128670 + }, + { + "epoch": 27.021924930942966, + "grad_norm": 0.0011264054337516427, + "learning_rate": 1.6835472747296395e-05, + "loss": 0.0306, + "step": 128680 + }, + { + "epoch": 27.02197909332178, + "grad_norm": 1.7507308721542358, + "learning_rate": 1.68324637262513e-05, + "loss": 0.0357, + "step": 128690 + }, + { + "epoch": 27.022033255700592, + "grad_norm": 0.00904883723706007, + "learning_rate": 1.682945470520621e-05, + "loss": 0.0209, + "step": 128700 + }, + { + "epoch": 27.0220874180794, + "grad_norm": 0.04498058930039406, + "learning_rate": 1.6826445684161117e-05, + "loss": 0.0508, + "step": 128710 + }, + { + "epoch": 27.022141580458214, + "grad_norm": 0.01120409183204174, + "learning_rate": 1.6823436663116023e-05, + "loss": 0.0505, + "step": 128720 + }, + { + "epoch": 27.022195742837024, + "grad_norm": 0.006556329783052206, + "learning_rate": 1.682042764207093e-05, + "loss": 0.1177, + "step": 128730 + }, + { + "epoch": 27.022249905215837, + "grad_norm": 0.002716024639084935, + "learning_rate": 1.6817418621025836e-05, + "loss": 0.1502, + "step": 128740 + }, + { + "epoch": 27.02230406759465, + "grad_norm": 0.0019118993077427149, + "learning_rate": 1.6814409599980742e-05, + "loss": 0.0143, + "step": 128750 + }, + { + "epoch": 27.02235822997346, + "grad_norm": 0.07064281404018402, + "learning_rate": 1.681140057893565e-05, + "loss": 0.0245, + "step": 128760 + }, + { + "epoch": 27.022412392352273, + "grad_norm": 0.0028918427415192127, + "learning_rate": 1.6808391557890558e-05, + "loss": 0.0495, + "step": 128770 + }, + { + "epoch": 27.022466554731082, + "grad_norm": 0.25255244970321655, + "learning_rate": 1.680538253684546e-05, + "loss": 0.0723, + "step": 128780 + }, + { + "epoch": 27.022520717109895, + "grad_norm": 0.01712716743350029, + "learning_rate": 1.680237351580037e-05, + "loss": 0.0535, + "step": 128790 + }, + { + "epoch": 27.022574879488708, + "grad_norm": 0.0071464162319898605, + "learning_rate": 1.6799364494755277e-05, + "loss": 0.0369, + "step": 128800 + }, + { + "epoch": 27.022629041867518, + "grad_norm": 0.07005513459444046, + "learning_rate": 1.6796355473710183e-05, + "loss": 0.0355, + "step": 128810 + }, + { + "epoch": 27.02268320424633, + "grad_norm": 0.41290712356567383, + "learning_rate": 1.679334645266509e-05, + "loss": 0.0058, + "step": 128820 + }, + { + "epoch": 27.022737366625144, + "grad_norm": 0.06259632110595703, + "learning_rate": 1.679033743162e-05, + "loss": 0.0395, + "step": 128830 + }, + { + "epoch": 27.022791529003953, + "grad_norm": 0.001281251898035407, + "learning_rate": 1.6787328410574906e-05, + "loss": 0.0024, + "step": 128840 + }, + { + "epoch": 27.022845691382766, + "grad_norm": 1.9097421169281006, + "learning_rate": 1.6784319389529812e-05, + "loss": 0.0586, + "step": 128850 + }, + { + "epoch": 27.022899853761576, + "grad_norm": 1.5573315620422363, + "learning_rate": 1.6781310368484718e-05, + "loss": 0.037, + "step": 128860 + }, + { + "epoch": 27.02295401614039, + "grad_norm": 0.0026717926375567913, + "learning_rate": 1.6778301347439628e-05, + "loss": 0.0029, + "step": 128870 + }, + { + "epoch": 27.023008178519202, + "grad_norm": 0.0017588865011930466, + "learning_rate": 1.677529232639453e-05, + "loss": 0.0446, + "step": 128880 + }, + { + "epoch": 27.02306234089801, + "grad_norm": 3.948554515838623, + "learning_rate": 1.6772283305349437e-05, + "loss": 0.0606, + "step": 128890 + }, + { + "epoch": 27.023116503276825, + "grad_norm": 0.038919489830732346, + "learning_rate": 1.6769274284304347e-05, + "loss": 0.0217, + "step": 128900 + }, + { + "epoch": 27.023170665655634, + "grad_norm": 0.378645658493042, + "learning_rate": 1.676626526325925e-05, + "loss": 0.0094, + "step": 128910 + }, + { + "epoch": 27.023224828034447, + "grad_norm": 0.009027251973748207, + "learning_rate": 1.676325624221416e-05, + "loss": 0.0655, + "step": 128920 + }, + { + "epoch": 27.02327899041326, + "grad_norm": 4.886764049530029, + "learning_rate": 1.6760247221169065e-05, + "loss": 0.0303, + "step": 128930 + }, + { + "epoch": 27.02333315279207, + "grad_norm": 0.01910972036421299, + "learning_rate": 1.675723820012397e-05, + "loss": 0.0285, + "step": 128940 + }, + { + "epoch": 27.023387315170883, + "grad_norm": 0.0012451534857973456, + "learning_rate": 1.6754229179078878e-05, + "loss": 0.0557, + "step": 128950 + }, + { + "epoch": 27.023441477549692, + "grad_norm": 0.001768809393979609, + "learning_rate": 1.6751220158033788e-05, + "loss": 0.0564, + "step": 128960 + }, + { + "epoch": 27.023495639928505, + "grad_norm": 0.00815387349575758, + "learning_rate": 1.6748211136988694e-05, + "loss": 0.004, + "step": 128970 + }, + { + "epoch": 27.02354980230732, + "grad_norm": 0.001712506520561874, + "learning_rate": 1.67452021159436e-05, + "loss": 0.0446, + "step": 128980 + }, + { + "epoch": 27.023603964686128, + "grad_norm": 0.060788244009017944, + "learning_rate": 1.6742193094898506e-05, + "loss": 0.0072, + "step": 128990 + }, + { + "epoch": 27.02365812706494, + "grad_norm": 0.0015466321492567658, + "learning_rate": 1.6739184073853416e-05, + "loss": 0.0041, + "step": 129000 + }, + { + "epoch": 27.023712289443754, + "grad_norm": 0.0038470416329801083, + "learning_rate": 1.673617505280832e-05, + "loss": 0.1213, + "step": 129010 + }, + { + "epoch": 27.023766451822564, + "grad_norm": 0.513943076133728, + "learning_rate": 1.673316603176323e-05, + "loss": 0.1054, + "step": 129020 + }, + { + "epoch": 27.023820614201377, + "grad_norm": 0.00145208858884871, + "learning_rate": 1.6730157010718135e-05, + "loss": 0.0539, + "step": 129030 + }, + { + "epoch": 27.023874776580186, + "grad_norm": 2.055772304534912, + "learning_rate": 1.6727147989673038e-05, + "loss": 0.0871, + "step": 129040 + }, + { + "epoch": 27.023928938959, + "grad_norm": 0.001317234244197607, + "learning_rate": 1.6724138968627947e-05, + "loss": 0.0169, + "step": 129050 + }, + { + "epoch": 27.023983101337812, + "grad_norm": 0.020818253979086876, + "learning_rate": 1.6721129947582854e-05, + "loss": 0.0612, + "step": 129060 + }, + { + "epoch": 27.02403726371662, + "grad_norm": 0.0018790926551446319, + "learning_rate": 1.671812092653776e-05, + "loss": 0.0238, + "step": 129070 + }, + { + "epoch": 27.024091426095435, + "grad_norm": 0.008677592501044273, + "learning_rate": 1.6715111905492666e-05, + "loss": 0.0098, + "step": 129080 + }, + { + "epoch": 27.024145588474244, + "grad_norm": 0.0019432712579146028, + "learning_rate": 1.6712102884447576e-05, + "loss": 0.0712, + "step": 129090 + }, + { + "epoch": 27.024199750853057, + "grad_norm": 0.023800836876034737, + "learning_rate": 1.6709093863402482e-05, + "loss": 0.1132, + "step": 129100 + }, + { + "epoch": 27.02425391323187, + "grad_norm": 0.018094616010785103, + "learning_rate": 1.670608484235739e-05, + "loss": 0.0505, + "step": 129110 + }, + { + "epoch": 27.02430807561068, + "grad_norm": 0.0015060391742736101, + "learning_rate": 1.6703075821312295e-05, + "loss": 0.0594, + "step": 129120 + }, + { + "epoch": 27.024362237989493, + "grad_norm": 0.4934857189655304, + "learning_rate": 1.6700066800267204e-05, + "loss": 0.0212, + "step": 129130 + }, + { + "epoch": 27.024416400368303, + "grad_norm": 3.4979748725891113, + "learning_rate": 1.6697057779222107e-05, + "loss": 0.1027, + "step": 129140 + }, + { + "epoch": 27.024470562747116, + "grad_norm": 0.03655615821480751, + "learning_rate": 1.6694048758177017e-05, + "loss": 0.0344, + "step": 129150 + }, + { + "epoch": 27.02452472512593, + "grad_norm": 0.0014346131356433034, + "learning_rate": 1.6691039737131923e-05, + "loss": 0.0045, + "step": 129160 + }, + { + "epoch": 27.024578887504738, + "grad_norm": 0.003261404810473323, + "learning_rate": 1.668803071608683e-05, + "loss": 0.0259, + "step": 129170 + }, + { + "epoch": 27.02463304988355, + "grad_norm": 24.551538467407227, + "learning_rate": 1.6685021695041736e-05, + "loss": 0.0637, + "step": 129180 + }, + { + "epoch": 27.024687212262364, + "grad_norm": 1.3342615365982056, + "learning_rate": 1.6682012673996642e-05, + "loss": 0.0536, + "step": 129190 + }, + { + "epoch": 27.024741374641174, + "grad_norm": 0.0018117238068953156, + "learning_rate": 1.667900365295155e-05, + "loss": 0.0237, + "step": 129200 + }, + { + "epoch": 27.024795537019987, + "grad_norm": 0.0012859365670010448, + "learning_rate": 1.6675994631906455e-05, + "loss": 0.0045, + "step": 129210 + }, + { + "epoch": 27.024849699398796, + "grad_norm": 0.038885682821273804, + "learning_rate": 1.6672985610861364e-05, + "loss": 0.0036, + "step": 129220 + }, + { + "epoch": 27.02490386177761, + "grad_norm": 0.046391963958740234, + "learning_rate": 1.666997658981627e-05, + "loss": 0.0453, + "step": 129230 + }, + { + "epoch": 27.024958024156422, + "grad_norm": 0.003627225523814559, + "learning_rate": 1.6666967568771177e-05, + "loss": 0.0681, + "step": 129240 + }, + { + "epoch": 27.02500135405947, + "eval_accuracy": 0.856629653821032, + "eval_loss": 0.8934354186058044, + "eval_runtime": 118.2782, + "eval_samples_per_second": 25.888, + "eval_steps_per_second": 3.238, + "step": 129248 + }, + { + "epoch": 28.000010832475763, + "grad_norm": 0.10902927815914154, + "learning_rate": 1.6663958547726083e-05, + "loss": 0.0928, + "step": 129250 + }, + { + "epoch": 28.000064994854576, + "grad_norm": 0.0013811567332595587, + "learning_rate": 1.6660949526680993e-05, + "loss": 0.0119, + "step": 129260 + }, + { + "epoch": 28.000119157233385, + "grad_norm": 0.04271673411130905, + "learning_rate": 1.6657940505635896e-05, + "loss": 0.0438, + "step": 129270 + }, + { + "epoch": 28.0001733196122, + "grad_norm": 0.001677747000940144, + "learning_rate": 1.6654931484590805e-05, + "loss": 0.0737, + "step": 129280 + }, + { + "epoch": 28.000227481991008, + "grad_norm": 0.005322233773767948, + "learning_rate": 1.665192246354571e-05, + "loss": 0.033, + "step": 129290 + }, + { + "epoch": 28.00028164436982, + "grad_norm": 0.010235190391540527, + "learning_rate": 1.6648913442500618e-05, + "loss": 0.1076, + "step": 129300 + }, + { + "epoch": 28.000335806748634, + "grad_norm": 0.006441912148147821, + "learning_rate": 1.6645904421455524e-05, + "loss": 0.0103, + "step": 129310 + }, + { + "epoch": 28.000389969127443, + "grad_norm": 0.6839904189109802, + "learning_rate": 1.6642895400410434e-05, + "loss": 0.0326, + "step": 129320 + }, + { + "epoch": 28.000444131506256, + "grad_norm": 1.1440956592559814, + "learning_rate": 1.6639886379365337e-05, + "loss": 0.0151, + "step": 129330 + }, + { + "epoch": 28.000498293885066, + "grad_norm": 0.009986195713281631, + "learning_rate": 1.6636877358320243e-05, + "loss": 0.0151, + "step": 129340 + }, + { + "epoch": 28.00055245626388, + "grad_norm": 0.0013924168888479471, + "learning_rate": 1.6633868337275153e-05, + "loss": 0.009, + "step": 129350 + }, + { + "epoch": 28.000606618642692, + "grad_norm": 0.0017950338078662753, + "learning_rate": 1.663085931623006e-05, + "loss": 0.049, + "step": 129360 + }, + { + "epoch": 28.0006607810215, + "grad_norm": 0.0011781491339206696, + "learning_rate": 1.6627850295184965e-05, + "loss": 0.025, + "step": 129370 + }, + { + "epoch": 28.000714943400315, + "grad_norm": 2.965139865875244, + "learning_rate": 1.662484127413987e-05, + "loss": 0.0442, + "step": 129380 + }, + { + "epoch": 28.000769105779124, + "grad_norm": 0.4044248163700104, + "learning_rate": 1.662183225309478e-05, + "loss": 0.0035, + "step": 129390 + }, + { + "epoch": 28.000823268157937, + "grad_norm": 0.0012968372320756316, + "learning_rate": 1.6618823232049684e-05, + "loss": 0.021, + "step": 129400 + }, + { + "epoch": 28.00087743053675, + "grad_norm": 0.0012380575062707067, + "learning_rate": 1.6615814211004594e-05, + "loss": 0.0231, + "step": 129410 + }, + { + "epoch": 28.00093159291556, + "grad_norm": 0.0020684755872935057, + "learning_rate": 1.66128051899595e-05, + "loss": 0.0439, + "step": 129420 + }, + { + "epoch": 28.000985755294373, + "grad_norm": 0.0012171610724180937, + "learning_rate": 1.6609796168914406e-05, + "loss": 0.093, + "step": 129430 + }, + { + "epoch": 28.001039917673186, + "grad_norm": 0.9153107404708862, + "learning_rate": 1.6606787147869312e-05, + "loss": 0.0198, + "step": 129440 + }, + { + "epoch": 28.001094080051995, + "grad_norm": 0.0030592025723308325, + "learning_rate": 1.6603778126824222e-05, + "loss": 0.011, + "step": 129450 + }, + { + "epoch": 28.00114824243081, + "grad_norm": 0.0050598992966115475, + "learning_rate": 1.6600769105779125e-05, + "loss": 0.078, + "step": 129460 + }, + { + "epoch": 28.001202404809618, + "grad_norm": 0.3059041500091553, + "learning_rate": 1.6597760084734035e-05, + "loss": 0.0031, + "step": 129470 + }, + { + "epoch": 28.00125656718843, + "grad_norm": 0.002397483214735985, + "learning_rate": 1.659475106368894e-05, + "loss": 0.0243, + "step": 129480 + }, + { + "epoch": 28.001310729567244, + "grad_norm": 0.004786746576428413, + "learning_rate": 1.6591742042643847e-05, + "loss": 0.0606, + "step": 129490 + }, + { + "epoch": 28.001364891946054, + "grad_norm": 1.0594444274902344, + "learning_rate": 1.6588733021598754e-05, + "loss": 0.0185, + "step": 129500 + }, + { + "epoch": 28.001419054324867, + "grad_norm": 0.004477751906961203, + "learning_rate": 1.658572400055366e-05, + "loss": 0.0119, + "step": 129510 + }, + { + "epoch": 28.001473216703676, + "grad_norm": 0.002267408650368452, + "learning_rate": 1.658271497950857e-05, + "loss": 0.0393, + "step": 129520 + }, + { + "epoch": 28.00152737908249, + "grad_norm": 0.09879808872938156, + "learning_rate": 1.6579705958463472e-05, + "loss": 0.0036, + "step": 129530 + }, + { + "epoch": 28.001581541461302, + "grad_norm": 0.5254073143005371, + "learning_rate": 1.6576696937418382e-05, + "loss": 0.0439, + "step": 129540 + }, + { + "epoch": 28.001635703840112, + "grad_norm": 0.0011950411135330796, + "learning_rate": 1.6573687916373288e-05, + "loss": 0.026, + "step": 129550 + }, + { + "epoch": 28.001689866218925, + "grad_norm": 0.0012280249502509832, + "learning_rate": 1.6570678895328195e-05, + "loss": 0.0802, + "step": 129560 + }, + { + "epoch": 28.001744028597734, + "grad_norm": 2.045048236846924, + "learning_rate": 1.65676698742831e-05, + "loss": 0.1148, + "step": 129570 + }, + { + "epoch": 28.001798190976547, + "grad_norm": 0.0013950330903753638, + "learning_rate": 1.656466085323801e-05, + "loss": 0.0009, + "step": 129580 + }, + { + "epoch": 28.00185235335536, + "grad_norm": 0.006280303467065096, + "learning_rate": 1.6561651832192913e-05, + "loss": 0.0236, + "step": 129590 + }, + { + "epoch": 28.00190651573417, + "grad_norm": 0.0016850673127919436, + "learning_rate": 1.6558642811147823e-05, + "loss": 0.006, + "step": 129600 + }, + { + "epoch": 28.001960678112983, + "grad_norm": 0.14757446944713593, + "learning_rate": 1.655563379010273e-05, + "loss": 0.0656, + "step": 129610 + }, + { + "epoch": 28.002014840491796, + "grad_norm": 0.0015006415778771043, + "learning_rate": 1.6552624769057636e-05, + "loss": 0.0392, + "step": 129620 + }, + { + "epoch": 28.002069002870606, + "grad_norm": 0.0024259034544229507, + "learning_rate": 1.6549615748012542e-05, + "loss": 0.0666, + "step": 129630 + }, + { + "epoch": 28.00212316524942, + "grad_norm": 0.0013214644277468324, + "learning_rate": 1.6546606726967448e-05, + "loss": 0.034, + "step": 129640 + }, + { + "epoch": 28.002177327628228, + "grad_norm": 0.8192682266235352, + "learning_rate": 1.6543597705922358e-05, + "loss": 0.0672, + "step": 129650 + }, + { + "epoch": 28.00223149000704, + "grad_norm": 0.10303469002246857, + "learning_rate": 1.654058868487726e-05, + "loss": 0.0182, + "step": 129660 + }, + { + "epoch": 28.002285652385854, + "grad_norm": 0.014051783829927444, + "learning_rate": 1.653757966383217e-05, + "loss": 0.0303, + "step": 129670 + }, + { + "epoch": 28.002339814764664, + "grad_norm": 0.0011834470788016915, + "learning_rate": 1.6534570642787077e-05, + "loss": 0.0525, + "step": 129680 + }, + { + "epoch": 28.002393977143477, + "grad_norm": 0.0012092376127839088, + "learning_rate": 1.6531561621741983e-05, + "loss": 0.0097, + "step": 129690 + }, + { + "epoch": 28.002448139522286, + "grad_norm": 0.0012238852214068174, + "learning_rate": 1.652855260069689e-05, + "loss": 0.0269, + "step": 129700 + }, + { + "epoch": 28.0025023019011, + "grad_norm": 0.00886338111013174, + "learning_rate": 1.65255435796518e-05, + "loss": 0.0356, + "step": 129710 + }, + { + "epoch": 28.002556464279913, + "grad_norm": 0.00859817024320364, + "learning_rate": 1.6522534558606702e-05, + "loss": 0.015, + "step": 129720 + }, + { + "epoch": 28.002610626658722, + "grad_norm": 0.0017295668367296457, + "learning_rate": 1.651952553756161e-05, + "loss": 0.0003, + "step": 129730 + }, + { + "epoch": 28.002664789037535, + "grad_norm": 0.09287331253290176, + "learning_rate": 1.6516516516516518e-05, + "loss": 0.0414, + "step": 129740 + }, + { + "epoch": 28.002718951416345, + "grad_norm": 0.0012426739558577538, + "learning_rate": 1.6513507495471424e-05, + "loss": 0.0022, + "step": 129750 + }, + { + "epoch": 28.002773113795158, + "grad_norm": 3.369598150253296, + "learning_rate": 1.651049847442633e-05, + "loss": 0.052, + "step": 129760 + }, + { + "epoch": 28.00282727617397, + "grad_norm": 0.016463037580251694, + "learning_rate": 1.650748945338124e-05, + "loss": 0.0474, + "step": 129770 + }, + { + "epoch": 28.00288143855278, + "grad_norm": 2.3303189277648926, + "learning_rate": 1.6504480432336146e-05, + "loss": 0.0535, + "step": 129780 + }, + { + "epoch": 28.002935600931593, + "grad_norm": 0.0015888102352619171, + "learning_rate": 1.650147141129105e-05, + "loss": 0.0288, + "step": 129790 + }, + { + "epoch": 28.002989763310406, + "grad_norm": 1.7409392595291138, + "learning_rate": 1.649846239024596e-05, + "loss": 0.0194, + "step": 129800 + }, + { + "epoch": 28.003043925689216, + "grad_norm": 0.048276063054800034, + "learning_rate": 1.6495453369200865e-05, + "loss": 0.0007, + "step": 129810 + }, + { + "epoch": 28.00309808806803, + "grad_norm": 0.0014945976436138153, + "learning_rate": 1.649244434815577e-05, + "loss": 0.0201, + "step": 129820 + }, + { + "epoch": 28.00315225044684, + "grad_norm": 0.001030805753543973, + "learning_rate": 1.6489435327110678e-05, + "loss": 0.0269, + "step": 129830 + }, + { + "epoch": 28.00320641282565, + "grad_norm": 0.03045765869319439, + "learning_rate": 1.6486426306065587e-05, + "loss": 0.0268, + "step": 129840 + }, + { + "epoch": 28.003260575204465, + "grad_norm": 0.0010515428148210049, + "learning_rate": 1.648341728502049e-05, + "loss": 0.0583, + "step": 129850 + }, + { + "epoch": 28.003314737583274, + "grad_norm": 0.5225806832313538, + "learning_rate": 1.64804082639754e-05, + "loss": 0.0245, + "step": 129860 + }, + { + "epoch": 28.003368899962087, + "grad_norm": 0.0015064437175169587, + "learning_rate": 1.6477399242930306e-05, + "loss": 0.0379, + "step": 129870 + }, + { + "epoch": 28.003423062340897, + "grad_norm": 0.021960997954010963, + "learning_rate": 1.6474390221885212e-05, + "loss": 0.0366, + "step": 129880 + }, + { + "epoch": 28.00347722471971, + "grad_norm": 0.0012037558481097221, + "learning_rate": 1.647138120084012e-05, + "loss": 0.1085, + "step": 129890 + }, + { + "epoch": 28.003531387098523, + "grad_norm": 0.4089851975440979, + "learning_rate": 1.6468372179795028e-05, + "loss": 0.0975, + "step": 129900 + }, + { + "epoch": 28.003585549477332, + "grad_norm": 0.003491367679089308, + "learning_rate": 1.6465363158749935e-05, + "loss": 0.0288, + "step": 129910 + }, + { + "epoch": 28.003639711856145, + "grad_norm": 0.0014377040788531303, + "learning_rate": 1.646235413770484e-05, + "loss": 0.0741, + "step": 129920 + }, + { + "epoch": 28.003693874234955, + "grad_norm": 0.015407111495733261, + "learning_rate": 1.6459345116659747e-05, + "loss": 0.0502, + "step": 129930 + }, + { + "epoch": 28.003748036613768, + "grad_norm": 0.0018103920156136155, + "learning_rate": 1.6456336095614653e-05, + "loss": 0.0373, + "step": 129940 + }, + { + "epoch": 28.00380219899258, + "grad_norm": 0.0019593380857259035, + "learning_rate": 1.645332707456956e-05, + "loss": 0.0014, + "step": 129950 + }, + { + "epoch": 28.00385636137139, + "grad_norm": 0.02008170261979103, + "learning_rate": 1.6450318053524466e-05, + "loss": 0.0023, + "step": 129960 + }, + { + "epoch": 28.003910523750204, + "grad_norm": 0.5293545722961426, + "learning_rate": 1.6447309032479376e-05, + "loss": 0.0344, + "step": 129970 + }, + { + "epoch": 28.003964686129013, + "grad_norm": 0.002850957913324237, + "learning_rate": 1.644430001143428e-05, + "loss": 0.0544, + "step": 129980 + }, + { + "epoch": 28.004018848507826, + "grad_norm": 2.0882742404937744, + "learning_rate": 1.6441290990389188e-05, + "loss": 0.0353, + "step": 129990 + }, + { + "epoch": 28.00407301088664, + "grad_norm": 0.0010769640794023871, + "learning_rate": 1.6438281969344094e-05, + "loss": 0.0135, + "step": 130000 + }, + { + "epoch": 28.00412717326545, + "grad_norm": 0.10990235954523087, + "learning_rate": 1.6435272948299e-05, + "loss": 0.0102, + "step": 130010 + }, + { + "epoch": 28.00418133564426, + "grad_norm": 0.1323474794626236, + "learning_rate": 1.6432263927253907e-05, + "loss": 0.0449, + "step": 130020 + }, + { + "epoch": 28.004235498023075, + "grad_norm": 1.7415670156478882, + "learning_rate": 1.6429254906208817e-05, + "loss": 0.1267, + "step": 130030 + }, + { + "epoch": 28.004289660401884, + "grad_norm": 0.06497974693775177, + "learning_rate": 1.6426245885163723e-05, + "loss": 0.0815, + "step": 130040 + }, + { + "epoch": 28.004343822780697, + "grad_norm": 0.0012322377879172564, + "learning_rate": 1.642323686411863e-05, + "loss": 0.03, + "step": 130050 + }, + { + "epoch": 28.004397985159507, + "grad_norm": 0.8962012529373169, + "learning_rate": 1.6420227843073535e-05, + "loss": 0.0382, + "step": 130060 + }, + { + "epoch": 28.00445214753832, + "grad_norm": 0.5321817994117737, + "learning_rate": 1.6417218822028445e-05, + "loss": 0.027, + "step": 130070 + }, + { + "epoch": 28.004506309917133, + "grad_norm": 0.00166158692445606, + "learning_rate": 1.6414209800983348e-05, + "loss": 0.086, + "step": 130080 + }, + { + "epoch": 28.004560472295942, + "grad_norm": 0.00138638645876199, + "learning_rate": 1.6411200779938254e-05, + "loss": 0.0497, + "step": 130090 + }, + { + "epoch": 28.004614634674756, + "grad_norm": 0.001171346870251, + "learning_rate": 1.6408191758893164e-05, + "loss": 0.0121, + "step": 130100 + }, + { + "epoch": 28.004668797053565, + "grad_norm": 0.058992817997932434, + "learning_rate": 1.6405182737848067e-05, + "loss": 0.0334, + "step": 130110 + }, + { + "epoch": 28.004722959432378, + "grad_norm": 1.701917052268982, + "learning_rate": 1.6402173716802976e-05, + "loss": 0.0168, + "step": 130120 + }, + { + "epoch": 28.00477712181119, + "grad_norm": 1.193941354751587, + "learning_rate": 1.6399164695757883e-05, + "loss": 0.0478, + "step": 130130 + }, + { + "epoch": 28.00483128419, + "grad_norm": 0.6039138436317444, + "learning_rate": 1.639615567471279e-05, + "loss": 0.0145, + "step": 130140 + }, + { + "epoch": 28.004885446568814, + "grad_norm": 0.0014101783744990826, + "learning_rate": 1.6393146653667695e-05, + "loss": 0.025, + "step": 130150 + }, + { + "epoch": 28.004939608947623, + "grad_norm": 0.0015370292821899056, + "learning_rate": 1.6390137632622605e-05, + "loss": 0.018, + "step": 130160 + }, + { + "epoch": 28.004993771326436, + "grad_norm": 0.0011564193991944194, + "learning_rate": 1.638712861157751e-05, + "loss": 0.098, + "step": 130170 + }, + { + "epoch": 28.00504793370525, + "grad_norm": 0.9410964250564575, + "learning_rate": 1.6384119590532417e-05, + "loss": 0.0279, + "step": 130180 + }, + { + "epoch": 28.00510209608406, + "grad_norm": 0.0017029792070388794, + "learning_rate": 1.6381110569487324e-05, + "loss": 0.0144, + "step": 130190 + }, + { + "epoch": 28.005156258462872, + "grad_norm": 0.0011081754928454757, + "learning_rate": 1.6378101548442233e-05, + "loss": 0.0016, + "step": 130200 + }, + { + "epoch": 28.005210420841685, + "grad_norm": 0.013738096691668034, + "learning_rate": 1.6375092527397136e-05, + "loss": 0.0076, + "step": 130210 + }, + { + "epoch": 28.005264583220495, + "grad_norm": 0.001048934180289507, + "learning_rate": 1.6372083506352046e-05, + "loss": 0.0002, + "step": 130220 + }, + { + "epoch": 28.005318745599308, + "grad_norm": 0.0028459252789616585, + "learning_rate": 1.6369074485306952e-05, + "loss": 0.005, + "step": 130230 + }, + { + "epoch": 28.005372907978117, + "grad_norm": 0.0010454322909936309, + "learning_rate": 1.6366065464261855e-05, + "loss": 0.0406, + "step": 130240 + }, + { + "epoch": 28.00542707035693, + "grad_norm": 1.1989842653274536, + "learning_rate": 1.6363056443216765e-05, + "loss": 0.0179, + "step": 130250 + }, + { + "epoch": 28.005481232735743, + "grad_norm": 0.0068631041795015335, + "learning_rate": 1.636004742217167e-05, + "loss": 0.0026, + "step": 130260 + }, + { + "epoch": 28.005535395114553, + "grad_norm": 0.001766803441569209, + "learning_rate": 1.6357038401126577e-05, + "loss": 0.0497, + "step": 130270 + }, + { + "epoch": 28.005589557493366, + "grad_norm": 0.0392676517367363, + "learning_rate": 1.6354029380081484e-05, + "loss": 0.0846, + "step": 130280 + }, + { + "epoch": 28.005643719872175, + "grad_norm": 11.710029602050781, + "learning_rate": 1.6351020359036393e-05, + "loss": 0.0991, + "step": 130290 + }, + { + "epoch": 28.00569788225099, + "grad_norm": 0.0010312506929039955, + "learning_rate": 1.63480113379913e-05, + "loss": 0.0657, + "step": 130300 + }, + { + "epoch": 28.0057520446298, + "grad_norm": 0.0010371358366683125, + "learning_rate": 1.6345002316946206e-05, + "loss": 0.0184, + "step": 130310 + }, + { + "epoch": 28.00580620700861, + "grad_norm": 0.4876607656478882, + "learning_rate": 1.6341993295901112e-05, + "loss": 0.1275, + "step": 130320 + }, + { + "epoch": 28.005860369387424, + "grad_norm": 0.0011183128226548433, + "learning_rate": 1.6338984274856022e-05, + "loss": 0.0386, + "step": 130330 + }, + { + "epoch": 28.005914531766233, + "grad_norm": 0.0011289350222796202, + "learning_rate": 1.6335975253810925e-05, + "loss": 0.0082, + "step": 130340 + }, + { + "epoch": 28.005968694145047, + "grad_norm": 0.16720886528491974, + "learning_rate": 1.6332966232765834e-05, + "loss": 0.1021, + "step": 130350 + }, + { + "epoch": 28.00602285652386, + "grad_norm": 0.0014964862493798137, + "learning_rate": 1.632995721172074e-05, + "loss": 0.0146, + "step": 130360 + }, + { + "epoch": 28.00607701890267, + "grad_norm": 0.0014525563456118107, + "learning_rate": 1.6326948190675647e-05, + "loss": 0.0831, + "step": 130370 + }, + { + "epoch": 28.006131181281482, + "grad_norm": 0.0010732851224020123, + "learning_rate": 1.6323939169630553e-05, + "loss": 0.035, + "step": 130380 + }, + { + "epoch": 28.006185343660295, + "grad_norm": 0.0011682066833600402, + "learning_rate": 1.632093014858546e-05, + "loss": 0.0582, + "step": 130390 + }, + { + "epoch": 28.006239506039105, + "grad_norm": 0.23288594186306, + "learning_rate": 1.6317921127540366e-05, + "loss": 0.0384, + "step": 130400 + }, + { + "epoch": 28.006293668417918, + "grad_norm": 0.5291587710380554, + "learning_rate": 1.6314912106495272e-05, + "loss": 0.031, + "step": 130410 + }, + { + "epoch": 28.006347830796727, + "grad_norm": 0.0011358585907146335, + "learning_rate": 1.631190308545018e-05, + "loss": 0.0112, + "step": 130420 + }, + { + "epoch": 28.00640199317554, + "grad_norm": 0.6907745599746704, + "learning_rate": 1.6308894064405088e-05, + "loss": 0.0391, + "step": 130430 + }, + { + "epoch": 28.006456155554353, + "grad_norm": 0.5885301828384399, + "learning_rate": 1.6305885043359994e-05, + "loss": 0.042, + "step": 130440 + }, + { + "epoch": 28.006510317933163, + "grad_norm": 0.0013295381795614958, + "learning_rate": 1.63028760223149e-05, + "loss": 0.0619, + "step": 130450 + }, + { + "epoch": 28.006564480311976, + "grad_norm": 0.0011026582214981318, + "learning_rate": 1.629986700126981e-05, + "loss": 0.0569, + "step": 130460 + }, + { + "epoch": 28.006618642690785, + "grad_norm": 0.0010470881825312972, + "learning_rate": 1.6296857980224713e-05, + "loss": 0.0194, + "step": 130470 + }, + { + "epoch": 28.0066728050696, + "grad_norm": 0.00822262093424797, + "learning_rate": 1.6293848959179623e-05, + "loss": 0.0035, + "step": 130480 + }, + { + "epoch": 28.00672696744841, + "grad_norm": 0.001102633774280548, + "learning_rate": 1.629083993813453e-05, + "loss": 0.0217, + "step": 130490 + }, + { + "epoch": 28.00678112982722, + "grad_norm": 0.002983664395287633, + "learning_rate": 1.6287830917089435e-05, + "loss": 0.0196, + "step": 130500 + }, + { + "epoch": 28.006835292206034, + "grad_norm": 0.4431779384613037, + "learning_rate": 1.628482189604434e-05, + "loss": 0.0101, + "step": 130510 + }, + { + "epoch": 28.006889454584844, + "grad_norm": 0.0010401451727375388, + "learning_rate": 1.628181287499925e-05, + "loss": 0.0303, + "step": 130520 + }, + { + "epoch": 28.006943616963657, + "grad_norm": 0.4554862082004547, + "learning_rate": 1.6278803853954154e-05, + "loss": 0.0113, + "step": 130530 + }, + { + "epoch": 28.00699777934247, + "grad_norm": 0.2263910323381424, + "learning_rate": 1.627579483290906e-05, + "loss": 0.022, + "step": 130540 + }, + { + "epoch": 28.00705194172128, + "grad_norm": 1.0815868377685547, + "learning_rate": 1.627278581186397e-05, + "loss": 0.0114, + "step": 130550 + }, + { + "epoch": 28.007106104100092, + "grad_norm": 0.07524343580007553, + "learning_rate": 1.6269776790818876e-05, + "loss": 0.0132, + "step": 130560 + }, + { + "epoch": 28.007160266478905, + "grad_norm": 1.006465196609497, + "learning_rate": 1.6266767769773783e-05, + "loss": 0.0132, + "step": 130570 + }, + { + "epoch": 28.007214428857715, + "grad_norm": 0.0017051630420610309, + "learning_rate": 1.626375874872869e-05, + "loss": 0.0794, + "step": 130580 + }, + { + "epoch": 28.007268591236528, + "grad_norm": 1.5323412418365479, + "learning_rate": 1.62607497276836e-05, + "loss": 0.0573, + "step": 130590 + }, + { + "epoch": 28.007322753615338, + "grad_norm": 0.17517302930355072, + "learning_rate": 1.62577407066385e-05, + "loss": 0.098, + "step": 130600 + }, + { + "epoch": 28.00737691599415, + "grad_norm": 0.0010184274287894368, + "learning_rate": 1.625473168559341e-05, + "loss": 0.056, + "step": 130610 + }, + { + "epoch": 28.007431078372964, + "grad_norm": 0.0014461182290688157, + "learning_rate": 1.6251722664548317e-05, + "loss": 0.0621, + "step": 130620 + }, + { + "epoch": 28.007485240751773, + "grad_norm": 0.2562750279903412, + "learning_rate": 1.6248713643503224e-05, + "loss": 0.0183, + "step": 130630 + }, + { + "epoch": 28.007539403130586, + "grad_norm": 0.018759120255708694, + "learning_rate": 1.624570462245813e-05, + "loss": 0.0274, + "step": 130640 + }, + { + "epoch": 28.007593565509396, + "grad_norm": 0.7260323762893677, + "learning_rate": 1.624269560141304e-05, + "loss": 0.0289, + "step": 130650 + }, + { + "epoch": 28.00764772788821, + "grad_norm": 0.5760477781295776, + "learning_rate": 1.6239686580367942e-05, + "loss": 0.0132, + "step": 130660 + }, + { + "epoch": 28.007701890267022, + "grad_norm": 0.0013210320612415671, + "learning_rate": 1.6236677559322852e-05, + "loss": 0.0491, + "step": 130670 + }, + { + "epoch": 28.00775605264583, + "grad_norm": 0.0015266359550878406, + "learning_rate": 1.623366853827776e-05, + "loss": 0.041, + "step": 130680 + }, + { + "epoch": 28.007810215024644, + "grad_norm": 0.34573009610176086, + "learning_rate": 1.6230659517232665e-05, + "loss": 0.0599, + "step": 130690 + }, + { + "epoch": 28.007864377403454, + "grad_norm": 0.07606829702854156, + "learning_rate": 1.622765049618757e-05, + "loss": 0.0566, + "step": 130700 + }, + { + "epoch": 28.007918539782267, + "grad_norm": 1.190408706665039, + "learning_rate": 1.6224641475142477e-05, + "loss": 0.0185, + "step": 130710 + }, + { + "epoch": 28.00797270216108, + "grad_norm": 0.000997241004370153, + "learning_rate": 1.6221632454097387e-05, + "loss": 0.0022, + "step": 130720 + }, + { + "epoch": 28.00802686453989, + "grad_norm": 0.07635117322206497, + "learning_rate": 1.621862343305229e-05, + "loss": 0.0079, + "step": 130730 + }, + { + "epoch": 28.008081026918703, + "grad_norm": 2.1405656337738037, + "learning_rate": 1.62156144120072e-05, + "loss": 0.0442, + "step": 130740 + }, + { + "epoch": 28.008135189297516, + "grad_norm": 0.001058813650161028, + "learning_rate": 1.6212605390962106e-05, + "loss": 0.0334, + "step": 130750 + }, + { + "epoch": 28.008189351676325, + "grad_norm": 0.0010176609503105283, + "learning_rate": 1.6209596369917012e-05, + "loss": 0.0695, + "step": 130760 + }, + { + "epoch": 28.00824351405514, + "grad_norm": 0.6469788551330566, + "learning_rate": 1.6206587348871918e-05, + "loss": 0.0085, + "step": 130770 + }, + { + "epoch": 28.008297676433948, + "grad_norm": 0.0009752711048349738, + "learning_rate": 1.6203578327826828e-05, + "loss": 0.0555, + "step": 130780 + }, + { + "epoch": 28.00835183881276, + "grad_norm": 0.005517063196748495, + "learning_rate": 1.620056930678173e-05, + "loss": 0.0366, + "step": 130790 + }, + { + "epoch": 28.008406001191574, + "grad_norm": 0.0009689785074442625, + "learning_rate": 1.619756028573664e-05, + "loss": 0.0067, + "step": 130800 + }, + { + "epoch": 28.008460163570383, + "grad_norm": 0.0013858547899872065, + "learning_rate": 1.6194551264691547e-05, + "loss": 0.1104, + "step": 130810 + }, + { + "epoch": 28.008514325949196, + "grad_norm": 0.0018955901032313704, + "learning_rate": 1.6191542243646453e-05, + "loss": 0.0813, + "step": 130820 + }, + { + "epoch": 28.008568488328006, + "grad_norm": 0.37451109290122986, + "learning_rate": 1.618853322260136e-05, + "loss": 0.049, + "step": 130830 + }, + { + "epoch": 28.00862265070682, + "grad_norm": 0.009996960870921612, + "learning_rate": 1.6185524201556265e-05, + "loss": 0.0487, + "step": 130840 + }, + { + "epoch": 28.008676813085632, + "grad_norm": 0.0010218751849606633, + "learning_rate": 1.6182515180511175e-05, + "loss": 0.0586, + "step": 130850 + }, + { + "epoch": 28.00873097546444, + "grad_norm": 0.025369463488459587, + "learning_rate": 1.6179506159466078e-05, + "loss": 0.0423, + "step": 130860 + }, + { + "epoch": 28.008785137843255, + "grad_norm": 0.0014115950325503945, + "learning_rate": 1.6176497138420988e-05, + "loss": 0.0547, + "step": 130870 + }, + { + "epoch": 28.008839300222064, + "grad_norm": 0.0018537509022280574, + "learning_rate": 1.6173488117375894e-05, + "loss": 0.1203, + "step": 130880 + }, + { + "epoch": 28.008893462600877, + "grad_norm": 0.0016517278272658587, + "learning_rate": 1.61704790963308e-05, + "loss": 0.0097, + "step": 130890 + }, + { + "epoch": 28.00894762497969, + "grad_norm": 0.0021157790906727314, + "learning_rate": 1.6167470075285707e-05, + "loss": 0.0297, + "step": 130900 + }, + { + "epoch": 28.0090017873585, + "grad_norm": 3.7135751247406006, + "learning_rate": 1.6164461054240616e-05, + "loss": 0.0695, + "step": 130910 + }, + { + "epoch": 28.009055949737313, + "grad_norm": 0.18452267348766327, + "learning_rate": 1.616145203319552e-05, + "loss": 0.0546, + "step": 130920 + }, + { + "epoch": 28.009110112116126, + "grad_norm": 0.002400605473667383, + "learning_rate": 1.615844301215043e-05, + "loss": 0.0203, + "step": 130930 + }, + { + "epoch": 28.009164274494935, + "grad_norm": 0.0016505923122167587, + "learning_rate": 1.6155433991105335e-05, + "loss": 0.0044, + "step": 130940 + }, + { + "epoch": 28.00921843687375, + "grad_norm": 0.8699111342430115, + "learning_rate": 1.615242497006024e-05, + "loss": 0.0294, + "step": 130950 + }, + { + "epoch": 28.009272599252558, + "grad_norm": 0.5417248606681824, + "learning_rate": 1.6149415949015148e-05, + "loss": 0.0179, + "step": 130960 + }, + { + "epoch": 28.00932676163137, + "grad_norm": 0.0017499647801741958, + "learning_rate": 1.6146406927970054e-05, + "loss": 0.0219, + "step": 130970 + }, + { + "epoch": 28.009380924010184, + "grad_norm": 0.6419848203659058, + "learning_rate": 1.6143397906924963e-05, + "loss": 0.0386, + "step": 130980 + }, + { + "epoch": 28.009435086388994, + "grad_norm": 0.09426411241292953, + "learning_rate": 1.6140388885879866e-05, + "loss": 0.019, + "step": 130990 + }, + { + "epoch": 28.009489248767807, + "grad_norm": 6.911012172698975, + "learning_rate": 1.6137379864834776e-05, + "loss": 0.0673, + "step": 131000 + }, + { + "epoch": 28.009543411146616, + "grad_norm": 0.0023542274720966816, + "learning_rate": 1.6134370843789682e-05, + "loss": 0.0814, + "step": 131010 + }, + { + "epoch": 28.00959757352543, + "grad_norm": 0.0015880281571298838, + "learning_rate": 1.613136182274459e-05, + "loss": 0.0112, + "step": 131020 + }, + { + "epoch": 28.009651735904242, + "grad_norm": 0.0014337899629026651, + "learning_rate": 1.6128352801699495e-05, + "loss": 0.0543, + "step": 131030 + }, + { + "epoch": 28.009705898283052, + "grad_norm": 0.6327234506607056, + "learning_rate": 1.6125343780654405e-05, + "loss": 0.02, + "step": 131040 + }, + { + "epoch": 28.009760060661865, + "grad_norm": 0.001253536087460816, + "learning_rate": 1.6122334759609307e-05, + "loss": 0.0009, + "step": 131050 + }, + { + "epoch": 28.009814223040674, + "grad_norm": 0.9385182857513428, + "learning_rate": 1.6119325738564217e-05, + "loss": 0.0332, + "step": 131060 + }, + { + "epoch": 28.009868385419487, + "grad_norm": 0.002544662682339549, + "learning_rate": 1.6116316717519123e-05, + "loss": 0.0416, + "step": 131070 + }, + { + "epoch": 28.0099225477983, + "grad_norm": 0.001197251258417964, + "learning_rate": 1.611330769647403e-05, + "loss": 0.0278, + "step": 131080 + }, + { + "epoch": 28.00997671017711, + "grad_norm": 0.016426360234618187, + "learning_rate": 1.6110298675428936e-05, + "loss": 0.0458, + "step": 131090 + }, + { + "epoch": 28.010030872555923, + "grad_norm": 0.062499675899744034, + "learning_rate": 1.6107289654383846e-05, + "loss": 0.1316, + "step": 131100 + }, + { + "epoch": 28.010085034934733, + "grad_norm": 0.0012029390782117844, + "learning_rate": 1.6104280633338752e-05, + "loss": 0.0856, + "step": 131110 + }, + { + "epoch": 28.010139197313546, + "grad_norm": 0.02628476172685623, + "learning_rate": 1.6101271612293655e-05, + "loss": 0.065, + "step": 131120 + }, + { + "epoch": 28.01019335969236, + "grad_norm": 0.0016308127669617534, + "learning_rate": 1.6098262591248564e-05, + "loss": 0.0154, + "step": 131130 + }, + { + "epoch": 28.010247522071168, + "grad_norm": 0.00280945235863328, + "learning_rate": 1.609525357020347e-05, + "loss": 0.0194, + "step": 131140 + }, + { + "epoch": 28.01030168444998, + "grad_norm": 0.0014275130815804005, + "learning_rate": 1.6092244549158377e-05, + "loss": 0.0802, + "step": 131150 + }, + { + "epoch": 28.010355846828794, + "grad_norm": 0.0017552425852045417, + "learning_rate": 1.6089235528113283e-05, + "loss": 0.0066, + "step": 131160 + }, + { + "epoch": 28.010410009207604, + "grad_norm": 0.30028092861175537, + "learning_rate": 1.6086226507068193e-05, + "loss": 0.063, + "step": 131170 + }, + { + "epoch": 28.010464171586417, + "grad_norm": 1.1844639778137207, + "learning_rate": 1.6083217486023096e-05, + "loss": 0.0051, + "step": 131180 + }, + { + "epoch": 28.010518333965226, + "grad_norm": 0.003948296420276165, + "learning_rate": 1.6080208464978005e-05, + "loss": 0.0091, + "step": 131190 + }, + { + "epoch": 28.01057249634404, + "grad_norm": 0.0011088958708569407, + "learning_rate": 1.6077199443932912e-05, + "loss": 0.018, + "step": 131200 + }, + { + "epoch": 28.010626658722853, + "grad_norm": 0.009923124685883522, + "learning_rate": 1.6074190422887818e-05, + "loss": 0.0333, + "step": 131210 + }, + { + "epoch": 28.010680821101662, + "grad_norm": 0.001313624670729041, + "learning_rate": 1.6071181401842724e-05, + "loss": 0.0456, + "step": 131220 + }, + { + "epoch": 28.010734983480475, + "grad_norm": 0.010027412325143814, + "learning_rate": 1.6068172380797634e-05, + "loss": 0.0462, + "step": 131230 + }, + { + "epoch": 28.010789145859285, + "grad_norm": 0.0010367641225457191, + "learning_rate": 1.606516335975254e-05, + "loss": 0.0994, + "step": 131240 + }, + { + "epoch": 28.010843308238098, + "grad_norm": 0.04909486696124077, + "learning_rate": 1.6062154338707446e-05, + "loss": 0.0292, + "step": 131250 + }, + { + "epoch": 28.01089747061691, + "grad_norm": 0.47635236382484436, + "learning_rate": 1.6059145317662353e-05, + "loss": 0.0222, + "step": 131260 + }, + { + "epoch": 28.01095163299572, + "grad_norm": 0.0010578497312963009, + "learning_rate": 1.605613629661726e-05, + "loss": 0.0187, + "step": 131270 + }, + { + "epoch": 28.011005795374533, + "grad_norm": 0.0012136328732594848, + "learning_rate": 1.6053127275572165e-05, + "loss": 0.0037, + "step": 131280 + }, + { + "epoch": 28.011059957753343, + "grad_norm": 0.0011758481850847602, + "learning_rate": 1.605011825452707e-05, + "loss": 0.013, + "step": 131290 + }, + { + "epoch": 28.011114120132156, + "grad_norm": 0.0037557214964181185, + "learning_rate": 1.604710923348198e-05, + "loss": 0.0046, + "step": 131300 + }, + { + "epoch": 28.01116828251097, + "grad_norm": 0.001595469075255096, + "learning_rate": 1.6044100212436884e-05, + "loss": 0.0188, + "step": 131310 + }, + { + "epoch": 28.01122244488978, + "grad_norm": 0.0018470536451786757, + "learning_rate": 1.6041091191391794e-05, + "loss": 0.0011, + "step": 131320 + }, + { + "epoch": 28.01127660726859, + "grad_norm": 0.8447710275650024, + "learning_rate": 1.60380821703467e-05, + "loss": 0.0466, + "step": 131330 + }, + { + "epoch": 28.011330769647405, + "grad_norm": 0.8054483532905579, + "learning_rate": 1.6035073149301606e-05, + "loss": 0.018, + "step": 131340 + }, + { + "epoch": 28.011384932026214, + "grad_norm": 0.0009689496364444494, + "learning_rate": 1.6032064128256513e-05, + "loss": 0.0607, + "step": 131350 + }, + { + "epoch": 28.011439094405027, + "grad_norm": 1.0159921646118164, + "learning_rate": 1.6029055107211422e-05, + "loss": 0.0131, + "step": 131360 + }, + { + "epoch": 28.011493256783837, + "grad_norm": 0.004260296002030373, + "learning_rate": 1.602604608616633e-05, + "loss": 0.1647, + "step": 131370 + }, + { + "epoch": 28.01154741916265, + "grad_norm": 9.688926696777344, + "learning_rate": 1.6023037065121235e-05, + "loss": 0.0931, + "step": 131380 + }, + { + "epoch": 28.011601581541463, + "grad_norm": 0.0013385943602770567, + "learning_rate": 1.602002804407614e-05, + "loss": 0.1091, + "step": 131390 + }, + { + "epoch": 28.011655743920272, + "grad_norm": 0.0011566696921363473, + "learning_rate": 1.601701902303105e-05, + "loss": 0.0001, + "step": 131400 + }, + { + "epoch": 28.011709906299085, + "grad_norm": 0.001437623519450426, + "learning_rate": 1.6014010001985954e-05, + "loss": 0.0211, + "step": 131410 + }, + { + "epoch": 28.011764068677895, + "grad_norm": 4.9429192543029785, + "learning_rate": 1.601100098094086e-05, + "loss": 0.1461, + "step": 131420 + }, + { + "epoch": 28.011818231056708, + "grad_norm": 0.8548331260681152, + "learning_rate": 1.600799195989577e-05, + "loss": 0.0372, + "step": 131430 + }, + { + "epoch": 28.01187239343552, + "grad_norm": 0.12150277942419052, + "learning_rate": 1.6004982938850672e-05, + "loss": 0.0658, + "step": 131440 + }, + { + "epoch": 28.01192655581433, + "grad_norm": 3.7076213359832764, + "learning_rate": 1.6001973917805582e-05, + "loss": 0.1117, + "step": 131450 + }, + { + "epoch": 28.011980718193144, + "grad_norm": 0.06166791543364525, + "learning_rate": 1.599896489676049e-05, + "loss": 0.071, + "step": 131460 + }, + { + "epoch": 28.012034880571953, + "grad_norm": 0.007899094372987747, + "learning_rate": 1.5995955875715395e-05, + "loss": 0.0548, + "step": 131470 + }, + { + "epoch": 28.012089042950766, + "grad_norm": 0.00285547343082726, + "learning_rate": 1.59929468546703e-05, + "loss": 0.0071, + "step": 131480 + }, + { + "epoch": 28.01214320532958, + "grad_norm": 0.0012019469868391752, + "learning_rate": 1.598993783362521e-05, + "loss": 0.0574, + "step": 131490 + }, + { + "epoch": 28.01219736770839, + "grad_norm": 0.43688973784446716, + "learning_rate": 1.5986928812580117e-05, + "loss": 0.0283, + "step": 131500 + }, + { + "epoch": 28.0122515300872, + "grad_norm": 0.3228369653224945, + "learning_rate": 1.5983919791535023e-05, + "loss": 0.0118, + "step": 131510 + }, + { + "epoch": 28.012305692466015, + "grad_norm": 0.0012730041053146124, + "learning_rate": 1.598091077048993e-05, + "loss": 0.0065, + "step": 131520 + }, + { + "epoch": 28.012359854844824, + "grad_norm": 0.0011336896568536758, + "learning_rate": 1.597790174944484e-05, + "loss": 0.0604, + "step": 131530 + }, + { + "epoch": 28.012414017223637, + "grad_norm": 0.0011609926586970687, + "learning_rate": 1.5974892728399742e-05, + "loss": 0.0269, + "step": 131540 + }, + { + "epoch": 28.012468179602447, + "grad_norm": 0.004589736927300692, + "learning_rate": 1.597188370735465e-05, + "loss": 0.0147, + "step": 131550 + }, + { + "epoch": 28.01252234198126, + "grad_norm": 0.0010711975628510118, + "learning_rate": 1.5968874686309558e-05, + "loss": 0.0572, + "step": 131560 + }, + { + "epoch": 28.012576504360073, + "grad_norm": 0.6506468057632446, + "learning_rate": 1.596586566526446e-05, + "loss": 0.1298, + "step": 131570 + }, + { + "epoch": 28.012630666738882, + "grad_norm": 0.006246057339012623, + "learning_rate": 1.596285664421937e-05, + "loss": 0.0146, + "step": 131580 + }, + { + "epoch": 28.012684829117696, + "grad_norm": 0.6230150461196899, + "learning_rate": 1.5959847623174277e-05, + "loss": 0.0155, + "step": 131590 + }, + { + "epoch": 28.012738991496505, + "grad_norm": 0.0025544515810906887, + "learning_rate": 1.5956838602129183e-05, + "loss": 0.0336, + "step": 131600 + }, + { + "epoch": 28.012793153875318, + "grad_norm": 0.01635883003473282, + "learning_rate": 1.595382958108409e-05, + "loss": 0.0492, + "step": 131610 + }, + { + "epoch": 28.01284731625413, + "grad_norm": 0.01940922811627388, + "learning_rate": 1.5950820560039e-05, + "loss": 0.0138, + "step": 131620 + }, + { + "epoch": 28.01290147863294, + "grad_norm": 0.13080859184265137, + "learning_rate": 1.5947811538993905e-05, + "loss": 0.0293, + "step": 131630 + }, + { + "epoch": 28.012955641011754, + "grad_norm": 0.2924156188964844, + "learning_rate": 1.594480251794881e-05, + "loss": 0.0352, + "step": 131640 + }, + { + "epoch": 28.013009803390563, + "grad_norm": 0.0017575203673914075, + "learning_rate": 1.5941793496903718e-05, + "loss": 0.0203, + "step": 131650 + }, + { + "epoch": 28.013063965769376, + "grad_norm": 0.01346070971339941, + "learning_rate": 1.5938784475858627e-05, + "loss": 0.0485, + "step": 131660 + }, + { + "epoch": 28.01311812814819, + "grad_norm": 0.29676583409309387, + "learning_rate": 1.593577545481353e-05, + "loss": 0.0054, + "step": 131670 + }, + { + "epoch": 28.013172290527, + "grad_norm": 0.6098618507385254, + "learning_rate": 1.593276643376844e-05, + "loss": 0.0322, + "step": 131680 + }, + { + "epoch": 28.013226452905812, + "grad_norm": 0.2912326455116272, + "learning_rate": 1.5929757412723346e-05, + "loss": 0.0464, + "step": 131690 + }, + { + "epoch": 28.013280615284625, + "grad_norm": 0.0009770835749804974, + "learning_rate": 1.5926748391678253e-05, + "loss": 0.0162, + "step": 131700 + }, + { + "epoch": 28.013334777663434, + "grad_norm": 1.2670376300811768, + "learning_rate": 1.592373937063316e-05, + "loss": 0.1427, + "step": 131710 + }, + { + "epoch": 28.013388940042248, + "grad_norm": 0.03501320257782936, + "learning_rate": 1.5920730349588065e-05, + "loss": 0.0023, + "step": 131720 + }, + { + "epoch": 28.013443102421057, + "grad_norm": 0.0013948746491223574, + "learning_rate": 1.591772132854297e-05, + "loss": 0.0207, + "step": 131730 + }, + { + "epoch": 28.01349726479987, + "grad_norm": 0.0013441990595310926, + "learning_rate": 1.5914712307497878e-05, + "loss": 0.0701, + "step": 131740 + }, + { + "epoch": 28.013551427178683, + "grad_norm": 0.0011713397689163685, + "learning_rate": 1.5911703286452787e-05, + "loss": 0.0288, + "step": 131750 + }, + { + "epoch": 28.013605589557493, + "grad_norm": 0.05147545412182808, + "learning_rate": 1.5908694265407694e-05, + "loss": 0.0626, + "step": 131760 + }, + { + "epoch": 28.013659751936306, + "grad_norm": 0.42057016491889954, + "learning_rate": 1.59056852443626e-05, + "loss": 0.0502, + "step": 131770 + }, + { + "epoch": 28.013713914315115, + "grad_norm": 0.001151336939074099, + "learning_rate": 1.5902676223317506e-05, + "loss": 0.0517, + "step": 131780 + }, + { + "epoch": 28.01376807669393, + "grad_norm": 0.002459681360051036, + "learning_rate": 1.5899667202272416e-05, + "loss": 0.0038, + "step": 131790 + }, + { + "epoch": 28.01382223907274, + "grad_norm": 0.0010945441899821162, + "learning_rate": 1.589665818122732e-05, + "loss": 0.0697, + "step": 131800 + }, + { + "epoch": 28.01387640145155, + "grad_norm": 0.5773826837539673, + "learning_rate": 1.589364916018223e-05, + "loss": 0.0286, + "step": 131810 + }, + { + "epoch": 28.013930563830364, + "grad_norm": 0.001469497918151319, + "learning_rate": 1.5890640139137135e-05, + "loss": 0.0091, + "step": 131820 + }, + { + "epoch": 28.013984726209173, + "grad_norm": 0.001609772676602006, + "learning_rate": 1.588763111809204e-05, + "loss": 0.0189, + "step": 131830 + }, + { + "epoch": 28.014038888587987, + "grad_norm": 0.027969183400273323, + "learning_rate": 1.5884622097046947e-05, + "loss": 0.0692, + "step": 131840 + }, + { + "epoch": 28.0140930509668, + "grad_norm": 0.001261936966329813, + "learning_rate": 1.5881613076001857e-05, + "loss": 0.0282, + "step": 131850 + }, + { + "epoch": 28.01414721334561, + "grad_norm": 0.0012552756816148758, + "learning_rate": 1.587860405495676e-05, + "loss": 0.0111, + "step": 131860 + }, + { + "epoch": 28.014201375724422, + "grad_norm": 0.0021669778507202864, + "learning_rate": 1.5875595033911666e-05, + "loss": 0.0985, + "step": 131870 + }, + { + "epoch": 28.014255538103235, + "grad_norm": 0.0030495093669742346, + "learning_rate": 1.5872586012866576e-05, + "loss": 0.08, + "step": 131880 + }, + { + "epoch": 28.014309700482045, + "grad_norm": 0.9909594655036926, + "learning_rate": 1.5869576991821482e-05, + "loss": 0.0703, + "step": 131890 + }, + { + "epoch": 28.014363862860858, + "grad_norm": 0.001717085367999971, + "learning_rate": 1.5866567970776388e-05, + "loss": 0.0452, + "step": 131900 + }, + { + "epoch": 28.014418025239667, + "grad_norm": 0.0011627617059275508, + "learning_rate": 1.5863558949731294e-05, + "loss": 0.0364, + "step": 131910 + }, + { + "epoch": 28.01447218761848, + "grad_norm": 0.0015421939315274358, + "learning_rate": 1.5860549928686204e-05, + "loss": 0.0334, + "step": 131920 + }, + { + "epoch": 28.014526349997293, + "grad_norm": 0.0019063071813434362, + "learning_rate": 1.5857540907641107e-05, + "loss": 0.0281, + "step": 131930 + }, + { + "epoch": 28.014580512376103, + "grad_norm": 0.0038164083380252123, + "learning_rate": 1.5854531886596017e-05, + "loss": 0.0297, + "step": 131940 + }, + { + "epoch": 28.014634674754916, + "grad_norm": 0.0033660843037068844, + "learning_rate": 1.5851522865550923e-05, + "loss": 0.0162, + "step": 131950 + }, + { + "epoch": 28.014688837133725, + "grad_norm": 0.001505651162005961, + "learning_rate": 1.584851384450583e-05, + "loss": 0.0103, + "step": 131960 + }, + { + "epoch": 28.01474299951254, + "grad_norm": 0.0019096999894827604, + "learning_rate": 1.5845504823460736e-05, + "loss": 0.0543, + "step": 131970 + }, + { + "epoch": 28.01479716189135, + "grad_norm": 3.718461751937866, + "learning_rate": 1.5842495802415645e-05, + "loss": 0.1137, + "step": 131980 + }, + { + "epoch": 28.01485132427016, + "grad_norm": 0.0017954543000087142, + "learning_rate": 1.5839486781370548e-05, + "loss": 0.0691, + "step": 131990 + }, + { + "epoch": 28.014905486648974, + "grad_norm": 1.0418694019317627, + "learning_rate": 1.5836477760325458e-05, + "loss": 0.0982, + "step": 132000 + }, + { + "epoch": 28.014959649027784, + "grad_norm": 0.05847616866230965, + "learning_rate": 1.5833468739280364e-05, + "loss": 0.028, + "step": 132010 + }, + { + "epoch": 28.015013811406597, + "grad_norm": 0.0014808300184085965, + "learning_rate": 1.583045971823527e-05, + "loss": 0.0321, + "step": 132020 + }, + { + "epoch": 28.01506797378541, + "grad_norm": 0.006788286846131086, + "learning_rate": 1.5827450697190177e-05, + "loss": 0.0096, + "step": 132030 + }, + { + "epoch": 28.01512213616422, + "grad_norm": 1.802736520767212, + "learning_rate": 1.5824441676145083e-05, + "loss": 0.0098, + "step": 132040 + }, + { + "epoch": 28.015176298543032, + "grad_norm": 0.0011708353413268924, + "learning_rate": 1.5821432655099992e-05, + "loss": 0.0189, + "step": 132050 + }, + { + "epoch": 28.015230460921845, + "grad_norm": 0.0382271409034729, + "learning_rate": 1.5818423634054895e-05, + "loss": 0.1072, + "step": 132060 + }, + { + "epoch": 28.015284623300655, + "grad_norm": 0.5167362093925476, + "learning_rate": 1.5815414613009805e-05, + "loss": 0.0047, + "step": 132070 + }, + { + "epoch": 28.015338785679468, + "grad_norm": 0.8607936501502991, + "learning_rate": 1.581240559196471e-05, + "loss": 0.0426, + "step": 132080 + }, + { + "epoch": 28.015392948058278, + "grad_norm": 0.7421157956123352, + "learning_rate": 1.5809396570919618e-05, + "loss": 0.0208, + "step": 132090 + }, + { + "epoch": 28.01544711043709, + "grad_norm": 0.001855051494203508, + "learning_rate": 1.5806387549874524e-05, + "loss": 0.0668, + "step": 132100 + }, + { + "epoch": 28.015501272815904, + "grad_norm": 1.952971339225769, + "learning_rate": 1.5803378528829434e-05, + "loss": 0.0615, + "step": 132110 + }, + { + "epoch": 28.015555435194713, + "grad_norm": 1.0760838985443115, + "learning_rate": 1.5800369507784336e-05, + "loss": 0.0764, + "step": 132120 + }, + { + "epoch": 28.015609597573526, + "grad_norm": 3.4057085514068604, + "learning_rate": 1.5797360486739246e-05, + "loss": 0.0734, + "step": 132130 + }, + { + "epoch": 28.015663759952336, + "grad_norm": 0.840890109539032, + "learning_rate": 1.5794351465694152e-05, + "loss": 0.0536, + "step": 132140 + }, + { + "epoch": 28.01571792233115, + "grad_norm": 1.9754676818847656, + "learning_rate": 1.579134244464906e-05, + "loss": 0.0302, + "step": 132150 + }, + { + "epoch": 28.015772084709962, + "grad_norm": 0.40661129355430603, + "learning_rate": 1.5788333423603965e-05, + "loss": 0.0873, + "step": 132160 + }, + { + "epoch": 28.01582624708877, + "grad_norm": 0.0015295393532142043, + "learning_rate": 1.578532440255887e-05, + "loss": 0.0354, + "step": 132170 + }, + { + "epoch": 28.015880409467584, + "grad_norm": 0.8179415464401245, + "learning_rate": 1.578231538151378e-05, + "loss": 0.0755, + "step": 132180 + }, + { + "epoch": 28.015934571846394, + "grad_norm": 0.7865458726882935, + "learning_rate": 1.5779306360468684e-05, + "loss": 0.0507, + "step": 132190 + }, + { + "epoch": 28.015988734225207, + "grad_norm": 0.0066210003569722176, + "learning_rate": 1.5776297339423593e-05, + "loss": 0.0439, + "step": 132200 + }, + { + "epoch": 28.01604289660402, + "grad_norm": 0.0010780274169519544, + "learning_rate": 1.57732883183785e-05, + "loss": 0.0359, + "step": 132210 + }, + { + "epoch": 28.01609705898283, + "grad_norm": 0.11495484411716461, + "learning_rate": 1.5770279297333406e-05, + "loss": 0.0398, + "step": 132220 + }, + { + "epoch": 28.016151221361643, + "grad_norm": 0.009042943827807903, + "learning_rate": 1.5767270276288312e-05, + "loss": 0.0082, + "step": 132230 + }, + { + "epoch": 28.016205383740452, + "grad_norm": 0.0010444381041452289, + "learning_rate": 1.5764261255243222e-05, + "loss": 0.0185, + "step": 132240 + }, + { + "epoch": 28.016259546119265, + "grad_norm": 0.002622907515615225, + "learning_rate": 1.5761252234198125e-05, + "loss": 0.0205, + "step": 132250 + }, + { + "epoch": 28.01631370849808, + "grad_norm": 0.0011551908683031797, + "learning_rate": 1.5758243213153034e-05, + "loss": 0.0162, + "step": 132260 + }, + { + "epoch": 28.016367870876888, + "grad_norm": 0.05592844635248184, + "learning_rate": 1.575523419210794e-05, + "loss": 0.0207, + "step": 132270 + }, + { + "epoch": 28.0164220332557, + "grad_norm": 1.0559535026550293, + "learning_rate": 1.5752225171062847e-05, + "loss": 0.0574, + "step": 132280 + }, + { + "epoch": 28.016476195634514, + "grad_norm": 0.0014074601931497455, + "learning_rate": 1.5749216150017753e-05, + "loss": 0.0018, + "step": 132290 + }, + { + "epoch": 28.016530358013323, + "grad_norm": 6.37316370010376, + "learning_rate": 1.5746207128972663e-05, + "loss": 0.0031, + "step": 132300 + }, + { + "epoch": 28.016584520392136, + "grad_norm": 0.00101807143073529, + "learning_rate": 1.574319810792757e-05, + "loss": 0.1121, + "step": 132310 + }, + { + "epoch": 28.016638682770946, + "grad_norm": 0.0012818323448300362, + "learning_rate": 1.5740189086882472e-05, + "loss": 0.1247, + "step": 132320 + }, + { + "epoch": 28.01669284514976, + "grad_norm": 0.0016740410355851054, + "learning_rate": 1.5737180065837382e-05, + "loss": 0.0775, + "step": 132330 + }, + { + "epoch": 28.016747007528572, + "grad_norm": 0.7471458911895752, + "learning_rate": 1.5734171044792288e-05, + "loss": 0.0076, + "step": 132340 + }, + { + "epoch": 28.01680116990738, + "grad_norm": 0.1090325266122818, + "learning_rate": 1.5731162023747194e-05, + "loss": 0.0219, + "step": 132350 + }, + { + "epoch": 28.016855332286195, + "grad_norm": 0.006267366465181112, + "learning_rate": 1.57281530027021e-05, + "loss": 0.0177, + "step": 132360 + }, + { + "epoch": 28.016909494665004, + "grad_norm": 0.0019007676746696234, + "learning_rate": 1.572514398165701e-05, + "loss": 0.0116, + "step": 132370 + }, + { + "epoch": 28.016963657043817, + "grad_norm": 0.0017278362065553665, + "learning_rate": 1.5722134960611913e-05, + "loss": 0.0714, + "step": 132380 + }, + { + "epoch": 28.01701781942263, + "grad_norm": 1.2233392000198364, + "learning_rate": 1.5719125939566823e-05, + "loss": 0.0747, + "step": 132390 + }, + { + "epoch": 28.01707198180144, + "grad_norm": 0.0233074352145195, + "learning_rate": 1.571611691852173e-05, + "loss": 0.0131, + "step": 132400 + }, + { + "epoch": 28.017126144180253, + "grad_norm": 0.07650650292634964, + "learning_rate": 1.5713107897476635e-05, + "loss": 0.0342, + "step": 132410 + }, + { + "epoch": 28.017180306559062, + "grad_norm": 0.0025120468344539404, + "learning_rate": 1.571009887643154e-05, + "loss": 0.1068, + "step": 132420 + }, + { + "epoch": 28.017234468937875, + "grad_norm": 0.007097309455275536, + "learning_rate": 1.570708985538645e-05, + "loss": 0.0132, + "step": 132430 + }, + { + "epoch": 28.01728863131669, + "grad_norm": 0.002311894204467535, + "learning_rate": 1.5704080834341358e-05, + "loss": 0.0298, + "step": 132440 + }, + { + "epoch": 28.017342793695498, + "grad_norm": 0.0022205442655831575, + "learning_rate": 1.5701071813296264e-05, + "loss": 0.0261, + "step": 132450 + }, + { + "epoch": 28.01739695607431, + "grad_norm": 0.0019583520479500294, + "learning_rate": 1.569806279225117e-05, + "loss": 0.003, + "step": 132460 + }, + { + "epoch": 28.017451118453124, + "grad_norm": 0.2029016613960266, + "learning_rate": 1.5695053771206076e-05, + "loss": 0.1075, + "step": 132470 + }, + { + "epoch": 28.017505280831934, + "grad_norm": 5.024944305419922, + "learning_rate": 1.5692044750160983e-05, + "loss": 0.1256, + "step": 132480 + }, + { + "epoch": 28.017559443210747, + "grad_norm": 1.1180381774902344, + "learning_rate": 1.568903572911589e-05, + "loss": 0.0357, + "step": 132490 + }, + { + "epoch": 28.017613605589556, + "grad_norm": 0.34258317947387695, + "learning_rate": 1.56860267080708e-05, + "loss": 0.0041, + "step": 132500 + }, + { + "epoch": 28.01766776796837, + "grad_norm": 0.5925453901290894, + "learning_rate": 1.56830176870257e-05, + "loss": 0.0274, + "step": 132510 + }, + { + "epoch": 28.017721930347182, + "grad_norm": 0.18594703078269958, + "learning_rate": 1.568000866598061e-05, + "loss": 0.036, + "step": 132520 + }, + { + "epoch": 28.017776092725992, + "grad_norm": 0.07048450410366058, + "learning_rate": 1.5676999644935517e-05, + "loss": 0.0966, + "step": 132530 + }, + { + "epoch": 28.017830255104805, + "grad_norm": 0.36229968070983887, + "learning_rate": 1.5673990623890424e-05, + "loss": 0.0222, + "step": 132540 + }, + { + "epoch": 28.017884417483614, + "grad_norm": 0.9492143392562866, + "learning_rate": 1.567098160284533e-05, + "loss": 0.0693, + "step": 132550 + }, + { + "epoch": 28.017938579862427, + "grad_norm": 0.0013440451584756374, + "learning_rate": 1.566797258180024e-05, + "loss": 0.0119, + "step": 132560 + }, + { + "epoch": 28.01799274224124, + "grad_norm": 0.0012796681839972734, + "learning_rate": 1.5664963560755146e-05, + "loss": 0.0173, + "step": 132570 + }, + { + "epoch": 28.01804690462005, + "grad_norm": 1.0363472700119019, + "learning_rate": 1.5661954539710052e-05, + "loss": 0.0218, + "step": 132580 + }, + { + "epoch": 28.018101066998863, + "grad_norm": 0.6494547724723816, + "learning_rate": 1.565894551866496e-05, + "loss": 0.0151, + "step": 132590 + }, + { + "epoch": 28.018155229377673, + "grad_norm": 0.0010734102688729763, + "learning_rate": 1.5655936497619865e-05, + "loss": 0.0665, + "step": 132600 + }, + { + "epoch": 28.018209391756486, + "grad_norm": 0.001064796233549714, + "learning_rate": 1.565292747657477e-05, + "loss": 0.0055, + "step": 132610 + }, + { + "epoch": 28.0182635541353, + "grad_norm": 0.09735960513353348, + "learning_rate": 1.5649918455529677e-05, + "loss": 0.0455, + "step": 132620 + }, + { + "epoch": 28.018317716514108, + "grad_norm": 0.06444528698921204, + "learning_rate": 1.5646909434484587e-05, + "loss": 0.0211, + "step": 132630 + }, + { + "epoch": 28.01837187889292, + "grad_norm": 0.8592331409454346, + "learning_rate": 1.564390041343949e-05, + "loss": 0.0642, + "step": 132640 + }, + { + "epoch": 28.018426041271734, + "grad_norm": 0.04910512641072273, + "learning_rate": 1.56408913923944e-05, + "loss": 0.0213, + "step": 132650 + }, + { + "epoch": 28.018480203650544, + "grad_norm": 0.02874310500919819, + "learning_rate": 1.5637882371349306e-05, + "loss": 0.0322, + "step": 132660 + }, + { + "epoch": 28.018534366029357, + "grad_norm": 0.0012254550820216537, + "learning_rate": 1.5634873350304212e-05, + "loss": 0.1187, + "step": 132670 + }, + { + "epoch": 28.018588528408166, + "grad_norm": 0.0018481449224054813, + "learning_rate": 1.5631864329259118e-05, + "loss": 0.0819, + "step": 132680 + }, + { + "epoch": 28.01864269078698, + "grad_norm": 0.03873195871710777, + "learning_rate": 1.5628855308214028e-05, + "loss": 0.0731, + "step": 132690 + }, + { + "epoch": 28.018696853165793, + "grad_norm": 0.0011301682097837329, + "learning_rate": 1.5625846287168934e-05, + "loss": 0.0125, + "step": 132700 + }, + { + "epoch": 28.018751015544602, + "grad_norm": 0.0021636139135807753, + "learning_rate": 1.562283726612384e-05, + "loss": 0.1553, + "step": 132710 + }, + { + "epoch": 28.018805177923415, + "grad_norm": 0.039920397102832794, + "learning_rate": 1.5619828245078747e-05, + "loss": 0.003, + "step": 132720 + }, + { + "epoch": 28.018859340302225, + "grad_norm": 0.0030231012497097254, + "learning_rate": 1.5616819224033653e-05, + "loss": 0.0045, + "step": 132730 + }, + { + "epoch": 28.018913502681038, + "grad_norm": 0.001937576220370829, + "learning_rate": 1.561381020298856e-05, + "loss": 0.0187, + "step": 132740 + }, + { + "epoch": 28.01896766505985, + "grad_norm": 0.0015424853190779686, + "learning_rate": 1.561080118194347e-05, + "loss": 0.0075, + "step": 132750 + }, + { + "epoch": 28.01902182743866, + "grad_norm": 0.0018241774523630738, + "learning_rate": 1.5607792160898375e-05, + "loss": 0.0099, + "step": 132760 + }, + { + "epoch": 28.019075989817473, + "grad_norm": 0.2687102258205414, + "learning_rate": 1.5604783139853278e-05, + "loss": 0.0216, + "step": 132770 + }, + { + "epoch": 28.019130152196283, + "grad_norm": 0.010148055851459503, + "learning_rate": 1.5601774118808188e-05, + "loss": 0.008, + "step": 132780 + }, + { + "epoch": 28.019184314575096, + "grad_norm": 0.7059053778648376, + "learning_rate": 1.5598765097763094e-05, + "loss": 0.0384, + "step": 132790 + }, + { + "epoch": 28.01923847695391, + "grad_norm": 0.0019763733725994825, + "learning_rate": 1.5595756076718e-05, + "loss": 0.0477, + "step": 132800 + }, + { + "epoch": 28.01929263933272, + "grad_norm": 0.6761925220489502, + "learning_rate": 1.5592747055672907e-05, + "loss": 0.0474, + "step": 132810 + }, + { + "epoch": 28.01934680171153, + "grad_norm": 0.14130575954914093, + "learning_rate": 1.5589738034627816e-05, + "loss": 0.0013, + "step": 132820 + }, + { + "epoch": 28.019400964090345, + "grad_norm": 0.004901786334812641, + "learning_rate": 1.5586729013582723e-05, + "loss": 0.0189, + "step": 132830 + }, + { + "epoch": 28.019455126469154, + "grad_norm": 6.737905025482178, + "learning_rate": 1.558371999253763e-05, + "loss": 0.0509, + "step": 132840 + }, + { + "epoch": 28.019509288847967, + "grad_norm": 0.031383976340293884, + "learning_rate": 1.5580710971492535e-05, + "loss": 0.0059, + "step": 132850 + }, + { + "epoch": 28.019563451226777, + "grad_norm": 0.018358301371335983, + "learning_rate": 1.5577701950447445e-05, + "loss": 0.0287, + "step": 132860 + }, + { + "epoch": 28.01961761360559, + "grad_norm": 0.2682872414588928, + "learning_rate": 1.5574692929402348e-05, + "loss": 0.039, + "step": 132870 + }, + { + "epoch": 28.019671775984403, + "grad_norm": 0.2901243567466736, + "learning_rate": 1.5571683908357257e-05, + "loss": 0.0454, + "step": 132880 + }, + { + "epoch": 28.019725938363212, + "grad_norm": 0.0013779174769297242, + "learning_rate": 1.5568674887312164e-05, + "loss": 0.0001, + "step": 132890 + }, + { + "epoch": 28.019780100742025, + "grad_norm": 0.0013303630985319614, + "learning_rate": 1.556566586626707e-05, + "loss": 0.0016, + "step": 132900 + }, + { + "epoch": 28.019834263120835, + "grad_norm": 0.012907867319881916, + "learning_rate": 1.5562656845221976e-05, + "loss": 0.0655, + "step": 132910 + }, + { + "epoch": 28.019888425499648, + "grad_norm": 0.001917414367198944, + "learning_rate": 1.5559647824176882e-05, + "loss": 0.0308, + "step": 132920 + }, + { + "epoch": 28.01994258787846, + "grad_norm": 0.07030254602432251, + "learning_rate": 1.555663880313179e-05, + "loss": 0.1457, + "step": 132930 + }, + { + "epoch": 28.01999675025727, + "grad_norm": 0.13555240631103516, + "learning_rate": 1.5553629782086695e-05, + "loss": 0.0123, + "step": 132940 + }, + { + "epoch": 28.020050912636083, + "grad_norm": 0.003456619568169117, + "learning_rate": 1.5550620761041605e-05, + "loss": 0.0154, + "step": 132950 + }, + { + "epoch": 28.020105075014893, + "grad_norm": 0.0014135220553725958, + "learning_rate": 1.554761173999651e-05, + "loss": 0.057, + "step": 132960 + }, + { + "epoch": 28.020159237393706, + "grad_norm": 0.05715344846248627, + "learning_rate": 1.5544602718951417e-05, + "loss": 0.0068, + "step": 132970 + }, + { + "epoch": 28.02021339977252, + "grad_norm": 0.2684767544269562, + "learning_rate": 1.5541593697906323e-05, + "loss": 0.0319, + "step": 132980 + }, + { + "epoch": 28.02026756215133, + "grad_norm": 1.5804722309112549, + "learning_rate": 1.5538584676861233e-05, + "loss": 0.0266, + "step": 132990 + }, + { + "epoch": 28.02032172453014, + "grad_norm": 2.2312612533569336, + "learning_rate": 1.5535575655816136e-05, + "loss": 0.0575, + "step": 133000 + }, + { + "epoch": 28.020375886908955, + "grad_norm": 0.0011035555507987738, + "learning_rate": 1.5532566634771046e-05, + "loss": 0.0546, + "step": 133010 + }, + { + "epoch": 28.020430049287764, + "grad_norm": 1.3794183731079102, + "learning_rate": 1.5529557613725952e-05, + "loss": 0.0546, + "step": 133020 + }, + { + "epoch": 28.020484211666577, + "grad_norm": 0.4027445614337921, + "learning_rate": 1.5526548592680858e-05, + "loss": 0.0042, + "step": 133030 + }, + { + "epoch": 28.020538374045387, + "grad_norm": 0.11606644839048386, + "learning_rate": 1.5523539571635764e-05, + "loss": 0.0368, + "step": 133040 + }, + { + "epoch": 28.0205925364242, + "grad_norm": 0.0009798919782042503, + "learning_rate": 1.5520530550590674e-05, + "loss": 0.0348, + "step": 133050 + }, + { + "epoch": 28.020646698803013, + "grad_norm": 0.0011442303657531738, + "learning_rate": 1.5517521529545577e-05, + "loss": 0.0004, + "step": 133060 + }, + { + "epoch": 28.020700861181822, + "grad_norm": 0.27149781584739685, + "learning_rate": 1.5514512508500483e-05, + "loss": 0.0307, + "step": 133070 + }, + { + "epoch": 28.020755023560636, + "grad_norm": 0.0014495467767119408, + "learning_rate": 1.5511503487455393e-05, + "loss": 0.1328, + "step": 133080 + }, + { + "epoch": 28.020809185939445, + "grad_norm": 0.00181293033529073, + "learning_rate": 1.55084944664103e-05, + "loss": 0.0206, + "step": 133090 + }, + { + "epoch": 28.020863348318258, + "grad_norm": 0.02120986580848694, + "learning_rate": 1.5505485445365206e-05, + "loss": 0.1241, + "step": 133100 + }, + { + "epoch": 28.02091751069707, + "grad_norm": 0.0013013132847845554, + "learning_rate": 1.5502476424320112e-05, + "loss": 0.0312, + "step": 133110 + }, + { + "epoch": 28.02097167307588, + "grad_norm": 4.429993152618408, + "learning_rate": 1.549946740327502e-05, + "loss": 0.1007, + "step": 133120 + }, + { + "epoch": 28.021025835454694, + "grad_norm": 0.0010755662806332111, + "learning_rate": 1.5496458382229924e-05, + "loss": 0.0921, + "step": 133130 + }, + { + "epoch": 28.021079997833503, + "grad_norm": 1.877479076385498, + "learning_rate": 1.5493449361184834e-05, + "loss": 0.0517, + "step": 133140 + }, + { + "epoch": 28.021134160212316, + "grad_norm": 0.015191525220870972, + "learning_rate": 1.549044034013974e-05, + "loss": 0.0016, + "step": 133150 + }, + { + "epoch": 28.02118832259113, + "grad_norm": 1.4350473880767822, + "learning_rate": 1.5487431319094647e-05, + "loss": 0.0499, + "step": 133160 + }, + { + "epoch": 28.02124248496994, + "grad_norm": 0.12337932735681534, + "learning_rate": 1.5484422298049553e-05, + "loss": 0.0476, + "step": 133170 + }, + { + "epoch": 28.021296647348752, + "grad_norm": 1.3578680753707886, + "learning_rate": 1.5481413277004462e-05, + "loss": 0.0677, + "step": 133180 + }, + { + "epoch": 28.021350809727565, + "grad_norm": 0.0017502299742773175, + "learning_rate": 1.5478404255959365e-05, + "loss": 0.0053, + "step": 133190 + }, + { + "epoch": 28.021404972106374, + "grad_norm": 0.00419851066544652, + "learning_rate": 1.5475395234914275e-05, + "loss": 0.0219, + "step": 133200 + }, + { + "epoch": 28.021459134485188, + "grad_norm": 0.0011205383343622088, + "learning_rate": 1.547238621386918e-05, + "loss": 0.0202, + "step": 133210 + }, + { + "epoch": 28.021513296863997, + "grad_norm": 0.010711521841585636, + "learning_rate": 1.5469377192824088e-05, + "loss": 0.0523, + "step": 133220 + }, + { + "epoch": 28.02156745924281, + "grad_norm": 0.012004864402115345, + "learning_rate": 1.5466368171778994e-05, + "loss": 0.0457, + "step": 133230 + }, + { + "epoch": 28.021621621621623, + "grad_norm": 0.9846242070198059, + "learning_rate": 1.54633591507339e-05, + "loss": 0.1413, + "step": 133240 + }, + { + "epoch": 28.021675784000433, + "grad_norm": 0.003988504409790039, + "learning_rate": 1.546035012968881e-05, + "loss": 0.0486, + "step": 133250 + }, + { + "epoch": 28.021729946379246, + "grad_norm": 1.0831669569015503, + "learning_rate": 1.5457341108643713e-05, + "loss": 0.0828, + "step": 133260 + }, + { + "epoch": 28.021784108758055, + "grad_norm": 0.04735938459634781, + "learning_rate": 1.5454332087598622e-05, + "loss": 0.0446, + "step": 133270 + }, + { + "epoch": 28.02183827113687, + "grad_norm": 1.4228047132492065, + "learning_rate": 1.545132306655353e-05, + "loss": 0.0371, + "step": 133280 + }, + { + "epoch": 28.02189243351568, + "grad_norm": 0.008267024531960487, + "learning_rate": 1.5448314045508435e-05, + "loss": 0.0242, + "step": 133290 + }, + { + "epoch": 28.02194659589449, + "grad_norm": 0.8438092470169067, + "learning_rate": 1.544530502446334e-05, + "loss": 0.0701, + "step": 133300 + }, + { + "epoch": 28.022000758273304, + "grad_norm": 0.020209496840834618, + "learning_rate": 1.544229600341825e-05, + "loss": 0.0318, + "step": 133310 + }, + { + "epoch": 28.022054920652113, + "grad_norm": 0.022743264213204384, + "learning_rate": 1.5439286982373154e-05, + "loss": 0.0561, + "step": 133320 + }, + { + "epoch": 28.022109083030927, + "grad_norm": 0.0012260179501026869, + "learning_rate": 1.5436277961328063e-05, + "loss": 0.0147, + "step": 133330 + }, + { + "epoch": 28.02216324540974, + "grad_norm": 0.0012241260847076774, + "learning_rate": 1.543326894028297e-05, + "loss": 0.0116, + "step": 133340 + }, + { + "epoch": 28.02221740778855, + "grad_norm": 0.08282791823148727, + "learning_rate": 1.5430259919237876e-05, + "loss": 0.0088, + "step": 133350 + }, + { + "epoch": 28.022271570167362, + "grad_norm": 0.0059851985424757, + "learning_rate": 1.5427250898192782e-05, + "loss": 0.0162, + "step": 133360 + }, + { + "epoch": 28.02232573254617, + "grad_norm": 0.0017620802391320467, + "learning_rate": 1.542424187714769e-05, + "loss": 0.0094, + "step": 133370 + }, + { + "epoch": 28.022379894924985, + "grad_norm": 0.0012775476789101958, + "learning_rate": 1.5421232856102598e-05, + "loss": 0.0111, + "step": 133380 + }, + { + "epoch": 28.022434057303798, + "grad_norm": 4.585992813110352, + "learning_rate": 1.54182238350575e-05, + "loss": 0.0626, + "step": 133390 + }, + { + "epoch": 28.022488219682607, + "grad_norm": 1.6328351497650146, + "learning_rate": 1.541521481401241e-05, + "loss": 0.0518, + "step": 133400 + }, + { + "epoch": 28.02254238206142, + "grad_norm": 6.578761577606201, + "learning_rate": 1.5412205792967317e-05, + "loss": 0.0539, + "step": 133410 + }, + { + "epoch": 28.022596544440233, + "grad_norm": 0.0022973762825131416, + "learning_rate": 1.5409196771922223e-05, + "loss": 0.0861, + "step": 133420 + }, + { + "epoch": 28.022650706819043, + "grad_norm": 0.0016296353423967957, + "learning_rate": 1.540618775087713e-05, + "loss": 0.0017, + "step": 133430 + }, + { + "epoch": 28.022704869197856, + "grad_norm": 0.04120276868343353, + "learning_rate": 1.540317872983204e-05, + "loss": 0.0036, + "step": 133440 + }, + { + "epoch": 28.022759031576665, + "grad_norm": 0.19106388092041016, + "learning_rate": 1.5400169708786942e-05, + "loss": 0.0267, + "step": 133450 + }, + { + "epoch": 28.02281319395548, + "grad_norm": 0.005003406200557947, + "learning_rate": 1.5397160687741852e-05, + "loss": 0.018, + "step": 133460 + }, + { + "epoch": 28.02286735633429, + "grad_norm": 1.0414891242980957, + "learning_rate": 1.5394151666696758e-05, + "loss": 0.0352, + "step": 133470 + }, + { + "epoch": 28.0229215187131, + "grad_norm": 0.0011269139358773828, + "learning_rate": 1.5391142645651664e-05, + "loss": 0.0538, + "step": 133480 + }, + { + "epoch": 28.022975681091914, + "grad_norm": 0.0027016066014766693, + "learning_rate": 1.538813362460657e-05, + "loss": 0.0053, + "step": 133490 + }, + { + "epoch": 28.023029843470724, + "grad_norm": 1.1671373844146729, + "learning_rate": 1.538512460356148e-05, + "loss": 0.0159, + "step": 133500 + }, + { + "epoch": 28.023084005849537, + "grad_norm": 2.085441827774048, + "learning_rate": 1.5382115582516386e-05, + "loss": 0.1194, + "step": 133510 + }, + { + "epoch": 28.02313816822835, + "grad_norm": 0.0011637583374977112, + "learning_rate": 1.537910656147129e-05, + "loss": 0.0003, + "step": 133520 + }, + { + "epoch": 28.02319233060716, + "grad_norm": 0.001230793073773384, + "learning_rate": 1.53760975404262e-05, + "loss": 0.0657, + "step": 133530 + }, + { + "epoch": 28.023246492985972, + "grad_norm": 0.28337714076042175, + "learning_rate": 1.5373088519381105e-05, + "loss": 0.1116, + "step": 133540 + }, + { + "epoch": 28.023300655364782, + "grad_norm": 0.6083505749702454, + "learning_rate": 1.537007949833601e-05, + "loss": 0.0902, + "step": 133550 + }, + { + "epoch": 28.023354817743595, + "grad_norm": 0.0016691219061613083, + "learning_rate": 1.5367070477290918e-05, + "loss": 0.0684, + "step": 133560 + }, + { + "epoch": 28.023408980122408, + "grad_norm": 0.4039580225944519, + "learning_rate": 1.5364061456245828e-05, + "loss": 0.0293, + "step": 133570 + }, + { + "epoch": 28.023463142501218, + "grad_norm": 0.0012484656181186438, + "learning_rate": 1.536105243520073e-05, + "loss": 0.0349, + "step": 133580 + }, + { + "epoch": 28.02351730488003, + "grad_norm": 1.1112693548202515, + "learning_rate": 1.535804341415564e-05, + "loss": 0.1714, + "step": 133590 + }, + { + "epoch": 28.023571467258844, + "grad_norm": 11.346073150634766, + "learning_rate": 1.5355034393110546e-05, + "loss": 0.1137, + "step": 133600 + }, + { + "epoch": 28.023625629637653, + "grad_norm": 0.0060307797975838184, + "learning_rate": 1.5352025372065453e-05, + "loss": 0.0187, + "step": 133610 + }, + { + "epoch": 28.023679792016466, + "grad_norm": 0.7118394374847412, + "learning_rate": 1.534901635102036e-05, + "loss": 0.0355, + "step": 133620 + }, + { + "epoch": 28.023733954395276, + "grad_norm": 0.0013991480227559805, + "learning_rate": 1.534600732997527e-05, + "loss": 0.0113, + "step": 133630 + }, + { + "epoch": 28.02378811677409, + "grad_norm": 0.24619223177433014, + "learning_rate": 1.5342998308930175e-05, + "loss": 0.0092, + "step": 133640 + }, + { + "epoch": 28.023842279152902, + "grad_norm": 1.2424399852752686, + "learning_rate": 1.533998928788508e-05, + "loss": 0.0325, + "step": 133650 + }, + { + "epoch": 28.02389644153171, + "grad_norm": 0.8973178267478943, + "learning_rate": 1.5336980266839987e-05, + "loss": 0.1173, + "step": 133660 + }, + { + "epoch": 28.023950603910524, + "grad_norm": 0.0013288372429087758, + "learning_rate": 1.5333971245794894e-05, + "loss": 0.0665, + "step": 133670 + }, + { + "epoch": 28.024004766289334, + "grad_norm": 0.0012805177830159664, + "learning_rate": 1.53309622247498e-05, + "loss": 0.1126, + "step": 133680 + }, + { + "epoch": 28.024058928668147, + "grad_norm": 0.0055419509299099445, + "learning_rate": 1.5327953203704706e-05, + "loss": 0.0223, + "step": 133690 + }, + { + "epoch": 28.02411309104696, + "grad_norm": 0.0778842642903328, + "learning_rate": 1.5324944182659616e-05, + "loss": 0.0202, + "step": 133700 + }, + { + "epoch": 28.02416725342577, + "grad_norm": 0.4565168619155884, + "learning_rate": 1.532193516161452e-05, + "loss": 0.0086, + "step": 133710 + }, + { + "epoch": 28.024221415804583, + "grad_norm": 0.04483634978532791, + "learning_rate": 1.531892614056943e-05, + "loss": 0.0278, + "step": 133720 + }, + { + "epoch": 28.024275578183392, + "grad_norm": 0.0012685989495366812, + "learning_rate": 1.5315917119524335e-05, + "loss": 0.0766, + "step": 133730 + }, + { + "epoch": 28.024329740562205, + "grad_norm": 0.06067318096756935, + "learning_rate": 1.531290809847924e-05, + "loss": 0.0324, + "step": 133740 + }, + { + "epoch": 28.024383902941018, + "grad_norm": 0.0013099347706884146, + "learning_rate": 1.5309899077434147e-05, + "loss": 0.026, + "step": 133750 + }, + { + "epoch": 28.024438065319828, + "grad_norm": 0.0013233991339802742, + "learning_rate": 1.5306890056389057e-05, + "loss": 0.0116, + "step": 133760 + }, + { + "epoch": 28.02449222769864, + "grad_norm": 0.011330324225127697, + "learning_rate": 1.5303881035343963e-05, + "loss": 0.0513, + "step": 133770 + }, + { + "epoch": 28.024546390077454, + "grad_norm": 0.03600993752479553, + "learning_rate": 1.530087201429887e-05, + "loss": 0.0784, + "step": 133780 + }, + { + "epoch": 28.024600552456263, + "grad_norm": 0.016300959512591362, + "learning_rate": 1.5297862993253776e-05, + "loss": 0.0316, + "step": 133790 + }, + { + "epoch": 28.024654714835076, + "grad_norm": 0.0012705783592537045, + "learning_rate": 1.5294853972208682e-05, + "loss": 0.0257, + "step": 133800 + }, + { + "epoch": 28.024708877213886, + "grad_norm": 0.5422767400741577, + "learning_rate": 1.5291844951163588e-05, + "loss": 0.0042, + "step": 133810 + }, + { + "epoch": 28.0247630395927, + "grad_norm": 0.03299478441476822, + "learning_rate": 1.5288835930118495e-05, + "loss": 0.0165, + "step": 133820 + }, + { + "epoch": 28.024817201971512, + "grad_norm": 0.003083727788180113, + "learning_rate": 1.5285826909073404e-05, + "loss": 0.0261, + "step": 133830 + }, + { + "epoch": 28.02487136435032, + "grad_norm": 0.05327107012271881, + "learning_rate": 1.5282817888028307e-05, + "loss": 0.0536, + "step": 133840 + }, + { + "epoch": 28.024925526729135, + "grad_norm": 0.14346326887607574, + "learning_rate": 1.5279808866983217e-05, + "loss": 0.0147, + "step": 133850 + }, + { + "epoch": 28.024979689107944, + "grad_norm": 0.0135506521910429, + "learning_rate": 1.5276799845938123e-05, + "loss": 0.0224, + "step": 133860 + }, + { + "epoch": 28.02500135405947, + "eval_accuracy": 0.8471587197909863, + "eval_loss": 0.9309638738632202, + "eval_runtime": 118.2734, + "eval_samples_per_second": 25.889, + "eval_steps_per_second": 3.238, + "step": 133864 + }, + { + "epoch": 29.000032497427288, + "grad_norm": 0.3311435282230377, + "learning_rate": 1.527379082489303e-05, + "loss": 0.0192, + "step": 133870 + }, + { + "epoch": 29.000086659806097, + "grad_norm": 0.004768325947225094, + "learning_rate": 1.5270781803847936e-05, + "loss": 0.0119, + "step": 133880 + }, + { + "epoch": 29.00014082218491, + "grad_norm": 0.5477191209793091, + "learning_rate": 1.5267772782802845e-05, + "loss": 0.0315, + "step": 133890 + }, + { + "epoch": 29.000194984563723, + "grad_norm": 0.016086716204881668, + "learning_rate": 1.526476376175775e-05, + "loss": 0.0331, + "step": 133900 + }, + { + "epoch": 29.000249146942533, + "grad_norm": 1.5061454772949219, + "learning_rate": 1.5261754740712658e-05, + "loss": 0.047, + "step": 133910 + }, + { + "epoch": 29.000303309321346, + "grad_norm": 0.06501590460538864, + "learning_rate": 1.5258745719667564e-05, + "loss": 0.0431, + "step": 133920 + }, + { + "epoch": 29.000357471700156, + "grad_norm": 10.131319046020508, + "learning_rate": 1.5255736698622472e-05, + "loss": 0.1466, + "step": 133930 + }, + { + "epoch": 29.00041163407897, + "grad_norm": 0.0010969346622005105, + "learning_rate": 1.5252727677577378e-05, + "loss": 0.0097, + "step": 133940 + }, + { + "epoch": 29.00046579645778, + "grad_norm": 0.0011518248356878757, + "learning_rate": 1.5249718656532286e-05, + "loss": 0.0454, + "step": 133950 + }, + { + "epoch": 29.00051995883659, + "grad_norm": 0.040738411247730255, + "learning_rate": 1.5246709635487191e-05, + "loss": 0.0149, + "step": 133960 + }, + { + "epoch": 29.000574121215404, + "grad_norm": 0.0055405753664672375, + "learning_rate": 1.5243700614442097e-05, + "loss": 0.0152, + "step": 133970 + }, + { + "epoch": 29.000628283594214, + "grad_norm": 0.0010965256951749325, + "learning_rate": 1.5240691593397005e-05, + "loss": 0.0119, + "step": 133980 + }, + { + "epoch": 29.000682445973027, + "grad_norm": 0.7440513968467712, + "learning_rate": 1.5237682572351911e-05, + "loss": 0.0444, + "step": 133990 + }, + { + "epoch": 29.00073660835184, + "grad_norm": 0.0011254495475441217, + "learning_rate": 1.523467355130682e-05, + "loss": 0.053, + "step": 134000 + }, + { + "epoch": 29.00079077073065, + "grad_norm": 0.019635861739516258, + "learning_rate": 1.5231664530261724e-05, + "loss": 0.0227, + "step": 134010 + }, + { + "epoch": 29.000844933109462, + "grad_norm": 0.0028460538014769554, + "learning_rate": 1.5228655509216632e-05, + "loss": 0.0605, + "step": 134020 + }, + { + "epoch": 29.000899095488275, + "grad_norm": 0.5430970788002014, + "learning_rate": 1.5225646488171538e-05, + "loss": 0.0259, + "step": 134030 + }, + { + "epoch": 29.000953257867085, + "grad_norm": 0.6198046803474426, + "learning_rate": 1.5222637467126446e-05, + "loss": 0.0439, + "step": 134040 + }, + { + "epoch": 29.001007420245898, + "grad_norm": 3.69958758354187, + "learning_rate": 1.5219628446081352e-05, + "loss": 0.0202, + "step": 134050 + }, + { + "epoch": 29.001061582624708, + "grad_norm": 0.001032282947562635, + "learning_rate": 1.521661942503626e-05, + "loss": 0.01, + "step": 134060 + }, + { + "epoch": 29.00111574500352, + "grad_norm": 1.0306408405303955, + "learning_rate": 1.5213610403991167e-05, + "loss": 0.0245, + "step": 134070 + }, + { + "epoch": 29.001169907382334, + "grad_norm": 0.0017351210117340088, + "learning_rate": 1.5210601382946075e-05, + "loss": 0.0003, + "step": 134080 + }, + { + "epoch": 29.001224069761143, + "grad_norm": 0.6841885447502136, + "learning_rate": 1.520759236190098e-05, + "loss": 0.0873, + "step": 134090 + }, + { + "epoch": 29.001278232139956, + "grad_norm": 0.4002379775047302, + "learning_rate": 1.5204583340855887e-05, + "loss": 0.1755, + "step": 134100 + }, + { + "epoch": 29.001332394518766, + "grad_norm": 0.018726997077465057, + "learning_rate": 1.5201574319810793e-05, + "loss": 0.0346, + "step": 134110 + }, + { + "epoch": 29.00138655689758, + "grad_norm": 0.0027492931112647057, + "learning_rate": 1.51985652987657e-05, + "loss": 0.0468, + "step": 134120 + }, + { + "epoch": 29.001440719276392, + "grad_norm": 0.02128375507891178, + "learning_rate": 1.5195556277720608e-05, + "loss": 0.0003, + "step": 134130 + }, + { + "epoch": 29.0014948816552, + "grad_norm": 1.4702974557876587, + "learning_rate": 1.5192547256675512e-05, + "loss": 0.021, + "step": 134140 + }, + { + "epoch": 29.001549044034014, + "grad_norm": 0.001478124177083373, + "learning_rate": 1.5189538235630422e-05, + "loss": 0.0182, + "step": 134150 + }, + { + "epoch": 29.001603206412824, + "grad_norm": 10.067268371582031, + "learning_rate": 1.5186529214585327e-05, + "loss": 0.1392, + "step": 134160 + }, + { + "epoch": 29.001657368791637, + "grad_norm": 10.48708724975586, + "learning_rate": 1.5183520193540235e-05, + "loss": 0.1593, + "step": 134170 + }, + { + "epoch": 29.00171153117045, + "grad_norm": 0.36103829741477966, + "learning_rate": 1.518051117249514e-05, + "loss": 0.0822, + "step": 134180 + }, + { + "epoch": 29.00176569354926, + "grad_norm": 0.007854383438825607, + "learning_rate": 1.5177502151450049e-05, + "loss": 0.0233, + "step": 134190 + }, + { + "epoch": 29.001819855928073, + "grad_norm": 0.008236411958932877, + "learning_rate": 1.5174493130404955e-05, + "loss": 0.0031, + "step": 134200 + }, + { + "epoch": 29.001874018306886, + "grad_norm": 0.002531011588871479, + "learning_rate": 1.5171484109359863e-05, + "loss": 0.0115, + "step": 134210 + }, + { + "epoch": 29.001928180685695, + "grad_norm": 0.0014171380316838622, + "learning_rate": 1.5168475088314768e-05, + "loss": 0.021, + "step": 134220 + }, + { + "epoch": 29.00198234306451, + "grad_norm": 0.0022581256926059723, + "learning_rate": 1.5165466067269676e-05, + "loss": 0.1069, + "step": 134230 + }, + { + "epoch": 29.002036505443318, + "grad_norm": 0.8240188360214233, + "learning_rate": 1.5162457046224582e-05, + "loss": 0.0797, + "step": 134240 + }, + { + "epoch": 29.00209066782213, + "grad_norm": 0.0011783079244196415, + "learning_rate": 1.515944802517949e-05, + "loss": 0.0252, + "step": 134250 + }, + { + "epoch": 29.002144830200944, + "grad_norm": 0.006793162319809198, + "learning_rate": 1.5156439004134396e-05, + "loss": 0.0275, + "step": 134260 + }, + { + "epoch": 29.002198992579753, + "grad_norm": 2.1922996044158936, + "learning_rate": 1.51534299830893e-05, + "loss": 0.0325, + "step": 134270 + }, + { + "epoch": 29.002253154958566, + "grad_norm": 0.0025666742585599422, + "learning_rate": 1.515042096204421e-05, + "loss": 0.0374, + "step": 134280 + }, + { + "epoch": 29.002307317337376, + "grad_norm": 1.1024655103683472, + "learning_rate": 1.5147411940999115e-05, + "loss": 0.0908, + "step": 134290 + }, + { + "epoch": 29.00236147971619, + "grad_norm": 0.054750729352235794, + "learning_rate": 1.5144402919954023e-05, + "loss": 0.0318, + "step": 134300 + }, + { + "epoch": 29.002415642095002, + "grad_norm": 0.002269495977088809, + "learning_rate": 1.5141393898908929e-05, + "loss": 0.0009, + "step": 134310 + }, + { + "epoch": 29.00246980447381, + "grad_norm": 0.04196862503886223, + "learning_rate": 1.5138384877863837e-05, + "loss": 0.0144, + "step": 134320 + }, + { + "epoch": 29.002523966852625, + "grad_norm": 0.8985464572906494, + "learning_rate": 1.5135375856818743e-05, + "loss": 0.0368, + "step": 134330 + }, + { + "epoch": 29.002578129231434, + "grad_norm": 0.003229771740734577, + "learning_rate": 1.5132366835773651e-05, + "loss": 0.0085, + "step": 134340 + }, + { + "epoch": 29.002632291610247, + "grad_norm": 0.6510912775993347, + "learning_rate": 1.5129357814728556e-05, + "loss": 0.0869, + "step": 134350 + }, + { + "epoch": 29.00268645398906, + "grad_norm": 0.139036163687706, + "learning_rate": 1.5126348793683464e-05, + "loss": 0.0776, + "step": 134360 + }, + { + "epoch": 29.00274061636787, + "grad_norm": 0.001356966677121818, + "learning_rate": 1.512333977263837e-05, + "loss": 0.0135, + "step": 134370 + }, + { + "epoch": 29.002794778746683, + "grad_norm": 0.0019245522562414408, + "learning_rate": 1.5120330751593278e-05, + "loss": 0.1047, + "step": 134380 + }, + { + "epoch": 29.002848941125496, + "grad_norm": 0.9017113447189331, + "learning_rate": 1.5117321730548184e-05, + "loss": 0.0601, + "step": 134390 + }, + { + "epoch": 29.002903103504305, + "grad_norm": 0.0026183065492659807, + "learning_rate": 1.5114312709503092e-05, + "loss": 0.008, + "step": 134400 + }, + { + "epoch": 29.00295726588312, + "grad_norm": 0.0011570487404242158, + "learning_rate": 1.5111303688457999e-05, + "loss": 0.0055, + "step": 134410 + }, + { + "epoch": 29.003011428261928, + "grad_norm": 0.02835371531546116, + "learning_rate": 1.5108294667412903e-05, + "loss": 0.0809, + "step": 134420 + }, + { + "epoch": 29.00306559064074, + "grad_norm": 4.110370635986328, + "learning_rate": 1.5105285646367811e-05, + "loss": 0.051, + "step": 134430 + }, + { + "epoch": 29.003119753019554, + "grad_norm": 0.0011133190710097551, + "learning_rate": 1.5102276625322717e-05, + "loss": 0.0082, + "step": 134440 + }, + { + "epoch": 29.003173915398364, + "grad_norm": 0.0010990806622430682, + "learning_rate": 1.5099267604277625e-05, + "loss": 0.0318, + "step": 134450 + }, + { + "epoch": 29.003228077777177, + "grad_norm": 0.005297174211591482, + "learning_rate": 1.5096258583232532e-05, + "loss": 0.0003, + "step": 134460 + }, + { + "epoch": 29.003282240155986, + "grad_norm": 0.08093482255935669, + "learning_rate": 1.509324956218744e-05, + "loss": 0.0369, + "step": 134470 + }, + { + "epoch": 29.0033364025348, + "grad_norm": 3.9409286975860596, + "learning_rate": 1.5090240541142344e-05, + "loss": 0.0743, + "step": 134480 + }, + { + "epoch": 29.003390564913612, + "grad_norm": 0.058487266302108765, + "learning_rate": 1.5087231520097252e-05, + "loss": 0.0115, + "step": 134490 + }, + { + "epoch": 29.003444727292422, + "grad_norm": 1.0110373497009277, + "learning_rate": 1.5084222499052159e-05, + "loss": 0.0323, + "step": 134500 + }, + { + "epoch": 29.003498889671235, + "grad_norm": 1.0743259191513062, + "learning_rate": 1.5081213478007066e-05, + "loss": 0.0214, + "step": 134510 + }, + { + "epoch": 29.003553052050044, + "grad_norm": 1.0327086448669434, + "learning_rate": 1.5078204456961973e-05, + "loss": 0.0962, + "step": 134520 + }, + { + "epoch": 29.003607214428857, + "grad_norm": 0.6004754900932312, + "learning_rate": 1.507519543591688e-05, + "loss": 0.0476, + "step": 134530 + }, + { + "epoch": 29.00366137680767, + "grad_norm": 0.0011409148573875427, + "learning_rate": 1.5072186414871787e-05, + "loss": 0.0468, + "step": 134540 + }, + { + "epoch": 29.00371553918648, + "grad_norm": 2.060243606567383, + "learning_rate": 1.5069177393826695e-05, + "loss": 0.0247, + "step": 134550 + }, + { + "epoch": 29.003769701565293, + "grad_norm": 0.370513379573822, + "learning_rate": 1.50661683727816e-05, + "loss": 0.0157, + "step": 134560 + }, + { + "epoch": 29.003823863944106, + "grad_norm": 0.016240019351243973, + "learning_rate": 1.5063159351736506e-05, + "loss": 0.0588, + "step": 134570 + }, + { + "epoch": 29.003878026322916, + "grad_norm": 0.09650033712387085, + "learning_rate": 1.5060150330691414e-05, + "loss": 0.0653, + "step": 134580 + }, + { + "epoch": 29.00393218870173, + "grad_norm": 0.02861672453582287, + "learning_rate": 1.505714130964632e-05, + "loss": 0.0319, + "step": 134590 + }, + { + "epoch": 29.00398635108054, + "grad_norm": 0.40443331003189087, + "learning_rate": 1.5054132288601228e-05, + "loss": 0.148, + "step": 134600 + }, + { + "epoch": 29.00404051345935, + "grad_norm": 1.089013695716858, + "learning_rate": 1.5051123267556133e-05, + "loss": 0.0332, + "step": 134610 + }, + { + "epoch": 29.004094675838164, + "grad_norm": 0.017625248059630394, + "learning_rate": 1.504811424651104e-05, + "loss": 0.0311, + "step": 134620 + }, + { + "epoch": 29.004148838216974, + "grad_norm": 0.6688411831855774, + "learning_rate": 1.5045105225465947e-05, + "loss": 0.1331, + "step": 134630 + }, + { + "epoch": 29.004203000595787, + "grad_norm": 0.0013370353262871504, + "learning_rate": 1.5042096204420855e-05, + "loss": 0.0134, + "step": 134640 + }, + { + "epoch": 29.004257162974596, + "grad_norm": 0.0853591337800026, + "learning_rate": 1.5039087183375761e-05, + "loss": 0.0034, + "step": 134650 + }, + { + "epoch": 29.00431132535341, + "grad_norm": 0.053089920431375504, + "learning_rate": 1.5036078162330669e-05, + "loss": 0.0066, + "step": 134660 + }, + { + "epoch": 29.004365487732223, + "grad_norm": 0.0020931288599967957, + "learning_rate": 1.5033069141285575e-05, + "loss": 0.0091, + "step": 134670 + }, + { + "epoch": 29.004419650111032, + "grad_norm": 1.010377287864685, + "learning_rate": 1.5030060120240483e-05, + "loss": 0.0972, + "step": 134680 + }, + { + "epoch": 29.004473812489845, + "grad_norm": 0.0014294898137450218, + "learning_rate": 1.5027051099195388e-05, + "loss": 0.0306, + "step": 134690 + }, + { + "epoch": 29.004527974868655, + "grad_norm": 0.08537038415670395, + "learning_rate": 1.5024042078150296e-05, + "loss": 0.025, + "step": 134700 + }, + { + "epoch": 29.004582137247468, + "grad_norm": 0.001132581033743918, + "learning_rate": 1.5021033057105202e-05, + "loss": 0.1014, + "step": 134710 + }, + { + "epoch": 29.00463629962628, + "grad_norm": 0.0013926562387496233, + "learning_rate": 1.5018024036060108e-05, + "loss": 0.0472, + "step": 134720 + }, + { + "epoch": 29.00469046200509, + "grad_norm": 0.5375736355781555, + "learning_rate": 1.5015015015015016e-05, + "loss": 0.021, + "step": 134730 + }, + { + "epoch": 29.004744624383903, + "grad_norm": 0.0011980487033724785, + "learning_rate": 1.5012005993969921e-05, + "loss": 0.0009, + "step": 134740 + }, + { + "epoch": 29.004798786762716, + "grad_norm": 0.028722912073135376, + "learning_rate": 1.5008996972924829e-05, + "loss": 0.0071, + "step": 134750 + }, + { + "epoch": 29.004852949141526, + "grad_norm": 0.1599704474210739, + "learning_rate": 1.5005987951879735e-05, + "loss": 0.0211, + "step": 134760 + }, + { + "epoch": 29.00490711152034, + "grad_norm": 0.867590606212616, + "learning_rate": 1.5002978930834643e-05, + "loss": 0.021, + "step": 134770 + }, + { + "epoch": 29.00496127389915, + "grad_norm": 0.47453275322914124, + "learning_rate": 1.499996990978955e-05, + "loss": 0.0246, + "step": 134780 + }, + { + "epoch": 29.00501543627796, + "grad_norm": 0.10040512681007385, + "learning_rate": 1.4996960888744457e-05, + "loss": 0.072, + "step": 134790 + }, + { + "epoch": 29.005069598656775, + "grad_norm": 0.0011210939846932888, + "learning_rate": 1.4993951867699364e-05, + "loss": 0.0517, + "step": 134800 + }, + { + "epoch": 29.005123761035584, + "grad_norm": 0.051674630492925644, + "learning_rate": 1.4990942846654272e-05, + "loss": 0.027, + "step": 134810 + }, + { + "epoch": 29.005177923414397, + "grad_norm": 0.0011521652340888977, + "learning_rate": 1.4987933825609176e-05, + "loss": 0.0384, + "step": 134820 + }, + { + "epoch": 29.005232085793207, + "grad_norm": 0.03142724558711052, + "learning_rate": 1.4984924804564084e-05, + "loss": 0.1129, + "step": 134830 + }, + { + "epoch": 29.00528624817202, + "grad_norm": 0.0012110145762562752, + "learning_rate": 1.498191578351899e-05, + "loss": 0.0893, + "step": 134840 + }, + { + "epoch": 29.005340410550833, + "grad_norm": 0.0039839851669967175, + "learning_rate": 1.4978906762473898e-05, + "loss": 0.0668, + "step": 134850 + }, + { + "epoch": 29.005394572929642, + "grad_norm": 0.3643752932548523, + "learning_rate": 1.4975897741428805e-05, + "loss": 0.0044, + "step": 134860 + }, + { + "epoch": 29.005448735308455, + "grad_norm": 0.31198057532310486, + "learning_rate": 1.497288872038371e-05, + "loss": 0.0378, + "step": 134870 + }, + { + "epoch": 29.005502897687265, + "grad_norm": 1.8925384283065796, + "learning_rate": 1.4969879699338617e-05, + "loss": 0.0479, + "step": 134880 + }, + { + "epoch": 29.005557060066078, + "grad_norm": 0.001202804851345718, + "learning_rate": 1.4966870678293524e-05, + "loss": 0.0795, + "step": 134890 + }, + { + "epoch": 29.00561122244489, + "grad_norm": 0.006730124820023775, + "learning_rate": 1.4963861657248432e-05, + "loss": 0.0413, + "step": 134900 + }, + { + "epoch": 29.0056653848237, + "grad_norm": 0.002406737767159939, + "learning_rate": 1.4960852636203338e-05, + "loss": 0.0022, + "step": 134910 + }, + { + "epoch": 29.005719547202514, + "grad_norm": 0.007381856441497803, + "learning_rate": 1.4957843615158246e-05, + "loss": 0.0106, + "step": 134920 + }, + { + "epoch": 29.005773709581323, + "grad_norm": 0.0326833613216877, + "learning_rate": 1.4954834594113152e-05, + "loss": 0.003, + "step": 134930 + }, + { + "epoch": 29.005827871960136, + "grad_norm": 0.8039588332176208, + "learning_rate": 1.495182557306806e-05, + "loss": 0.0648, + "step": 134940 + }, + { + "epoch": 29.00588203433895, + "grad_norm": 0.06421443074941635, + "learning_rate": 1.4948816552022965e-05, + "loss": 0.0461, + "step": 134950 + }, + { + "epoch": 29.00593619671776, + "grad_norm": 0.001191923045553267, + "learning_rate": 1.4945807530977873e-05, + "loss": 0.0499, + "step": 134960 + }, + { + "epoch": 29.00599035909657, + "grad_norm": 0.001644762814976275, + "learning_rate": 1.4942798509932779e-05, + "loss": 0.0091, + "step": 134970 + }, + { + "epoch": 29.006044521475385, + "grad_norm": 0.04141295701265335, + "learning_rate": 1.4939789488887687e-05, + "loss": 0.0013, + "step": 134980 + }, + { + "epoch": 29.006098683854194, + "grad_norm": 0.0011249256785959005, + "learning_rate": 1.4936780467842593e-05, + "loss": 0.0122, + "step": 134990 + }, + { + "epoch": 29.006152846233007, + "grad_norm": 0.7214692234992981, + "learning_rate": 1.4933771446797501e-05, + "loss": 0.0209, + "step": 135000 + }, + { + "epoch": 29.006207008611817, + "grad_norm": 0.9616790413856506, + "learning_rate": 1.4930762425752407e-05, + "loss": 0.0769, + "step": 135010 + }, + { + "epoch": 29.00626117099063, + "grad_norm": 1.9875376224517822, + "learning_rate": 1.4927753404707312e-05, + "loss": 0.0314, + "step": 135020 + }, + { + "epoch": 29.006315333369443, + "grad_norm": 0.0012031645746901631, + "learning_rate": 1.492474438366222e-05, + "loss": 0.0791, + "step": 135030 + }, + { + "epoch": 29.006369495748253, + "grad_norm": 0.04210326820611954, + "learning_rate": 1.4921735362617126e-05, + "loss": 0.0665, + "step": 135040 + }, + { + "epoch": 29.006423658127066, + "grad_norm": 0.0011092924978584051, + "learning_rate": 1.4918726341572034e-05, + "loss": 0.0163, + "step": 135050 + }, + { + "epoch": 29.006477820505875, + "grad_norm": 0.20302218198776245, + "learning_rate": 1.491571732052694e-05, + "loss": 0.0177, + "step": 135060 + }, + { + "epoch": 29.006531982884688, + "grad_norm": 0.40679219365119934, + "learning_rate": 1.4912708299481848e-05, + "loss": 0.0078, + "step": 135070 + }, + { + "epoch": 29.0065861452635, + "grad_norm": 0.3319483697414398, + "learning_rate": 1.4909699278436753e-05, + "loss": 0.0018, + "step": 135080 + }, + { + "epoch": 29.00664030764231, + "grad_norm": 1.4298882484436035, + "learning_rate": 1.4906690257391661e-05, + "loss": 0.0532, + "step": 135090 + }, + { + "epoch": 29.006694470021124, + "grad_norm": 0.030795035883784294, + "learning_rate": 1.4903681236346567e-05, + "loss": 0.013, + "step": 135100 + }, + { + "epoch": 29.006748632399933, + "grad_norm": 0.0011413522297516465, + "learning_rate": 1.4900672215301475e-05, + "loss": 0.1348, + "step": 135110 + }, + { + "epoch": 29.006802794778746, + "grad_norm": 0.6392168402671814, + "learning_rate": 1.4897663194256381e-05, + "loss": 0.0463, + "step": 135120 + }, + { + "epoch": 29.00685695715756, + "grad_norm": 0.021201735362410545, + "learning_rate": 1.489465417321129e-05, + "loss": 0.0313, + "step": 135130 + }, + { + "epoch": 29.00691111953637, + "grad_norm": 0.00308014964684844, + "learning_rate": 1.4891645152166196e-05, + "loss": 0.0454, + "step": 135140 + }, + { + "epoch": 29.006965281915182, + "grad_norm": 0.0021926758345216513, + "learning_rate": 1.4888636131121104e-05, + "loss": 0.0007, + "step": 135150 + }, + { + "epoch": 29.007019444293995, + "grad_norm": 0.0010677448008209467, + "learning_rate": 1.4885627110076008e-05, + "loss": 0.0179, + "step": 135160 + }, + { + "epoch": 29.007073606672805, + "grad_norm": 0.14893166720867157, + "learning_rate": 1.4882618089030914e-05, + "loss": 0.0173, + "step": 135170 + }, + { + "epoch": 29.007127769051618, + "grad_norm": 0.0011246521025896072, + "learning_rate": 1.4879609067985822e-05, + "loss": 0.0414, + "step": 135180 + }, + { + "epoch": 29.007181931430427, + "grad_norm": 0.012448581866919994, + "learning_rate": 1.4876600046940729e-05, + "loss": 0.0353, + "step": 135190 + }, + { + "epoch": 29.00723609380924, + "grad_norm": 0.004121995065361261, + "learning_rate": 1.4873591025895637e-05, + "loss": 0.0815, + "step": 135200 + }, + { + "epoch": 29.007290256188053, + "grad_norm": 0.0010805773781612515, + "learning_rate": 1.4870582004850541e-05, + "loss": 0.0169, + "step": 135210 + }, + { + "epoch": 29.007344418566863, + "grad_norm": 1.9500465393066406, + "learning_rate": 1.486757298380545e-05, + "loss": 0.0205, + "step": 135220 + }, + { + "epoch": 29.007398580945676, + "grad_norm": 0.001100937370210886, + "learning_rate": 1.4864563962760356e-05, + "loss": 0.02, + "step": 135230 + }, + { + "epoch": 29.007452743324485, + "grad_norm": 0.0011038158554583788, + "learning_rate": 1.4861554941715263e-05, + "loss": 0.052, + "step": 135240 + }, + { + "epoch": 29.0075069057033, + "grad_norm": 6.204117298126221, + "learning_rate": 1.485854592067017e-05, + "loss": 0.105, + "step": 135250 + }, + { + "epoch": 29.00756106808211, + "grad_norm": 0.001605106284841895, + "learning_rate": 1.4855536899625078e-05, + "loss": 0.0148, + "step": 135260 + }, + { + "epoch": 29.00761523046092, + "grad_norm": 0.21470455825328827, + "learning_rate": 1.4852527878579984e-05, + "loss": 0.0008, + "step": 135270 + }, + { + "epoch": 29.007669392839734, + "grad_norm": 5.919979572296143, + "learning_rate": 1.4849518857534892e-05, + "loss": 0.0667, + "step": 135280 + }, + { + "epoch": 29.007723555218544, + "grad_norm": 0.001129745040088892, + "learning_rate": 1.4846509836489797e-05, + "loss": 0.0115, + "step": 135290 + }, + { + "epoch": 29.007777717597357, + "grad_norm": 0.017500223591923714, + "learning_rate": 1.4843500815444705e-05, + "loss": 0.0124, + "step": 135300 + }, + { + "epoch": 29.00783187997617, + "grad_norm": 0.004485153127461672, + "learning_rate": 1.484049179439961e-05, + "loss": 0.013, + "step": 135310 + }, + { + "epoch": 29.00788604235498, + "grad_norm": 0.0010987237328663468, + "learning_rate": 1.4837482773354517e-05, + "loss": 0.0238, + "step": 135320 + }, + { + "epoch": 29.007940204733792, + "grad_norm": 0.0010286953765898943, + "learning_rate": 1.4834473752309425e-05, + "loss": 0.0591, + "step": 135330 + }, + { + "epoch": 29.007994367112605, + "grad_norm": 1.1500895023345947, + "learning_rate": 1.483146473126433e-05, + "loss": 0.0287, + "step": 135340 + }, + { + "epoch": 29.008048529491415, + "grad_norm": 0.000995114678516984, + "learning_rate": 1.4828455710219238e-05, + "loss": 0.0006, + "step": 135350 + }, + { + "epoch": 29.008102691870228, + "grad_norm": 0.009620032273232937, + "learning_rate": 1.4825446689174144e-05, + "loss": 0.004, + "step": 135360 + }, + { + "epoch": 29.008156854249037, + "grad_norm": 1.5096354484558105, + "learning_rate": 1.4822437668129052e-05, + "loss": 0.2211, + "step": 135370 + }, + { + "epoch": 29.00821101662785, + "grad_norm": 0.0010476632742211223, + "learning_rate": 1.4819428647083958e-05, + "loss": 0.0516, + "step": 135380 + }, + { + "epoch": 29.008265179006663, + "grad_norm": 0.0011396192712709308, + "learning_rate": 1.4816419626038866e-05, + "loss": 0.0041, + "step": 135390 + }, + { + "epoch": 29.008319341385473, + "grad_norm": 0.00869819987565279, + "learning_rate": 1.4813410604993772e-05, + "loss": 0.0218, + "step": 135400 + }, + { + "epoch": 29.008373503764286, + "grad_norm": 0.02438826486468315, + "learning_rate": 1.481040158394868e-05, + "loss": 0.0828, + "step": 135410 + }, + { + "epoch": 29.008427666143096, + "grad_norm": 0.5740218162536621, + "learning_rate": 1.4807392562903585e-05, + "loss": 0.0206, + "step": 135420 + }, + { + "epoch": 29.00848182852191, + "grad_norm": 0.20225736498832703, + "learning_rate": 1.4804383541858493e-05, + "loss": 0.1289, + "step": 135430 + }, + { + "epoch": 29.00853599090072, + "grad_norm": 0.7093781232833862, + "learning_rate": 1.4801374520813399e-05, + "loss": 0.0666, + "step": 135440 + }, + { + "epoch": 29.00859015327953, + "grad_norm": 0.001851308741606772, + "learning_rate": 1.4798365499768307e-05, + "loss": 0.0314, + "step": 135450 + }, + { + "epoch": 29.008644315658344, + "grad_norm": 0.8572037220001221, + "learning_rate": 1.4795356478723213e-05, + "loss": 0.0799, + "step": 135460 + }, + { + "epoch": 29.008698478037154, + "grad_norm": 0.004309754353016615, + "learning_rate": 1.4792347457678118e-05, + "loss": 0.0251, + "step": 135470 + }, + { + "epoch": 29.008752640415967, + "grad_norm": 0.0019956217147409916, + "learning_rate": 1.4789338436633026e-05, + "loss": 0.0009, + "step": 135480 + }, + { + "epoch": 29.00880680279478, + "grad_norm": 1.9890425205230713, + "learning_rate": 1.4786329415587932e-05, + "loss": 0.033, + "step": 135490 + }, + { + "epoch": 29.00886096517359, + "grad_norm": 0.0025118584744632244, + "learning_rate": 1.478332039454284e-05, + "loss": 0.0632, + "step": 135500 + }, + { + "epoch": 29.008915127552402, + "grad_norm": 0.0019410460954532027, + "learning_rate": 1.4780311373497746e-05, + "loss": 0.035, + "step": 135510 + }, + { + "epoch": 29.008969289931215, + "grad_norm": 0.005997858475893736, + "learning_rate": 1.4777302352452654e-05, + "loss": 0.0045, + "step": 135520 + }, + { + "epoch": 29.009023452310025, + "grad_norm": 0.3370564579963684, + "learning_rate": 1.477429333140756e-05, + "loss": 0.0247, + "step": 135530 + }, + { + "epoch": 29.009077614688838, + "grad_norm": 0.0026018167845904827, + "learning_rate": 1.4771284310362469e-05, + "loss": 0.0822, + "step": 135540 + }, + { + "epoch": 29.009131777067648, + "grad_norm": 0.0011214505648240447, + "learning_rate": 1.4768275289317373e-05, + "loss": 0.1489, + "step": 135550 + }, + { + "epoch": 29.00918593944646, + "grad_norm": 0.0019419988384470344, + "learning_rate": 1.4765266268272281e-05, + "loss": 0.0128, + "step": 135560 + }, + { + "epoch": 29.009240101825274, + "grad_norm": 0.049767766147851944, + "learning_rate": 1.4762257247227187e-05, + "loss": 0.0164, + "step": 135570 + }, + { + "epoch": 29.009294264204083, + "grad_norm": 0.0017086936859413981, + "learning_rate": 1.4759248226182095e-05, + "loss": 0.0014, + "step": 135580 + }, + { + "epoch": 29.009348426582896, + "grad_norm": 0.0011558873811736703, + "learning_rate": 1.4756239205137002e-05, + "loss": 0.0079, + "step": 135590 + }, + { + "epoch": 29.009402588961706, + "grad_norm": 0.0016927389660850167, + "learning_rate": 1.475323018409191e-05, + "loss": 0.0549, + "step": 135600 + }, + { + "epoch": 29.00945675134052, + "grad_norm": 0.001453097560442984, + "learning_rate": 1.4750221163046814e-05, + "loss": 0.0282, + "step": 135610 + }, + { + "epoch": 29.009510913719332, + "grad_norm": 1.6815557479858398, + "learning_rate": 1.474721214200172e-05, + "loss": 0.0173, + "step": 135620 + }, + { + "epoch": 29.00956507609814, + "grad_norm": 0.0031160006765276194, + "learning_rate": 1.4744203120956629e-05, + "loss": 0.0458, + "step": 135630 + }, + { + "epoch": 29.009619238476954, + "grad_norm": 0.08745237439870834, + "learning_rate": 1.4741194099911535e-05, + "loss": 0.0064, + "step": 135640 + }, + { + "epoch": 29.009673400855764, + "grad_norm": 0.0010678924154490232, + "learning_rate": 1.4738185078866443e-05, + "loss": 0.0008, + "step": 135650 + }, + { + "epoch": 29.009727563234577, + "grad_norm": 0.0019535748288035393, + "learning_rate": 1.4735176057821349e-05, + "loss": 0.0632, + "step": 135660 + }, + { + "epoch": 29.00978172561339, + "grad_norm": 1.2862528562545776, + "learning_rate": 1.4732167036776257e-05, + "loss": 0.0753, + "step": 135670 + }, + { + "epoch": 29.0098358879922, + "grad_norm": 0.7078820466995239, + "learning_rate": 1.4729158015731162e-05, + "loss": 0.0208, + "step": 135680 + }, + { + "epoch": 29.009890050371013, + "grad_norm": 0.005491011310368776, + "learning_rate": 1.472614899468607e-05, + "loss": 0.0528, + "step": 135690 + }, + { + "epoch": 29.009944212749826, + "grad_norm": 0.0026262628380209208, + "learning_rate": 1.4723139973640976e-05, + "loss": 0.0282, + "step": 135700 + }, + { + "epoch": 29.009998375128635, + "grad_norm": 0.8968016505241394, + "learning_rate": 1.4720130952595884e-05, + "loss": 0.0461, + "step": 135710 + }, + { + "epoch": 29.01005253750745, + "grad_norm": 5.255450248718262, + "learning_rate": 1.471712193155079e-05, + "loss": 0.0374, + "step": 135720 + }, + { + "epoch": 29.010106699886258, + "grad_norm": 0.005341735202819109, + "learning_rate": 1.4714112910505698e-05, + "loss": 0.0271, + "step": 135730 + }, + { + "epoch": 29.01016086226507, + "grad_norm": 0.0009532485855743289, + "learning_rate": 1.4711103889460603e-05, + "loss": 0.0791, + "step": 135740 + }, + { + "epoch": 29.010215024643884, + "grad_norm": 8.9947509765625, + "learning_rate": 1.4708094868415512e-05, + "loss": 0.0091, + "step": 135750 + }, + { + "epoch": 29.010269187022693, + "grad_norm": 3.349388360977173, + "learning_rate": 1.4705085847370417e-05, + "loss": 0.0571, + "step": 135760 + }, + { + "epoch": 29.010323349401506, + "grad_norm": 0.0010402960469946265, + "learning_rate": 1.4702076826325323e-05, + "loss": 0.0603, + "step": 135770 + }, + { + "epoch": 29.010377511780316, + "grad_norm": 0.00628312723711133, + "learning_rate": 1.4699067805280231e-05, + "loss": 0.1036, + "step": 135780 + }, + { + "epoch": 29.01043167415913, + "grad_norm": 0.2515001595020294, + "learning_rate": 1.4696058784235137e-05, + "loss": 0.0338, + "step": 135790 + }, + { + "epoch": 29.010485836537942, + "grad_norm": 0.0013167441356927156, + "learning_rate": 1.4693049763190045e-05, + "loss": 0.0088, + "step": 135800 + }, + { + "epoch": 29.01053999891675, + "grad_norm": 0.0014149779453873634, + "learning_rate": 1.469004074214495e-05, + "loss": 0.0322, + "step": 135810 + }, + { + "epoch": 29.010594161295565, + "grad_norm": 0.5790715217590332, + "learning_rate": 1.4687031721099858e-05, + "loss": 0.0752, + "step": 135820 + }, + { + "epoch": 29.010648323674374, + "grad_norm": 0.0015794358914718032, + "learning_rate": 1.4684022700054764e-05, + "loss": 0.0681, + "step": 135830 + }, + { + "epoch": 29.010702486053187, + "grad_norm": 1.6252179145812988, + "learning_rate": 1.4681013679009672e-05, + "loss": 0.0407, + "step": 135840 + }, + { + "epoch": 29.010756648432, + "grad_norm": 0.0014859844231978059, + "learning_rate": 1.4678004657964578e-05, + "loss": 0.0031, + "step": 135850 + }, + { + "epoch": 29.01081081081081, + "grad_norm": 0.0023885394912213087, + "learning_rate": 1.4674995636919486e-05, + "loss": 0.0064, + "step": 135860 + }, + { + "epoch": 29.010864973189623, + "grad_norm": 0.0012430617352947593, + "learning_rate": 1.4671986615874393e-05, + "loss": 0.0413, + "step": 135870 + }, + { + "epoch": 29.010919135568436, + "grad_norm": 0.0020143925212323666, + "learning_rate": 1.46689775948293e-05, + "loss": 0.0253, + "step": 135880 + }, + { + "epoch": 29.010973297947245, + "grad_norm": 0.6763007044792175, + "learning_rate": 1.4665968573784205e-05, + "loss": 0.0215, + "step": 135890 + }, + { + "epoch": 29.01102746032606, + "grad_norm": 3.1539368629455566, + "learning_rate": 1.4662959552739113e-05, + "loss": 0.0543, + "step": 135900 + }, + { + "epoch": 29.011081622704868, + "grad_norm": 0.020161353051662445, + "learning_rate": 1.465995053169402e-05, + "loss": 0.0004, + "step": 135910 + }, + { + "epoch": 29.01113578508368, + "grad_norm": 0.0011427206918597221, + "learning_rate": 1.4656941510648926e-05, + "loss": 0.065, + "step": 135920 + }, + { + "epoch": 29.011189947462494, + "grad_norm": 0.0027016664389520884, + "learning_rate": 1.4653932489603834e-05, + "loss": 0.0064, + "step": 135930 + }, + { + "epoch": 29.011244109841304, + "grad_norm": 0.0634007379412651, + "learning_rate": 1.4650923468558738e-05, + "loss": 0.0004, + "step": 135940 + }, + { + "epoch": 29.011298272220117, + "grad_norm": 0.7801255583763123, + "learning_rate": 1.4647914447513646e-05, + "loss": 0.056, + "step": 135950 + }, + { + "epoch": 29.011352434598926, + "grad_norm": 0.001657202374190092, + "learning_rate": 1.4644905426468553e-05, + "loss": 0.0782, + "step": 135960 + }, + { + "epoch": 29.01140659697774, + "grad_norm": 0.0011775223538279533, + "learning_rate": 1.464189640542346e-05, + "loss": 0.005, + "step": 135970 + }, + { + "epoch": 29.011460759356552, + "grad_norm": 2.10640549659729, + "learning_rate": 1.4638887384378367e-05, + "loss": 0.0399, + "step": 135980 + }, + { + "epoch": 29.011514921735362, + "grad_norm": 0.03290526941418648, + "learning_rate": 1.4635878363333275e-05, + "loss": 0.0139, + "step": 135990 + }, + { + "epoch": 29.011569084114175, + "grad_norm": 0.028713632375001907, + "learning_rate": 1.4632869342288181e-05, + "loss": 0.0253, + "step": 136000 + }, + { + "epoch": 29.011623246492984, + "grad_norm": 0.0014858623035252094, + "learning_rate": 1.4629860321243089e-05, + "loss": 0.061, + "step": 136010 + }, + { + "epoch": 29.011677408871797, + "grad_norm": 2.9084770679473877, + "learning_rate": 1.4626851300197994e-05, + "loss": 0.0307, + "step": 136020 + }, + { + "epoch": 29.01173157125061, + "grad_norm": 0.026795335114002228, + "learning_rate": 1.4623842279152902e-05, + "loss": 0.0723, + "step": 136030 + }, + { + "epoch": 29.01178573362942, + "grad_norm": 0.010672207921743393, + "learning_rate": 1.4620833258107808e-05, + "loss": 0.027, + "step": 136040 + }, + { + "epoch": 29.011839896008233, + "grad_norm": 0.025660919025540352, + "learning_rate": 1.4617824237062716e-05, + "loss": 0.0648, + "step": 136050 + }, + { + "epoch": 29.011894058387043, + "grad_norm": 1.1011128425598145, + "learning_rate": 1.4614815216017622e-05, + "loss": 0.0629, + "step": 136060 + }, + { + "epoch": 29.011948220765856, + "grad_norm": 0.0010554947657510638, + "learning_rate": 1.4611806194972527e-05, + "loss": 0.0683, + "step": 136070 + }, + { + "epoch": 29.01200238314467, + "grad_norm": 0.3093312680721283, + "learning_rate": 1.4608797173927435e-05, + "loss": 0.0361, + "step": 136080 + }, + { + "epoch": 29.01205654552348, + "grad_norm": 0.023679979145526886, + "learning_rate": 1.4605788152882341e-05, + "loss": 0.01, + "step": 136090 + }, + { + "epoch": 29.01211070790229, + "grad_norm": 0.005221368744969368, + "learning_rate": 1.4602779131837249e-05, + "loss": 0.0414, + "step": 136100 + }, + { + "epoch": 29.012164870281104, + "grad_norm": 0.04904154688119888, + "learning_rate": 1.4599770110792155e-05, + "loss": 0.0084, + "step": 136110 + }, + { + "epoch": 29.012219032659914, + "grad_norm": 0.0543457493185997, + "learning_rate": 1.4596761089747063e-05, + "loss": 0.025, + "step": 136120 + }, + { + "epoch": 29.012273195038727, + "grad_norm": 1.1276224851608276, + "learning_rate": 1.459375206870197e-05, + "loss": 0.044, + "step": 136130 + }, + { + "epoch": 29.012327357417536, + "grad_norm": 0.04886627942323685, + "learning_rate": 1.4590743047656877e-05, + "loss": 0.0141, + "step": 136140 + }, + { + "epoch": 29.01238151979635, + "grad_norm": 0.0010573676554486156, + "learning_rate": 1.4587734026611782e-05, + "loss": 0.0033, + "step": 136150 + }, + { + "epoch": 29.012435682175163, + "grad_norm": 0.001170345232822001, + "learning_rate": 1.458472500556669e-05, + "loss": 0.0789, + "step": 136160 + }, + { + "epoch": 29.012489844553972, + "grad_norm": 0.0010735372779890895, + "learning_rate": 1.4581715984521596e-05, + "loss": 0.048, + "step": 136170 + }, + { + "epoch": 29.012544006932785, + "grad_norm": 0.0015143572818487883, + "learning_rate": 1.4578706963476504e-05, + "loss": 0.0929, + "step": 136180 + }, + { + "epoch": 29.012598169311595, + "grad_norm": 0.2454458624124527, + "learning_rate": 1.457569794243141e-05, + "loss": 0.1558, + "step": 136190 + }, + { + "epoch": 29.012652331690408, + "grad_norm": 0.015275262296199799, + "learning_rate": 1.4572688921386318e-05, + "loss": 0.0268, + "step": 136200 + }, + { + "epoch": 29.01270649406922, + "grad_norm": 0.09542746096849442, + "learning_rate": 1.4569679900341223e-05, + "loss": 0.0599, + "step": 136210 + }, + { + "epoch": 29.01276065644803, + "grad_norm": 0.29944735765457153, + "learning_rate": 1.456667087929613e-05, + "loss": 0.0539, + "step": 136220 + }, + { + "epoch": 29.012814818826843, + "grad_norm": 1.2212218046188354, + "learning_rate": 1.4563661858251037e-05, + "loss": 0.0518, + "step": 136230 + }, + { + "epoch": 29.012868981205653, + "grad_norm": 0.49514341354370117, + "learning_rate": 1.4560652837205943e-05, + "loss": 0.01, + "step": 136240 + }, + { + "epoch": 29.012923143584466, + "grad_norm": 0.06437639892101288, + "learning_rate": 1.4557643816160851e-05, + "loss": 0.0414, + "step": 136250 + }, + { + "epoch": 29.01297730596328, + "grad_norm": 0.0010390487732365727, + "learning_rate": 1.4554634795115758e-05, + "loss": 0.033, + "step": 136260 + }, + { + "epoch": 29.01303146834209, + "grad_norm": 0.05234331637620926, + "learning_rate": 1.4551625774070666e-05, + "loss": 0.0189, + "step": 136270 + }, + { + "epoch": 29.0130856307209, + "grad_norm": 0.20523983240127563, + "learning_rate": 1.454861675302557e-05, + "loss": 0.0495, + "step": 136280 + }, + { + "epoch": 29.013139793099715, + "grad_norm": 0.7158628106117249, + "learning_rate": 1.4545607731980478e-05, + "loss": 0.0485, + "step": 136290 + }, + { + "epoch": 29.013193955478524, + "grad_norm": 1.08737051486969, + "learning_rate": 1.4542598710935385e-05, + "loss": 0.0252, + "step": 136300 + }, + { + "epoch": 29.013248117857337, + "grad_norm": 0.027565548196434975, + "learning_rate": 1.4539589689890292e-05, + "loss": 0.0354, + "step": 136310 + }, + { + "epoch": 29.013302280236147, + "grad_norm": 0.09433392435312271, + "learning_rate": 1.4536580668845199e-05, + "loss": 0.0181, + "step": 136320 + }, + { + "epoch": 29.01335644261496, + "grad_norm": 0.0013021074701100588, + "learning_rate": 1.4533571647800107e-05, + "loss": 0.0593, + "step": 136330 + }, + { + "epoch": 29.013410604993773, + "grad_norm": 0.012501894496381283, + "learning_rate": 1.4530562626755011e-05, + "loss": 0.0808, + "step": 136340 + }, + { + "epoch": 29.013464767372582, + "grad_norm": 0.06065906956791878, + "learning_rate": 1.4527553605709921e-05, + "loss": 0.0161, + "step": 136350 + }, + { + "epoch": 29.013518929751395, + "grad_norm": 0.5514750480651855, + "learning_rate": 1.4524544584664826e-05, + "loss": 0.0114, + "step": 136360 + }, + { + "epoch": 29.013573092130205, + "grad_norm": 0.1761956512928009, + "learning_rate": 1.4521535563619732e-05, + "loss": 0.0029, + "step": 136370 + }, + { + "epoch": 29.013627254509018, + "grad_norm": 0.0009201784851029515, + "learning_rate": 1.451852654257464e-05, + "loss": 0.0009, + "step": 136380 + }, + { + "epoch": 29.01368141688783, + "grad_norm": 4.867364406585693, + "learning_rate": 1.4515517521529546e-05, + "loss": 0.0356, + "step": 136390 + }, + { + "epoch": 29.01373557926664, + "grad_norm": 0.001417541061528027, + "learning_rate": 1.4512508500484454e-05, + "loss": 0.0316, + "step": 136400 + }, + { + "epoch": 29.013789741645454, + "grad_norm": 0.02416575327515602, + "learning_rate": 1.4509499479439359e-05, + "loss": 0.0426, + "step": 136410 + }, + { + "epoch": 29.013843904024263, + "grad_norm": 0.0011249319650232792, + "learning_rate": 1.4506490458394267e-05, + "loss": 0.023, + "step": 136420 + }, + { + "epoch": 29.013898066403076, + "grad_norm": 0.001863874844275415, + "learning_rate": 1.4503481437349173e-05, + "loss": 0.0643, + "step": 136430 + }, + { + "epoch": 29.01395222878189, + "grad_norm": 0.023295825347304344, + "learning_rate": 1.450047241630408e-05, + "loss": 0.0128, + "step": 136440 + }, + { + "epoch": 29.0140063911607, + "grad_norm": 0.2470082938671112, + "learning_rate": 1.4497463395258987e-05, + "loss": 0.0857, + "step": 136450 + }, + { + "epoch": 29.01406055353951, + "grad_norm": 0.0027103072497993708, + "learning_rate": 1.4494454374213895e-05, + "loss": 0.0001, + "step": 136460 + }, + { + "epoch": 29.014114715918325, + "grad_norm": 0.0010045471135526896, + "learning_rate": 1.44914453531688e-05, + "loss": 0.0051, + "step": 136470 + }, + { + "epoch": 29.014168878297134, + "grad_norm": 0.0012459908612072468, + "learning_rate": 1.448843633212371e-05, + "loss": 0.1003, + "step": 136480 + }, + { + "epoch": 29.014223040675947, + "grad_norm": 1.8322278261184692, + "learning_rate": 1.4485427311078614e-05, + "loss": 0.0745, + "step": 136490 + }, + { + "epoch": 29.014277203054757, + "grad_norm": 0.0036892953794449568, + "learning_rate": 1.4482418290033522e-05, + "loss": 0.0571, + "step": 136500 + }, + { + "epoch": 29.01433136543357, + "grad_norm": 0.008196700364351273, + "learning_rate": 1.4479409268988428e-05, + "loss": 0.0243, + "step": 136510 + }, + { + "epoch": 29.014385527812383, + "grad_norm": 0.0010182576952502131, + "learning_rate": 1.4476400247943334e-05, + "loss": 0.0156, + "step": 136520 + }, + { + "epoch": 29.014439690191193, + "grad_norm": 0.007764573674649, + "learning_rate": 1.4473391226898242e-05, + "loss": 0.0004, + "step": 136530 + }, + { + "epoch": 29.014493852570006, + "grad_norm": 0.09031554311513901, + "learning_rate": 1.4470382205853147e-05, + "loss": 0.0158, + "step": 136540 + }, + { + "epoch": 29.014548014948815, + "grad_norm": 0.0654132291674614, + "learning_rate": 1.4467373184808055e-05, + "loss": 0.0375, + "step": 136550 + }, + { + "epoch": 29.014602177327628, + "grad_norm": 0.001196608878672123, + "learning_rate": 1.4464364163762961e-05, + "loss": 0.0453, + "step": 136560 + }, + { + "epoch": 29.01465633970644, + "grad_norm": 0.004425338935106993, + "learning_rate": 1.446135514271787e-05, + "loss": 0.0407, + "step": 136570 + }, + { + "epoch": 29.01471050208525, + "grad_norm": 0.0021731099113821983, + "learning_rate": 1.4458346121672775e-05, + "loss": 0.0026, + "step": 136580 + }, + { + "epoch": 29.014764664464064, + "grad_norm": 0.677463948726654, + "learning_rate": 1.4455337100627683e-05, + "loss": 0.018, + "step": 136590 + }, + { + "epoch": 29.014818826842873, + "grad_norm": 0.02092144452035427, + "learning_rate": 1.4452328079582588e-05, + "loss": 0.0271, + "step": 136600 + }, + { + "epoch": 29.014872989221686, + "grad_norm": 0.2422066181898117, + "learning_rate": 1.4449319058537498e-05, + "loss": 0.0239, + "step": 136610 + }, + { + "epoch": 29.0149271516005, + "grad_norm": 0.04389714077115059, + "learning_rate": 1.4446310037492402e-05, + "loss": 0.0196, + "step": 136620 + }, + { + "epoch": 29.01498131397931, + "grad_norm": 0.04871762543916702, + "learning_rate": 1.444330101644731e-05, + "loss": 0.0094, + "step": 136630 + }, + { + "epoch": 29.015035476358122, + "grad_norm": 0.9127642512321472, + "learning_rate": 1.4440291995402216e-05, + "loss": 0.0171, + "step": 136640 + }, + { + "epoch": 29.015089638736935, + "grad_norm": 0.0008295879233628511, + "learning_rate": 1.4437282974357123e-05, + "loss": 0.0133, + "step": 136650 + }, + { + "epoch": 29.015143801115745, + "grad_norm": 3.373298406600952, + "learning_rate": 1.443427395331203e-05, + "loss": 0.133, + "step": 136660 + }, + { + "epoch": 29.015197963494558, + "grad_norm": 0.022470900788903236, + "learning_rate": 1.4431264932266935e-05, + "loss": 0.0152, + "step": 136670 + }, + { + "epoch": 29.015252125873367, + "grad_norm": 2.173490285873413, + "learning_rate": 1.4428255911221843e-05, + "loss": 0.086, + "step": 136680 + }, + { + "epoch": 29.01530628825218, + "grad_norm": 0.0009232745505869389, + "learning_rate": 1.442524689017675e-05, + "loss": 0.0154, + "step": 136690 + }, + { + "epoch": 29.015360450630993, + "grad_norm": 0.007494818419218063, + "learning_rate": 1.4422237869131658e-05, + "loss": 0.0129, + "step": 136700 + }, + { + "epoch": 29.015414613009803, + "grad_norm": 0.8879398703575134, + "learning_rate": 1.4419228848086564e-05, + "loss": 0.064, + "step": 136710 + }, + { + "epoch": 29.015468775388616, + "grad_norm": 0.5236414074897766, + "learning_rate": 1.4416219827041472e-05, + "loss": 0.027, + "step": 136720 + }, + { + "epoch": 29.015522937767425, + "grad_norm": 1.1113371849060059, + "learning_rate": 1.4413210805996378e-05, + "loss": 0.0081, + "step": 136730 + }, + { + "epoch": 29.01557710014624, + "grad_norm": 0.0011378072667866945, + "learning_rate": 1.4410201784951286e-05, + "loss": 0.0009, + "step": 136740 + }, + { + "epoch": 29.01563126252505, + "grad_norm": 1.6084316968917847, + "learning_rate": 1.440719276390619e-05, + "loss": 0.0296, + "step": 136750 + }, + { + "epoch": 29.01568542490386, + "grad_norm": 0.0009103210759349167, + "learning_rate": 1.4404183742861099e-05, + "loss": 0.0218, + "step": 136760 + }, + { + "epoch": 29.015739587282674, + "grad_norm": 0.0015954634873196483, + "learning_rate": 1.4401174721816005e-05, + "loss": 0.0118, + "step": 136770 + }, + { + "epoch": 29.015793749661484, + "grad_norm": 1.1611082553863525, + "learning_rate": 1.4398165700770913e-05, + "loss": 0.067, + "step": 136780 + }, + { + "epoch": 29.015847912040297, + "grad_norm": 0.0008619948639534414, + "learning_rate": 1.4395156679725819e-05, + "loss": 0.007, + "step": 136790 + }, + { + "epoch": 29.01590207441911, + "grad_norm": 0.007799440063536167, + "learning_rate": 1.4392147658680724e-05, + "loss": 0.0634, + "step": 136800 + }, + { + "epoch": 29.01595623679792, + "grad_norm": 0.02688918448984623, + "learning_rate": 1.4389138637635632e-05, + "loss": 0.0306, + "step": 136810 + }, + { + "epoch": 29.016010399176732, + "grad_norm": 0.061075787991285324, + "learning_rate": 1.4386129616590538e-05, + "loss": 0.0307, + "step": 136820 + }, + { + "epoch": 29.016064561555545, + "grad_norm": 0.0011354632442817092, + "learning_rate": 1.4383120595545446e-05, + "loss": 0.0053, + "step": 136830 + }, + { + "epoch": 29.016118723934355, + "grad_norm": 0.0008543056901544333, + "learning_rate": 1.4380111574500352e-05, + "loss": 0.01, + "step": 136840 + }, + { + "epoch": 29.016172886313168, + "grad_norm": 0.9196966290473938, + "learning_rate": 1.437710255345526e-05, + "loss": 0.1529, + "step": 136850 + }, + { + "epoch": 29.016227048691977, + "grad_norm": 0.0016105073736980557, + "learning_rate": 1.4374093532410166e-05, + "loss": 0.0356, + "step": 136860 + }, + { + "epoch": 29.01628121107079, + "grad_norm": 1.0265742540359497, + "learning_rate": 1.4371084511365074e-05, + "loss": 0.0478, + "step": 136870 + }, + { + "epoch": 29.016335373449603, + "grad_norm": 0.0016851937398314476, + "learning_rate": 1.4368075490319979e-05, + "loss": 0.0836, + "step": 136880 + }, + { + "epoch": 29.016389535828413, + "grad_norm": 0.013685758225619793, + "learning_rate": 1.4365066469274887e-05, + "loss": 0.0569, + "step": 136890 + }, + { + "epoch": 29.016443698207226, + "grad_norm": 0.06971456855535507, + "learning_rate": 1.4362057448229793e-05, + "loss": 0.0033, + "step": 136900 + }, + { + "epoch": 29.016497860586036, + "grad_norm": 0.002385868225246668, + "learning_rate": 1.4359048427184701e-05, + "loss": 0.0064, + "step": 136910 + }, + { + "epoch": 29.01655202296485, + "grad_norm": 0.06665533781051636, + "learning_rate": 1.4356039406139607e-05, + "loss": 0.033, + "step": 136920 + }, + { + "epoch": 29.01660618534366, + "grad_norm": 0.046318165957927704, + "learning_rate": 1.4353030385094515e-05, + "loss": 0.0039, + "step": 136930 + }, + { + "epoch": 29.01666034772247, + "grad_norm": 0.0012871065409854054, + "learning_rate": 1.435002136404942e-05, + "loss": 0.0025, + "step": 136940 + }, + { + "epoch": 29.016714510101284, + "grad_norm": 0.000907055102288723, + "learning_rate": 1.4347012343004326e-05, + "loss": 0.0606, + "step": 136950 + }, + { + "epoch": 29.016768672480094, + "grad_norm": 0.0012346188304945827, + "learning_rate": 1.4344003321959234e-05, + "loss": 0.0658, + "step": 136960 + }, + { + "epoch": 29.016822834858907, + "grad_norm": 0.30067506432533264, + "learning_rate": 1.434099430091414e-05, + "loss": 0.0015, + "step": 136970 + }, + { + "epoch": 29.01687699723772, + "grad_norm": 0.06046541407704353, + "learning_rate": 1.4337985279869048e-05, + "loss": 0.0254, + "step": 136980 + }, + { + "epoch": 29.01693115961653, + "grad_norm": 1.6111952066421509, + "learning_rate": 1.4334976258823955e-05, + "loss": 0.0381, + "step": 136990 + }, + { + "epoch": 29.016985321995342, + "grad_norm": 0.029337381944060326, + "learning_rate": 1.4331967237778863e-05, + "loss": 0.0297, + "step": 137000 + }, + { + "epoch": 29.017039484374155, + "grad_norm": 0.6005001664161682, + "learning_rate": 1.4328958216733767e-05, + "loss": 0.0161, + "step": 137010 + }, + { + "epoch": 29.017093646752965, + "grad_norm": 0.0013416280271485448, + "learning_rate": 1.4325949195688675e-05, + "loss": 0.0888, + "step": 137020 + }, + { + "epoch": 29.017147809131778, + "grad_norm": 0.0012205636594444513, + "learning_rate": 1.4322940174643582e-05, + "loss": 0.0233, + "step": 137030 + }, + { + "epoch": 29.017201971510588, + "grad_norm": 0.0011681870091706514, + "learning_rate": 1.431993115359849e-05, + "loss": 0.0762, + "step": 137040 + }, + { + "epoch": 29.0172561338894, + "grad_norm": 0.0017266884678974748, + "learning_rate": 1.4316922132553396e-05, + "loss": 0.0255, + "step": 137050 + }, + { + "epoch": 29.017310296268214, + "grad_norm": 0.0009092522086575627, + "learning_rate": 1.4313913111508304e-05, + "loss": 0.0343, + "step": 137060 + }, + { + "epoch": 29.017364458647023, + "grad_norm": 8.372208595275879, + "learning_rate": 1.4310904090463208e-05, + "loss": 0.1117, + "step": 137070 + }, + { + "epoch": 29.017418621025836, + "grad_norm": 0.062400829046964645, + "learning_rate": 1.4307895069418118e-05, + "loss": 0.0291, + "step": 137080 + }, + { + "epoch": 29.017472783404646, + "grad_norm": 0.014306161552667618, + "learning_rate": 1.4304886048373023e-05, + "loss": 0.0034, + "step": 137090 + }, + { + "epoch": 29.01752694578346, + "grad_norm": 0.8359270095825195, + "learning_rate": 1.4301877027327929e-05, + "loss": 0.0147, + "step": 137100 + }, + { + "epoch": 29.017581108162272, + "grad_norm": 0.1601998209953308, + "learning_rate": 1.4298868006282837e-05, + "loss": 0.0119, + "step": 137110 + }, + { + "epoch": 29.01763527054108, + "grad_norm": 0.0008853141916915774, + "learning_rate": 1.4295858985237743e-05, + "loss": 0.032, + "step": 137120 + }, + { + "epoch": 29.017689432919894, + "grad_norm": 0.0009236649493686855, + "learning_rate": 1.4292849964192651e-05, + "loss": 0.0375, + "step": 137130 + }, + { + "epoch": 29.017743595298704, + "grad_norm": 0.0011175224790349603, + "learning_rate": 1.4289840943147556e-05, + "loss": 0.0116, + "step": 137140 + }, + { + "epoch": 29.017797757677517, + "grad_norm": 0.0010067401453852654, + "learning_rate": 1.4286831922102464e-05, + "loss": 0.0692, + "step": 137150 + }, + { + "epoch": 29.01785192005633, + "grad_norm": 0.706923246383667, + "learning_rate": 1.428382290105737e-05, + "loss": 0.0513, + "step": 137160 + }, + { + "epoch": 29.01790608243514, + "grad_norm": 0.006177070550620556, + "learning_rate": 1.4280813880012278e-05, + "loss": 0.0462, + "step": 137170 + }, + { + "epoch": 29.017960244813953, + "grad_norm": 0.14629198610782623, + "learning_rate": 1.4277804858967184e-05, + "loss": 0.0279, + "step": 137180 + }, + { + "epoch": 29.018014407192762, + "grad_norm": 3.387006998062134, + "learning_rate": 1.4274795837922092e-05, + "loss": 0.0217, + "step": 137190 + }, + { + "epoch": 29.018068569571575, + "grad_norm": 0.9338724613189697, + "learning_rate": 1.4271786816876997e-05, + "loss": 0.01, + "step": 137200 + }, + { + "epoch": 29.01812273195039, + "grad_norm": 1.0013301372528076, + "learning_rate": 1.4268777795831906e-05, + "loss": 0.2008, + "step": 137210 + }, + { + "epoch": 29.018176894329198, + "grad_norm": 0.001225721905939281, + "learning_rate": 1.4265768774786811e-05, + "loss": 0.0983, + "step": 137220 + }, + { + "epoch": 29.01823105670801, + "grad_norm": 0.001040036091580987, + "learning_rate": 1.4262759753741719e-05, + "loss": 0.0261, + "step": 137230 + }, + { + "epoch": 29.018285219086824, + "grad_norm": 0.0011436550412327051, + "learning_rate": 1.4259750732696625e-05, + "loss": 0.0108, + "step": 137240 + }, + { + "epoch": 29.018339381465633, + "grad_norm": 0.0012377878883853555, + "learning_rate": 1.4256741711651531e-05, + "loss": 0.1048, + "step": 137250 + }, + { + "epoch": 29.018393543844446, + "grad_norm": 0.0011147946352139115, + "learning_rate": 1.425373269060644e-05, + "loss": 0.0052, + "step": 137260 + }, + { + "epoch": 29.018447706223256, + "grad_norm": 0.05544280260801315, + "learning_rate": 1.4250723669561344e-05, + "loss": 0.0203, + "step": 137270 + }, + { + "epoch": 29.01850186860207, + "grad_norm": 0.0027980059385299683, + "learning_rate": 1.4247714648516252e-05, + "loss": 0.0464, + "step": 137280 + }, + { + "epoch": 29.018556030980882, + "grad_norm": 0.16661252081394196, + "learning_rate": 1.4244705627471158e-05, + "loss": 0.024, + "step": 137290 + }, + { + "epoch": 29.01861019335969, + "grad_norm": 0.0034273494966328144, + "learning_rate": 1.4241696606426066e-05, + "loss": 0.0368, + "step": 137300 + }, + { + "epoch": 29.018664355738505, + "grad_norm": 4.776010036468506, + "learning_rate": 1.4238687585380972e-05, + "loss": 0.0922, + "step": 137310 + }, + { + "epoch": 29.018718518117314, + "grad_norm": 0.0010822713375091553, + "learning_rate": 1.423567856433588e-05, + "loss": 0.0381, + "step": 137320 + }, + { + "epoch": 29.018772680496127, + "grad_norm": 0.0014271916588768363, + "learning_rate": 1.4232669543290785e-05, + "loss": 0.0077, + "step": 137330 + }, + { + "epoch": 29.01882684287494, + "grad_norm": 0.0011265911161899567, + "learning_rate": 1.4229660522245695e-05, + "loss": 0.0074, + "step": 137340 + }, + { + "epoch": 29.01888100525375, + "grad_norm": 0.8288344740867615, + "learning_rate": 1.42266515012006e-05, + "loss": 0.0155, + "step": 137350 + }, + { + "epoch": 29.018935167632563, + "grad_norm": 3.3719239234924316, + "learning_rate": 1.4223642480155507e-05, + "loss": 0.069, + "step": 137360 + }, + { + "epoch": 29.018989330011372, + "grad_norm": 1.6665760278701782, + "learning_rate": 1.4220633459110413e-05, + "loss": 0.0631, + "step": 137370 + }, + { + "epoch": 29.019043492390185, + "grad_norm": 0.0011789370328187943, + "learning_rate": 1.4217624438065321e-05, + "loss": 0.0177, + "step": 137380 + }, + { + "epoch": 29.019097654769, + "grad_norm": 1.291474461555481, + "learning_rate": 1.4214615417020228e-05, + "loss": 0.0392, + "step": 137390 + }, + { + "epoch": 29.019151817147808, + "grad_norm": 0.0011675446294248104, + "learning_rate": 1.4211606395975132e-05, + "loss": 0.0546, + "step": 137400 + }, + { + "epoch": 29.01920597952662, + "grad_norm": 0.4624645411968231, + "learning_rate": 1.420859737493004e-05, + "loss": 0.0887, + "step": 137410 + }, + { + "epoch": 29.019260141905434, + "grad_norm": 0.0010789020452648401, + "learning_rate": 1.4205588353884947e-05, + "loss": 0.0645, + "step": 137420 + }, + { + "epoch": 29.019314304284244, + "grad_norm": 0.9256618618965149, + "learning_rate": 1.4202579332839855e-05, + "loss": 0.0532, + "step": 137430 + }, + { + "epoch": 29.019368466663057, + "grad_norm": 0.05334649980068207, + "learning_rate": 1.419957031179476e-05, + "loss": 0.0127, + "step": 137440 + }, + { + "epoch": 29.019422629041866, + "grad_norm": 0.0010177807416766882, + "learning_rate": 1.4196561290749669e-05, + "loss": 0.0432, + "step": 137450 + }, + { + "epoch": 29.01947679142068, + "grad_norm": 0.0032229593489319086, + "learning_rate": 1.4193552269704573e-05, + "loss": 0.0334, + "step": 137460 + }, + { + "epoch": 29.019530953799492, + "grad_norm": 0.0010566716082394123, + "learning_rate": 1.4190543248659483e-05, + "loss": 0.011, + "step": 137470 + }, + { + "epoch": 29.019585116178302, + "grad_norm": 0.001206046319566667, + "learning_rate": 1.4187534227614388e-05, + "loss": 0.0311, + "step": 137480 + }, + { + "epoch": 29.019639278557115, + "grad_norm": 10.13215446472168, + "learning_rate": 1.4184525206569296e-05, + "loss": 0.013, + "step": 137490 + }, + { + "epoch": 29.019693440935924, + "grad_norm": 1.792426586151123, + "learning_rate": 1.4181516185524202e-05, + "loss": 0.0285, + "step": 137500 + }, + { + "epoch": 29.019747603314737, + "grad_norm": 0.6328930854797363, + "learning_rate": 1.417850716447911e-05, + "loss": 0.0086, + "step": 137510 + }, + { + "epoch": 29.01980176569355, + "grad_norm": 1.7223687171936035, + "learning_rate": 1.4175498143434016e-05, + "loss": 0.0803, + "step": 137520 + }, + { + "epoch": 29.01985592807236, + "grad_norm": 0.026286108419299126, + "learning_rate": 1.4172489122388924e-05, + "loss": 0.0086, + "step": 137530 + }, + { + "epoch": 29.019910090451173, + "grad_norm": 0.06390439718961716, + "learning_rate": 1.4169480101343829e-05, + "loss": 0.0008, + "step": 137540 + }, + { + "epoch": 29.019964252829983, + "grad_norm": 0.037892647087574005, + "learning_rate": 1.4166471080298735e-05, + "loss": 0.0221, + "step": 137550 + }, + { + "epoch": 29.020018415208796, + "grad_norm": 0.001234991941601038, + "learning_rate": 1.4163462059253643e-05, + "loss": 0.027, + "step": 137560 + }, + { + "epoch": 29.02007257758761, + "grad_norm": 1.3176813125610352, + "learning_rate": 1.4160453038208549e-05, + "loss": 0.0488, + "step": 137570 + }, + { + "epoch": 29.02012673996642, + "grad_norm": 0.0009339769603684545, + "learning_rate": 1.4157444017163457e-05, + "loss": 0.0031, + "step": 137580 + }, + { + "epoch": 29.02018090234523, + "grad_norm": 0.001729040639474988, + "learning_rate": 1.4154434996118363e-05, + "loss": 0.0562, + "step": 137590 + }, + { + "epoch": 29.020235064724044, + "grad_norm": 0.48737749457359314, + "learning_rate": 1.4151425975073271e-05, + "loss": 0.0136, + "step": 137600 + }, + { + "epoch": 29.020289227102854, + "grad_norm": 1.5142384767532349, + "learning_rate": 1.4148416954028176e-05, + "loss": 0.0754, + "step": 137610 + }, + { + "epoch": 29.020343389481667, + "grad_norm": 1.7690900564193726, + "learning_rate": 1.4145407932983084e-05, + "loss": 0.1454, + "step": 137620 + }, + { + "epoch": 29.020397551860476, + "grad_norm": 0.0015494059771299362, + "learning_rate": 1.414239891193799e-05, + "loss": 0.053, + "step": 137630 + }, + { + "epoch": 29.02045171423929, + "grad_norm": 2.6944022178649902, + "learning_rate": 1.4139389890892898e-05, + "loss": 0.0249, + "step": 137640 + }, + { + "epoch": 29.020505876618103, + "grad_norm": 0.0014324478106573224, + "learning_rate": 1.4136380869847804e-05, + "loss": 0.0324, + "step": 137650 + }, + { + "epoch": 29.020560038996912, + "grad_norm": 0.0023731654509902, + "learning_rate": 1.4133371848802712e-05, + "loss": 0.1186, + "step": 137660 + }, + { + "epoch": 29.020614201375725, + "grad_norm": 4.140650272369385, + "learning_rate": 1.4130362827757617e-05, + "loss": 0.0731, + "step": 137670 + }, + { + "epoch": 29.020668363754535, + "grad_norm": 0.0016719324048608541, + "learning_rate": 1.4127353806712527e-05, + "loss": 0.0006, + "step": 137680 + }, + { + "epoch": 29.020722526133348, + "grad_norm": 0.028567688539624214, + "learning_rate": 1.4124344785667431e-05, + "loss": 0.0301, + "step": 137690 + }, + { + "epoch": 29.02077668851216, + "grad_norm": 0.002351607196033001, + "learning_rate": 1.4121335764622337e-05, + "loss": 0.0472, + "step": 137700 + }, + { + "epoch": 29.02083085089097, + "grad_norm": 0.08017820119857788, + "learning_rate": 1.4118326743577245e-05, + "loss": 0.01, + "step": 137710 + }, + { + "epoch": 29.020885013269783, + "grad_norm": 6.115312099456787, + "learning_rate": 1.4115317722532152e-05, + "loss": 0.1472, + "step": 137720 + }, + { + "epoch": 29.020939175648593, + "grad_norm": 0.042907945811748505, + "learning_rate": 1.411230870148706e-05, + "loss": 0.0005, + "step": 137730 + }, + { + "epoch": 29.020993338027406, + "grad_norm": 0.001604067045263946, + "learning_rate": 1.4109299680441964e-05, + "loss": 0.0736, + "step": 137740 + }, + { + "epoch": 29.02104750040622, + "grad_norm": 3.070174217224121, + "learning_rate": 1.4106290659396872e-05, + "loss": 0.0407, + "step": 137750 + }, + { + "epoch": 29.02110166278503, + "grad_norm": 0.22772973775863647, + "learning_rate": 1.4103281638351779e-05, + "loss": 0.0623, + "step": 137760 + }, + { + "epoch": 29.02115582516384, + "grad_norm": 0.011354553513228893, + "learning_rate": 1.4100272617306686e-05, + "loss": 0.0406, + "step": 137770 + }, + { + "epoch": 29.021209987542655, + "grad_norm": 0.0014833626337349415, + "learning_rate": 1.4097263596261593e-05, + "loss": 0.0092, + "step": 137780 + }, + { + "epoch": 29.021264149921464, + "grad_norm": 0.4775385558605194, + "learning_rate": 1.40942545752165e-05, + "loss": 0.0257, + "step": 137790 + }, + { + "epoch": 29.021318312300277, + "grad_norm": 0.0014459671219810843, + "learning_rate": 1.4091245554171405e-05, + "loss": 0.0173, + "step": 137800 + }, + { + "epoch": 29.021372474679087, + "grad_norm": 3.958150625228882, + "learning_rate": 1.4088236533126315e-05, + "loss": 0.0534, + "step": 137810 + }, + { + "epoch": 29.0214266370579, + "grad_norm": 0.0013834529090672731, + "learning_rate": 1.408522751208122e-05, + "loss": 0.0169, + "step": 137820 + }, + { + "epoch": 29.021480799436713, + "grad_norm": 0.005152693949639797, + "learning_rate": 1.4082218491036128e-05, + "loss": 0.0223, + "step": 137830 + }, + { + "epoch": 29.021534961815522, + "grad_norm": 0.4415989816188812, + "learning_rate": 1.4079209469991034e-05, + "loss": 0.0499, + "step": 137840 + }, + { + "epoch": 29.021589124194335, + "grad_norm": 0.3477841019630432, + "learning_rate": 1.407620044894594e-05, + "loss": 0.0319, + "step": 137850 + }, + { + "epoch": 29.021643286573145, + "grad_norm": 0.019179772585630417, + "learning_rate": 1.4073191427900848e-05, + "loss": 0.0276, + "step": 137860 + }, + { + "epoch": 29.021697448951958, + "grad_norm": 0.8201265931129456, + "learning_rate": 1.4070182406855753e-05, + "loss": 0.0352, + "step": 137870 + }, + { + "epoch": 29.02175161133077, + "grad_norm": 0.0010704539017751813, + "learning_rate": 1.406717338581066e-05, + "loss": 0.0435, + "step": 137880 + }, + { + "epoch": 29.02180577370958, + "grad_norm": 0.46940651535987854, + "learning_rate": 1.4064164364765567e-05, + "loss": 0.0399, + "step": 137890 + }, + { + "epoch": 29.021859936088394, + "grad_norm": 0.7406325936317444, + "learning_rate": 1.4061155343720475e-05, + "loss": 0.0361, + "step": 137900 + }, + { + "epoch": 29.021914098467203, + "grad_norm": 0.002314672339707613, + "learning_rate": 1.4058146322675381e-05, + "loss": 0.0451, + "step": 137910 + }, + { + "epoch": 29.021968260846016, + "grad_norm": 0.0010494240559637547, + "learning_rate": 1.4055137301630289e-05, + "loss": 0.0433, + "step": 137920 + }, + { + "epoch": 29.02202242322483, + "grad_norm": 1.37785804271698, + "learning_rate": 1.4052128280585194e-05, + "loss": 0.0325, + "step": 137930 + }, + { + "epoch": 29.02207658560364, + "grad_norm": 2.8029003143310547, + "learning_rate": 1.4049119259540103e-05, + "loss": 0.0805, + "step": 137940 + }, + { + "epoch": 29.02213074798245, + "grad_norm": 0.7211646437644958, + "learning_rate": 1.4046110238495008e-05, + "loss": 0.0979, + "step": 137950 + }, + { + "epoch": 29.022184910361265, + "grad_norm": 4.123842716217041, + "learning_rate": 1.4043101217449916e-05, + "loss": 0.1377, + "step": 137960 + }, + { + "epoch": 29.022239072740074, + "grad_norm": 0.012201678939163685, + "learning_rate": 1.4040092196404822e-05, + "loss": 0.024, + "step": 137970 + }, + { + "epoch": 29.022293235118887, + "grad_norm": 0.0013043781509622931, + "learning_rate": 1.403708317535973e-05, + "loss": 0.0116, + "step": 137980 + }, + { + "epoch": 29.022347397497697, + "grad_norm": 0.9302355051040649, + "learning_rate": 1.4034074154314636e-05, + "loss": 0.0524, + "step": 137990 + }, + { + "epoch": 29.02240155987651, + "grad_norm": 0.0016146990237757564, + "learning_rate": 1.4031065133269541e-05, + "loss": 0.1115, + "step": 138000 + }, + { + "epoch": 29.022455722255323, + "grad_norm": 0.0019403300248086452, + "learning_rate": 1.4028056112224449e-05, + "loss": 0.0165, + "step": 138010 + }, + { + "epoch": 29.022509884634133, + "grad_norm": 0.16129855811595917, + "learning_rate": 1.4025047091179355e-05, + "loss": 0.0037, + "step": 138020 + }, + { + "epoch": 29.022564047012946, + "grad_norm": 0.0016841983888298273, + "learning_rate": 1.4022038070134263e-05, + "loss": 0.0109, + "step": 138030 + }, + { + "epoch": 29.022618209391755, + "grad_norm": 11.259441375732422, + "learning_rate": 1.401902904908917e-05, + "loss": 0.099, + "step": 138040 + }, + { + "epoch": 29.022672371770568, + "grad_norm": 1.1566601991653442, + "learning_rate": 1.4016020028044077e-05, + "loss": 0.0144, + "step": 138050 + }, + { + "epoch": 29.02272653414938, + "grad_norm": 0.002518636407330632, + "learning_rate": 1.4013011006998982e-05, + "loss": 0.0659, + "step": 138060 + }, + { + "epoch": 29.02278069652819, + "grad_norm": 0.0014521342236548662, + "learning_rate": 1.4010001985953892e-05, + "loss": 0.0178, + "step": 138070 + }, + { + "epoch": 29.022834858907004, + "grad_norm": 0.0010830878745764494, + "learning_rate": 1.4006992964908796e-05, + "loss": 0.0008, + "step": 138080 + }, + { + "epoch": 29.022889021285813, + "grad_norm": 0.0025973378214985132, + "learning_rate": 1.4003983943863704e-05, + "loss": 0.0303, + "step": 138090 + }, + { + "epoch": 29.022943183664626, + "grad_norm": 0.08751651644706726, + "learning_rate": 1.400097492281861e-05, + "loss": 0.0484, + "step": 138100 + }, + { + "epoch": 29.02299734604344, + "grad_norm": 0.017854535952210426, + "learning_rate": 1.3997965901773518e-05, + "loss": 0.0094, + "step": 138110 + }, + { + "epoch": 29.02305150842225, + "grad_norm": 0.0028815995901823044, + "learning_rate": 1.3994956880728425e-05, + "loss": 0.1223, + "step": 138120 + }, + { + "epoch": 29.023105670801062, + "grad_norm": 0.0029320635367184877, + "learning_rate": 1.3991947859683333e-05, + "loss": 0.0569, + "step": 138130 + }, + { + "epoch": 29.023159833179875, + "grad_norm": 3.1047513484954834, + "learning_rate": 1.3988938838638237e-05, + "loss": 0.082, + "step": 138140 + }, + { + "epoch": 29.023213995558685, + "grad_norm": 0.222146674990654, + "learning_rate": 1.3985929817593144e-05, + "loss": 0.0125, + "step": 138150 + }, + { + "epoch": 29.023268157937498, + "grad_norm": 0.042453378438949585, + "learning_rate": 1.3982920796548052e-05, + "loss": 0.0316, + "step": 138160 + }, + { + "epoch": 29.023322320316307, + "grad_norm": 1.3160640001296997, + "learning_rate": 1.3979911775502958e-05, + "loss": 0.0212, + "step": 138170 + }, + { + "epoch": 29.02337648269512, + "grad_norm": 0.294459730386734, + "learning_rate": 1.3976902754457866e-05, + "loss": 0.0337, + "step": 138180 + }, + { + "epoch": 29.023430645073933, + "grad_norm": 2.3360867500305176, + "learning_rate": 1.397389373341277e-05, + "loss": 0.0241, + "step": 138190 + }, + { + "epoch": 29.023484807452743, + "grad_norm": 1.0838671922683716, + "learning_rate": 1.397088471236768e-05, + "loss": 0.0877, + "step": 138200 + }, + { + "epoch": 29.023538969831556, + "grad_norm": 0.0014574414817616343, + "learning_rate": 1.3967875691322585e-05, + "loss": 0.0608, + "step": 138210 + }, + { + "epoch": 29.023593132210365, + "grad_norm": 0.0013538795756176114, + "learning_rate": 1.3964866670277493e-05, + "loss": 0.0432, + "step": 138220 + }, + { + "epoch": 29.02364729458918, + "grad_norm": 0.005886403378099203, + "learning_rate": 1.3961857649232399e-05, + "loss": 0.0006, + "step": 138230 + }, + { + "epoch": 29.02370145696799, + "grad_norm": 0.0013432990526780486, + "learning_rate": 1.3958848628187307e-05, + "loss": 0.0045, + "step": 138240 + }, + { + "epoch": 29.0237556193468, + "grad_norm": 0.8334677219390869, + "learning_rate": 1.3955839607142213e-05, + "loss": 0.0541, + "step": 138250 + }, + { + "epoch": 29.023809781725614, + "grad_norm": 0.008941123262047768, + "learning_rate": 1.3952830586097121e-05, + "loss": 0.0565, + "step": 138260 + }, + { + "epoch": 29.023863944104423, + "grad_norm": 0.001078106346540153, + "learning_rate": 1.3949821565052026e-05, + "loss": 0.0678, + "step": 138270 + }, + { + "epoch": 29.023918106483237, + "grad_norm": 1.017729640007019, + "learning_rate": 1.3946812544006935e-05, + "loss": 0.0241, + "step": 138280 + }, + { + "epoch": 29.02397226886205, + "grad_norm": 0.0011515115620568395, + "learning_rate": 1.394380352296184e-05, + "loss": 0.0075, + "step": 138290 + }, + { + "epoch": 29.02402643124086, + "grad_norm": 1.0960094928741455, + "learning_rate": 1.3940794501916746e-05, + "loss": 0.0645, + "step": 138300 + }, + { + "epoch": 29.024080593619672, + "grad_norm": 0.0028099699411541224, + "learning_rate": 1.3937785480871654e-05, + "loss": 0.076, + "step": 138310 + }, + { + "epoch": 29.02413475599848, + "grad_norm": 0.7377060651779175, + "learning_rate": 1.3934776459826559e-05, + "loss": 0.0519, + "step": 138320 + }, + { + "epoch": 29.024188918377295, + "grad_norm": 0.5722109079360962, + "learning_rate": 1.3931767438781468e-05, + "loss": 0.0622, + "step": 138330 + }, + { + "epoch": 29.024243080756108, + "grad_norm": 0.0013953922316432, + "learning_rate": 1.3928758417736373e-05, + "loss": 0.0068, + "step": 138340 + }, + { + "epoch": 29.024297243134917, + "grad_norm": 0.03607865795493126, + "learning_rate": 1.3925749396691281e-05, + "loss": 0.0659, + "step": 138350 + }, + { + "epoch": 29.02435140551373, + "grad_norm": 2.123250961303711, + "learning_rate": 1.3922740375646187e-05, + "loss": 0.0219, + "step": 138360 + }, + { + "epoch": 29.024405567892543, + "grad_norm": 0.7715107202529907, + "learning_rate": 1.3919731354601095e-05, + "loss": 0.0697, + "step": 138370 + }, + { + "epoch": 29.024459730271353, + "grad_norm": 0.0011168451746925712, + "learning_rate": 1.3916722333556001e-05, + "loss": 0.0729, + "step": 138380 + }, + { + "epoch": 29.024513892650166, + "grad_norm": 0.0015373743372038007, + "learning_rate": 1.391371331251091e-05, + "loss": 0.0085, + "step": 138390 + }, + { + "epoch": 29.024568055028976, + "grad_norm": 0.5182665586471558, + "learning_rate": 1.3910704291465814e-05, + "loss": 0.0274, + "step": 138400 + }, + { + "epoch": 29.02462221740779, + "grad_norm": 0.09579063951969147, + "learning_rate": 1.3907695270420724e-05, + "loss": 0.0424, + "step": 138410 + }, + { + "epoch": 29.0246763797866, + "grad_norm": 0.00561527768149972, + "learning_rate": 1.3904686249375628e-05, + "loss": 0.0497, + "step": 138420 + }, + { + "epoch": 29.02473054216541, + "grad_norm": 0.0010057835606858134, + "learning_rate": 1.3901677228330536e-05, + "loss": 0.0272, + "step": 138430 + }, + { + "epoch": 29.024784704544224, + "grad_norm": 0.006153935566544533, + "learning_rate": 1.3898668207285442e-05, + "loss": 0.0481, + "step": 138440 + }, + { + "epoch": 29.024838866923034, + "grad_norm": 0.06736212968826294, + "learning_rate": 1.3895659186240349e-05, + "loss": 0.0224, + "step": 138450 + }, + { + "epoch": 29.024893029301847, + "grad_norm": 0.0016050394624471664, + "learning_rate": 1.3892650165195257e-05, + "loss": 0.0039, + "step": 138460 + }, + { + "epoch": 29.02494719168066, + "grad_norm": 0.043433938175439835, + "learning_rate": 1.3889641144150161e-05, + "loss": 0.0088, + "step": 138470 + }, + { + "epoch": 29.02500135405947, + "grad_norm": 0.0010403543710708618, + "learning_rate": 1.388663212310507e-05, + "loss": 0.0432, + "step": 138480 + }, + { + "epoch": 29.02500135405947, + "eval_accuracy": 0.8641410842586544, + "eval_loss": 0.8324839472770691, + "eval_runtime": 113.246, + "eval_samples_per_second": 27.038, + "eval_steps_per_second": 3.382, + "step": 138480 + }, + { + "epoch": 30.000054162378813, + "grad_norm": 0.10976050049066544, + "learning_rate": 1.3883623102059976e-05, + "loss": 0.0109, + "step": 138490 + }, + { + "epoch": 30.000108324757623, + "grad_norm": 0.02296866849064827, + "learning_rate": 1.3880614081014884e-05, + "loss": 0.0044, + "step": 138500 + }, + { + "epoch": 30.000162487136436, + "grad_norm": 17.942174911499023, + "learning_rate": 1.387760505996979e-05, + "loss": 0.083, + "step": 138510 + }, + { + "epoch": 30.000216649515245, + "grad_norm": 0.0009538385784253478, + "learning_rate": 1.3874596038924698e-05, + "loss": 0.0258, + "step": 138520 + }, + { + "epoch": 30.000270811894058, + "grad_norm": 0.0009595148148946464, + "learning_rate": 1.3871587017879602e-05, + "loss": 0.025, + "step": 138530 + }, + { + "epoch": 30.00032497427287, + "grad_norm": 0.002057319739833474, + "learning_rate": 1.3868577996834512e-05, + "loss": 0.0615, + "step": 138540 + }, + { + "epoch": 30.00037913665168, + "grad_norm": 0.0011708278907462955, + "learning_rate": 1.3865568975789417e-05, + "loss": 0.0763, + "step": 138550 + }, + { + "epoch": 30.000433299030494, + "grad_norm": 0.0011869609588757157, + "learning_rate": 1.3862559954744325e-05, + "loss": 0.008, + "step": 138560 + }, + { + "epoch": 30.000487461409303, + "grad_norm": 0.8824222683906555, + "learning_rate": 1.385955093369923e-05, + "loss": 0.0067, + "step": 138570 + }, + { + "epoch": 30.000541623788116, + "grad_norm": 0.06694720685482025, + "learning_rate": 1.3856541912654139e-05, + "loss": 0.0718, + "step": 138580 + }, + { + "epoch": 30.00059578616693, + "grad_norm": 0.0011411912273615599, + "learning_rate": 1.3853532891609045e-05, + "loss": 0.0051, + "step": 138590 + }, + { + "epoch": 30.00064994854574, + "grad_norm": 4.759274482727051, + "learning_rate": 1.385052387056395e-05, + "loss": 0.0527, + "step": 138600 + }, + { + "epoch": 30.000704110924552, + "grad_norm": 1.0812617540359497, + "learning_rate": 1.3847514849518858e-05, + "loss": 0.0192, + "step": 138610 + }, + { + "epoch": 30.000758273303365, + "grad_norm": 0.001365778036415577, + "learning_rate": 1.3844505828473764e-05, + "loss": 0.0178, + "step": 138620 + }, + { + "epoch": 30.000812435682175, + "grad_norm": 0.0011204121401533484, + "learning_rate": 1.3841496807428672e-05, + "loss": 0.0459, + "step": 138630 + }, + { + "epoch": 30.000866598060988, + "grad_norm": 0.0011002658866345882, + "learning_rate": 1.3838487786383578e-05, + "loss": 0.0262, + "step": 138640 + }, + { + "epoch": 30.000920760439797, + "grad_norm": 0.21767382323741913, + "learning_rate": 1.3835478765338486e-05, + "loss": 0.0005, + "step": 138650 + }, + { + "epoch": 30.00097492281861, + "grad_norm": 0.0009497120045125484, + "learning_rate": 1.383246974429339e-05, + "loss": 0.0182, + "step": 138660 + }, + { + "epoch": 30.001029085197423, + "grad_norm": 0.0011804227251559496, + "learning_rate": 1.38294607232483e-05, + "loss": 0.0739, + "step": 138670 + }, + { + "epoch": 30.001083247576233, + "grad_norm": 0.0009978408925235271, + "learning_rate": 1.3826451702203205e-05, + "loss": 0.091, + "step": 138680 + }, + { + "epoch": 30.001137409955046, + "grad_norm": 0.000974586873780936, + "learning_rate": 1.3823442681158113e-05, + "loss": 0.0364, + "step": 138690 + }, + { + "epoch": 30.001191572333855, + "grad_norm": 0.21060337126255035, + "learning_rate": 1.382043366011302e-05, + "loss": 0.0118, + "step": 138700 + }, + { + "epoch": 30.00124573471267, + "grad_norm": 14.2246675491333, + "learning_rate": 1.3817424639067927e-05, + "loss": 0.1993, + "step": 138710 + }, + { + "epoch": 30.00129989709148, + "grad_norm": 0.6603474020957947, + "learning_rate": 1.3814415618022833e-05, + "loss": 0.0673, + "step": 138720 + }, + { + "epoch": 30.00135405947029, + "grad_norm": 0.001121916458941996, + "learning_rate": 1.3811406596977741e-05, + "loss": 0.0166, + "step": 138730 + }, + { + "epoch": 30.001408221849104, + "grad_norm": 4.967649936676025, + "learning_rate": 1.3808397575932646e-05, + "loss": 0.022, + "step": 138740 + }, + { + "epoch": 30.001462384227914, + "grad_norm": 0.0013437796151265502, + "learning_rate": 1.3805388554887552e-05, + "loss": 0.0364, + "step": 138750 + }, + { + "epoch": 30.001516546606727, + "grad_norm": 0.31015047430992126, + "learning_rate": 1.380237953384246e-05, + "loss": 0.0643, + "step": 138760 + }, + { + "epoch": 30.00157070898554, + "grad_norm": 0.03246820345520973, + "learning_rate": 1.3799370512797366e-05, + "loss": 0.0446, + "step": 138770 + }, + { + "epoch": 30.00162487136435, + "grad_norm": 0.003151151817291975, + "learning_rate": 1.3796361491752274e-05, + "loss": 0.0218, + "step": 138780 + }, + { + "epoch": 30.001679033743162, + "grad_norm": 0.028842603787779808, + "learning_rate": 1.3793352470707179e-05, + "loss": 0.0046, + "step": 138790 + }, + { + "epoch": 30.001733196121975, + "grad_norm": 0.002234856365248561, + "learning_rate": 1.3790343449662089e-05, + "loss": 0.0303, + "step": 138800 + }, + { + "epoch": 30.001787358500785, + "grad_norm": 2.3447470664978027, + "learning_rate": 1.3787334428616993e-05, + "loss": 0.0138, + "step": 138810 + }, + { + "epoch": 30.001841520879598, + "grad_norm": 0.0016427722293883562, + "learning_rate": 1.3784325407571901e-05, + "loss": 0.0621, + "step": 138820 + }, + { + "epoch": 30.001895683258407, + "grad_norm": 0.005449227057397366, + "learning_rate": 1.3781316386526808e-05, + "loss": 0.0805, + "step": 138830 + }, + { + "epoch": 30.00194984563722, + "grad_norm": 0.0009809606708586216, + "learning_rate": 1.3778307365481715e-05, + "loss": 0.0589, + "step": 138840 + }, + { + "epoch": 30.002004008016034, + "grad_norm": 0.0011078763054683805, + "learning_rate": 1.3775298344436622e-05, + "loss": 0.0165, + "step": 138850 + }, + { + "epoch": 30.002058170394843, + "grad_norm": 0.0010544228134676814, + "learning_rate": 1.377228932339153e-05, + "loss": 0.0064, + "step": 138860 + }, + { + "epoch": 30.002112332773656, + "grad_norm": 0.7453145980834961, + "learning_rate": 1.3769280302346434e-05, + "loss": 0.0149, + "step": 138870 + }, + { + "epoch": 30.002166495152466, + "grad_norm": 0.35436445474624634, + "learning_rate": 1.3766271281301344e-05, + "loss": 0.0754, + "step": 138880 + }, + { + "epoch": 30.00222065753128, + "grad_norm": 1.6646168231964111, + "learning_rate": 1.3763262260256249e-05, + "loss": 0.0598, + "step": 138890 + }, + { + "epoch": 30.00227481991009, + "grad_norm": 3.260178327560425, + "learning_rate": 1.3760253239211155e-05, + "loss": 0.0754, + "step": 138900 + }, + { + "epoch": 30.0023289822889, + "grad_norm": 0.0009908931097015738, + "learning_rate": 1.3757244218166063e-05, + "loss": 0.05, + "step": 138910 + }, + { + "epoch": 30.002383144667714, + "grad_norm": 0.05052602291107178, + "learning_rate": 1.3754235197120967e-05, + "loss": 0.031, + "step": 138920 + }, + { + "epoch": 30.002437307046524, + "grad_norm": 0.0012528728693723679, + "learning_rate": 1.3751226176075877e-05, + "loss": 0.0235, + "step": 138930 + }, + { + "epoch": 30.002491469425337, + "grad_norm": 0.0073355273343622684, + "learning_rate": 1.3748217155030782e-05, + "loss": 0.0282, + "step": 138940 + }, + { + "epoch": 30.00254563180415, + "grad_norm": 0.04626655951142311, + "learning_rate": 1.374520813398569e-05, + "loss": 0.0195, + "step": 138950 + }, + { + "epoch": 30.00259979418296, + "grad_norm": 0.0009972364641726017, + "learning_rate": 1.3742199112940596e-05, + "loss": 0.0323, + "step": 138960 + }, + { + "epoch": 30.002653956561772, + "grad_norm": 0.006610110867768526, + "learning_rate": 1.3739190091895504e-05, + "loss": 0.014, + "step": 138970 + }, + { + "epoch": 30.002708118940586, + "grad_norm": 0.0009713415638543665, + "learning_rate": 1.373618107085041e-05, + "loss": 0.0363, + "step": 138980 + }, + { + "epoch": 30.002762281319395, + "grad_norm": 3.2172391414642334, + "learning_rate": 1.3733172049805318e-05, + "loss": 0.0636, + "step": 138990 + }, + { + "epoch": 30.002816443698208, + "grad_norm": 0.5673621892929077, + "learning_rate": 1.3730163028760223e-05, + "loss": 0.0104, + "step": 139000 + }, + { + "epoch": 30.002870606077018, + "grad_norm": 0.0013431374682113528, + "learning_rate": 1.3727154007715132e-05, + "loss": 0.0579, + "step": 139010 + }, + { + "epoch": 30.00292476845583, + "grad_norm": 5.126908779144287, + "learning_rate": 1.3724144986670037e-05, + "loss": 0.095, + "step": 139020 + }, + { + "epoch": 30.002978930834644, + "grad_norm": 0.0026249848306179047, + "learning_rate": 1.3721135965624945e-05, + "loss": 0.0914, + "step": 139030 + }, + { + "epoch": 30.003033093213453, + "grad_norm": 2.2942934036254883, + "learning_rate": 1.3718126944579851e-05, + "loss": 0.0709, + "step": 139040 + }, + { + "epoch": 30.003087255592266, + "grad_norm": 0.653549075126648, + "learning_rate": 1.3715117923534756e-05, + "loss": 0.0433, + "step": 139050 + }, + { + "epoch": 30.003141417971076, + "grad_norm": 0.0013255313970148563, + "learning_rate": 1.3712108902489665e-05, + "loss": 0.015, + "step": 139060 + }, + { + "epoch": 30.00319558034989, + "grad_norm": 0.0010615467326715589, + "learning_rate": 1.370909988144457e-05, + "loss": 0.0127, + "step": 139070 + }, + { + "epoch": 30.003249742728702, + "grad_norm": 0.710225522518158, + "learning_rate": 1.3706090860399478e-05, + "loss": 0.026, + "step": 139080 + }, + { + "epoch": 30.00330390510751, + "grad_norm": 0.2675127685070038, + "learning_rate": 1.3703081839354384e-05, + "loss": 0.0565, + "step": 139090 + }, + { + "epoch": 30.003358067486325, + "grad_norm": 1.5609959363937378, + "learning_rate": 1.3700072818309292e-05, + "loss": 0.0545, + "step": 139100 + }, + { + "epoch": 30.003412229865134, + "grad_norm": 0.7413500547409058, + "learning_rate": 1.3697063797264198e-05, + "loss": 0.0171, + "step": 139110 + }, + { + "epoch": 30.003466392243947, + "grad_norm": 0.0010389778763055801, + "learning_rate": 1.3694054776219106e-05, + "loss": 0.0757, + "step": 139120 + }, + { + "epoch": 30.00352055462276, + "grad_norm": 0.0011987584875896573, + "learning_rate": 1.3691045755174011e-05, + "loss": 0.0039, + "step": 139130 + }, + { + "epoch": 30.00357471700157, + "grad_norm": 0.001111264224164188, + "learning_rate": 1.368803673412892e-05, + "loss": 0.1154, + "step": 139140 + }, + { + "epoch": 30.003628879380383, + "grad_norm": 0.0012294326443225145, + "learning_rate": 1.3685027713083825e-05, + "loss": 0.0098, + "step": 139150 + }, + { + "epoch": 30.003683041759196, + "grad_norm": 0.9317246675491333, + "learning_rate": 1.3682018692038733e-05, + "loss": 0.0665, + "step": 139160 + }, + { + "epoch": 30.003737204138005, + "grad_norm": 0.3280019462108612, + "learning_rate": 1.367900967099364e-05, + "loss": 0.0633, + "step": 139170 + }, + { + "epoch": 30.00379136651682, + "grad_norm": 0.06740517169237137, + "learning_rate": 1.3676000649948547e-05, + "loss": 0.0997, + "step": 139180 + }, + { + "epoch": 30.003845528895628, + "grad_norm": 0.0010052102152258158, + "learning_rate": 1.3672991628903454e-05, + "loss": 0.0635, + "step": 139190 + }, + { + "epoch": 30.00389969127444, + "grad_norm": 0.0010094994213432074, + "learning_rate": 1.3669982607858358e-05, + "loss": 0.0521, + "step": 139200 + }, + { + "epoch": 30.003953853653254, + "grad_norm": 0.08693958073854446, + "learning_rate": 1.3666973586813266e-05, + "loss": 0.0815, + "step": 139210 + }, + { + "epoch": 30.004008016032063, + "grad_norm": 0.0526759997010231, + "learning_rate": 1.3663964565768173e-05, + "loss": 0.0217, + "step": 139220 + }, + { + "epoch": 30.004062178410877, + "grad_norm": 0.0010100625222548842, + "learning_rate": 1.366095554472308e-05, + "loss": 0.038, + "step": 139230 + }, + { + "epoch": 30.004116340789686, + "grad_norm": 0.0015208483673632145, + "learning_rate": 1.3657946523677987e-05, + "loss": 0.0155, + "step": 139240 + }, + { + "epoch": 30.0041705031685, + "grad_norm": 0.7380883693695068, + "learning_rate": 1.3654937502632895e-05, + "loss": 0.0515, + "step": 139250 + }, + { + "epoch": 30.004224665547312, + "grad_norm": 0.049591101706027985, + "learning_rate": 1.36519284815878e-05, + "loss": 0.0115, + "step": 139260 + }, + { + "epoch": 30.00427882792612, + "grad_norm": 0.008980856277048588, + "learning_rate": 1.3648919460542709e-05, + "loss": 0.0121, + "step": 139270 + }, + { + "epoch": 30.004332990304935, + "grad_norm": 0.6251480579376221, + "learning_rate": 1.3645910439497614e-05, + "loss": 0.0278, + "step": 139280 + }, + { + "epoch": 30.004387152683744, + "grad_norm": 1.2597931623458862, + "learning_rate": 1.3642901418452522e-05, + "loss": 0.0609, + "step": 139290 + }, + { + "epoch": 30.004441315062557, + "grad_norm": 2.3640899658203125, + "learning_rate": 1.3639892397407428e-05, + "loss": 0.0255, + "step": 139300 + }, + { + "epoch": 30.00449547744137, + "grad_norm": 0.0018031809013336897, + "learning_rate": 1.3636883376362336e-05, + "loss": 0.025, + "step": 139310 + }, + { + "epoch": 30.00454963982018, + "grad_norm": 0.0011717225424945354, + "learning_rate": 1.3633874355317242e-05, + "loss": 0.001, + "step": 139320 + }, + { + "epoch": 30.004603802198993, + "grad_norm": 0.0016206679865717888, + "learning_rate": 1.363086533427215e-05, + "loss": 0.009, + "step": 139330 + }, + { + "epoch": 30.004657964577806, + "grad_norm": 0.0011664885096251965, + "learning_rate": 1.3627856313227055e-05, + "loss": 0.0203, + "step": 139340 + }, + { + "epoch": 30.004712126956615, + "grad_norm": 0.002670158864930272, + "learning_rate": 1.3624847292181961e-05, + "loss": 0.0725, + "step": 139350 + }, + { + "epoch": 30.00476628933543, + "grad_norm": 0.9140305519104004, + "learning_rate": 1.3621838271136869e-05, + "loss": 0.012, + "step": 139360 + }, + { + "epoch": 30.004820451714238, + "grad_norm": 0.0037807468324899673, + "learning_rate": 1.3618829250091775e-05, + "loss": 0.0599, + "step": 139370 + }, + { + "epoch": 30.00487461409305, + "grad_norm": 0.0011559037957340479, + "learning_rate": 1.3615820229046683e-05, + "loss": 0.0487, + "step": 139380 + }, + { + "epoch": 30.004928776471864, + "grad_norm": 0.0009051589877344668, + "learning_rate": 1.3612811208001588e-05, + "loss": 0.0046, + "step": 139390 + }, + { + "epoch": 30.004982938850674, + "grad_norm": 6.799541473388672, + "learning_rate": 1.3609802186956497e-05, + "loss": 0.1284, + "step": 139400 + }, + { + "epoch": 30.005037101229487, + "grad_norm": 0.09302418678998947, + "learning_rate": 1.3606793165911402e-05, + "loss": 0.0007, + "step": 139410 + }, + { + "epoch": 30.005091263608296, + "grad_norm": 0.055866487324237823, + "learning_rate": 1.360378414486631e-05, + "loss": 0.0474, + "step": 139420 + }, + { + "epoch": 30.00514542598711, + "grad_norm": 0.005351018160581589, + "learning_rate": 1.3600775123821216e-05, + "loss": 0.0749, + "step": 139430 + }, + { + "epoch": 30.005199588365922, + "grad_norm": 0.002262710127979517, + "learning_rate": 1.3597766102776124e-05, + "loss": 0.0143, + "step": 139440 + }, + { + "epoch": 30.005253750744732, + "grad_norm": 0.001354701118543744, + "learning_rate": 1.359475708173103e-05, + "loss": 0.0838, + "step": 139450 + }, + { + "epoch": 30.005307913123545, + "grad_norm": 1.2974518537521362, + "learning_rate": 1.3591748060685938e-05, + "loss": 0.0172, + "step": 139460 + }, + { + "epoch": 30.005362075502354, + "grad_norm": 1.000426173210144, + "learning_rate": 1.3588739039640843e-05, + "loss": 0.0948, + "step": 139470 + }, + { + "epoch": 30.005416237881168, + "grad_norm": 0.1826702058315277, + "learning_rate": 1.3585730018595753e-05, + "loss": 0.0586, + "step": 139480 + }, + { + "epoch": 30.00547040025998, + "grad_norm": 0.007049197796732187, + "learning_rate": 1.3582720997550657e-05, + "loss": 0.0106, + "step": 139490 + }, + { + "epoch": 30.00552456263879, + "grad_norm": 1.9737991094589233, + "learning_rate": 1.3579711976505563e-05, + "loss": 0.0641, + "step": 139500 + }, + { + "epoch": 30.005578725017603, + "grad_norm": 0.0015148891834542155, + "learning_rate": 1.3576702955460471e-05, + "loss": 0.1235, + "step": 139510 + }, + { + "epoch": 30.005632887396416, + "grad_norm": 0.0082296933978796, + "learning_rate": 1.3573693934415376e-05, + "loss": 0.0503, + "step": 139520 + }, + { + "epoch": 30.005687049775226, + "grad_norm": 0.003776038996875286, + "learning_rate": 1.3570684913370286e-05, + "loss": 0.0856, + "step": 139530 + }, + { + "epoch": 30.00574121215404, + "grad_norm": 6.216279983520508, + "learning_rate": 1.356767589232519e-05, + "loss": 0.0057, + "step": 139540 + }, + { + "epoch": 30.00579537453285, + "grad_norm": 0.28730759024620056, + "learning_rate": 1.3564666871280098e-05, + "loss": 0.0313, + "step": 139550 + }, + { + "epoch": 30.00584953691166, + "grad_norm": 0.18604305386543274, + "learning_rate": 1.3561657850235005e-05, + "loss": 0.0527, + "step": 139560 + }, + { + "epoch": 30.005903699290474, + "grad_norm": 0.07290593534708023, + "learning_rate": 1.3558648829189912e-05, + "loss": 0.0116, + "step": 139570 + }, + { + "epoch": 30.005957861669284, + "grad_norm": 0.039249420166015625, + "learning_rate": 1.3555639808144819e-05, + "loss": 0.0549, + "step": 139580 + }, + { + "epoch": 30.006012024048097, + "grad_norm": 0.003125265706330538, + "learning_rate": 1.3552630787099727e-05, + "loss": 0.0637, + "step": 139590 + }, + { + "epoch": 30.006066186426906, + "grad_norm": 1.328411340713501, + "learning_rate": 1.3549621766054631e-05, + "loss": 0.0094, + "step": 139600 + }, + { + "epoch": 30.00612034880572, + "grad_norm": 1.0280174016952515, + "learning_rate": 1.3546612745009541e-05, + "loss": 0.02, + "step": 139610 + }, + { + "epoch": 30.006174511184533, + "grad_norm": 0.9233240485191345, + "learning_rate": 1.3543603723964446e-05, + "loss": 0.0138, + "step": 139620 + }, + { + "epoch": 30.006228673563342, + "grad_norm": 0.0009931904496625066, + "learning_rate": 1.3540594702919354e-05, + "loss": 0.009, + "step": 139630 + }, + { + "epoch": 30.006282835942155, + "grad_norm": 0.0010135384509339929, + "learning_rate": 1.353758568187426e-05, + "loss": 0.084, + "step": 139640 + }, + { + "epoch": 30.006336998320965, + "grad_norm": 5.739273548126221, + "learning_rate": 1.3534576660829164e-05, + "loss": 0.1343, + "step": 139650 + }, + { + "epoch": 30.006391160699778, + "grad_norm": 0.007555374410003424, + "learning_rate": 1.3531567639784074e-05, + "loss": 0.015, + "step": 139660 + }, + { + "epoch": 30.00644532307859, + "grad_norm": 0.0009669282007962465, + "learning_rate": 1.3528558618738979e-05, + "loss": 0.0029, + "step": 139670 + }, + { + "epoch": 30.0064994854574, + "grad_norm": 0.8920565247535706, + "learning_rate": 1.3525549597693887e-05, + "loss": 0.0161, + "step": 139680 + }, + { + "epoch": 30.006553647836213, + "grad_norm": 1.6001511812210083, + "learning_rate": 1.3522540576648793e-05, + "loss": 0.0901, + "step": 139690 + }, + { + "epoch": 30.006607810215023, + "grad_norm": 0.0010872356360778213, + "learning_rate": 1.35195315556037e-05, + "loss": 0.0368, + "step": 139700 + }, + { + "epoch": 30.006661972593836, + "grad_norm": 0.3472674787044525, + "learning_rate": 1.3516522534558607e-05, + "loss": 0.1102, + "step": 139710 + }, + { + "epoch": 30.00671613497265, + "grad_norm": 0.0010094509925693274, + "learning_rate": 1.3513513513513515e-05, + "loss": 0.0083, + "step": 139720 + }, + { + "epoch": 30.00677029735146, + "grad_norm": 0.0023331325501203537, + "learning_rate": 1.351050449246842e-05, + "loss": 0.0298, + "step": 139730 + }, + { + "epoch": 30.00682445973027, + "grad_norm": 0.001135371276177466, + "learning_rate": 1.350749547142333e-05, + "loss": 0.062, + "step": 139740 + }, + { + "epoch": 30.006878622109085, + "grad_norm": 0.009766790084540844, + "learning_rate": 1.3504486450378234e-05, + "loss": 0.0268, + "step": 139750 + }, + { + "epoch": 30.006932784487894, + "grad_norm": 0.13285669684410095, + "learning_rate": 1.3501477429333142e-05, + "loss": 0.0269, + "step": 139760 + }, + { + "epoch": 30.006986946866707, + "grad_norm": 0.08565661311149597, + "learning_rate": 1.3498468408288048e-05, + "loss": 0.089, + "step": 139770 + }, + { + "epoch": 30.007041109245517, + "grad_norm": 0.3317391872406006, + "learning_rate": 1.3495459387242956e-05, + "loss": 0.0286, + "step": 139780 + }, + { + "epoch": 30.00709527162433, + "grad_norm": 0.001061939401552081, + "learning_rate": 1.3492450366197862e-05, + "loss": 0.0073, + "step": 139790 + }, + { + "epoch": 30.007149434003143, + "grad_norm": 1.3075804710388184, + "learning_rate": 1.3489441345152767e-05, + "loss": 0.0261, + "step": 139800 + }, + { + "epoch": 30.007203596381952, + "grad_norm": 0.011708625592291355, + "learning_rate": 1.3486432324107675e-05, + "loss": 0.0027, + "step": 139810 + }, + { + "epoch": 30.007257758760765, + "grad_norm": 0.4787430465221405, + "learning_rate": 1.3483423303062581e-05, + "loss": 0.0156, + "step": 139820 + }, + { + "epoch": 30.007311921139575, + "grad_norm": 0.0009859626879915595, + "learning_rate": 1.348041428201749e-05, + "loss": 0.0305, + "step": 139830 + }, + { + "epoch": 30.007366083518388, + "grad_norm": 0.022072849795222282, + "learning_rate": 1.3477405260972395e-05, + "loss": 0.0018, + "step": 139840 + }, + { + "epoch": 30.0074202458972, + "grad_norm": 0.0009032832458615303, + "learning_rate": 1.3474396239927303e-05, + "loss": 0.0072, + "step": 139850 + }, + { + "epoch": 30.00747440827601, + "grad_norm": 0.002210067817941308, + "learning_rate": 1.3471387218882208e-05, + "loss": 0.0356, + "step": 139860 + }, + { + "epoch": 30.007528570654824, + "grad_norm": 0.00127699866425246, + "learning_rate": 1.3468378197837118e-05, + "loss": 0.0003, + "step": 139870 + }, + { + "epoch": 30.007582733033633, + "grad_norm": 0.27245813608169556, + "learning_rate": 1.3465369176792022e-05, + "loss": 0.0548, + "step": 139880 + }, + { + "epoch": 30.007636895412446, + "grad_norm": 45.38032531738281, + "learning_rate": 1.346236015574693e-05, + "loss": 0.0393, + "step": 139890 + }, + { + "epoch": 30.00769105779126, + "grad_norm": 0.0008859532536007464, + "learning_rate": 1.3459351134701836e-05, + "loss": 0.0271, + "step": 139900 + }, + { + "epoch": 30.00774522017007, + "grad_norm": 0.000869595562107861, + "learning_rate": 1.3456342113656744e-05, + "loss": 0.0072, + "step": 139910 + }, + { + "epoch": 30.007799382548882, + "grad_norm": 1.406944751739502, + "learning_rate": 1.345333309261165e-05, + "loss": 0.0112, + "step": 139920 + }, + { + "epoch": 30.007853544927695, + "grad_norm": 0.03655560314655304, + "learning_rate": 1.3450324071566559e-05, + "loss": 0.0061, + "step": 139930 + }, + { + "epoch": 30.007907707306504, + "grad_norm": 0.0008253091364167631, + "learning_rate": 1.3447315050521463e-05, + "loss": 0.0547, + "step": 139940 + }, + { + "epoch": 30.007961869685317, + "grad_norm": 0.04106289893388748, + "learning_rate": 1.344430602947637e-05, + "loss": 0.0046, + "step": 139950 + }, + { + "epoch": 30.008016032064127, + "grad_norm": 0.007154711987823248, + "learning_rate": 1.3441297008431278e-05, + "loss": 0.0597, + "step": 139960 + }, + { + "epoch": 30.00807019444294, + "grad_norm": 0.004401014186441898, + "learning_rate": 1.3438287987386184e-05, + "loss": 0.0507, + "step": 139970 + }, + { + "epoch": 30.008124356821753, + "grad_norm": 0.0009074354893527925, + "learning_rate": 1.3435278966341092e-05, + "loss": 0.0453, + "step": 139980 + }, + { + "epoch": 30.008178519200563, + "grad_norm": 0.006477907299995422, + "learning_rate": 1.3432269945295996e-05, + "loss": 0.0045, + "step": 139990 + }, + { + "epoch": 30.008232681579376, + "grad_norm": 0.000953121401835233, + "learning_rate": 1.3429260924250906e-05, + "loss": 0.0003, + "step": 140000 + }, + { + "epoch": 30.008286843958185, + "grad_norm": 0.0009386612218804657, + "learning_rate": 1.342625190320581e-05, + "loss": 0.0417, + "step": 140010 + }, + { + "epoch": 30.008341006336998, + "grad_norm": 0.0011327681131660938, + "learning_rate": 1.3423242882160719e-05, + "loss": 0.0058, + "step": 140020 + }, + { + "epoch": 30.00839516871581, + "grad_norm": 0.0008390609873458743, + "learning_rate": 1.3420233861115625e-05, + "loss": 0.0017, + "step": 140030 + }, + { + "epoch": 30.00844933109462, + "grad_norm": 0.0012442822335287929, + "learning_rate": 1.3417224840070533e-05, + "loss": 0.0475, + "step": 140040 + }, + { + "epoch": 30.008503493473434, + "grad_norm": 1.211889624595642, + "learning_rate": 1.3414215819025439e-05, + "loss": 0.128, + "step": 140050 + }, + { + "epoch": 30.008557655852243, + "grad_norm": 0.0008691027760505676, + "learning_rate": 1.3411206797980347e-05, + "loss": 0.0214, + "step": 140060 + }, + { + "epoch": 30.008611818231056, + "grad_norm": 1.0159168243408203, + "learning_rate": 1.3408197776935252e-05, + "loss": 0.0235, + "step": 140070 + }, + { + "epoch": 30.00866598060987, + "grad_norm": 12.269556999206543, + "learning_rate": 1.3405188755890161e-05, + "loss": 0.082, + "step": 140080 + }, + { + "epoch": 30.00872014298868, + "grad_norm": 0.057913463562726974, + "learning_rate": 1.3402179734845066e-05, + "loss": 0.0184, + "step": 140090 + }, + { + "epoch": 30.008774305367492, + "grad_norm": 0.0011707795783877373, + "learning_rate": 1.3399170713799972e-05, + "loss": 0.0003, + "step": 140100 + }, + { + "epoch": 30.008828467746305, + "grad_norm": 0.0009153418359346688, + "learning_rate": 1.339616169275488e-05, + "loss": 0.0418, + "step": 140110 + }, + { + "epoch": 30.008882630125115, + "grad_norm": 0.1535148024559021, + "learning_rate": 1.3393152671709785e-05, + "loss": 0.0299, + "step": 140120 + }, + { + "epoch": 30.008936792503928, + "grad_norm": 2.5271615982055664, + "learning_rate": 1.3390143650664694e-05, + "loss": 0.0231, + "step": 140130 + }, + { + "epoch": 30.008990954882737, + "grad_norm": 0.0008608180796727538, + "learning_rate": 1.3387134629619599e-05, + "loss": 0.0051, + "step": 140140 + }, + { + "epoch": 30.00904511726155, + "grad_norm": 0.001761824474669993, + "learning_rate": 1.3384125608574507e-05, + "loss": 0.0169, + "step": 140150 + }, + { + "epoch": 30.009099279640363, + "grad_norm": 0.34240254759788513, + "learning_rate": 1.3381116587529413e-05, + "loss": 0.0981, + "step": 140160 + }, + { + "epoch": 30.009153442019173, + "grad_norm": 5.3894853591918945, + "learning_rate": 1.3378107566484321e-05, + "loss": 0.1008, + "step": 140170 + }, + { + "epoch": 30.009207604397986, + "grad_norm": 33.622413635253906, + "learning_rate": 1.3375098545439227e-05, + "loss": 0.082, + "step": 140180 + }, + { + "epoch": 30.009261766776795, + "grad_norm": 0.0013348014326766133, + "learning_rate": 1.3372089524394135e-05, + "loss": 0.0015, + "step": 140190 + }, + { + "epoch": 30.00931592915561, + "grad_norm": 2.1425156593322754, + "learning_rate": 1.336908050334904e-05, + "loss": 0.031, + "step": 140200 + }, + { + "epoch": 30.00937009153442, + "grad_norm": 0.06749147176742554, + "learning_rate": 1.336607148230395e-05, + "loss": 0.0938, + "step": 140210 + }, + { + "epoch": 30.00942425391323, + "grad_norm": 0.00714115472510457, + "learning_rate": 1.3363062461258854e-05, + "loss": 0.0589, + "step": 140220 + }, + { + "epoch": 30.009478416292044, + "grad_norm": 4.237223148345947, + "learning_rate": 1.3360053440213762e-05, + "loss": 0.1677, + "step": 140230 + }, + { + "epoch": 30.009532578670854, + "grad_norm": 0.06022903323173523, + "learning_rate": 1.3357044419168668e-05, + "loss": 0.094, + "step": 140240 + }, + { + "epoch": 30.009586741049667, + "grad_norm": 0.5157176852226257, + "learning_rate": 1.3354035398123573e-05, + "loss": 0.0233, + "step": 140250 + }, + { + "epoch": 30.00964090342848, + "grad_norm": 3.1983232498168945, + "learning_rate": 1.3351026377078483e-05, + "loss": 0.0586, + "step": 140260 + }, + { + "epoch": 30.00969506580729, + "grad_norm": 0.0009447821066714823, + "learning_rate": 1.3348017356033387e-05, + "loss": 0.0036, + "step": 140270 + }, + { + "epoch": 30.009749228186102, + "grad_norm": 0.005743660032749176, + "learning_rate": 1.3345008334988295e-05, + "loss": 0.0337, + "step": 140280 + }, + { + "epoch": 30.009803390564915, + "grad_norm": 0.7673547863960266, + "learning_rate": 1.3341999313943202e-05, + "loss": 0.0161, + "step": 140290 + }, + { + "epoch": 30.009857552943725, + "grad_norm": 0.2614358365535736, + "learning_rate": 1.333899029289811e-05, + "loss": 0.0141, + "step": 140300 + }, + { + "epoch": 30.009911715322538, + "grad_norm": 0.0013736821711063385, + "learning_rate": 1.3335981271853016e-05, + "loss": 0.0825, + "step": 140310 + }, + { + "epoch": 30.009965877701347, + "grad_norm": 5.944368362426758, + "learning_rate": 1.3332972250807924e-05, + "loss": 0.1411, + "step": 140320 + }, + { + "epoch": 30.01002004008016, + "grad_norm": 0.002347419271245599, + "learning_rate": 1.3329963229762828e-05, + "loss": 0.0119, + "step": 140330 + }, + { + "epoch": 30.010074202458974, + "grad_norm": 1.4616177082061768, + "learning_rate": 1.3326954208717738e-05, + "loss": 0.0301, + "step": 140340 + }, + { + "epoch": 30.010128364837783, + "grad_norm": 0.015606241300702095, + "learning_rate": 1.3323945187672643e-05, + "loss": 0.0871, + "step": 140350 + }, + { + "epoch": 30.010182527216596, + "grad_norm": 0.0009636401664465666, + "learning_rate": 1.332093616662755e-05, + "loss": 0.0759, + "step": 140360 + }, + { + "epoch": 30.010236689595406, + "grad_norm": 0.702969491481781, + "learning_rate": 1.3317927145582457e-05, + "loss": 0.0816, + "step": 140370 + }, + { + "epoch": 30.01029085197422, + "grad_norm": 1.7499016523361206, + "learning_rate": 1.3314918124537365e-05, + "loss": 0.0364, + "step": 140380 + }, + { + "epoch": 30.01034501435303, + "grad_norm": 0.0010361778549849987, + "learning_rate": 1.3311909103492271e-05, + "loss": 0.0021, + "step": 140390 + }, + { + "epoch": 30.01039917673184, + "grad_norm": 0.10270586609840393, + "learning_rate": 1.3308900082447176e-05, + "loss": 0.0487, + "step": 140400 + }, + { + "epoch": 30.010453339110654, + "grad_norm": 0.6358445882797241, + "learning_rate": 1.3305891061402084e-05, + "loss": 0.0684, + "step": 140410 + }, + { + "epoch": 30.010507501489464, + "grad_norm": 0.039135321974754333, + "learning_rate": 1.330288204035699e-05, + "loss": 0.0602, + "step": 140420 + }, + { + "epoch": 30.010561663868277, + "grad_norm": 0.0021292243618518114, + "learning_rate": 1.3299873019311898e-05, + "loss": 0.059, + "step": 140430 + }, + { + "epoch": 30.01061582624709, + "grad_norm": 0.05508744716644287, + "learning_rate": 1.3296863998266804e-05, + "loss": 0.022, + "step": 140440 + }, + { + "epoch": 30.0106699886259, + "grad_norm": 0.06263545155525208, + "learning_rate": 1.3293854977221712e-05, + "loss": 0.0359, + "step": 140450 + }, + { + "epoch": 30.010724151004712, + "grad_norm": 0.0010586526477709413, + "learning_rate": 1.3290845956176617e-05, + "loss": 0.033, + "step": 140460 + }, + { + "epoch": 30.010778313383526, + "grad_norm": 0.06038999930024147, + "learning_rate": 1.3287836935131526e-05, + "loss": 0.0129, + "step": 140470 + }, + { + "epoch": 30.010832475762335, + "grad_norm": 0.0014380336506292224, + "learning_rate": 1.3284827914086431e-05, + "loss": 0.0578, + "step": 140480 + }, + { + "epoch": 30.010886638141148, + "grad_norm": 0.058689653873443604, + "learning_rate": 1.3281818893041339e-05, + "loss": 0.0417, + "step": 140490 + }, + { + "epoch": 30.010940800519958, + "grad_norm": 0.00104427186306566, + "learning_rate": 1.3278809871996245e-05, + "loss": 0.0107, + "step": 140500 + }, + { + "epoch": 30.01099496289877, + "grad_norm": 3.893934726715088, + "learning_rate": 1.3275800850951153e-05, + "loss": 0.0429, + "step": 140510 + }, + { + "epoch": 30.011049125277584, + "grad_norm": 0.001052183099091053, + "learning_rate": 1.327279182990606e-05, + "loss": 0.0286, + "step": 140520 + }, + { + "epoch": 30.011103287656393, + "grad_norm": 0.33387303352355957, + "learning_rate": 1.3269782808860967e-05, + "loss": 0.0251, + "step": 140530 + }, + { + "epoch": 30.011157450035206, + "grad_norm": 0.3215467035770416, + "learning_rate": 1.3266773787815872e-05, + "loss": 0.0312, + "step": 140540 + }, + { + "epoch": 30.011211612414016, + "grad_norm": 2.16555118560791, + "learning_rate": 1.3263764766770778e-05, + "loss": 0.1007, + "step": 140550 + }, + { + "epoch": 30.01126577479283, + "grad_norm": 0.00386997452005744, + "learning_rate": 1.3260755745725686e-05, + "loss": 0.0494, + "step": 140560 + }, + { + "epoch": 30.011319937171642, + "grad_norm": 1.58185613155365, + "learning_rate": 1.3257746724680592e-05, + "loss": 0.0336, + "step": 140570 + }, + { + "epoch": 30.01137409955045, + "grad_norm": 0.046750687062740326, + "learning_rate": 1.32547377036355e-05, + "loss": 0.0182, + "step": 140580 + }, + { + "epoch": 30.011428261929264, + "grad_norm": 0.8912496566772461, + "learning_rate": 1.3251728682590405e-05, + "loss": 0.0842, + "step": 140590 + }, + { + "epoch": 30.011482424308074, + "grad_norm": 0.0023354454897344112, + "learning_rate": 1.3248719661545315e-05, + "loss": 0.0453, + "step": 140600 + }, + { + "epoch": 30.011536586686887, + "grad_norm": 0.0032025170512497425, + "learning_rate": 1.324571064050022e-05, + "loss": 0.0193, + "step": 140610 + }, + { + "epoch": 30.0115907490657, + "grad_norm": 0.09735354781150818, + "learning_rate": 1.3242701619455127e-05, + "loss": 0.0256, + "step": 140620 + }, + { + "epoch": 30.01164491144451, + "grad_norm": 0.001035218476317823, + "learning_rate": 1.3239692598410034e-05, + "loss": 0.0233, + "step": 140630 + }, + { + "epoch": 30.011699073823323, + "grad_norm": 0.09337108582258224, + "learning_rate": 1.3236683577364941e-05, + "loss": 0.0254, + "step": 140640 + }, + { + "epoch": 30.011753236202136, + "grad_norm": 0.006050207186490297, + "learning_rate": 1.3233674556319848e-05, + "loss": 0.0232, + "step": 140650 + }, + { + "epoch": 30.011807398580945, + "grad_norm": 0.6411912441253662, + "learning_rate": 1.3230665535274756e-05, + "loss": 0.0057, + "step": 140660 + }, + { + "epoch": 30.01186156095976, + "grad_norm": 0.0010149043519049883, + "learning_rate": 1.322765651422966e-05, + "loss": 0.0165, + "step": 140670 + }, + { + "epoch": 30.011915723338568, + "grad_norm": 0.09021039307117462, + "learning_rate": 1.322464749318457e-05, + "loss": 0.1455, + "step": 140680 + }, + { + "epoch": 30.01196988571738, + "grad_norm": 0.02849547192454338, + "learning_rate": 1.3221638472139475e-05, + "loss": 0.0209, + "step": 140690 + }, + { + "epoch": 30.012024048096194, + "grad_norm": 0.032421305775642395, + "learning_rate": 1.321862945109438e-05, + "loss": 0.0414, + "step": 140700 + }, + { + "epoch": 30.012078210475003, + "grad_norm": 0.24134334921836853, + "learning_rate": 1.3215620430049289e-05, + "loss": 0.0375, + "step": 140710 + }, + { + "epoch": 30.012132372853817, + "grad_norm": 0.05954517796635628, + "learning_rate": 1.3212611409004193e-05, + "loss": 0.0721, + "step": 140720 + }, + { + "epoch": 30.012186535232626, + "grad_norm": 3.163980484008789, + "learning_rate": 1.3209602387959103e-05, + "loss": 0.0694, + "step": 140730 + }, + { + "epoch": 30.01224069761144, + "grad_norm": 3.3097822666168213, + "learning_rate": 1.3206593366914008e-05, + "loss": 0.0516, + "step": 140740 + }, + { + "epoch": 30.012294859990252, + "grad_norm": 0.09265866875648499, + "learning_rate": 1.3203584345868916e-05, + "loss": 0.0126, + "step": 140750 + }, + { + "epoch": 30.01234902236906, + "grad_norm": 0.001250268891453743, + "learning_rate": 1.3200575324823822e-05, + "loss": 0.0043, + "step": 140760 + }, + { + "epoch": 30.012403184747875, + "grad_norm": 0.0010695054661482573, + "learning_rate": 1.319756630377873e-05, + "loss": 0.0894, + "step": 140770 + }, + { + "epoch": 30.012457347126684, + "grad_norm": 2.2482104301452637, + "learning_rate": 1.3194557282733636e-05, + "loss": 0.0219, + "step": 140780 + }, + { + "epoch": 30.012511509505497, + "grad_norm": 0.011002596467733383, + "learning_rate": 1.3191548261688544e-05, + "loss": 0.0106, + "step": 140790 + }, + { + "epoch": 30.01256567188431, + "grad_norm": 0.0565987192094326, + "learning_rate": 1.3188539240643449e-05, + "loss": 0.0443, + "step": 140800 + }, + { + "epoch": 30.01261983426312, + "grad_norm": 0.0013085316168144345, + "learning_rate": 1.3185530219598358e-05, + "loss": 0.0135, + "step": 140810 + }, + { + "epoch": 30.012673996641933, + "grad_norm": 0.002161230891942978, + "learning_rate": 1.3182521198553263e-05, + "loss": 0.0007, + "step": 140820 + }, + { + "epoch": 30.012728159020742, + "grad_norm": 1.1591893434524536, + "learning_rate": 1.3179512177508171e-05, + "loss": 0.0344, + "step": 140830 + }, + { + "epoch": 30.012782321399555, + "grad_norm": 0.001051528612151742, + "learning_rate": 1.3176503156463077e-05, + "loss": 0.0022, + "step": 140840 + }, + { + "epoch": 30.01283648377837, + "grad_norm": 0.001485602930188179, + "learning_rate": 1.3173494135417982e-05, + "loss": 0.0109, + "step": 140850 + }, + { + "epoch": 30.012890646157178, + "grad_norm": 0.0017362352227792144, + "learning_rate": 1.3170485114372891e-05, + "loss": 0.0003, + "step": 140860 + }, + { + "epoch": 30.01294480853599, + "grad_norm": 0.04915193095803261, + "learning_rate": 1.3167476093327796e-05, + "loss": 0.0279, + "step": 140870 + }, + { + "epoch": 30.012998970914804, + "grad_norm": 0.0010243426077067852, + "learning_rate": 1.3164467072282704e-05, + "loss": 0.0696, + "step": 140880 + }, + { + "epoch": 30.013053133293614, + "grad_norm": 0.0010392635595053434, + "learning_rate": 1.316145805123761e-05, + "loss": 0.0177, + "step": 140890 + }, + { + "epoch": 30.013107295672427, + "grad_norm": 0.0015449250349774957, + "learning_rate": 1.3158449030192518e-05, + "loss": 0.1143, + "step": 140900 + }, + { + "epoch": 30.013161458051236, + "grad_norm": 1.378412127494812, + "learning_rate": 1.3155440009147424e-05, + "loss": 0.017, + "step": 140910 + }, + { + "epoch": 30.01321562043005, + "grad_norm": 0.0016460021724924445, + "learning_rate": 1.3152430988102332e-05, + "loss": 0.0239, + "step": 140920 + }, + { + "epoch": 30.013269782808862, + "grad_norm": 0.10634436458349228, + "learning_rate": 1.3149421967057237e-05, + "loss": 0.0678, + "step": 140930 + }, + { + "epoch": 30.013323945187672, + "grad_norm": 0.001346774399280548, + "learning_rate": 1.3146412946012147e-05, + "loss": 0.0187, + "step": 140940 + }, + { + "epoch": 30.013378107566485, + "grad_norm": 0.0011706264922395349, + "learning_rate": 1.3143403924967051e-05, + "loss": 0.0192, + "step": 140950 + }, + { + "epoch": 30.013432269945294, + "grad_norm": 0.0035867253318428993, + "learning_rate": 1.314039490392196e-05, + "loss": 0.0235, + "step": 140960 + }, + { + "epoch": 30.013486432324108, + "grad_norm": 0.05160539597272873, + "learning_rate": 1.3137385882876865e-05, + "loss": 0.0254, + "step": 140970 + }, + { + "epoch": 30.01354059470292, + "grad_norm": 0.00310452189296484, + "learning_rate": 1.3134376861831773e-05, + "loss": 0.0224, + "step": 140980 + }, + { + "epoch": 30.01359475708173, + "grad_norm": 1.9286961555480957, + "learning_rate": 1.313136784078668e-05, + "loss": 0.0575, + "step": 140990 + }, + { + "epoch": 30.013648919460543, + "grad_norm": 0.0010278268018737435, + "learning_rate": 1.3128358819741584e-05, + "loss": 0.0467, + "step": 141000 + }, + { + "epoch": 30.013703081839353, + "grad_norm": 1.0835353136062622, + "learning_rate": 1.3125349798696492e-05, + "loss": 0.0649, + "step": 141010 + }, + { + "epoch": 30.013757244218166, + "grad_norm": 0.5735268592834473, + "learning_rate": 1.3122340777651399e-05, + "loss": 0.0111, + "step": 141020 + }, + { + "epoch": 30.01381140659698, + "grad_norm": 0.001039267866872251, + "learning_rate": 1.3119331756606307e-05, + "loss": 0.0005, + "step": 141030 + }, + { + "epoch": 30.01386556897579, + "grad_norm": 0.0021226329263299704, + "learning_rate": 1.3116322735561213e-05, + "loss": 0.0557, + "step": 141040 + }, + { + "epoch": 30.0139197313546, + "grad_norm": 0.014117216691374779, + "learning_rate": 1.311331371451612e-05, + "loss": 0.0729, + "step": 141050 + }, + { + "epoch": 30.013973893733414, + "grad_norm": 0.6620268225669861, + "learning_rate": 1.3110304693471025e-05, + "loss": 0.0107, + "step": 141060 + }, + { + "epoch": 30.014028056112224, + "grad_norm": 0.0010666563175618649, + "learning_rate": 1.3107295672425935e-05, + "loss": 0.0389, + "step": 141070 + }, + { + "epoch": 30.014082218491037, + "grad_norm": 2.3074986934661865, + "learning_rate": 1.310428665138084e-05, + "loss": 0.0566, + "step": 141080 + }, + { + "epoch": 30.014136380869846, + "grad_norm": 0.0010394788114354014, + "learning_rate": 1.3101277630335748e-05, + "loss": 0.0216, + "step": 141090 + }, + { + "epoch": 30.01419054324866, + "grad_norm": 1.2266199588775635, + "learning_rate": 1.3098268609290654e-05, + "loss": 0.111, + "step": 141100 + }, + { + "epoch": 30.014244705627473, + "grad_norm": 21.197975158691406, + "learning_rate": 1.3095259588245562e-05, + "loss": 0.0844, + "step": 141110 + }, + { + "epoch": 30.014298868006282, + "grad_norm": 0.002547938609495759, + "learning_rate": 1.3092250567200468e-05, + "loss": 0.0035, + "step": 141120 + }, + { + "epoch": 30.014353030385095, + "grad_norm": 5.854017734527588, + "learning_rate": 1.3089241546155376e-05, + "loss": 0.0135, + "step": 141130 + }, + { + "epoch": 30.014407192763905, + "grad_norm": 1.1730939149856567, + "learning_rate": 1.308623252511028e-05, + "loss": 0.0439, + "step": 141140 + }, + { + "epoch": 30.014461355142718, + "grad_norm": 0.0010351010132580996, + "learning_rate": 1.3083223504065187e-05, + "loss": 0.0116, + "step": 141150 + }, + { + "epoch": 30.01451551752153, + "grad_norm": 0.0017063155537471175, + "learning_rate": 1.3080214483020095e-05, + "loss": 0.1371, + "step": 141160 + }, + { + "epoch": 30.01456967990034, + "grad_norm": 0.002778181340545416, + "learning_rate": 1.3077205461975001e-05, + "loss": 0.0236, + "step": 141170 + }, + { + "epoch": 30.014623842279153, + "grad_norm": 0.055219218134880066, + "learning_rate": 1.3074196440929909e-05, + "loss": 0.1562, + "step": 141180 + }, + { + "epoch": 30.014678004657963, + "grad_norm": 0.8389903903007507, + "learning_rate": 1.3071187419884814e-05, + "loss": 0.0148, + "step": 141190 + }, + { + "epoch": 30.014732167036776, + "grad_norm": 0.0021152545232325792, + "learning_rate": 1.3068178398839723e-05, + "loss": 0.0347, + "step": 141200 + }, + { + "epoch": 30.01478632941559, + "grad_norm": 0.09180602431297302, + "learning_rate": 1.3065169377794628e-05, + "loss": 0.0315, + "step": 141210 + }, + { + "epoch": 30.0148404917944, + "grad_norm": 1.3692090511322021, + "learning_rate": 1.3062160356749536e-05, + "loss": 0.0247, + "step": 141220 + }, + { + "epoch": 30.01489465417321, + "grad_norm": 0.0068956706672906876, + "learning_rate": 1.3059151335704442e-05, + "loss": 0.0262, + "step": 141230 + }, + { + "epoch": 30.014948816552025, + "grad_norm": 1.0875091552734375, + "learning_rate": 1.305614231465935e-05, + "loss": 0.0345, + "step": 141240 + }, + { + "epoch": 30.015002978930834, + "grad_norm": 0.0013410600367933512, + "learning_rate": 1.3053133293614256e-05, + "loss": 0.0166, + "step": 141250 + }, + { + "epoch": 30.015057141309647, + "grad_norm": 0.5650318264961243, + "learning_rate": 1.3050124272569164e-05, + "loss": 0.0318, + "step": 141260 + }, + { + "epoch": 30.015111303688457, + "grad_norm": 3.827171564102173, + "learning_rate": 1.3047115251524069e-05, + "loss": 0.0275, + "step": 141270 + }, + { + "epoch": 30.01516546606727, + "grad_norm": 0.07832467555999756, + "learning_rate": 1.3044106230478979e-05, + "loss": 0.009, + "step": 141280 + }, + { + "epoch": 30.015219628446083, + "grad_norm": 0.016242392361164093, + "learning_rate": 1.3041097209433883e-05, + "loss": 0.0346, + "step": 141290 + }, + { + "epoch": 30.015273790824892, + "grad_norm": 0.03845600038766861, + "learning_rate": 1.303808818838879e-05, + "loss": 0.05, + "step": 141300 + }, + { + "epoch": 30.015327953203705, + "grad_norm": 0.001013064058497548, + "learning_rate": 1.3035079167343697e-05, + "loss": 0.0232, + "step": 141310 + }, + { + "epoch": 30.015382115582515, + "grad_norm": 0.7695217132568359, + "learning_rate": 1.3032070146298602e-05, + "loss": 0.0632, + "step": 141320 + }, + { + "epoch": 30.015436277961328, + "grad_norm": 0.9804859161376953, + "learning_rate": 1.3029061125253512e-05, + "loss": 0.0219, + "step": 141330 + }, + { + "epoch": 30.01549044034014, + "grad_norm": 0.005280131474137306, + "learning_rate": 1.3026052104208416e-05, + "loss": 0.0742, + "step": 141340 + }, + { + "epoch": 30.01554460271895, + "grad_norm": 0.0030994724947959185, + "learning_rate": 1.3023043083163324e-05, + "loss": 0.0114, + "step": 141350 + }, + { + "epoch": 30.015598765097764, + "grad_norm": 0.003505566157400608, + "learning_rate": 1.302003406211823e-05, + "loss": 0.0129, + "step": 141360 + }, + { + "epoch": 30.015652927476573, + "grad_norm": 0.020563632249832153, + "learning_rate": 1.3017025041073138e-05, + "loss": 0.0027, + "step": 141370 + }, + { + "epoch": 30.015707089855386, + "grad_norm": 0.0012027360498905182, + "learning_rate": 1.3014016020028045e-05, + "loss": 0.0099, + "step": 141380 + }, + { + "epoch": 30.0157612522342, + "grad_norm": 3.2338192462921143, + "learning_rate": 1.3011006998982953e-05, + "loss": 0.0811, + "step": 141390 + }, + { + "epoch": 30.01581541461301, + "grad_norm": 0.001048337435349822, + "learning_rate": 1.3007997977937857e-05, + "loss": 0.073, + "step": 141400 + }, + { + "epoch": 30.015869576991822, + "grad_norm": 0.006042728666216135, + "learning_rate": 1.3004988956892767e-05, + "loss": 0.0468, + "step": 141410 + }, + { + "epoch": 30.015923739370635, + "grad_norm": 0.0009754739585332572, + "learning_rate": 1.3001979935847672e-05, + "loss": 0.0247, + "step": 141420 + }, + { + "epoch": 30.015977901749444, + "grad_norm": 24.0122013092041, + "learning_rate": 1.299897091480258e-05, + "loss": 0.0378, + "step": 141430 + }, + { + "epoch": 30.016032064128257, + "grad_norm": 10.713151931762695, + "learning_rate": 1.2995961893757486e-05, + "loss": 0.1234, + "step": 141440 + }, + { + "epoch": 30.016086226507067, + "grad_norm": 0.0016298171831294894, + "learning_rate": 1.299295287271239e-05, + "loss": 0.1774, + "step": 141450 + }, + { + "epoch": 30.01614038888588, + "grad_norm": 0.0016852234257385135, + "learning_rate": 1.29899438516673e-05, + "loss": 0.0027, + "step": 141460 + }, + { + "epoch": 30.016194551264693, + "grad_norm": 0.0025641194079071283, + "learning_rate": 1.2986934830622205e-05, + "loss": 0.0485, + "step": 141470 + }, + { + "epoch": 30.016248713643503, + "grad_norm": 0.14335589110851288, + "learning_rate": 1.2983925809577113e-05, + "loss": 0.0191, + "step": 141480 + }, + { + "epoch": 30.016302876022316, + "grad_norm": 0.0013417009031400084, + "learning_rate": 1.2980916788532019e-05, + "loss": 0.0222, + "step": 141490 + }, + { + "epoch": 30.016357038401125, + "grad_norm": 0.2479100078344345, + "learning_rate": 1.2977907767486927e-05, + "loss": 0.0278, + "step": 141500 + }, + { + "epoch": 30.016411200779938, + "grad_norm": 0.001656531821936369, + "learning_rate": 1.2974898746441833e-05, + "loss": 0.0769, + "step": 141510 + }, + { + "epoch": 30.01646536315875, + "grad_norm": 0.0012572036357596517, + "learning_rate": 1.2971889725396741e-05, + "loss": 0.0797, + "step": 141520 + }, + { + "epoch": 30.01651952553756, + "grad_norm": 0.00805676355957985, + "learning_rate": 1.2968880704351646e-05, + "loss": 0.0457, + "step": 141530 + }, + { + "epoch": 30.016573687916374, + "grad_norm": 0.4595228135585785, + "learning_rate": 1.2965871683306555e-05, + "loss": 0.041, + "step": 141540 + }, + { + "epoch": 30.016627850295183, + "grad_norm": 0.06865271925926208, + "learning_rate": 1.296286266226146e-05, + "loss": 0.034, + "step": 141550 + }, + { + "epoch": 30.016682012673996, + "grad_norm": 0.03672216460108757, + "learning_rate": 1.2959853641216368e-05, + "loss": 0.0307, + "step": 141560 + }, + { + "epoch": 30.01673617505281, + "grad_norm": 0.9217135310173035, + "learning_rate": 1.2956844620171274e-05, + "loss": 0.0508, + "step": 141570 + }, + { + "epoch": 30.01679033743162, + "grad_norm": 0.0011949827894568443, + "learning_rate": 1.2953835599126182e-05, + "loss": 0.0004, + "step": 141580 + }, + { + "epoch": 30.016844499810432, + "grad_norm": 0.0011473214253783226, + "learning_rate": 1.2950826578081088e-05, + "loss": 0.0272, + "step": 141590 + }, + { + "epoch": 30.016898662189245, + "grad_norm": 0.0011829425347968936, + "learning_rate": 1.2947817557035993e-05, + "loss": 0.0296, + "step": 141600 + }, + { + "epoch": 30.016952824568055, + "grad_norm": 0.0015619854675605893, + "learning_rate": 1.2944808535990901e-05, + "loss": 0.0252, + "step": 141610 + }, + { + "epoch": 30.017006986946868, + "grad_norm": 0.0011200807057321072, + "learning_rate": 1.2941799514945807e-05, + "loss": 0.0058, + "step": 141620 + }, + { + "epoch": 30.017061149325677, + "grad_norm": 0.48295632004737854, + "learning_rate": 1.2938790493900715e-05, + "loss": 0.0178, + "step": 141630 + }, + { + "epoch": 30.01711531170449, + "grad_norm": 0.0011373745510354638, + "learning_rate": 1.2935781472855621e-05, + "loss": 0.0367, + "step": 141640 + }, + { + "epoch": 30.017169474083303, + "grad_norm": 0.001679193926975131, + "learning_rate": 1.293277245181053e-05, + "loss": 0.0332, + "step": 141650 + }, + { + "epoch": 30.017223636462113, + "grad_norm": 0.01592082343995571, + "learning_rate": 1.2929763430765434e-05, + "loss": 0.0598, + "step": 141660 + }, + { + "epoch": 30.017277798840926, + "grad_norm": 0.8671791553497314, + "learning_rate": 1.2926754409720344e-05, + "loss": 0.0151, + "step": 141670 + }, + { + "epoch": 30.017331961219735, + "grad_norm": 0.00401806877925992, + "learning_rate": 1.2923745388675248e-05, + "loss": 0.0172, + "step": 141680 + }, + { + "epoch": 30.01738612359855, + "grad_norm": 0.0036458054091781378, + "learning_rate": 1.2920736367630156e-05, + "loss": 0.0104, + "step": 141690 + }, + { + "epoch": 30.01744028597736, + "grad_norm": 0.35464122891426086, + "learning_rate": 1.2917727346585062e-05, + "loss": 0.0074, + "step": 141700 + }, + { + "epoch": 30.01749444835617, + "grad_norm": 0.001001255470328033, + "learning_rate": 1.291471832553997e-05, + "loss": 0.0235, + "step": 141710 + }, + { + "epoch": 30.017548610734984, + "grad_norm": 0.08402521163225174, + "learning_rate": 1.2911709304494877e-05, + "loss": 0.0007, + "step": 141720 + }, + { + "epoch": 30.017602773113794, + "grad_norm": 0.4019148051738739, + "learning_rate": 1.2908700283449785e-05, + "loss": 0.0565, + "step": 141730 + }, + { + "epoch": 30.017656935492607, + "grad_norm": 0.0012905315961688757, + "learning_rate": 1.290569126240469e-05, + "loss": 0.0516, + "step": 141740 + }, + { + "epoch": 30.01771109787142, + "grad_norm": 5.126029014587402, + "learning_rate": 1.2902682241359596e-05, + "loss": 0.1963, + "step": 141750 + }, + { + "epoch": 30.01776526025023, + "grad_norm": 1.462839126586914, + "learning_rate": 1.2899673220314504e-05, + "loss": 0.0167, + "step": 141760 + }, + { + "epoch": 30.017819422629042, + "grad_norm": 0.045301903039216995, + "learning_rate": 1.289666419926941e-05, + "loss": 0.0182, + "step": 141770 + }, + { + "epoch": 30.017873585007855, + "grad_norm": 0.18671315908432007, + "learning_rate": 1.2893655178224318e-05, + "loss": 0.0513, + "step": 141780 + }, + { + "epoch": 30.017927747386665, + "grad_norm": 0.0010806635254994035, + "learning_rate": 1.2890646157179222e-05, + "loss": 0.0612, + "step": 141790 + }, + { + "epoch": 30.017981909765478, + "grad_norm": 0.21094204485416412, + "learning_rate": 1.2887637136134132e-05, + "loss": 0.0013, + "step": 141800 + }, + { + "epoch": 30.018036072144287, + "grad_norm": 0.03044936992228031, + "learning_rate": 1.2884628115089037e-05, + "loss": 0.0364, + "step": 141810 + }, + { + "epoch": 30.0180902345231, + "grad_norm": 0.1494038701057434, + "learning_rate": 1.2881619094043945e-05, + "loss": 0.1551, + "step": 141820 + }, + { + "epoch": 30.018144396901913, + "grad_norm": 0.002141143660992384, + "learning_rate": 1.287861007299885e-05, + "loss": 0.071, + "step": 141830 + }, + { + "epoch": 30.018198559280723, + "grad_norm": 7.101075172424316, + "learning_rate": 1.2875601051953759e-05, + "loss": 0.0465, + "step": 141840 + }, + { + "epoch": 30.018252721659536, + "grad_norm": 0.008898278698325157, + "learning_rate": 1.2872592030908665e-05, + "loss": 0.0045, + "step": 141850 + }, + { + "epoch": 30.018306884038346, + "grad_norm": 0.0014320502523332834, + "learning_rate": 1.2869583009863573e-05, + "loss": 0.0858, + "step": 141860 + }, + { + "epoch": 30.01836104641716, + "grad_norm": 0.1970401406288147, + "learning_rate": 1.2866573988818478e-05, + "loss": 0.0347, + "step": 141870 + }, + { + "epoch": 30.01841520879597, + "grad_norm": 0.02182084694504738, + "learning_rate": 1.2863564967773387e-05, + "loss": 0.0205, + "step": 141880 + }, + { + "epoch": 30.01846937117478, + "grad_norm": 0.001178423990495503, + "learning_rate": 1.2860555946728292e-05, + "loss": 0.0107, + "step": 141890 + }, + { + "epoch": 30.018523533553594, + "grad_norm": 0.002906169043853879, + "learning_rate": 1.2857546925683198e-05, + "loss": 0.023, + "step": 141900 + }, + { + "epoch": 30.018577695932404, + "grad_norm": 0.32332560420036316, + "learning_rate": 1.2854537904638106e-05, + "loss": 0.0198, + "step": 141910 + }, + { + "epoch": 30.018631858311217, + "grad_norm": 1.0629277229309082, + "learning_rate": 1.285152888359301e-05, + "loss": 0.0117, + "step": 141920 + }, + { + "epoch": 30.01868602069003, + "grad_norm": 0.0011315064039081335, + "learning_rate": 1.284851986254792e-05, + "loss": 0.035, + "step": 141930 + }, + { + "epoch": 30.01874018306884, + "grad_norm": 0.002112024463713169, + "learning_rate": 1.2845510841502825e-05, + "loss": 0.0032, + "step": 141940 + }, + { + "epoch": 30.018794345447652, + "grad_norm": 2.73275089263916, + "learning_rate": 1.2842501820457733e-05, + "loss": 0.0297, + "step": 141950 + }, + { + "epoch": 30.018848507826462, + "grad_norm": 0.4512849450111389, + "learning_rate": 1.283949279941264e-05, + "loss": 0.0525, + "step": 141960 + }, + { + "epoch": 30.018902670205275, + "grad_norm": 0.007091611623764038, + "learning_rate": 1.2836483778367547e-05, + "loss": 0.003, + "step": 141970 + }, + { + "epoch": 30.018956832584088, + "grad_norm": 1.0942808389663696, + "learning_rate": 1.2833474757322453e-05, + "loss": 0.029, + "step": 141980 + }, + { + "epoch": 30.019010994962898, + "grad_norm": 0.0010197405936196446, + "learning_rate": 1.2830465736277361e-05, + "loss": 0.0235, + "step": 141990 + }, + { + "epoch": 30.01906515734171, + "grad_norm": 0.0011064795544371009, + "learning_rate": 1.2827456715232266e-05, + "loss": 0.0338, + "step": 142000 + }, + { + "epoch": 30.019119319720524, + "grad_norm": 0.009687509387731552, + "learning_rate": 1.2824447694187176e-05, + "loss": 0.1946, + "step": 142010 + }, + { + "epoch": 30.019173482099333, + "grad_norm": 0.9801973700523376, + "learning_rate": 1.282143867314208e-05, + "loss": 0.0548, + "step": 142020 + }, + { + "epoch": 30.019227644478146, + "grad_norm": 0.005619056522846222, + "learning_rate": 1.2818429652096988e-05, + "loss": 0.01, + "step": 142030 + }, + { + "epoch": 30.019281806856956, + "grad_norm": 0.0011539279948920012, + "learning_rate": 1.2815420631051894e-05, + "loss": 0.0689, + "step": 142040 + }, + { + "epoch": 30.01933596923577, + "grad_norm": 0.03911615535616875, + "learning_rate": 1.2812411610006799e-05, + "loss": 0.0213, + "step": 142050 + }, + { + "epoch": 30.019390131614582, + "grad_norm": 1.431229829788208, + "learning_rate": 1.2809402588961709e-05, + "loss": 0.058, + "step": 142060 + }, + { + "epoch": 30.01944429399339, + "grad_norm": 0.0026845301035791636, + "learning_rate": 1.2806393567916613e-05, + "loss": 0.0655, + "step": 142070 + }, + { + "epoch": 30.019498456372204, + "grad_norm": 0.0011952138738706708, + "learning_rate": 1.2803384546871521e-05, + "loss": 0.0237, + "step": 142080 + }, + { + "epoch": 30.019552618751014, + "grad_norm": 0.05999800190329552, + "learning_rate": 1.2800375525826428e-05, + "loss": 0.0183, + "step": 142090 + }, + { + "epoch": 30.019606781129827, + "grad_norm": 0.570354163646698, + "learning_rate": 1.2797366504781335e-05, + "loss": 0.0247, + "step": 142100 + }, + { + "epoch": 30.01966094350864, + "grad_norm": 0.4478234648704529, + "learning_rate": 1.2794357483736242e-05, + "loss": 0.0193, + "step": 142110 + }, + { + "epoch": 30.01971510588745, + "grad_norm": 0.0010452037677168846, + "learning_rate": 1.279134846269115e-05, + "loss": 0.0076, + "step": 142120 + }, + { + "epoch": 30.019769268266263, + "grad_norm": 0.0010166428983211517, + "learning_rate": 1.2788339441646054e-05, + "loss": 0.0085, + "step": 142130 + }, + { + "epoch": 30.019823430645072, + "grad_norm": 0.062409091740846634, + "learning_rate": 1.2785330420600964e-05, + "loss": 0.0014, + "step": 142140 + }, + { + "epoch": 30.019877593023885, + "grad_norm": 0.1590421050786972, + "learning_rate": 1.2782321399555869e-05, + "loss": 0.0074, + "step": 142150 + }, + { + "epoch": 30.0199317554027, + "grad_norm": 0.0011036151554435492, + "learning_rate": 1.2779312378510777e-05, + "loss": 0.003, + "step": 142160 + }, + { + "epoch": 30.019985917781508, + "grad_norm": 1.4613816738128662, + "learning_rate": 1.2776303357465683e-05, + "loss": 0.0781, + "step": 142170 + }, + { + "epoch": 30.02004008016032, + "grad_norm": 0.0014042637776583433, + "learning_rate": 1.277329433642059e-05, + "loss": 0.1075, + "step": 142180 + }, + { + "epoch": 30.020094242539134, + "grad_norm": 0.0018994539277628064, + "learning_rate": 1.2770285315375497e-05, + "loss": 0.0384, + "step": 142190 + }, + { + "epoch": 30.020148404917943, + "grad_norm": 0.42979294061660767, + "learning_rate": 1.2767276294330402e-05, + "loss": 0.0559, + "step": 142200 + }, + { + "epoch": 30.020202567296757, + "grad_norm": 2.386732816696167, + "learning_rate": 1.276426727328531e-05, + "loss": 0.0326, + "step": 142210 + }, + { + "epoch": 30.020256729675566, + "grad_norm": 1.0781079530715942, + "learning_rate": 1.2761258252240216e-05, + "loss": 0.0692, + "step": 142220 + }, + { + "epoch": 30.02031089205438, + "grad_norm": 0.003499601734802127, + "learning_rate": 1.2758249231195124e-05, + "loss": 0.0211, + "step": 142230 + }, + { + "epoch": 30.020365054433192, + "grad_norm": 0.034491926431655884, + "learning_rate": 1.275524021015003e-05, + "loss": 0.0138, + "step": 142240 + }, + { + "epoch": 30.020419216812, + "grad_norm": 0.0010698537807911634, + "learning_rate": 1.2752231189104938e-05, + "loss": 0.0769, + "step": 142250 + }, + { + "epoch": 30.020473379190815, + "grad_norm": 0.5856369733810425, + "learning_rate": 1.2749222168059843e-05, + "loss": 0.041, + "step": 142260 + }, + { + "epoch": 30.020527541569624, + "grad_norm": 0.0015369397588074207, + "learning_rate": 1.2746213147014752e-05, + "loss": 0.0488, + "step": 142270 + }, + { + "epoch": 30.020581703948437, + "grad_norm": 0.15670324862003326, + "learning_rate": 1.2743204125969657e-05, + "loss": 0.0084, + "step": 142280 + }, + { + "epoch": 30.02063586632725, + "grad_norm": 0.5788416266441345, + "learning_rate": 1.2740195104924565e-05, + "loss": 0.0551, + "step": 142290 + }, + { + "epoch": 30.02069002870606, + "grad_norm": 1.8416179418563843, + "learning_rate": 1.2737186083879471e-05, + "loss": 0.0586, + "step": 142300 + }, + { + "epoch": 30.020744191084873, + "grad_norm": 2.2982265949249268, + "learning_rate": 1.2734177062834379e-05, + "loss": 0.1946, + "step": 142310 + }, + { + "epoch": 30.020798353463682, + "grad_norm": 0.0020227180793881416, + "learning_rate": 1.2731168041789285e-05, + "loss": 0.0571, + "step": 142320 + }, + { + "epoch": 30.020852515842495, + "grad_norm": 0.001703007728792727, + "learning_rate": 1.272815902074419e-05, + "loss": 0.0079, + "step": 142330 + }, + { + "epoch": 30.02090667822131, + "grad_norm": 0.0011425198754295707, + "learning_rate": 1.2725149999699098e-05, + "loss": 0.0787, + "step": 142340 + }, + { + "epoch": 30.020960840600118, + "grad_norm": 0.03673820197582245, + "learning_rate": 1.2722140978654004e-05, + "loss": 0.0245, + "step": 142350 + }, + { + "epoch": 30.02101500297893, + "grad_norm": 0.0033640002366155386, + "learning_rate": 1.2719131957608912e-05, + "loss": 0.084, + "step": 142360 + }, + { + "epoch": 30.021069165357744, + "grad_norm": 1.1336801052093506, + "learning_rate": 1.2716122936563818e-05, + "loss": 0.0497, + "step": 142370 + }, + { + "epoch": 30.021123327736554, + "grad_norm": 0.00289287231862545, + "learning_rate": 1.2713113915518726e-05, + "loss": 0.0341, + "step": 142380 + }, + { + "epoch": 30.021177490115367, + "grad_norm": 0.0011967504397034645, + "learning_rate": 1.2710104894473631e-05, + "loss": 0.0014, + "step": 142390 + }, + { + "epoch": 30.021231652494176, + "grad_norm": 0.0014611692167818546, + "learning_rate": 1.270709587342854e-05, + "loss": 0.0072, + "step": 142400 + }, + { + "epoch": 30.02128581487299, + "grad_norm": 1.172899603843689, + "learning_rate": 1.2704086852383445e-05, + "loss": 0.0048, + "step": 142410 + }, + { + "epoch": 30.021339977251802, + "grad_norm": 0.0012036623666062951, + "learning_rate": 1.2701077831338353e-05, + "loss": 0.0001, + "step": 142420 + }, + { + "epoch": 30.021394139630612, + "grad_norm": 0.39012548327445984, + "learning_rate": 1.269806881029326e-05, + "loss": 0.0022, + "step": 142430 + }, + { + "epoch": 30.021448302009425, + "grad_norm": 0.002258870517835021, + "learning_rate": 1.2695059789248167e-05, + "loss": 0.0275, + "step": 142440 + }, + { + "epoch": 30.021502464388234, + "grad_norm": 0.11350119858980179, + "learning_rate": 1.2692050768203074e-05, + "loss": 0.0524, + "step": 142450 + }, + { + "epoch": 30.021556626767048, + "grad_norm": 0.1504385620355606, + "learning_rate": 1.2689041747157982e-05, + "loss": 0.0231, + "step": 142460 + }, + { + "epoch": 30.02161078914586, + "grad_norm": 0.04665962606668472, + "learning_rate": 1.2686032726112886e-05, + "loss": 0.0035, + "step": 142470 + }, + { + "epoch": 30.02166495152467, + "grad_norm": 0.10605419427156448, + "learning_rate": 1.2683023705067793e-05, + "loss": 0.0029, + "step": 142480 + }, + { + "epoch": 30.021719113903483, + "grad_norm": 0.0009965786011889577, + "learning_rate": 1.26800146840227e-05, + "loss": 0.0366, + "step": 142490 + }, + { + "epoch": 30.021773276282293, + "grad_norm": 0.001367919147014618, + "learning_rate": 1.2677005662977607e-05, + "loss": 0.0113, + "step": 142500 + }, + { + "epoch": 30.021827438661106, + "grad_norm": 0.0018069354118779302, + "learning_rate": 1.2673996641932515e-05, + "loss": 0.0439, + "step": 142510 + }, + { + "epoch": 30.02188160103992, + "grad_norm": 4.688146114349365, + "learning_rate": 1.267098762088742e-05, + "loss": 0.0659, + "step": 142520 + }, + { + "epoch": 30.02193576341873, + "grad_norm": 0.0019095876486971974, + "learning_rate": 1.2667978599842329e-05, + "loss": 0.0352, + "step": 142530 + }, + { + "epoch": 30.02198992579754, + "grad_norm": 0.001212434028275311, + "learning_rate": 1.2664969578797234e-05, + "loss": 0.0523, + "step": 142540 + }, + { + "epoch": 30.022044088176354, + "grad_norm": 0.023311514407396317, + "learning_rate": 1.2661960557752142e-05, + "loss": 0.0048, + "step": 142550 + }, + { + "epoch": 30.022098250555164, + "grad_norm": 1.3921451568603516, + "learning_rate": 1.2658951536707048e-05, + "loss": 0.0454, + "step": 142560 + }, + { + "epoch": 30.022152412933977, + "grad_norm": 0.9764531254768372, + "learning_rate": 1.2655942515661956e-05, + "loss": 0.057, + "step": 142570 + }, + { + "epoch": 30.022206575312786, + "grad_norm": 0.0031810745131224394, + "learning_rate": 1.2652933494616862e-05, + "loss": 0.028, + "step": 142580 + }, + { + "epoch": 30.0222607376916, + "grad_norm": 3.7701504230499268, + "learning_rate": 1.264992447357177e-05, + "loss": 0.0514, + "step": 142590 + }, + { + "epoch": 30.022314900070413, + "grad_norm": 0.0010089168790727854, + "learning_rate": 1.2646915452526675e-05, + "loss": 0.0626, + "step": 142600 + }, + { + "epoch": 30.022369062449222, + "grad_norm": 0.5982241630554199, + "learning_rate": 1.2643906431481584e-05, + "loss": 0.0574, + "step": 142610 + }, + { + "epoch": 30.022423224828035, + "grad_norm": 0.9391607046127319, + "learning_rate": 1.2640897410436489e-05, + "loss": 0.0214, + "step": 142620 + }, + { + "epoch": 30.022477387206845, + "grad_norm": 3.8234689235687256, + "learning_rate": 1.2637888389391395e-05, + "loss": 0.0783, + "step": 142630 + }, + { + "epoch": 30.022531549585658, + "grad_norm": 0.0013686807360500097, + "learning_rate": 1.2634879368346303e-05, + "loss": 0.0519, + "step": 142640 + }, + { + "epoch": 30.02258571196447, + "grad_norm": 0.006221618968993425, + "learning_rate": 1.2631870347301208e-05, + "loss": 0.0427, + "step": 142650 + }, + { + "epoch": 30.02263987434328, + "grad_norm": 0.0014394710306078196, + "learning_rate": 1.2628861326256117e-05, + "loss": 0.0022, + "step": 142660 + }, + { + "epoch": 30.022694036722093, + "grad_norm": 1.0187182426452637, + "learning_rate": 1.2625852305211022e-05, + "loss": 0.1137, + "step": 142670 + }, + { + "epoch": 30.022748199100903, + "grad_norm": 0.9735836386680603, + "learning_rate": 1.262284328416593e-05, + "loss": 0.0194, + "step": 142680 + }, + { + "epoch": 30.022802361479716, + "grad_norm": 0.6425798535346985, + "learning_rate": 1.2619834263120836e-05, + "loss": 0.013, + "step": 142690 + }, + { + "epoch": 30.02285652385853, + "grad_norm": 0.0027526759076863527, + "learning_rate": 1.2616825242075744e-05, + "loss": 0.0161, + "step": 142700 + }, + { + "epoch": 30.02291068623734, + "grad_norm": 0.017164170742034912, + "learning_rate": 1.261381622103065e-05, + "loss": 0.0465, + "step": 142710 + }, + { + "epoch": 30.02296484861615, + "grad_norm": 0.149788498878479, + "learning_rate": 1.2610807199985558e-05, + "loss": 0.1058, + "step": 142720 + }, + { + "epoch": 30.023019010994965, + "grad_norm": 0.001184823107905686, + "learning_rate": 1.2607798178940463e-05, + "loss": 0.0708, + "step": 142730 + }, + { + "epoch": 30.023073173373774, + "grad_norm": 1.3379244804382324, + "learning_rate": 1.2604789157895373e-05, + "loss": 0.0222, + "step": 142740 + }, + { + "epoch": 30.023127335752587, + "grad_norm": 0.1334417462348938, + "learning_rate": 1.2601780136850277e-05, + "loss": 0.0076, + "step": 142750 + }, + { + "epoch": 30.023181498131397, + "grad_norm": 0.02854982577264309, + "learning_rate": 1.2598771115805185e-05, + "loss": 0.0057, + "step": 142760 + }, + { + "epoch": 30.02323566051021, + "grad_norm": 0.001899291411973536, + "learning_rate": 1.2595762094760091e-05, + "loss": 0.0225, + "step": 142770 + }, + { + "epoch": 30.023289822889023, + "grad_norm": 0.0011044983984902501, + "learning_rate": 1.2592753073714996e-05, + "loss": 0.0079, + "step": 142780 + }, + { + "epoch": 30.023343985267832, + "grad_norm": 15.910932540893555, + "learning_rate": 1.2589744052669906e-05, + "loss": 0.0725, + "step": 142790 + }, + { + "epoch": 30.023398147646645, + "grad_norm": 0.0014784007798880339, + "learning_rate": 1.258673503162481e-05, + "loss": 0.0662, + "step": 142800 + }, + { + "epoch": 30.023452310025455, + "grad_norm": 0.0828239768743515, + "learning_rate": 1.2583726010579718e-05, + "loss": 0.1255, + "step": 142810 + }, + { + "epoch": 30.023506472404268, + "grad_norm": 6.086184024810791, + "learning_rate": 1.2580716989534625e-05, + "loss": 0.0785, + "step": 142820 + }, + { + "epoch": 30.02356063478308, + "grad_norm": 0.3250199258327484, + "learning_rate": 1.2577707968489533e-05, + "loss": 0.0021, + "step": 142830 + }, + { + "epoch": 30.02361479716189, + "grad_norm": 0.008415734395384789, + "learning_rate": 1.2574698947444439e-05, + "loss": 0.004, + "step": 142840 + }, + { + "epoch": 30.023668959540704, + "grad_norm": 0.21556209027767181, + "learning_rate": 1.2571689926399347e-05, + "loss": 0.0365, + "step": 142850 + }, + { + "epoch": 30.023723121919513, + "grad_norm": 0.002164107048884034, + "learning_rate": 1.2568680905354251e-05, + "loss": 0.0093, + "step": 142860 + }, + { + "epoch": 30.023777284298326, + "grad_norm": 0.0021114188712090254, + "learning_rate": 1.2565671884309161e-05, + "loss": 0.0009, + "step": 142870 + }, + { + "epoch": 30.02383144667714, + "grad_norm": 0.003054507775232196, + "learning_rate": 1.2562662863264066e-05, + "loss": 0.0133, + "step": 142880 + }, + { + "epoch": 30.02388560905595, + "grad_norm": 0.02792513184249401, + "learning_rate": 1.2559653842218974e-05, + "loss": 0.0084, + "step": 142890 + }, + { + "epoch": 30.023939771434762, + "grad_norm": 0.08716808259487152, + "learning_rate": 1.255664482117388e-05, + "loss": 0.0497, + "step": 142900 + }, + { + "epoch": 30.023993933813575, + "grad_norm": 0.0024035342503339052, + "learning_rate": 1.2553635800128788e-05, + "loss": 0.0076, + "step": 142910 + }, + { + "epoch": 30.024048096192384, + "grad_norm": 0.0019653206691145897, + "learning_rate": 1.2550626779083694e-05, + "loss": 0.0128, + "step": 142920 + }, + { + "epoch": 30.024102258571197, + "grad_norm": 0.4089268445968628, + "learning_rate": 1.2547617758038599e-05, + "loss": 0.021, + "step": 142930 + }, + { + "epoch": 30.024156420950007, + "grad_norm": 0.0015874814707785845, + "learning_rate": 1.2544608736993507e-05, + "loss": 0.0395, + "step": 142940 + }, + { + "epoch": 30.02421058332882, + "grad_norm": 0.0013670267071574926, + "learning_rate": 1.2541599715948413e-05, + "loss": 0.0236, + "step": 142950 + }, + { + "epoch": 30.024264745707633, + "grad_norm": 0.003026761580258608, + "learning_rate": 1.2538590694903321e-05, + "loss": 0.0246, + "step": 142960 + }, + { + "epoch": 30.024318908086443, + "grad_norm": 2.404752492904663, + "learning_rate": 1.2535581673858227e-05, + "loss": 0.0412, + "step": 142970 + }, + { + "epoch": 30.024373070465256, + "grad_norm": 0.0016484976513311267, + "learning_rate": 1.2532572652813135e-05, + "loss": 0.0231, + "step": 142980 + }, + { + "epoch": 30.024427232844065, + "grad_norm": 0.001101527945138514, + "learning_rate": 1.252956363176804e-05, + "loss": 0.0478, + "step": 142990 + }, + { + "epoch": 30.024481395222878, + "grad_norm": 5.853210926055908, + "learning_rate": 1.252655461072295e-05, + "loss": 0.1066, + "step": 143000 + }, + { + "epoch": 30.02453555760169, + "grad_norm": 0.1850150227546692, + "learning_rate": 1.2523545589677854e-05, + "loss": 0.019, + "step": 143010 + }, + { + "epoch": 30.0245897199805, + "grad_norm": 0.3694545030593872, + "learning_rate": 1.2520536568632762e-05, + "loss": 0.0445, + "step": 143020 + }, + { + "epoch": 30.024643882359314, + "grad_norm": 0.001741160056553781, + "learning_rate": 1.2517527547587668e-05, + "loss": 0.1474, + "step": 143030 + }, + { + "epoch": 30.024698044738123, + "grad_norm": 0.0016258053947240114, + "learning_rate": 1.2514518526542576e-05, + "loss": 0.0572, + "step": 143040 + }, + { + "epoch": 30.024752207116936, + "grad_norm": 0.019140511751174927, + "learning_rate": 1.2511509505497482e-05, + "loss": 0.1188, + "step": 143050 + }, + { + "epoch": 30.02480636949575, + "grad_norm": 0.0024198240134865046, + "learning_rate": 1.250850048445239e-05, + "loss": 0.0856, + "step": 143060 + }, + { + "epoch": 30.02486053187456, + "grad_norm": 0.007469006814062595, + "learning_rate": 1.2505491463407295e-05, + "loss": 0.0123, + "step": 143070 + }, + { + "epoch": 30.024914694253372, + "grad_norm": 0.002343543339520693, + "learning_rate": 1.2502482442362201e-05, + "loss": 0.0281, + "step": 143080 + }, + { + "epoch": 30.02496885663218, + "grad_norm": 0.0030854525975883007, + "learning_rate": 1.249947342131711e-05, + "loss": 0.019, + "step": 143090 + }, + { + "epoch": 30.02500135405947, + "eval_accuracy": 0.8357282821685174, + "eval_loss": 0.8224306106567383, + "eval_runtime": 113.3316, + "eval_samples_per_second": 27.018, + "eval_steps_per_second": 3.379, + "step": 143096 + }, + { + "epoch": 31.000021664951525, + "grad_norm": 0.03774125128984451, + "learning_rate": 1.2496464400272015e-05, + "loss": 0.0152, + "step": 143100 + }, + { + "epoch": 31.000075827330335, + "grad_norm": 0.003980347886681557, + "learning_rate": 1.2493455379226923e-05, + "loss": 0.007, + "step": 143110 + }, + { + "epoch": 31.000129989709148, + "grad_norm": 0.11374767869710922, + "learning_rate": 1.249044635818183e-05, + "loss": 0.0188, + "step": 143120 + }, + { + "epoch": 31.00018415208796, + "grad_norm": 0.006183468736708164, + "learning_rate": 1.2487437337136738e-05, + "loss": 0.1076, + "step": 143130 + }, + { + "epoch": 31.00023831446677, + "grad_norm": 0.84281325340271, + "learning_rate": 1.2484428316091644e-05, + "loss": 0.059, + "step": 143140 + }, + { + "epoch": 31.000292476845583, + "grad_norm": 0.0019127107225358486, + "learning_rate": 1.248141929504655e-05, + "loss": 0.0299, + "step": 143150 + }, + { + "epoch": 31.000346639224396, + "grad_norm": 0.07268435508012772, + "learning_rate": 1.2478410274001457e-05, + "loss": 0.0611, + "step": 143160 + }, + { + "epoch": 31.000400801603206, + "grad_norm": 0.012855501845479012, + "learning_rate": 1.2475401252956363e-05, + "loss": 0.0396, + "step": 143170 + }, + { + "epoch": 31.00045496398202, + "grad_norm": 0.002350196707993746, + "learning_rate": 1.247239223191127e-05, + "loss": 0.0402, + "step": 143180 + }, + { + "epoch": 31.00050912636083, + "grad_norm": 0.0018687181873247027, + "learning_rate": 1.2469383210866177e-05, + "loss": 0.0006, + "step": 143190 + }, + { + "epoch": 31.00056328873964, + "grad_norm": 0.003999791108071804, + "learning_rate": 1.2466374189821083e-05, + "loss": 0.0304, + "step": 143200 + }, + { + "epoch": 31.000617451118455, + "grad_norm": 0.17964355647563934, + "learning_rate": 1.2463365168775991e-05, + "loss": 0.0221, + "step": 143210 + }, + { + "epoch": 31.000671613497264, + "grad_norm": 0.3621842563152313, + "learning_rate": 1.2460356147730898e-05, + "loss": 0.0117, + "step": 143220 + }, + { + "epoch": 31.000725775876077, + "grad_norm": 3.9818685054779053, + "learning_rate": 1.2457347126685806e-05, + "loss": 0.1529, + "step": 143230 + }, + { + "epoch": 31.000779938254887, + "grad_norm": 0.004773287568241358, + "learning_rate": 1.2454338105640712e-05, + "loss": 0.0161, + "step": 143240 + }, + { + "epoch": 31.0008341006337, + "grad_norm": 0.0024385687429457903, + "learning_rate": 1.2451329084595618e-05, + "loss": 0.086, + "step": 143250 + }, + { + "epoch": 31.000888263012513, + "grad_norm": 0.08096367865800858, + "learning_rate": 1.2448320063550526e-05, + "loss": 0.0471, + "step": 143260 + }, + { + "epoch": 31.000942425391322, + "grad_norm": 0.0020602026488631964, + "learning_rate": 1.2445311042505432e-05, + "loss": 0.0699, + "step": 143270 + }, + { + "epoch": 31.000996587770135, + "grad_norm": 1.0881155729293823, + "learning_rate": 1.2442302021460339e-05, + "loss": 0.0603, + "step": 143280 + }, + { + "epoch": 31.001050750148945, + "grad_norm": 0.002488489728420973, + "learning_rate": 1.2439293000415247e-05, + "loss": 0.0312, + "step": 143290 + }, + { + "epoch": 31.001104912527758, + "grad_norm": 0.051630258560180664, + "learning_rate": 1.2436283979370153e-05, + "loss": 0.0395, + "step": 143300 + }, + { + "epoch": 31.00115907490657, + "grad_norm": 0.001419394975528121, + "learning_rate": 1.2433274958325059e-05, + "loss": 0.0247, + "step": 143310 + }, + { + "epoch": 31.00121323728538, + "grad_norm": 0.11521053314208984, + "learning_rate": 1.2430265937279965e-05, + "loss": 0.0278, + "step": 143320 + }, + { + "epoch": 31.001267399664194, + "grad_norm": 0.010697532445192337, + "learning_rate": 1.2427256916234872e-05, + "loss": 0.0345, + "step": 143330 + }, + { + "epoch": 31.001321562043003, + "grad_norm": 0.5059794187545776, + "learning_rate": 1.242424789518978e-05, + "loss": 0.0641, + "step": 143340 + }, + { + "epoch": 31.001375724421816, + "grad_norm": 0.003644323907792568, + "learning_rate": 1.2421238874144686e-05, + "loss": 0.0022, + "step": 143350 + }, + { + "epoch": 31.00142988680063, + "grad_norm": 0.15150798857212067, + "learning_rate": 1.2418229853099594e-05, + "loss": 0.0416, + "step": 143360 + }, + { + "epoch": 31.00148404917944, + "grad_norm": 0.0019515702733770013, + "learning_rate": 1.24152208320545e-05, + "loss": 0.0726, + "step": 143370 + }, + { + "epoch": 31.001538211558252, + "grad_norm": 0.7288778424263, + "learning_rate": 1.2412211811009406e-05, + "loss": 0.0083, + "step": 143380 + }, + { + "epoch": 31.001592373937065, + "grad_norm": 0.001788172754459083, + "learning_rate": 1.2409202789964314e-05, + "loss": 0.1411, + "step": 143390 + }, + { + "epoch": 31.001646536315874, + "grad_norm": 0.0026792704593390226, + "learning_rate": 1.240619376891922e-05, + "loss": 0.007, + "step": 143400 + }, + { + "epoch": 31.001700698694687, + "grad_norm": 3.0546443462371826, + "learning_rate": 1.2403184747874127e-05, + "loss": 0.0302, + "step": 143410 + }, + { + "epoch": 31.001754861073497, + "grad_norm": 0.11497750133275986, + "learning_rate": 1.2400175726829035e-05, + "loss": 0.0796, + "step": 143420 + }, + { + "epoch": 31.00180902345231, + "grad_norm": 0.7878735065460205, + "learning_rate": 1.2397166705783941e-05, + "loss": 0.0175, + "step": 143430 + }, + { + "epoch": 31.001863185831123, + "grad_norm": 0.0018514110706746578, + "learning_rate": 1.2394157684738847e-05, + "loss": 0.0128, + "step": 143440 + }, + { + "epoch": 31.001917348209933, + "grad_norm": 0.028198907151818275, + "learning_rate": 1.2391148663693755e-05, + "loss": 0.0197, + "step": 143450 + }, + { + "epoch": 31.001971510588746, + "grad_norm": 3.307246208190918, + "learning_rate": 1.238813964264866e-05, + "loss": 0.0493, + "step": 143460 + }, + { + "epoch": 31.002025672967555, + "grad_norm": 0.027657516300678253, + "learning_rate": 1.2385130621603568e-05, + "loss": 0.0144, + "step": 143470 + }, + { + "epoch": 31.00207983534637, + "grad_norm": 0.1827709823846817, + "learning_rate": 1.2382121600558474e-05, + "loss": 0.0042, + "step": 143480 + }, + { + "epoch": 31.00213399772518, + "grad_norm": 0.002345165004953742, + "learning_rate": 1.2379112579513382e-05, + "loss": 0.0518, + "step": 143490 + }, + { + "epoch": 31.00218816010399, + "grad_norm": 0.002726918552070856, + "learning_rate": 1.2376103558468288e-05, + "loss": 0.0654, + "step": 143500 + }, + { + "epoch": 31.002242322482804, + "grad_norm": 0.003467465750873089, + "learning_rate": 1.2373094537423195e-05, + "loss": 0.0665, + "step": 143510 + }, + { + "epoch": 31.002296484861613, + "grad_norm": 0.0019064201042056084, + "learning_rate": 1.2370085516378103e-05, + "loss": 0.0001, + "step": 143520 + }, + { + "epoch": 31.002350647240426, + "grad_norm": 0.005562477745115757, + "learning_rate": 1.2367076495333009e-05, + "loss": 0.0532, + "step": 143530 + }, + { + "epoch": 31.00240480961924, + "grad_norm": 0.001627300982363522, + "learning_rate": 1.2364067474287915e-05, + "loss": 0.0567, + "step": 143540 + }, + { + "epoch": 31.00245897199805, + "grad_norm": 0.003633118001744151, + "learning_rate": 1.2361058453242823e-05, + "loss": 0.0172, + "step": 143550 + }, + { + "epoch": 31.002513134376862, + "grad_norm": 0.006793707609176636, + "learning_rate": 1.235804943219773e-05, + "loss": 0.0358, + "step": 143560 + }, + { + "epoch": 31.002567296755675, + "grad_norm": 0.003536727512255311, + "learning_rate": 1.2355040411152636e-05, + "loss": 0.0194, + "step": 143570 + }, + { + "epoch": 31.002621459134485, + "grad_norm": 0.0013159004738554358, + "learning_rate": 1.2352031390107544e-05, + "loss": 0.0203, + "step": 143580 + }, + { + "epoch": 31.002675621513298, + "grad_norm": 0.2787741720676422, + "learning_rate": 1.234902236906245e-05, + "loss": 0.0763, + "step": 143590 + }, + { + "epoch": 31.002729783892107, + "grad_norm": 0.0022237738594412804, + "learning_rate": 1.2346013348017358e-05, + "loss": 0.0432, + "step": 143600 + }, + { + "epoch": 31.00278394627092, + "grad_norm": 0.037309277802705765, + "learning_rate": 1.2343004326972263e-05, + "loss": 0.0593, + "step": 143610 + }, + { + "epoch": 31.002838108649733, + "grad_norm": 0.03792416676878929, + "learning_rate": 1.233999530592717e-05, + "loss": 0.0843, + "step": 143620 + }, + { + "epoch": 31.002892271028543, + "grad_norm": 0.002571767894551158, + "learning_rate": 1.2336986284882077e-05, + "loss": 0.0717, + "step": 143630 + }, + { + "epoch": 31.002946433407356, + "grad_norm": 0.03260188177227974, + "learning_rate": 1.2333977263836983e-05, + "loss": 0.0301, + "step": 143640 + }, + { + "epoch": 31.003000595786165, + "grad_norm": 0.01851978339254856, + "learning_rate": 1.2330968242791891e-05, + "loss": 0.0147, + "step": 143650 + }, + { + "epoch": 31.00305475816498, + "grad_norm": 0.01209926325827837, + "learning_rate": 1.2327959221746797e-05, + "loss": 0.0643, + "step": 143660 + }, + { + "epoch": 31.00310892054379, + "grad_norm": 0.005080513656139374, + "learning_rate": 1.2324950200701704e-05, + "loss": 0.0402, + "step": 143670 + }, + { + "epoch": 31.0031630829226, + "grad_norm": 0.025448936969041824, + "learning_rate": 1.2321941179656612e-05, + "loss": 0.0028, + "step": 143680 + }, + { + "epoch": 31.003217245301414, + "grad_norm": 0.44414883852005005, + "learning_rate": 1.2318932158611518e-05, + "loss": 0.0549, + "step": 143690 + }, + { + "epoch": 31.003271407680224, + "grad_norm": 0.04566964507102966, + "learning_rate": 1.2315923137566424e-05, + "loss": 0.1401, + "step": 143700 + }, + { + "epoch": 31.003325570059037, + "grad_norm": 0.6623637080192566, + "learning_rate": 1.2312914116521332e-05, + "loss": 0.0201, + "step": 143710 + }, + { + "epoch": 31.00337973243785, + "grad_norm": 0.004072579089552164, + "learning_rate": 1.2309905095476238e-05, + "loss": 0.0198, + "step": 143720 + }, + { + "epoch": 31.00343389481666, + "grad_norm": 0.0034719801042228937, + "learning_rate": 1.2306896074431146e-05, + "loss": 0.0156, + "step": 143730 + }, + { + "epoch": 31.003488057195472, + "grad_norm": 0.04950632527470589, + "learning_rate": 1.2303887053386053e-05, + "loss": 0.0147, + "step": 143740 + }, + { + "epoch": 31.003542219574285, + "grad_norm": 0.02761359140276909, + "learning_rate": 1.2300878032340959e-05, + "loss": 0.0516, + "step": 143750 + }, + { + "epoch": 31.003596381953095, + "grad_norm": 0.0013080372009426355, + "learning_rate": 1.2297869011295865e-05, + "loss": 0.0001, + "step": 143760 + }, + { + "epoch": 31.003650544331908, + "grad_norm": 0.0793391764163971, + "learning_rate": 1.2294859990250771e-05, + "loss": 0.0469, + "step": 143770 + }, + { + "epoch": 31.003704706710717, + "grad_norm": 0.011832769960165024, + "learning_rate": 1.229185096920568e-05, + "loss": 0.0154, + "step": 143780 + }, + { + "epoch": 31.00375886908953, + "grad_norm": 0.0015574733261018991, + "learning_rate": 1.2288841948160586e-05, + "loss": 0.0386, + "step": 143790 + }, + { + "epoch": 31.003813031468344, + "grad_norm": 1.1529819965362549, + "learning_rate": 1.2285832927115492e-05, + "loss": 0.0136, + "step": 143800 + }, + { + "epoch": 31.003867193847153, + "grad_norm": 0.008058109320700169, + "learning_rate": 1.22828239060704e-05, + "loss": 0.002, + "step": 143810 + }, + { + "epoch": 31.003921356225966, + "grad_norm": 0.0028838682919740677, + "learning_rate": 1.2279814885025306e-05, + "loss": 0.0088, + "step": 143820 + }, + { + "epoch": 31.003975518604776, + "grad_norm": 0.0015316500794142485, + "learning_rate": 1.2276805863980212e-05, + "loss": 0.0212, + "step": 143830 + }, + { + "epoch": 31.00402968098359, + "grad_norm": 0.006873460486531258, + "learning_rate": 1.227379684293512e-05, + "loss": 0.0262, + "step": 143840 + }, + { + "epoch": 31.0040838433624, + "grad_norm": 0.020268965512514114, + "learning_rate": 1.2270787821890027e-05, + "loss": 0.0436, + "step": 143850 + }, + { + "epoch": 31.00413800574121, + "grad_norm": 0.0011766896350309253, + "learning_rate": 1.2267778800844935e-05, + "loss": 0.0196, + "step": 143860 + }, + { + "epoch": 31.004192168120024, + "grad_norm": 8.701142311096191, + "learning_rate": 1.2264769779799841e-05, + "loss": 0.0886, + "step": 143870 + }, + { + "epoch": 31.004246330498834, + "grad_norm": 10.514815330505371, + "learning_rate": 1.2261760758754747e-05, + "loss": 0.0214, + "step": 143880 + }, + { + "epoch": 31.004300492877647, + "grad_norm": 1.0128589868545532, + "learning_rate": 1.2258751737709655e-05, + "loss": 0.042, + "step": 143890 + }, + { + "epoch": 31.00435465525646, + "grad_norm": 0.0011035938514396548, + "learning_rate": 1.225574271666456e-05, + "loss": 0.0233, + "step": 143900 + }, + { + "epoch": 31.00440881763527, + "grad_norm": 0.07476267218589783, + "learning_rate": 1.2252733695619468e-05, + "loss": 0.0386, + "step": 143910 + }, + { + "epoch": 31.004462980014083, + "grad_norm": 0.05805163457989693, + "learning_rate": 1.2249724674574374e-05, + "loss": 0.0009, + "step": 143920 + }, + { + "epoch": 31.004517142392896, + "grad_norm": 0.0017900200327858329, + "learning_rate": 1.224671565352928e-05, + "loss": 0.0211, + "step": 143930 + }, + { + "epoch": 31.004571304771705, + "grad_norm": 0.056623343378305435, + "learning_rate": 1.2243706632484188e-05, + "loss": 0.0242, + "step": 143940 + }, + { + "epoch": 31.004625467150518, + "grad_norm": 1.779771327972412, + "learning_rate": 1.2240697611439095e-05, + "loss": 0.1481, + "step": 143950 + }, + { + "epoch": 31.004679629529328, + "grad_norm": 0.0015308067668229342, + "learning_rate": 1.2237688590394e-05, + "loss": 0.032, + "step": 143960 + }, + { + "epoch": 31.00473379190814, + "grad_norm": 0.1607745885848999, + "learning_rate": 1.2234679569348909e-05, + "loss": 0.0626, + "step": 143970 + }, + { + "epoch": 31.004787954286954, + "grad_norm": 0.0018102178582921624, + "learning_rate": 1.2231670548303815e-05, + "loss": 0.0139, + "step": 143980 + }, + { + "epoch": 31.004842116665763, + "grad_norm": 0.016199441626667976, + "learning_rate": 1.2228661527258723e-05, + "loss": 0.0016, + "step": 143990 + }, + { + "epoch": 31.004896279044576, + "grad_norm": 0.08114417642354965, + "learning_rate": 1.222565250621363e-05, + "loss": 0.0401, + "step": 144000 + }, + { + "epoch": 31.004950441423386, + "grad_norm": 0.5903763771057129, + "learning_rate": 1.2222643485168536e-05, + "loss": 0.0338, + "step": 144010 + }, + { + "epoch": 31.0050046038022, + "grad_norm": 0.004814891144633293, + "learning_rate": 1.2219634464123444e-05, + "loss": 0.0002, + "step": 144020 + }, + { + "epoch": 31.005058766181012, + "grad_norm": 2.6056904792785645, + "learning_rate": 1.221662544307835e-05, + "loss": 0.0755, + "step": 144030 + }, + { + "epoch": 31.00511292855982, + "grad_norm": 0.003660645103082061, + "learning_rate": 1.2213616422033256e-05, + "loss": 0.0009, + "step": 144040 + }, + { + "epoch": 31.005167090938635, + "grad_norm": 0.0011174720712006092, + "learning_rate": 1.2210607400988162e-05, + "loss": 0.0226, + "step": 144050 + }, + { + "epoch": 31.005221253317444, + "grad_norm": 0.0011344398371875286, + "learning_rate": 1.2207598379943069e-05, + "loss": 0.0141, + "step": 144060 + }, + { + "epoch": 31.005275415696257, + "grad_norm": 3.7767045497894287, + "learning_rate": 1.2204589358897977e-05, + "loss": 0.0752, + "step": 144070 + }, + { + "epoch": 31.00532957807507, + "grad_norm": 0.0011060342658311129, + "learning_rate": 1.2201580337852883e-05, + "loss": 0.0658, + "step": 144080 + }, + { + "epoch": 31.00538374045388, + "grad_norm": 0.3322433531284332, + "learning_rate": 1.2198571316807791e-05, + "loss": 0.0033, + "step": 144090 + }, + { + "epoch": 31.005437902832693, + "grad_norm": 1.0419496297836304, + "learning_rate": 1.2195562295762697e-05, + "loss": 0.0257, + "step": 144100 + }, + { + "epoch": 31.005492065211506, + "grad_norm": 0.41735517978668213, + "learning_rate": 1.2192553274717603e-05, + "loss": 0.0393, + "step": 144110 + }, + { + "epoch": 31.005546227590315, + "grad_norm": 0.0762975886464119, + "learning_rate": 1.2189544253672511e-05, + "loss": 0.0562, + "step": 144120 + }, + { + "epoch": 31.00560038996913, + "grad_norm": 0.942527711391449, + "learning_rate": 1.2186535232627418e-05, + "loss": 0.0728, + "step": 144130 + }, + { + "epoch": 31.005654552347938, + "grad_norm": 3.092737913131714, + "learning_rate": 1.2183526211582324e-05, + "loss": 0.0749, + "step": 144140 + }, + { + "epoch": 31.00570871472675, + "grad_norm": 0.06109573692083359, + "learning_rate": 1.2180517190537232e-05, + "loss": 0.0171, + "step": 144150 + }, + { + "epoch": 31.005762877105564, + "grad_norm": 2.0101733207702637, + "learning_rate": 1.2177508169492138e-05, + "loss": 0.0454, + "step": 144160 + }, + { + "epoch": 31.005817039484374, + "grad_norm": 0.02457900159060955, + "learning_rate": 1.2174499148447044e-05, + "loss": 0.0233, + "step": 144170 + }, + { + "epoch": 31.005871201863187, + "grad_norm": 0.0010963456006720662, + "learning_rate": 1.2171490127401952e-05, + "loss": 0.0149, + "step": 144180 + }, + { + "epoch": 31.005925364241996, + "grad_norm": 1.3745505809783936, + "learning_rate": 1.2168481106356859e-05, + "loss": 0.0155, + "step": 144190 + }, + { + "epoch": 31.00597952662081, + "grad_norm": 0.6356057524681091, + "learning_rate": 1.2165472085311765e-05, + "loss": 0.0398, + "step": 144200 + }, + { + "epoch": 31.006033688999622, + "grad_norm": 0.0012482606107369065, + "learning_rate": 1.2162463064266671e-05, + "loss": 0.0222, + "step": 144210 + }, + { + "epoch": 31.00608785137843, + "grad_norm": 0.0010920384665951133, + "learning_rate": 1.215945404322158e-05, + "loss": 0.021, + "step": 144220 + }, + { + "epoch": 31.006142013757245, + "grad_norm": 0.017230067402124405, + "learning_rate": 1.2156445022176485e-05, + "loss": 0.0001, + "step": 144230 + }, + { + "epoch": 31.006196176136054, + "grad_norm": 0.19375066459178925, + "learning_rate": 1.2153436001131392e-05, + "loss": 0.0107, + "step": 144240 + }, + { + "epoch": 31.006250338514867, + "grad_norm": 0.0029175193049013615, + "learning_rate": 1.21504269800863e-05, + "loss": 0.0057, + "step": 144250 + }, + { + "epoch": 31.00630450089368, + "grad_norm": 3.7144172191619873, + "learning_rate": 1.2147417959041206e-05, + "loss": 0.0885, + "step": 144260 + }, + { + "epoch": 31.00635866327249, + "grad_norm": 0.001258791540749371, + "learning_rate": 1.2144408937996112e-05, + "loss": 0.0337, + "step": 144270 + }, + { + "epoch": 31.006412825651303, + "grad_norm": 0.644011378288269, + "learning_rate": 1.214139991695102e-05, + "loss": 0.0359, + "step": 144280 + }, + { + "epoch": 31.006466988030116, + "grad_norm": 0.0010629039024934173, + "learning_rate": 1.2138390895905927e-05, + "loss": 0.0794, + "step": 144290 + }, + { + "epoch": 31.006521150408926, + "grad_norm": 0.04001335799694061, + "learning_rate": 1.2135381874860833e-05, + "loss": 0.0237, + "step": 144300 + }, + { + "epoch": 31.00657531278774, + "grad_norm": 1.4074630737304688, + "learning_rate": 1.213237285381574e-05, + "loss": 0.1086, + "step": 144310 + }, + { + "epoch": 31.006629475166548, + "grad_norm": 0.0013362009776756167, + "learning_rate": 1.2129363832770647e-05, + "loss": 0.0177, + "step": 144320 + }, + { + "epoch": 31.00668363754536, + "grad_norm": 0.0019352376693859696, + "learning_rate": 1.2126354811725555e-05, + "loss": 0.0229, + "step": 144330 + }, + { + "epoch": 31.006737799924174, + "grad_norm": 0.14719487726688385, + "learning_rate": 1.2123345790680461e-05, + "loss": 0.0325, + "step": 144340 + }, + { + "epoch": 31.006791962302984, + "grad_norm": 0.001171929994598031, + "learning_rate": 1.2120336769635368e-05, + "loss": 0.0458, + "step": 144350 + }, + { + "epoch": 31.006846124681797, + "grad_norm": 0.001018668757751584, + "learning_rate": 1.2117327748590274e-05, + "loss": 0.0139, + "step": 144360 + }, + { + "epoch": 31.006900287060606, + "grad_norm": 17.334556579589844, + "learning_rate": 1.211431872754518e-05, + "loss": 0.1352, + "step": 144370 + }, + { + "epoch": 31.00695444943942, + "grad_norm": 0.007613572757691145, + "learning_rate": 1.2111309706500088e-05, + "loss": 0.0084, + "step": 144380 + }, + { + "epoch": 31.007008611818232, + "grad_norm": 0.02558698132634163, + "learning_rate": 1.2108300685454994e-05, + "loss": 0.0336, + "step": 144390 + }, + { + "epoch": 31.007062774197042, + "grad_norm": 0.0014851848827674985, + "learning_rate": 1.21052916644099e-05, + "loss": 0.0013, + "step": 144400 + }, + { + "epoch": 31.007116936575855, + "grad_norm": 0.009175443090498447, + "learning_rate": 1.2102282643364809e-05, + "loss": 0.016, + "step": 144410 + }, + { + "epoch": 31.007171098954665, + "grad_norm": 0.0417487695813179, + "learning_rate": 1.2099273622319715e-05, + "loss": 0.0202, + "step": 144420 + }, + { + "epoch": 31.007225261333478, + "grad_norm": 0.08303428441286087, + "learning_rate": 1.2096264601274621e-05, + "loss": 0.1511, + "step": 144430 + }, + { + "epoch": 31.00727942371229, + "grad_norm": 0.8491005301475525, + "learning_rate": 1.2093255580229529e-05, + "loss": 0.007, + "step": 144440 + }, + { + "epoch": 31.0073335860911, + "grad_norm": 0.0012536130379885435, + "learning_rate": 1.2090246559184435e-05, + "loss": 0.0996, + "step": 144450 + }, + { + "epoch": 31.007387748469913, + "grad_norm": 0.7251456379890442, + "learning_rate": 1.2087237538139343e-05, + "loss": 0.0242, + "step": 144460 + }, + { + "epoch": 31.007441910848726, + "grad_norm": 0.0017610065406188369, + "learning_rate": 1.208422851709425e-05, + "loss": 0.0304, + "step": 144470 + }, + { + "epoch": 31.007496073227536, + "grad_norm": 0.5711960196495056, + "learning_rate": 1.2081219496049156e-05, + "loss": 0.0845, + "step": 144480 + }, + { + "epoch": 31.00755023560635, + "grad_norm": 0.0011984555749222636, + "learning_rate": 1.2078210475004064e-05, + "loss": 0.0387, + "step": 144490 + }, + { + "epoch": 31.00760439798516, + "grad_norm": 1.1814703941345215, + "learning_rate": 1.2075201453958968e-05, + "loss": 0.0601, + "step": 144500 + }, + { + "epoch": 31.00765856036397, + "grad_norm": 0.0011226891074329615, + "learning_rate": 1.2072192432913876e-05, + "loss": 0.0102, + "step": 144510 + }, + { + "epoch": 31.007712722742784, + "grad_norm": 0.030660316348075867, + "learning_rate": 1.2069183411868783e-05, + "loss": 0.0038, + "step": 144520 + }, + { + "epoch": 31.007766885121594, + "grad_norm": 0.34937554597854614, + "learning_rate": 1.2066174390823689e-05, + "loss": 0.0658, + "step": 144530 + }, + { + "epoch": 31.007821047500407, + "grad_norm": 0.03693534433841705, + "learning_rate": 1.2063165369778597e-05, + "loss": 0.0987, + "step": 144540 + }, + { + "epoch": 31.007875209879217, + "grad_norm": 0.6070881485939026, + "learning_rate": 1.2060156348733503e-05, + "loss": 0.0104, + "step": 144550 + }, + { + "epoch": 31.00792937225803, + "grad_norm": 0.1435161828994751, + "learning_rate": 1.205714732768841e-05, + "loss": 0.0027, + "step": 144560 + }, + { + "epoch": 31.007983534636843, + "grad_norm": 0.0010735675459727645, + "learning_rate": 1.2054138306643317e-05, + "loss": 0.0554, + "step": 144570 + }, + { + "epoch": 31.008037697015652, + "grad_norm": 2.3454489707946777, + "learning_rate": 1.2051129285598224e-05, + "loss": 0.1044, + "step": 144580 + }, + { + "epoch": 31.008091859394465, + "grad_norm": 0.0011082282289862633, + "learning_rate": 1.2048120264553132e-05, + "loss": 0.0235, + "step": 144590 + }, + { + "epoch": 31.008146021773275, + "grad_norm": 3.4428975582122803, + "learning_rate": 1.2045111243508038e-05, + "loss": 0.1492, + "step": 144600 + }, + { + "epoch": 31.008200184152088, + "grad_norm": 0.004553291946649551, + "learning_rate": 1.2042102222462944e-05, + "loss": 0.0584, + "step": 144610 + }, + { + "epoch": 31.0082543465309, + "grad_norm": 0.40697285532951355, + "learning_rate": 1.2039093201417852e-05, + "loss": 0.0178, + "step": 144620 + }, + { + "epoch": 31.00830850890971, + "grad_norm": 1.5649272203445435, + "learning_rate": 1.2036084180372758e-05, + "loss": 0.022, + "step": 144630 + }, + { + "epoch": 31.008362671288523, + "grad_norm": 0.002478551585227251, + "learning_rate": 1.2033075159327665e-05, + "loss": 0.1092, + "step": 144640 + }, + { + "epoch": 31.008416833667333, + "grad_norm": 0.07006871700286865, + "learning_rate": 1.2030066138282571e-05, + "loss": 0.0039, + "step": 144650 + }, + { + "epoch": 31.008470996046146, + "grad_norm": 0.00817685853689909, + "learning_rate": 1.2027057117237477e-05, + "loss": 0.0033, + "step": 144660 + }, + { + "epoch": 31.00852515842496, + "grad_norm": 0.0023123533464968204, + "learning_rate": 1.2024048096192385e-05, + "loss": 0.0027, + "step": 144670 + }, + { + "epoch": 31.00857932080377, + "grad_norm": 0.0013326879125088453, + "learning_rate": 1.2021039075147292e-05, + "loss": 0.0031, + "step": 144680 + }, + { + "epoch": 31.00863348318258, + "grad_norm": 0.002864887472242117, + "learning_rate": 1.2018030054102198e-05, + "loss": 0.103, + "step": 144690 + }, + { + "epoch": 31.008687645561395, + "grad_norm": 0.12251321971416473, + "learning_rate": 1.2015021033057106e-05, + "loss": 0.0061, + "step": 144700 + }, + { + "epoch": 31.008741807940204, + "grad_norm": 1.639522910118103, + "learning_rate": 1.2012012012012012e-05, + "loss": 0.0455, + "step": 144710 + }, + { + "epoch": 31.008795970319017, + "grad_norm": 0.001216170028783381, + "learning_rate": 1.200900299096692e-05, + "loss": 0.0049, + "step": 144720 + }, + { + "epoch": 31.008850132697827, + "grad_norm": 0.0023059186059981585, + "learning_rate": 1.2005993969921826e-05, + "loss": 0.0013, + "step": 144730 + }, + { + "epoch": 31.00890429507664, + "grad_norm": 0.0027689277194440365, + "learning_rate": 1.2002984948876733e-05, + "loss": 0.0222, + "step": 144740 + }, + { + "epoch": 31.008958457455453, + "grad_norm": 0.5034531950950623, + "learning_rate": 1.199997592783164e-05, + "loss": 0.0222, + "step": 144750 + }, + { + "epoch": 31.009012619834262, + "grad_norm": 0.4369010329246521, + "learning_rate": 1.1996966906786547e-05, + "loss": 0.0205, + "step": 144760 + }, + { + "epoch": 31.009066782213075, + "grad_norm": 0.001158082508482039, + "learning_rate": 1.1993957885741453e-05, + "loss": 0.0677, + "step": 144770 + }, + { + "epoch": 31.009120944591885, + "grad_norm": 0.001957373460754752, + "learning_rate": 1.1990948864696361e-05, + "loss": 0.023, + "step": 144780 + }, + { + "epoch": 31.009175106970698, + "grad_norm": 0.0011812089942395687, + "learning_rate": 1.1987939843651267e-05, + "loss": 0.0192, + "step": 144790 + }, + { + "epoch": 31.00922926934951, + "grad_norm": 0.006632199045270681, + "learning_rate": 1.1984930822606174e-05, + "loss": 0.0005, + "step": 144800 + }, + { + "epoch": 31.00928343172832, + "grad_norm": 0.0012743468396365643, + "learning_rate": 1.198192180156108e-05, + "loss": 0.0061, + "step": 144810 + }, + { + "epoch": 31.009337594107134, + "grad_norm": 0.006665551569312811, + "learning_rate": 1.1978912780515986e-05, + "loss": 0.0012, + "step": 144820 + }, + { + "epoch": 31.009391756485943, + "grad_norm": 1.118801474571228, + "learning_rate": 1.1975903759470894e-05, + "loss": 0.0113, + "step": 144830 + }, + { + "epoch": 31.009445918864756, + "grad_norm": 0.0016227021114900708, + "learning_rate": 1.19728947384258e-05, + "loss": 0.0478, + "step": 144840 + }, + { + "epoch": 31.00950008124357, + "grad_norm": 2.262399196624756, + "learning_rate": 1.1969885717380708e-05, + "loss": 0.0161, + "step": 144850 + }, + { + "epoch": 31.00955424362238, + "grad_norm": 3.365831136703491, + "learning_rate": 1.1966876696335615e-05, + "loss": 0.0427, + "step": 144860 + }, + { + "epoch": 31.009608406001192, + "grad_norm": 0.9015489220619202, + "learning_rate": 1.1963867675290521e-05, + "loss": 0.0104, + "step": 144870 + }, + { + "epoch": 31.009662568380005, + "grad_norm": 3.128680467605591, + "learning_rate": 1.1960858654245429e-05, + "loss": 0.0985, + "step": 144880 + }, + { + "epoch": 31.009716730758814, + "grad_norm": 0.001535340677946806, + "learning_rate": 1.1957849633200335e-05, + "loss": 0.016, + "step": 144890 + }, + { + "epoch": 31.009770893137627, + "grad_norm": 0.0021109464578330517, + "learning_rate": 1.1954840612155241e-05, + "loss": 0.0301, + "step": 144900 + }, + { + "epoch": 31.009825055516437, + "grad_norm": 0.0027553110849112272, + "learning_rate": 1.195183159111015e-05, + "loss": 0.012, + "step": 144910 + }, + { + "epoch": 31.00987921789525, + "grad_norm": 0.03878484666347504, + "learning_rate": 1.1948822570065056e-05, + "loss": 0.0715, + "step": 144920 + }, + { + "epoch": 31.009933380274063, + "grad_norm": 0.10350296646356583, + "learning_rate": 1.1945813549019964e-05, + "loss": 0.0462, + "step": 144930 + }, + { + "epoch": 31.009987542652873, + "grad_norm": 0.0018504020990803838, + "learning_rate": 1.194280452797487e-05, + "loss": 0.0217, + "step": 144940 + }, + { + "epoch": 31.010041705031686, + "grad_norm": 2.06630277633667, + "learning_rate": 1.1939795506929776e-05, + "loss": 0.0236, + "step": 144950 + }, + { + "epoch": 31.010095867410495, + "grad_norm": 0.0013008428504690528, + "learning_rate": 1.1936786485884683e-05, + "loss": 0.1504, + "step": 144960 + }, + { + "epoch": 31.01015002978931, + "grad_norm": 0.002409618813544512, + "learning_rate": 1.1933777464839589e-05, + "loss": 0.0226, + "step": 144970 + }, + { + "epoch": 31.01020419216812, + "grad_norm": 1.2376846075057983, + "learning_rate": 1.1930768443794497e-05, + "loss": 0.0169, + "step": 144980 + }, + { + "epoch": 31.01025835454693, + "grad_norm": 0.0019675614312291145, + "learning_rate": 1.1927759422749403e-05, + "loss": 0.0015, + "step": 144990 + }, + { + "epoch": 31.010312516925744, + "grad_norm": 0.001056990004144609, + "learning_rate": 1.192475040170431e-05, + "loss": 0.0041, + "step": 145000 + }, + { + "epoch": 31.010366679304553, + "grad_norm": 0.01368747092783451, + "learning_rate": 1.1921741380659217e-05, + "loss": 0.0569, + "step": 145010 + }, + { + "epoch": 31.010420841683366, + "grad_norm": 0.00122841726988554, + "learning_rate": 1.1918732359614124e-05, + "loss": 0.0345, + "step": 145020 + }, + { + "epoch": 31.01047500406218, + "grad_norm": 0.001275224145501852, + "learning_rate": 1.191572333856903e-05, + "loss": 0.0648, + "step": 145030 + }, + { + "epoch": 31.01052916644099, + "grad_norm": 0.06390224397182465, + "learning_rate": 1.1912714317523938e-05, + "loss": 0.0391, + "step": 145040 + }, + { + "epoch": 31.010583328819802, + "grad_norm": 0.0030088271014392376, + "learning_rate": 1.1909705296478844e-05, + "loss": 0.0432, + "step": 145050 + }, + { + "epoch": 31.010637491198615, + "grad_norm": 0.030532538890838623, + "learning_rate": 1.1906696275433752e-05, + "loss": 0.072, + "step": 145060 + }, + { + "epoch": 31.010691653577425, + "grad_norm": 1.0978769063949585, + "learning_rate": 1.1903687254388658e-05, + "loss": 0.0128, + "step": 145070 + }, + { + "epoch": 31.010745815956238, + "grad_norm": 0.16368980705738068, + "learning_rate": 1.1900678233343565e-05, + "loss": 0.0008, + "step": 145080 + }, + { + "epoch": 31.010799978335047, + "grad_norm": 0.0012829017359763384, + "learning_rate": 1.1897669212298473e-05, + "loss": 0.077, + "step": 145090 + }, + { + "epoch": 31.01085414071386, + "grad_norm": 0.00214245542883873, + "learning_rate": 1.1894660191253377e-05, + "loss": 0.0386, + "step": 145100 + }, + { + "epoch": 31.010908303092673, + "grad_norm": 0.031696297228336334, + "learning_rate": 1.1891651170208285e-05, + "loss": 0.0444, + "step": 145110 + }, + { + "epoch": 31.010962465471483, + "grad_norm": 0.8997089862823486, + "learning_rate": 1.1888642149163191e-05, + "loss": 0.0873, + "step": 145120 + }, + { + "epoch": 31.011016627850296, + "grad_norm": 0.0010774293914437294, + "learning_rate": 1.1885633128118098e-05, + "loss": 0.045, + "step": 145130 + }, + { + "epoch": 31.011070790229105, + "grad_norm": 0.3493668735027313, + "learning_rate": 1.1882624107073006e-05, + "loss": 0.0344, + "step": 145140 + }, + { + "epoch": 31.01112495260792, + "grad_norm": 1.8062365055084229, + "learning_rate": 1.1879615086027912e-05, + "loss": 0.0117, + "step": 145150 + }, + { + "epoch": 31.01117911498673, + "grad_norm": 4.584301948547363, + "learning_rate": 1.1876606064982818e-05, + "loss": 0.011, + "step": 145160 + }, + { + "epoch": 31.01123327736554, + "grad_norm": 0.001801475533284247, + "learning_rate": 1.1873597043937726e-05, + "loss": 0.0183, + "step": 145170 + }, + { + "epoch": 31.011287439744354, + "grad_norm": 0.04331207275390625, + "learning_rate": 1.1870588022892632e-05, + "loss": 0.0054, + "step": 145180 + }, + { + "epoch": 31.011341602123164, + "grad_norm": 1.0838655233383179, + "learning_rate": 1.186757900184754e-05, + "loss": 0.0182, + "step": 145190 + }, + { + "epoch": 31.011395764501977, + "grad_norm": 0.0012677741469815373, + "learning_rate": 1.1864569980802447e-05, + "loss": 0.0072, + "step": 145200 + }, + { + "epoch": 31.01144992688079, + "grad_norm": 1.232276439666748, + "learning_rate": 1.1861560959757353e-05, + "loss": 0.0037, + "step": 145210 + }, + { + "epoch": 31.0115040892596, + "grad_norm": 0.002043773652985692, + "learning_rate": 1.1858551938712261e-05, + "loss": 0.0346, + "step": 145220 + }, + { + "epoch": 31.011558251638412, + "grad_norm": 0.0010675369994714856, + "learning_rate": 1.1855542917667167e-05, + "loss": 0.0002, + "step": 145230 + }, + { + "epoch": 31.011612414017225, + "grad_norm": 0.0036041024141013622, + "learning_rate": 1.1852533896622073e-05, + "loss": 0.0037, + "step": 145240 + }, + { + "epoch": 31.011666576396035, + "grad_norm": 0.0011563226580619812, + "learning_rate": 1.184952487557698e-05, + "loss": 0.0512, + "step": 145250 + }, + { + "epoch": 31.011720738774848, + "grad_norm": 0.0012814372312277555, + "learning_rate": 1.1846515854531886e-05, + "loss": 0.0067, + "step": 145260 + }, + { + "epoch": 31.011774901153657, + "grad_norm": 0.0019672284834086895, + "learning_rate": 1.1843506833486794e-05, + "loss": 0.0007, + "step": 145270 + }, + { + "epoch": 31.01182906353247, + "grad_norm": 0.004685147199779749, + "learning_rate": 1.18404978124417e-05, + "loss": 0.0283, + "step": 145280 + }, + { + "epoch": 31.011883225911284, + "grad_norm": 0.0009858421981334686, + "learning_rate": 1.1837488791396607e-05, + "loss": 0.0026, + "step": 145290 + }, + { + "epoch": 31.011937388290093, + "grad_norm": 0.12363018095493317, + "learning_rate": 1.1834479770351514e-05, + "loss": 0.1581, + "step": 145300 + }, + { + "epoch": 31.011991550668906, + "grad_norm": 0.5228644013404846, + "learning_rate": 1.183147074930642e-05, + "loss": 0.0666, + "step": 145310 + }, + { + "epoch": 31.012045713047716, + "grad_norm": 0.0010154071496799588, + "learning_rate": 1.1828461728261329e-05, + "loss": 0.0063, + "step": 145320 + }, + { + "epoch": 31.01209987542653, + "grad_norm": 0.001153567573055625, + "learning_rate": 1.1825452707216235e-05, + "loss": 0.0589, + "step": 145330 + }, + { + "epoch": 31.01215403780534, + "grad_norm": 0.0012103813933208585, + "learning_rate": 1.1822443686171141e-05, + "loss": 0.0217, + "step": 145340 + }, + { + "epoch": 31.01220820018415, + "grad_norm": 0.001050209510140121, + "learning_rate": 1.181943466512605e-05, + "loss": 0.0068, + "step": 145350 + }, + { + "epoch": 31.012262362562964, + "grad_norm": 0.001330124563537538, + "learning_rate": 1.1816425644080956e-05, + "loss": 0.0526, + "step": 145360 + }, + { + "epoch": 31.012316524941774, + "grad_norm": 0.005929472856223583, + "learning_rate": 1.1813416623035862e-05, + "loss": 0.0345, + "step": 145370 + }, + { + "epoch": 31.012370687320587, + "grad_norm": 39.46073532104492, + "learning_rate": 1.181040760199077e-05, + "loss": 0.0318, + "step": 145380 + }, + { + "epoch": 31.0124248496994, + "grad_norm": 0.001268837251700461, + "learning_rate": 1.1807398580945676e-05, + "loss": 0.048, + "step": 145390 + }, + { + "epoch": 31.01247901207821, + "grad_norm": 1.8432589769363403, + "learning_rate": 1.1804389559900582e-05, + "loss": 0.0092, + "step": 145400 + }, + { + "epoch": 31.012533174457023, + "grad_norm": 0.0011794044403359294, + "learning_rate": 1.1801380538855489e-05, + "loss": 0.0872, + "step": 145410 + }, + { + "epoch": 31.012587336835836, + "grad_norm": 2.5264933109283447, + "learning_rate": 1.1798371517810395e-05, + "loss": 0.0975, + "step": 145420 + }, + { + "epoch": 31.012641499214645, + "grad_norm": 0.0010016511660069227, + "learning_rate": 1.1795362496765303e-05, + "loss": 0.0227, + "step": 145430 + }, + { + "epoch": 31.012695661593458, + "grad_norm": 0.001022564247250557, + "learning_rate": 1.1792353475720209e-05, + "loss": 0.0086, + "step": 145440 + }, + { + "epoch": 31.012749823972268, + "grad_norm": 0.8333533406257629, + "learning_rate": 1.1789344454675117e-05, + "loss": 0.0204, + "step": 145450 + }, + { + "epoch": 31.01280398635108, + "grad_norm": 0.17842057347297668, + "learning_rate": 1.1786335433630023e-05, + "loss": 0.0067, + "step": 145460 + }, + { + "epoch": 31.012858148729894, + "grad_norm": 0.6267008781433105, + "learning_rate": 1.178332641258493e-05, + "loss": 0.0063, + "step": 145470 + }, + { + "epoch": 31.012912311108703, + "grad_norm": 1.6525285243988037, + "learning_rate": 1.1780317391539838e-05, + "loss": 0.0164, + "step": 145480 + }, + { + "epoch": 31.012966473487516, + "grad_norm": 0.86359041929245, + "learning_rate": 1.1777308370494744e-05, + "loss": 0.0029, + "step": 145490 + }, + { + "epoch": 31.013020635866326, + "grad_norm": 0.001011671731248498, + "learning_rate": 1.177429934944965e-05, + "loss": 0.0072, + "step": 145500 + }, + { + "epoch": 31.01307479824514, + "grad_norm": 0.045349836349487305, + "learning_rate": 1.1771290328404558e-05, + "loss": 0.0694, + "step": 145510 + }, + { + "epoch": 31.013128960623952, + "grad_norm": 0.006311746314167976, + "learning_rate": 1.1768281307359464e-05, + "loss": 0.0491, + "step": 145520 + }, + { + "epoch": 31.01318312300276, + "grad_norm": 0.0023083644919097424, + "learning_rate": 1.1765272286314372e-05, + "loss": 0.0144, + "step": 145530 + }, + { + "epoch": 31.013237285381575, + "grad_norm": 0.00115368259139359, + "learning_rate": 1.1762263265269279e-05, + "loss": 0.0379, + "step": 145540 + }, + { + "epoch": 31.013291447760384, + "grad_norm": 0.001004091463983059, + "learning_rate": 1.1759254244224183e-05, + "loss": 0.0023, + "step": 145550 + }, + { + "epoch": 31.013345610139197, + "grad_norm": 0.0018820528639480472, + "learning_rate": 1.1756245223179091e-05, + "loss": 0.0197, + "step": 145560 + }, + { + "epoch": 31.01339977251801, + "grad_norm": 2.4552059173583984, + "learning_rate": 1.1753236202133997e-05, + "loss": 0.1287, + "step": 145570 + }, + { + "epoch": 31.01345393489682, + "grad_norm": 5.661612510681152, + "learning_rate": 1.1750227181088905e-05, + "loss": 0.103, + "step": 145580 + }, + { + "epoch": 31.013508097275633, + "grad_norm": 0.000953196722548455, + "learning_rate": 1.1747218160043812e-05, + "loss": 0.0589, + "step": 145590 + }, + { + "epoch": 31.013562259654446, + "grad_norm": 3.9382565021514893, + "learning_rate": 1.1744209138998718e-05, + "loss": 0.0894, + "step": 145600 + }, + { + "epoch": 31.013616422033255, + "grad_norm": 0.0010061470093205571, + "learning_rate": 1.1741200117953626e-05, + "loss": 0.023, + "step": 145610 + }, + { + "epoch": 31.01367058441207, + "grad_norm": 0.06917504966259003, + "learning_rate": 1.1738191096908532e-05, + "loss": 0.0314, + "step": 145620 + }, + { + "epoch": 31.013724746790878, + "grad_norm": 0.0010515108006075025, + "learning_rate": 1.1735182075863438e-05, + "loss": 0.0098, + "step": 145630 + }, + { + "epoch": 31.01377890916969, + "grad_norm": 0.0015394283691421151, + "learning_rate": 1.1732173054818346e-05, + "loss": 0.035, + "step": 145640 + }, + { + "epoch": 31.013833071548504, + "grad_norm": 2.7997095584869385, + "learning_rate": 1.1729164033773253e-05, + "loss": 0.0877, + "step": 145650 + }, + { + "epoch": 31.013887233927314, + "grad_norm": 3.2873737812042236, + "learning_rate": 1.172615501272816e-05, + "loss": 0.0591, + "step": 145660 + }, + { + "epoch": 31.013941396306127, + "grad_norm": 0.0013411915861070156, + "learning_rate": 1.1723145991683067e-05, + "loss": 0.0546, + "step": 145670 + }, + { + "epoch": 31.013995558684936, + "grad_norm": 6.1214470863342285, + "learning_rate": 1.1720136970637973e-05, + "loss": 0.0327, + "step": 145680 + }, + { + "epoch": 31.01404972106375, + "grad_norm": 0.0015516700223088264, + "learning_rate": 1.1717127949592881e-05, + "loss": 0.0064, + "step": 145690 + }, + { + "epoch": 31.014103883442562, + "grad_norm": 0.49805882573127747, + "learning_rate": 1.1714118928547786e-05, + "loss": 0.0221, + "step": 145700 + }, + { + "epoch": 31.01415804582137, + "grad_norm": 0.0010464226361364126, + "learning_rate": 1.1711109907502694e-05, + "loss": 0.0214, + "step": 145710 + }, + { + "epoch": 31.014212208200185, + "grad_norm": 0.0009837057441473007, + "learning_rate": 1.17081008864576e-05, + "loss": 0.128, + "step": 145720 + }, + { + "epoch": 31.014266370578994, + "grad_norm": 2.8380136489868164, + "learning_rate": 1.1705091865412506e-05, + "loss": 0.0472, + "step": 145730 + }, + { + "epoch": 31.014320532957807, + "grad_norm": 0.004468211904168129, + "learning_rate": 1.1702082844367414e-05, + "loss": 0.1348, + "step": 145740 + }, + { + "epoch": 31.01437469533662, + "grad_norm": 0.0015900930156931281, + "learning_rate": 1.169907382332232e-05, + "loss": 0.0294, + "step": 145750 + }, + { + "epoch": 31.01442885771543, + "grad_norm": 0.0016660437686368823, + "learning_rate": 1.1696064802277227e-05, + "loss": 0.0024, + "step": 145760 + }, + { + "epoch": 31.014483020094243, + "grad_norm": 0.0029007107950747013, + "learning_rate": 1.1693055781232135e-05, + "loss": 0.0003, + "step": 145770 + }, + { + "epoch": 31.014537182473052, + "grad_norm": 0.0010939118219539523, + "learning_rate": 1.1690046760187041e-05, + "loss": 0.0138, + "step": 145780 + }, + { + "epoch": 31.014591344851866, + "grad_norm": 0.004232531879097223, + "learning_rate": 1.1687037739141949e-05, + "loss": 0.0048, + "step": 145790 + }, + { + "epoch": 31.01464550723068, + "grad_norm": 0.0010826995130628347, + "learning_rate": 1.1684028718096855e-05, + "loss": 0.0036, + "step": 145800 + }, + { + "epoch": 31.014699669609488, + "grad_norm": 0.17365342378616333, + "learning_rate": 1.1681019697051762e-05, + "loss": 0.0643, + "step": 145810 + }, + { + "epoch": 31.0147538319883, + "grad_norm": 0.03571773320436478, + "learning_rate": 1.167801067600667e-05, + "loss": 0.0656, + "step": 145820 + }, + { + "epoch": 31.014807994367114, + "grad_norm": 1.1189652681350708, + "learning_rate": 1.1675001654961576e-05, + "loss": 0.0502, + "step": 145830 + }, + { + "epoch": 31.014862156745924, + "grad_norm": 0.0012951373355463147, + "learning_rate": 1.1671992633916482e-05, + "loss": 0.0331, + "step": 145840 + }, + { + "epoch": 31.014916319124737, + "grad_norm": 1.1116446256637573, + "learning_rate": 1.1668983612871388e-05, + "loss": 0.0221, + "step": 145850 + }, + { + "epoch": 31.014970481503546, + "grad_norm": 0.0018300999654456973, + "learning_rate": 1.1665974591826295e-05, + "loss": 0.0452, + "step": 145860 + }, + { + "epoch": 31.01502464388236, + "grad_norm": 0.0010972386226058006, + "learning_rate": 1.1662965570781203e-05, + "loss": 0.0024, + "step": 145870 + }, + { + "epoch": 31.015078806261172, + "grad_norm": 0.0010128449648618698, + "learning_rate": 1.1659956549736109e-05, + "loss": 0.0086, + "step": 145880 + }, + { + "epoch": 31.015132968639982, + "grad_norm": 0.0016276579117402434, + "learning_rate": 1.1656947528691015e-05, + "loss": 0.1022, + "step": 145890 + }, + { + "epoch": 31.015187131018795, + "grad_norm": 0.0014334071893244982, + "learning_rate": 1.1653938507645923e-05, + "loss": 0.0378, + "step": 145900 + }, + { + "epoch": 31.015241293397604, + "grad_norm": 1.911876916885376, + "learning_rate": 1.165092948660083e-05, + "loss": 0.0988, + "step": 145910 + }, + { + "epoch": 31.015295455776418, + "grad_norm": 0.0010798608418554068, + "learning_rate": 1.1647920465555737e-05, + "loss": 0.0602, + "step": 145920 + }, + { + "epoch": 31.01534961815523, + "grad_norm": 0.03300881385803223, + "learning_rate": 1.1644911444510644e-05, + "loss": 0.0395, + "step": 145930 + }, + { + "epoch": 31.01540378053404, + "grad_norm": 0.028761345893144608, + "learning_rate": 1.164190242346555e-05, + "loss": 0.0331, + "step": 145940 + }, + { + "epoch": 31.015457942912853, + "grad_norm": 1.9307732582092285, + "learning_rate": 1.1638893402420458e-05, + "loss": 0.0244, + "step": 145950 + }, + { + "epoch": 31.015512105291663, + "grad_norm": 0.0016121163498610258, + "learning_rate": 1.1635884381375364e-05, + "loss": 0.0222, + "step": 145960 + }, + { + "epoch": 31.015566267670476, + "grad_norm": 0.7160137295722961, + "learning_rate": 1.163287536033027e-05, + "loss": 0.0716, + "step": 145970 + }, + { + "epoch": 31.01562043004929, + "grad_norm": 0.06945274025201797, + "learning_rate": 1.1629866339285178e-05, + "loss": 0.0125, + "step": 145980 + }, + { + "epoch": 31.0156745924281, + "grad_norm": 2.144043207168579, + "learning_rate": 1.1626857318240085e-05, + "loss": 0.0347, + "step": 145990 + }, + { + "epoch": 31.01572875480691, + "grad_norm": 0.001179181388579309, + "learning_rate": 1.1623848297194991e-05, + "loss": 0.0171, + "step": 146000 + }, + { + "epoch": 31.015782917185724, + "grad_norm": 0.0016957268817350268, + "learning_rate": 1.1620839276149897e-05, + "loss": 0.0249, + "step": 146010 + }, + { + "epoch": 31.015837079564534, + "grad_norm": 0.1787324696779251, + "learning_rate": 1.1617830255104804e-05, + "loss": 0.0922, + "step": 146020 + }, + { + "epoch": 31.015891241943347, + "grad_norm": 0.0012204180238768458, + "learning_rate": 1.1614821234059711e-05, + "loss": 0.0124, + "step": 146030 + }, + { + "epoch": 31.015945404322157, + "grad_norm": 0.008117370307445526, + "learning_rate": 1.1611812213014618e-05, + "loss": 0.0167, + "step": 146040 + }, + { + "epoch": 31.01599956670097, + "grad_norm": 0.0013003810308873653, + "learning_rate": 1.1608803191969526e-05, + "loss": 0.0005, + "step": 146050 + }, + { + "epoch": 31.016053729079783, + "grad_norm": 0.0015579527243971825, + "learning_rate": 1.1605794170924432e-05, + "loss": 0.0143, + "step": 146060 + }, + { + "epoch": 31.016107891458592, + "grad_norm": 0.001125988783314824, + "learning_rate": 1.1602785149879338e-05, + "loss": 0.0037, + "step": 146070 + }, + { + "epoch": 31.016162053837405, + "grad_norm": 0.0012548487866297364, + "learning_rate": 1.1599776128834246e-05, + "loss": 0.1247, + "step": 146080 + }, + { + "epoch": 31.016216216216215, + "grad_norm": 0.0023876531049609184, + "learning_rate": 1.1596767107789153e-05, + "loss": 0.0496, + "step": 146090 + }, + { + "epoch": 31.016270378595028, + "grad_norm": 0.11487380415201187, + "learning_rate": 1.1593758086744059e-05, + "loss": 0.0109, + "step": 146100 + }, + { + "epoch": 31.01632454097384, + "grad_norm": 0.2097969502210617, + "learning_rate": 1.1590749065698967e-05, + "loss": 0.0094, + "step": 146110 + }, + { + "epoch": 31.01637870335265, + "grad_norm": 0.012783543206751347, + "learning_rate": 1.1587740044653873e-05, + "loss": 0.0308, + "step": 146120 + }, + { + "epoch": 31.016432865731463, + "grad_norm": 0.05254229158163071, + "learning_rate": 1.1584731023608781e-05, + "loss": 0.0505, + "step": 146130 + }, + { + "epoch": 31.016487028110273, + "grad_norm": 0.0015376355731859803, + "learning_rate": 1.1581722002563687e-05, + "loss": 0.0048, + "step": 146140 + }, + { + "epoch": 31.016541190489086, + "grad_norm": 0.0016916915774345398, + "learning_rate": 1.1578712981518592e-05, + "loss": 0.0185, + "step": 146150 + }, + { + "epoch": 31.0165953528679, + "grad_norm": 0.10036426037549973, + "learning_rate": 1.15757039604735e-05, + "loss": 0.0194, + "step": 146160 + }, + { + "epoch": 31.01664951524671, + "grad_norm": 0.0009905928745865822, + "learning_rate": 1.1572694939428406e-05, + "loss": 0.0196, + "step": 146170 + }, + { + "epoch": 31.01670367762552, + "grad_norm": 0.002103877952322364, + "learning_rate": 1.1569685918383314e-05, + "loss": 0.0002, + "step": 146180 + }, + { + "epoch": 31.016757840004335, + "grad_norm": 0.0013507786206901073, + "learning_rate": 1.156667689733822e-05, + "loss": 0.0234, + "step": 146190 + }, + { + "epoch": 31.016812002383144, + "grad_norm": 1.6631025075912476, + "learning_rate": 1.1563667876293127e-05, + "loss": 0.0325, + "step": 146200 + }, + { + "epoch": 31.016866164761957, + "grad_norm": 0.0009353175992146134, + "learning_rate": 1.1560658855248035e-05, + "loss": 0.0279, + "step": 146210 + }, + { + "epoch": 31.016920327140767, + "grad_norm": 0.0009532415424473584, + "learning_rate": 1.1557649834202941e-05, + "loss": 0.0039, + "step": 146220 + }, + { + "epoch": 31.01697448951958, + "grad_norm": 0.0010260945418849587, + "learning_rate": 1.1554640813157847e-05, + "loss": 0.0566, + "step": 146230 + }, + { + "epoch": 31.017028651898393, + "grad_norm": 1.5612245798110962, + "learning_rate": 1.1551631792112755e-05, + "loss": 0.0219, + "step": 146240 + }, + { + "epoch": 31.017082814277202, + "grad_norm": 0.0009717060602270067, + "learning_rate": 1.1548622771067661e-05, + "loss": 0.0232, + "step": 146250 + }, + { + "epoch": 31.017136976656015, + "grad_norm": 0.00785547774285078, + "learning_rate": 1.154561375002257e-05, + "loss": 0.0286, + "step": 146260 + }, + { + "epoch": 31.017191139034825, + "grad_norm": 2.748396158218384, + "learning_rate": 1.1542604728977476e-05, + "loss": 0.0444, + "step": 146270 + }, + { + "epoch": 31.017245301413638, + "grad_norm": 0.006047976668924093, + "learning_rate": 1.1539595707932382e-05, + "loss": 0.0003, + "step": 146280 + }, + { + "epoch": 31.01729946379245, + "grad_norm": 0.0009255941258743405, + "learning_rate": 1.153658668688729e-05, + "loss": 0.0028, + "step": 146290 + }, + { + "epoch": 31.01735362617126, + "grad_norm": 0.001342786941677332, + "learning_rate": 1.1533577665842194e-05, + "loss": 0.0258, + "step": 146300 + }, + { + "epoch": 31.017407788550074, + "grad_norm": 0.0009373169159516692, + "learning_rate": 1.1530568644797102e-05, + "loss": 0.0166, + "step": 146310 + }, + { + "epoch": 31.017461950928883, + "grad_norm": 0.0012140872422605753, + "learning_rate": 1.1527559623752009e-05, + "loss": 0.0515, + "step": 146320 + }, + { + "epoch": 31.017516113307696, + "grad_norm": 0.8112196326255798, + "learning_rate": 1.1524550602706915e-05, + "loss": 0.0307, + "step": 146330 + }, + { + "epoch": 31.01757027568651, + "grad_norm": 0.019719045609235764, + "learning_rate": 1.1521541581661823e-05, + "loss": 0.0547, + "step": 146340 + }, + { + "epoch": 31.01762443806532, + "grad_norm": 0.1444760113954544, + "learning_rate": 1.151853256061673e-05, + "loss": 0.0199, + "step": 146350 + }, + { + "epoch": 31.017678600444132, + "grad_norm": 0.0010201260447502136, + "learning_rate": 1.1515523539571635e-05, + "loss": 0.1459, + "step": 146360 + }, + { + "epoch": 31.017732762822945, + "grad_norm": 0.001804043771699071, + "learning_rate": 1.1512514518526543e-05, + "loss": 0.0753, + "step": 146370 + }, + { + "epoch": 31.017786925201754, + "grad_norm": 0.0012253449531272054, + "learning_rate": 1.150950549748145e-05, + "loss": 0.0869, + "step": 146380 + }, + { + "epoch": 31.017841087580567, + "grad_norm": 0.0010386720532551408, + "learning_rate": 1.1506496476436358e-05, + "loss": 0.0334, + "step": 146390 + }, + { + "epoch": 31.017895249959377, + "grad_norm": 0.6302465200424194, + "learning_rate": 1.1503487455391264e-05, + "loss": 0.0329, + "step": 146400 + }, + { + "epoch": 31.01794941233819, + "grad_norm": 0.0010349743533879519, + "learning_rate": 1.150047843434617e-05, + "loss": 0.0174, + "step": 146410 + }, + { + "epoch": 31.018003574717003, + "grad_norm": 0.5957885384559631, + "learning_rate": 1.1497469413301078e-05, + "loss": 0.0916, + "step": 146420 + }, + { + "epoch": 31.018057737095813, + "grad_norm": 0.001015939167700708, + "learning_rate": 1.1494460392255984e-05, + "loss": 0.0232, + "step": 146430 + }, + { + "epoch": 31.018111899474626, + "grad_norm": 0.0016928126569837332, + "learning_rate": 1.149145137121089e-05, + "loss": 0.001, + "step": 146440 + }, + { + "epoch": 31.018166061853435, + "grad_norm": 0.022106122225522995, + "learning_rate": 1.1488442350165797e-05, + "loss": 0.0829, + "step": 146450 + }, + { + "epoch": 31.01822022423225, + "grad_norm": 1.6158580780029297, + "learning_rate": 1.1485433329120703e-05, + "loss": 0.0288, + "step": 146460 + }, + { + "epoch": 31.01827438661106, + "grad_norm": 0.001594059867784381, + "learning_rate": 1.1482424308075611e-05, + "loss": 0.028, + "step": 146470 + }, + { + "epoch": 31.01832854898987, + "grad_norm": 0.0010073930025100708, + "learning_rate": 1.1479415287030518e-05, + "loss": 0.0188, + "step": 146480 + }, + { + "epoch": 31.018382711368684, + "grad_norm": 0.0009872736409306526, + "learning_rate": 1.1476406265985424e-05, + "loss": 0.0024, + "step": 146490 + }, + { + "epoch": 31.018436873747493, + "grad_norm": 0.8545201420783997, + "learning_rate": 1.1473397244940332e-05, + "loss": 0.0089, + "step": 146500 + }, + { + "epoch": 31.018491036126306, + "grad_norm": 0.0012190861161798239, + "learning_rate": 1.1470388223895238e-05, + "loss": 0.0505, + "step": 146510 + }, + { + "epoch": 31.01854519850512, + "grad_norm": 0.0653570145368576, + "learning_rate": 1.1467379202850146e-05, + "loss": 0.0493, + "step": 146520 + }, + { + "epoch": 31.01859936088393, + "grad_norm": 0.5924845337867737, + "learning_rate": 1.1464370181805052e-05, + "loss": 0.0048, + "step": 146530 + }, + { + "epoch": 31.018653523262742, + "grad_norm": 0.0012114790733903646, + "learning_rate": 1.1461361160759959e-05, + "loss": 0.0026, + "step": 146540 + }, + { + "epoch": 31.018707685641555, + "grad_norm": 0.007636606227606535, + "learning_rate": 1.1458352139714867e-05, + "loss": 0.0857, + "step": 146550 + }, + { + "epoch": 31.018761848020365, + "grad_norm": 0.000985215650871396, + "learning_rate": 1.1455343118669773e-05, + "loss": 0.022, + "step": 146560 + }, + { + "epoch": 31.018816010399178, + "grad_norm": 0.0009477687417529523, + "learning_rate": 1.1452334097624679e-05, + "loss": 0.0358, + "step": 146570 + }, + { + "epoch": 31.018870172777987, + "grad_norm": 0.0009386851452291012, + "learning_rate": 1.1449325076579587e-05, + "loss": 0.0496, + "step": 146580 + }, + { + "epoch": 31.0189243351568, + "grad_norm": 0.5990067720413208, + "learning_rate": 1.1446316055534492e-05, + "loss": 0.0158, + "step": 146590 + }, + { + "epoch": 31.018978497535613, + "grad_norm": 0.3656524121761322, + "learning_rate": 1.14433070344894e-05, + "loss": 0.0497, + "step": 146600 + }, + { + "epoch": 31.019032659914423, + "grad_norm": 0.0009574647410772741, + "learning_rate": 1.1440298013444306e-05, + "loss": 0.0153, + "step": 146610 + }, + { + "epoch": 31.019086822293236, + "grad_norm": 0.0033022400457412004, + "learning_rate": 1.1437288992399212e-05, + "loss": 0.0364, + "step": 146620 + }, + { + "epoch": 31.019140984672045, + "grad_norm": 0.0009394686785526574, + "learning_rate": 1.143427997135412e-05, + "loss": 0.0729, + "step": 146630 + }, + { + "epoch": 31.01919514705086, + "grad_norm": 0.0009500145679339767, + "learning_rate": 1.1431270950309026e-05, + "loss": 0.0058, + "step": 146640 + }, + { + "epoch": 31.01924930942967, + "grad_norm": 0.001159012084826827, + "learning_rate": 1.1428261929263934e-05, + "loss": 0.0027, + "step": 146650 + }, + { + "epoch": 31.01930347180848, + "grad_norm": 3.2657246589660645, + "learning_rate": 1.142525290821884e-05, + "loss": 0.0799, + "step": 146660 + }, + { + "epoch": 31.019357634187294, + "grad_norm": 0.003948057070374489, + "learning_rate": 1.1422243887173747e-05, + "loss": 0.0246, + "step": 146670 + }, + { + "epoch": 31.019411796566104, + "grad_norm": 0.0011678645387291908, + "learning_rate": 1.1419234866128655e-05, + "loss": 0.0103, + "step": 146680 + }, + { + "epoch": 31.019465958944917, + "grad_norm": 0.0009673136519268155, + "learning_rate": 1.1416225845083561e-05, + "loss": 0.0457, + "step": 146690 + }, + { + "epoch": 31.01952012132373, + "grad_norm": 0.7658808827400208, + "learning_rate": 1.1413216824038467e-05, + "loss": 0.0575, + "step": 146700 + }, + { + "epoch": 31.01957428370254, + "grad_norm": 0.49688488245010376, + "learning_rate": 1.1410207802993375e-05, + "loss": 0.0683, + "step": 146710 + }, + { + "epoch": 31.019628446081352, + "grad_norm": 0.001266247360035777, + "learning_rate": 1.1407198781948282e-05, + "loss": 0.0549, + "step": 146720 + }, + { + "epoch": 31.019682608460165, + "grad_norm": 0.0011955387890338898, + "learning_rate": 1.140418976090319e-05, + "loss": 0.0182, + "step": 146730 + }, + { + "epoch": 31.019736770838975, + "grad_norm": 0.0012275060871616006, + "learning_rate": 1.1401180739858094e-05, + "loss": 0.0133, + "step": 146740 + }, + { + "epoch": 31.019790933217788, + "grad_norm": 0.0009993626736104488, + "learning_rate": 1.1398171718813e-05, + "loss": 0.0128, + "step": 146750 + }, + { + "epoch": 31.019845095596597, + "grad_norm": 0.0032437534537166357, + "learning_rate": 1.1395162697767908e-05, + "loss": 0.0201, + "step": 146760 + }, + { + "epoch": 31.01989925797541, + "grad_norm": 0.6935446262359619, + "learning_rate": 1.1392153676722815e-05, + "loss": 0.0069, + "step": 146770 + }, + { + "epoch": 31.019953420354224, + "grad_norm": 1.2831898927688599, + "learning_rate": 1.1389144655677723e-05, + "loss": 0.0463, + "step": 146780 + }, + { + "epoch": 31.020007582733033, + "grad_norm": 0.0049192775040864944, + "learning_rate": 1.1386135634632629e-05, + "loss": 0.0704, + "step": 146790 + }, + { + "epoch": 31.020061745111846, + "grad_norm": 0.007953070104122162, + "learning_rate": 1.1383126613587535e-05, + "loss": 0.0258, + "step": 146800 + }, + { + "epoch": 31.020115907490656, + "grad_norm": 0.2099732905626297, + "learning_rate": 1.1380117592542443e-05, + "loss": 0.056, + "step": 146810 + }, + { + "epoch": 31.02017006986947, + "grad_norm": 0.08214042335748672, + "learning_rate": 1.137710857149735e-05, + "loss": 0.0039, + "step": 146820 + }, + { + "epoch": 31.02022423224828, + "grad_norm": 0.31769466400146484, + "learning_rate": 1.1374099550452256e-05, + "loss": 0.0211, + "step": 146830 + }, + { + "epoch": 31.02027839462709, + "grad_norm": 2.5574686527252197, + "learning_rate": 1.1371090529407164e-05, + "loss": 0.0712, + "step": 146840 + }, + { + "epoch": 31.020332557005904, + "grad_norm": 0.14085370302200317, + "learning_rate": 1.136808150836207e-05, + "loss": 0.0074, + "step": 146850 + }, + { + "epoch": 31.020386719384714, + "grad_norm": 0.002122668083757162, + "learning_rate": 1.1365072487316978e-05, + "loss": 0.0068, + "step": 146860 + }, + { + "epoch": 31.020440881763527, + "grad_norm": 2.083174705505371, + "learning_rate": 1.1362063466271884e-05, + "loss": 0.0758, + "step": 146870 + }, + { + "epoch": 31.02049504414234, + "grad_norm": 0.0009786197915673256, + "learning_rate": 1.135905444522679e-05, + "loss": 0.0041, + "step": 146880 + }, + { + "epoch": 31.02054920652115, + "grad_norm": 0.0009833350777626038, + "learning_rate": 1.1356045424181697e-05, + "loss": 0.0353, + "step": 146890 + }, + { + "epoch": 31.020603368899963, + "grad_norm": 0.002276867162436247, + "learning_rate": 1.1353036403136603e-05, + "loss": 0.0001, + "step": 146900 + }, + { + "epoch": 31.020657531278772, + "grad_norm": 0.00127529411111027, + "learning_rate": 1.1350027382091511e-05, + "loss": 0.0373, + "step": 146910 + }, + { + "epoch": 31.020711693657585, + "grad_norm": 0.000981392222456634, + "learning_rate": 1.1347018361046417e-05, + "loss": 0.0431, + "step": 146920 + }, + { + "epoch": 31.020765856036398, + "grad_norm": 0.0010049928678199649, + "learning_rate": 1.1344009340001324e-05, + "loss": 0.0157, + "step": 146930 + }, + { + "epoch": 31.020820018415208, + "grad_norm": 0.0009917591232806444, + "learning_rate": 1.1341000318956232e-05, + "loss": 0.03, + "step": 146940 + }, + { + "epoch": 31.02087418079402, + "grad_norm": 5.104968070983887, + "learning_rate": 1.1337991297911138e-05, + "loss": 0.0517, + "step": 146950 + }, + { + "epoch": 31.020928343172834, + "grad_norm": 1.539172887802124, + "learning_rate": 1.1334982276866044e-05, + "loss": 0.0342, + "step": 146960 + }, + { + "epoch": 31.020982505551643, + "grad_norm": 0.0014489154564216733, + "learning_rate": 1.1331973255820952e-05, + "loss": 0.1271, + "step": 146970 + }, + { + "epoch": 31.021036667930456, + "grad_norm": 0.0016204979037865996, + "learning_rate": 1.1328964234775858e-05, + "loss": 0.0242, + "step": 146980 + }, + { + "epoch": 31.021090830309266, + "grad_norm": 2.744835615158081, + "learning_rate": 1.1325955213730766e-05, + "loss": 0.0797, + "step": 146990 + }, + { + "epoch": 31.02114499268808, + "grad_norm": 0.00103546935133636, + "learning_rate": 1.1322946192685673e-05, + "loss": 0.0164, + "step": 147000 + }, + { + "epoch": 31.021199155066892, + "grad_norm": 0.0013249025214463472, + "learning_rate": 1.1319937171640579e-05, + "loss": 0.0134, + "step": 147010 + }, + { + "epoch": 31.0212533174457, + "grad_norm": 0.7690743803977966, + "learning_rate": 1.1316928150595487e-05, + "loss": 0.1127, + "step": 147020 + }, + { + "epoch": 31.021307479824515, + "grad_norm": 0.5361955165863037, + "learning_rate": 1.1313919129550393e-05, + "loss": 0.0153, + "step": 147030 + }, + { + "epoch": 31.021361642203324, + "grad_norm": 0.2766215205192566, + "learning_rate": 1.13109101085053e-05, + "loss": 0.0248, + "step": 147040 + }, + { + "epoch": 31.021415804582137, + "grad_norm": 3.520308017730713, + "learning_rate": 1.1307901087460206e-05, + "loss": 0.0209, + "step": 147050 + }, + { + "epoch": 31.02146996696095, + "grad_norm": 0.32587364315986633, + "learning_rate": 1.1304892066415112e-05, + "loss": 0.0312, + "step": 147060 + }, + { + "epoch": 31.02152412933976, + "grad_norm": 0.0009946770733222365, + "learning_rate": 1.130188304537002e-05, + "loss": 0.0294, + "step": 147070 + }, + { + "epoch": 31.021578291718573, + "grad_norm": 0.3871113657951355, + "learning_rate": 1.1298874024324926e-05, + "loss": 0.018, + "step": 147080 + }, + { + "epoch": 31.021632454097382, + "grad_norm": 0.8303360939025879, + "learning_rate": 1.1295865003279833e-05, + "loss": 0.0867, + "step": 147090 + }, + { + "epoch": 31.021686616476195, + "grad_norm": 0.0013250584015622735, + "learning_rate": 1.129285598223474e-05, + "loss": 0.0117, + "step": 147100 + }, + { + "epoch": 31.02174077885501, + "grad_norm": 0.1202298104763031, + "learning_rate": 1.1289846961189647e-05, + "loss": 0.1048, + "step": 147110 + }, + { + "epoch": 31.021794941233818, + "grad_norm": 1.5692362785339355, + "learning_rate": 1.1286837940144555e-05, + "loss": 0.0376, + "step": 147120 + }, + { + "epoch": 31.02184910361263, + "grad_norm": 0.0009647909319028258, + "learning_rate": 1.1283828919099461e-05, + "loss": 0.0123, + "step": 147130 + }, + { + "epoch": 31.021903265991444, + "grad_norm": 0.08656778931617737, + "learning_rate": 1.1280819898054367e-05, + "loss": 0.0733, + "step": 147140 + }, + { + "epoch": 31.021957428370253, + "grad_norm": 1.4409397840499878, + "learning_rate": 1.1277810877009275e-05, + "loss": 0.0186, + "step": 147150 + }, + { + "epoch": 31.022011590749067, + "grad_norm": 0.2500858008861542, + "learning_rate": 1.1274801855964182e-05, + "loss": 0.0992, + "step": 147160 + }, + { + "epoch": 31.022065753127876, + "grad_norm": 2.4264633655548096, + "learning_rate": 1.1271792834919088e-05, + "loss": 0.1144, + "step": 147170 + }, + { + "epoch": 31.02211991550669, + "grad_norm": 0.0751953050494194, + "learning_rate": 1.1268783813873996e-05, + "loss": 0.0038, + "step": 147180 + }, + { + "epoch": 31.022174077885502, + "grad_norm": 0.0009632979054003954, + "learning_rate": 1.12657747928289e-05, + "loss": 0.0312, + "step": 147190 + }, + { + "epoch": 31.02222824026431, + "grad_norm": 0.0014909862075001001, + "learning_rate": 1.1262765771783808e-05, + "loss": 0.0284, + "step": 147200 + }, + { + "epoch": 31.022282402643125, + "grad_norm": 0.0009317368385381997, + "learning_rate": 1.1259756750738715e-05, + "loss": 0.0405, + "step": 147210 + }, + { + "epoch": 31.022336565021934, + "grad_norm": 0.005648954771459103, + "learning_rate": 1.125674772969362e-05, + "loss": 0.0425, + "step": 147220 + }, + { + "epoch": 31.022390727400747, + "grad_norm": 0.049551162868738174, + "learning_rate": 1.1253738708648529e-05, + "loss": 0.1228, + "step": 147230 + }, + { + "epoch": 31.02244488977956, + "grad_norm": 0.0010101220104843378, + "learning_rate": 1.1250729687603435e-05, + "loss": 0.0025, + "step": 147240 + }, + { + "epoch": 31.02249905215837, + "grad_norm": 0.0009763696580193937, + "learning_rate": 1.1247720666558343e-05, + "loss": 0.0043, + "step": 147250 + }, + { + "epoch": 31.022553214537183, + "grad_norm": 0.000957637035753578, + "learning_rate": 1.124471164551325e-05, + "loss": 0.0003, + "step": 147260 + }, + { + "epoch": 31.022607376915992, + "grad_norm": 5.10071325302124, + "learning_rate": 1.1241702624468156e-05, + "loss": 0.0701, + "step": 147270 + }, + { + "epoch": 31.022661539294806, + "grad_norm": 0.001830242807045579, + "learning_rate": 1.1238693603423064e-05, + "loss": 0.0647, + "step": 147280 + }, + { + "epoch": 31.02271570167362, + "grad_norm": 0.0014713405398651958, + "learning_rate": 1.123568458237797e-05, + "loss": 0.084, + "step": 147290 + }, + { + "epoch": 31.022769864052428, + "grad_norm": 0.0015087717911228538, + "learning_rate": 1.1232675561332876e-05, + "loss": 0.0111, + "step": 147300 + }, + { + "epoch": 31.02282402643124, + "grad_norm": 4.294197082519531, + "learning_rate": 1.1229666540287784e-05, + "loss": 0.1089, + "step": 147310 + }, + { + "epoch": 31.022878188810054, + "grad_norm": 0.9348607659339905, + "learning_rate": 1.122665751924269e-05, + "loss": 0.0558, + "step": 147320 + }, + { + "epoch": 31.022932351188864, + "grad_norm": 1.5871641635894775, + "learning_rate": 1.1223648498197598e-05, + "loss": 0.0328, + "step": 147330 + }, + { + "epoch": 31.022986513567677, + "grad_norm": 0.31009480357170105, + "learning_rate": 1.1220639477152503e-05, + "loss": 0.0138, + "step": 147340 + }, + { + "epoch": 31.023040675946486, + "grad_norm": 0.0016365436604246497, + "learning_rate": 1.121763045610741e-05, + "loss": 0.0349, + "step": 147350 + }, + { + "epoch": 31.0230948383253, + "grad_norm": 0.0017541777342557907, + "learning_rate": 1.1214621435062317e-05, + "loss": 0.0152, + "step": 147360 + }, + { + "epoch": 31.023149000704112, + "grad_norm": 0.0014855039771646261, + "learning_rate": 1.1211612414017223e-05, + "loss": 0.0503, + "step": 147370 + }, + { + "epoch": 31.023203163082922, + "grad_norm": 0.0011451717000454664, + "learning_rate": 1.1208603392972131e-05, + "loss": 0.1445, + "step": 147380 + }, + { + "epoch": 31.023257325461735, + "grad_norm": 0.003690296085551381, + "learning_rate": 1.1205594371927038e-05, + "loss": 0.0156, + "step": 147390 + }, + { + "epoch": 31.023311487840544, + "grad_norm": 0.1323416829109192, + "learning_rate": 1.1202585350881944e-05, + "loss": 0.0298, + "step": 147400 + }, + { + "epoch": 31.023365650219358, + "grad_norm": 4.363639831542969, + "learning_rate": 1.1199576329836852e-05, + "loss": 0.2043, + "step": 147410 + }, + { + "epoch": 31.02341981259817, + "grad_norm": 0.056549880653619766, + "learning_rate": 1.1196567308791758e-05, + "loss": 0.0016, + "step": 147420 + }, + { + "epoch": 31.02347397497698, + "grad_norm": 0.0016316180117428303, + "learning_rate": 1.1193558287746664e-05, + "loss": 0.0104, + "step": 147430 + }, + { + "epoch": 31.023528137355793, + "grad_norm": 0.018274953588843346, + "learning_rate": 1.1190549266701572e-05, + "loss": 0.0751, + "step": 147440 + }, + { + "epoch": 31.023582299734603, + "grad_norm": 19.999984741210938, + "learning_rate": 1.1187540245656479e-05, + "loss": 0.2033, + "step": 147450 + }, + { + "epoch": 31.023636462113416, + "grad_norm": 0.0021943235769867897, + "learning_rate": 1.1184531224611387e-05, + "loss": 0.0154, + "step": 147460 + }, + { + "epoch": 31.02369062449223, + "grad_norm": 0.006987429223954678, + "learning_rate": 1.1181522203566293e-05, + "loss": 0.0005, + "step": 147470 + }, + { + "epoch": 31.02374478687104, + "grad_norm": 0.009113864041864872, + "learning_rate": 1.11785131825212e-05, + "loss": 0.0182, + "step": 147480 + }, + { + "epoch": 31.02379894924985, + "grad_norm": 1.6050875186920166, + "learning_rate": 1.1175504161476106e-05, + "loss": 0.0305, + "step": 147490 + }, + { + "epoch": 31.023853111628664, + "grad_norm": 0.0016751891234889627, + "learning_rate": 1.1172495140431012e-05, + "loss": 0.0393, + "step": 147500 + }, + { + "epoch": 31.023907274007474, + "grad_norm": 0.5545037984848022, + "learning_rate": 1.116948611938592e-05, + "loss": 0.0561, + "step": 147510 + }, + { + "epoch": 31.023961436386287, + "grad_norm": 15.009427070617676, + "learning_rate": 1.1166477098340826e-05, + "loss": 0.049, + "step": 147520 + }, + { + "epoch": 31.024015598765097, + "grad_norm": 2.015507936477661, + "learning_rate": 1.1163468077295732e-05, + "loss": 0.0791, + "step": 147530 + }, + { + "epoch": 31.02406976114391, + "grad_norm": 0.0023024610709398985, + "learning_rate": 1.116045905625064e-05, + "loss": 0.0032, + "step": 147540 + }, + { + "epoch": 31.024123923522723, + "grad_norm": 1.0735702514648438, + "learning_rate": 1.1157450035205547e-05, + "loss": 0.0124, + "step": 147550 + }, + { + "epoch": 31.024178085901532, + "grad_norm": 20.87493896484375, + "learning_rate": 1.1154441014160453e-05, + "loss": 0.0416, + "step": 147560 + }, + { + "epoch": 31.024232248280345, + "grad_norm": 0.00246503297239542, + "learning_rate": 1.115143199311536e-05, + "loss": 0.0699, + "step": 147570 + }, + { + "epoch": 31.024286410659155, + "grad_norm": 0.1686190664768219, + "learning_rate": 1.1148422972070267e-05, + "loss": 0.0023, + "step": 147580 + }, + { + "epoch": 31.024340573037968, + "grad_norm": 0.004273759201169014, + "learning_rate": 1.1145413951025175e-05, + "loss": 0.0053, + "step": 147590 + }, + { + "epoch": 31.02439473541678, + "grad_norm": 0.001687810872681439, + "learning_rate": 1.1142404929980081e-05, + "loss": 0.011, + "step": 147600 + }, + { + "epoch": 31.02444889779559, + "grad_norm": 0.858476459980011, + "learning_rate": 1.1139395908934988e-05, + "loss": 0.1274, + "step": 147610 + }, + { + "epoch": 31.024503060174403, + "grad_norm": 0.027361661195755005, + "learning_rate": 1.1136386887889896e-05, + "loss": 0.0501, + "step": 147620 + }, + { + "epoch": 31.024557222553213, + "grad_norm": 0.003026816062629223, + "learning_rate": 1.1133377866844802e-05, + "loss": 0.0394, + "step": 147630 + }, + { + "epoch": 31.024611384932026, + "grad_norm": 0.0016316507244482636, + "learning_rate": 1.1130368845799708e-05, + "loss": 0.0199, + "step": 147640 + }, + { + "epoch": 31.02466554731084, + "grad_norm": 0.0015529036754742265, + "learning_rate": 1.1127359824754614e-05, + "loss": 0.0474, + "step": 147650 + }, + { + "epoch": 31.02471970968965, + "grad_norm": 0.003337375819683075, + "learning_rate": 1.112435080370952e-05, + "loss": 0.0084, + "step": 147660 + }, + { + "epoch": 31.02477387206846, + "grad_norm": 0.0014163991436362267, + "learning_rate": 1.1121341782664429e-05, + "loss": 0.0453, + "step": 147670 + }, + { + "epoch": 31.024828034447275, + "grad_norm": 3.2850663661956787, + "learning_rate": 1.1118332761619335e-05, + "loss": 0.033, + "step": 147680 + }, + { + "epoch": 31.024882196826084, + "grad_norm": 0.0013301096623763442, + "learning_rate": 1.1115323740574241e-05, + "loss": 0.0897, + "step": 147690 + }, + { + "epoch": 31.024936359204897, + "grad_norm": 0.03537634015083313, + "learning_rate": 1.1112314719529149e-05, + "loss": 0.0148, + "step": 147700 + }, + { + "epoch": 31.024990521583707, + "grad_norm": 0.027231266722083092, + "learning_rate": 1.1109305698484055e-05, + "loss": 0.0888, + "step": 147710 + }, + { + "epoch": 31.02500135405947, + "eval_accuracy": 0.821685173089484, + "eval_loss": 0.9873241186141968, + "eval_runtime": 119.149, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 3.214, + "step": 147712 + }, + { + "epoch": 32.00004332990305, + "grad_norm": 0.002581594046205282, + "learning_rate": 1.1106296677438963e-05, + "loss": 0.004, + "step": 147720 + }, + { + "epoch": 32.00009749228186, + "grad_norm": 0.0018265669932588935, + "learning_rate": 1.110328765639387e-05, + "loss": 0.0205, + "step": 147730 + }, + { + "epoch": 32.00015165466067, + "grad_norm": 0.0018092712853103876, + "learning_rate": 1.1100278635348776e-05, + "loss": 0.0689, + "step": 147740 + }, + { + "epoch": 32.000205817039486, + "grad_norm": 0.0013388519873842597, + "learning_rate": 1.1097269614303684e-05, + "loss": 0.0901, + "step": 147750 + }, + { + "epoch": 32.000259979418296, + "grad_norm": 0.029020244255661964, + "learning_rate": 1.109426059325859e-05, + "loss": 0.01, + "step": 147760 + }, + { + "epoch": 32.000314141797105, + "grad_norm": 0.00526345893740654, + "learning_rate": 1.1091251572213496e-05, + "loss": 0.0127, + "step": 147770 + }, + { + "epoch": 32.00036830417592, + "grad_norm": 0.007177156396210194, + "learning_rate": 1.1088242551168404e-05, + "loss": 0.007, + "step": 147780 + }, + { + "epoch": 32.00042246655473, + "grad_norm": 0.04418333247303963, + "learning_rate": 1.1085233530123309e-05, + "loss": 0.0399, + "step": 147790 + }, + { + "epoch": 32.00047662893354, + "grad_norm": 0.03484117239713669, + "learning_rate": 1.1082224509078217e-05, + "loss": 0.0447, + "step": 147800 + }, + { + "epoch": 32.00053079131236, + "grad_norm": 0.0015696983318775892, + "learning_rate": 1.1079215488033123e-05, + "loss": 0.026, + "step": 147810 + }, + { + "epoch": 32.00058495369117, + "grad_norm": 0.0022420347668230534, + "learning_rate": 1.107620646698803e-05, + "loss": 0.0276, + "step": 147820 + }, + { + "epoch": 32.000639116069976, + "grad_norm": 0.002888399874791503, + "learning_rate": 1.1073197445942937e-05, + "loss": 0.0224, + "step": 147830 + }, + { + "epoch": 32.00069327844879, + "grad_norm": 0.9497385621070862, + "learning_rate": 1.1070188424897844e-05, + "loss": 0.055, + "step": 147840 + }, + { + "epoch": 32.0007474408276, + "grad_norm": 0.004032154567539692, + "learning_rate": 1.1067179403852752e-05, + "loss": 0.0286, + "step": 147850 + }, + { + "epoch": 32.00080160320641, + "grad_norm": 0.008909028023481369, + "learning_rate": 1.1064170382807658e-05, + "loss": 0.0017, + "step": 147860 + }, + { + "epoch": 32.00085576558522, + "grad_norm": 0.0016406658105552197, + "learning_rate": 1.1061161361762564e-05, + "loss": 0.0183, + "step": 147870 + }, + { + "epoch": 32.00090992796404, + "grad_norm": 0.0015634230803698301, + "learning_rate": 1.1058152340717472e-05, + "loss": 0.0382, + "step": 147880 + }, + { + "epoch": 32.00096409034285, + "grad_norm": 0.001296662143431604, + "learning_rate": 1.1055143319672379e-05, + "loss": 0.0014, + "step": 147890 + }, + { + "epoch": 32.00101825272166, + "grad_norm": 0.036734744906425476, + "learning_rate": 1.1052134298627285e-05, + "loss": 0.0558, + "step": 147900 + }, + { + "epoch": 32.001072415100474, + "grad_norm": 7.257513999938965, + "learning_rate": 1.1049125277582193e-05, + "loss": 0.1154, + "step": 147910 + }, + { + "epoch": 32.00112657747928, + "grad_norm": 0.0023325507063418627, + "learning_rate": 1.1046116256537099e-05, + "loss": 0.0146, + "step": 147920 + }, + { + "epoch": 32.00118073985809, + "grad_norm": 0.001935803797096014, + "learning_rate": 1.1043107235492007e-05, + "loss": 0.0482, + "step": 147930 + }, + { + "epoch": 32.00123490223691, + "grad_norm": 1.652928352355957, + "learning_rate": 1.1040098214446912e-05, + "loss": 0.019, + "step": 147940 + }, + { + "epoch": 32.00128906461572, + "grad_norm": 0.0028869963716715574, + "learning_rate": 1.1037089193401818e-05, + "loss": 0.0391, + "step": 147950 + }, + { + "epoch": 32.00134322699453, + "grad_norm": 0.0013397658476606011, + "learning_rate": 1.1034080172356726e-05, + "loss": 0.0215, + "step": 147960 + }, + { + "epoch": 32.00139738937334, + "grad_norm": 0.07562387734651566, + "learning_rate": 1.1031071151311632e-05, + "loss": 0.0096, + "step": 147970 + }, + { + "epoch": 32.001451551752155, + "grad_norm": 0.002657161559909582, + "learning_rate": 1.102806213026654e-05, + "loss": 0.1105, + "step": 147980 + }, + { + "epoch": 32.001505714130964, + "grad_norm": 0.0677269697189331, + "learning_rate": 1.1025053109221446e-05, + "loss": 0.0115, + "step": 147990 + }, + { + "epoch": 32.00155987650977, + "grad_norm": 0.3952522873878479, + "learning_rate": 1.1022044088176353e-05, + "loss": 0.013, + "step": 148000 + }, + { + "epoch": 32.00161403888859, + "grad_norm": 0.02201959304511547, + "learning_rate": 1.101903506713126e-05, + "loss": 0.0366, + "step": 148010 + }, + { + "epoch": 32.0016682012674, + "grad_norm": 0.0029187826439738274, + "learning_rate": 1.1016026046086167e-05, + "loss": 0.0406, + "step": 148020 + }, + { + "epoch": 32.00172236364621, + "grad_norm": 0.21054452657699585, + "learning_rate": 1.1013017025041073e-05, + "loss": 0.0018, + "step": 148030 + }, + { + "epoch": 32.001776526025026, + "grad_norm": 0.028959842398762703, + "learning_rate": 1.1010008003995981e-05, + "loss": 0.029, + "step": 148040 + }, + { + "epoch": 32.001830688403835, + "grad_norm": 0.0357905849814415, + "learning_rate": 1.1006998982950887e-05, + "loss": 0.0433, + "step": 148050 + }, + { + "epoch": 32.001884850782645, + "grad_norm": 0.002011343138292432, + "learning_rate": 1.1003989961905795e-05, + "loss": 0.0689, + "step": 148060 + }, + { + "epoch": 32.00193901316146, + "grad_norm": 0.08815915882587433, + "learning_rate": 1.1000980940860702e-05, + "loss": 0.005, + "step": 148070 + }, + { + "epoch": 32.00199317554027, + "grad_norm": 0.0013272752985358238, + "learning_rate": 1.0997971919815608e-05, + "loss": 0.0097, + "step": 148080 + }, + { + "epoch": 32.00204733791908, + "grad_norm": 0.001477169687859714, + "learning_rate": 1.0994962898770514e-05, + "loss": 0.0034, + "step": 148090 + }, + { + "epoch": 32.00210150029789, + "grad_norm": 0.004327022470533848, + "learning_rate": 1.099195387772542e-05, + "loss": 0.0337, + "step": 148100 + }, + { + "epoch": 32.00215566267671, + "grad_norm": 3.0826079845428467, + "learning_rate": 1.0988944856680328e-05, + "loss": 0.0631, + "step": 148110 + }, + { + "epoch": 32.002209825055516, + "grad_norm": 4.160189628601074, + "learning_rate": 1.0985935835635235e-05, + "loss": 0.0617, + "step": 148120 + }, + { + "epoch": 32.002263987434326, + "grad_norm": 4.032883167266846, + "learning_rate": 1.0982926814590141e-05, + "loss": 0.0315, + "step": 148130 + }, + { + "epoch": 32.00231814981314, + "grad_norm": 0.0033241184428334236, + "learning_rate": 1.0979917793545049e-05, + "loss": 0.0167, + "step": 148140 + }, + { + "epoch": 32.00237231219195, + "grad_norm": 0.0015562293119728565, + "learning_rate": 1.0976908772499955e-05, + "loss": 0.0019, + "step": 148150 + }, + { + "epoch": 32.00242647457076, + "grad_norm": 0.001720644417218864, + "learning_rate": 1.0973899751454861e-05, + "loss": 0.0009, + "step": 148160 + }, + { + "epoch": 32.00248063694958, + "grad_norm": 0.0016280231066048145, + "learning_rate": 1.097089073040977e-05, + "loss": 0.0179, + "step": 148170 + }, + { + "epoch": 32.00253479932839, + "grad_norm": 3.3309900760650635, + "learning_rate": 1.0967881709364676e-05, + "loss": 0.1047, + "step": 148180 + }, + { + "epoch": 32.0025889617072, + "grad_norm": 0.0013270614435896277, + "learning_rate": 1.0964872688319584e-05, + "loss": 0.0162, + "step": 148190 + }, + { + "epoch": 32.002643124086006, + "grad_norm": 0.014975343830883503, + "learning_rate": 1.096186366727449e-05, + "loss": 0.039, + "step": 148200 + }, + { + "epoch": 32.00269728646482, + "grad_norm": 0.056180257350206375, + "learning_rate": 1.0958854646229396e-05, + "loss": 0.0005, + "step": 148210 + }, + { + "epoch": 32.00275144884363, + "grad_norm": 0.0015318429796025157, + "learning_rate": 1.0955845625184304e-05, + "loss": 0.0112, + "step": 148220 + }, + { + "epoch": 32.00280561122244, + "grad_norm": 0.001121392473578453, + "learning_rate": 1.095283660413921e-05, + "loss": 0.0381, + "step": 148230 + }, + { + "epoch": 32.00285977360126, + "grad_norm": 0.5933691263198853, + "learning_rate": 1.0949827583094117e-05, + "loss": 0.0732, + "step": 148240 + }, + { + "epoch": 32.00291393598007, + "grad_norm": 4.377905368804932, + "learning_rate": 1.0946818562049023e-05, + "loss": 0.0894, + "step": 148250 + }, + { + "epoch": 32.00296809835888, + "grad_norm": 0.0012027432676404715, + "learning_rate": 1.094380954100393e-05, + "loss": 0.0092, + "step": 148260 + }, + { + "epoch": 32.003022260737694, + "grad_norm": 1.2375530004501343, + "learning_rate": 1.0940800519958837e-05, + "loss": 0.0106, + "step": 148270 + }, + { + "epoch": 32.003076423116504, + "grad_norm": 0.0011257590958848596, + "learning_rate": 1.0937791498913744e-05, + "loss": 0.1003, + "step": 148280 + }, + { + "epoch": 32.00313058549531, + "grad_norm": 0.003460064297541976, + "learning_rate": 1.093478247786865e-05, + "loss": 0.0296, + "step": 148290 + }, + { + "epoch": 32.00318474787413, + "grad_norm": 0.05541830509901047, + "learning_rate": 1.0931773456823558e-05, + "loss": 0.1327, + "step": 148300 + }, + { + "epoch": 32.00323891025294, + "grad_norm": 0.7473239898681641, + "learning_rate": 1.0928764435778464e-05, + "loss": 0.0803, + "step": 148310 + }, + { + "epoch": 32.00329307263175, + "grad_norm": 1.0531936883926392, + "learning_rate": 1.0925755414733372e-05, + "loss": 0.0259, + "step": 148320 + }, + { + "epoch": 32.00334723501056, + "grad_norm": 0.4280773401260376, + "learning_rate": 1.0922746393688278e-05, + "loss": 0.0328, + "step": 148330 + }, + { + "epoch": 32.003401397389375, + "grad_norm": 0.0015754529740661383, + "learning_rate": 1.0919737372643185e-05, + "loss": 0.0133, + "step": 148340 + }, + { + "epoch": 32.003455559768184, + "grad_norm": 1.1856930255889893, + "learning_rate": 1.0916728351598093e-05, + "loss": 0.0114, + "step": 148350 + }, + { + "epoch": 32.003509722146994, + "grad_norm": 0.06882630288600922, + "learning_rate": 1.0913719330552999e-05, + "loss": 0.071, + "step": 148360 + }, + { + "epoch": 32.00356388452581, + "grad_norm": 0.9087584018707275, + "learning_rate": 1.0910710309507905e-05, + "loss": 0.0041, + "step": 148370 + }, + { + "epoch": 32.00361804690462, + "grad_norm": 0.9568824768066406, + "learning_rate": 1.0907701288462813e-05, + "loss": 0.05, + "step": 148380 + }, + { + "epoch": 32.00367220928343, + "grad_norm": 0.001521204481832683, + "learning_rate": 1.0904692267417718e-05, + "loss": 0.0346, + "step": 148390 + }, + { + "epoch": 32.003726371662246, + "grad_norm": 0.12097103893756866, + "learning_rate": 1.0901683246372626e-05, + "loss": 0.0198, + "step": 148400 + }, + { + "epoch": 32.003780534041056, + "grad_norm": 0.0261429101228714, + "learning_rate": 1.0898674225327532e-05, + "loss": 0.0148, + "step": 148410 + }, + { + "epoch": 32.003834696419865, + "grad_norm": 0.0011783677618950605, + "learning_rate": 1.0895665204282438e-05, + "loss": 0.0028, + "step": 148420 + }, + { + "epoch": 32.00388885879868, + "grad_norm": 0.14205174148082733, + "learning_rate": 1.0892656183237346e-05, + "loss": 0.0026, + "step": 148430 + }, + { + "epoch": 32.00394302117749, + "grad_norm": 0.0009900209261104465, + "learning_rate": 1.0889647162192252e-05, + "loss": 0.0189, + "step": 148440 + }, + { + "epoch": 32.0039971835563, + "grad_norm": 0.0011187875643372536, + "learning_rate": 1.088663814114716e-05, + "loss": 0.0006, + "step": 148450 + }, + { + "epoch": 32.00405134593511, + "grad_norm": 0.0058117215521633625, + "learning_rate": 1.0883629120102067e-05, + "loss": 0.044, + "step": 148460 + }, + { + "epoch": 32.00410550831393, + "grad_norm": 0.0009795461082831025, + "learning_rate": 1.0880620099056973e-05, + "loss": 0.0382, + "step": 148470 + }, + { + "epoch": 32.00415967069274, + "grad_norm": 0.0010656213853508234, + "learning_rate": 1.0877611078011881e-05, + "loss": 0.0312, + "step": 148480 + }, + { + "epoch": 32.004213833071546, + "grad_norm": 0.0032262338791042566, + "learning_rate": 1.0874602056966787e-05, + "loss": 0.087, + "step": 148490 + }, + { + "epoch": 32.00426799545036, + "grad_norm": 0.46266961097717285, + "learning_rate": 1.0871593035921693e-05, + "loss": 0.0168, + "step": 148500 + }, + { + "epoch": 32.00432215782917, + "grad_norm": 0.0010063968366011977, + "learning_rate": 1.0868584014876601e-05, + "loss": 0.0357, + "step": 148510 + }, + { + "epoch": 32.00437632020798, + "grad_norm": 0.000975727685727179, + "learning_rate": 1.0865574993831508e-05, + "loss": 0.0539, + "step": 148520 + }, + { + "epoch": 32.0044304825868, + "grad_norm": 0.0016509209526702762, + "learning_rate": 1.0862565972786416e-05, + "loss": 0.0006, + "step": 148530 + }, + { + "epoch": 32.00448464496561, + "grad_norm": 4.755137920379639, + "learning_rate": 1.085955695174132e-05, + "loss": 0.1099, + "step": 148540 + }, + { + "epoch": 32.00453880734442, + "grad_norm": 0.0011420950759202242, + "learning_rate": 1.0856547930696227e-05, + "loss": 0.0588, + "step": 148550 + }, + { + "epoch": 32.00459296972323, + "grad_norm": 0.001727652968838811, + "learning_rate": 1.0853538909651134e-05, + "loss": 0.0392, + "step": 148560 + }, + { + "epoch": 32.00464713210204, + "grad_norm": 0.4120330214500427, + "learning_rate": 1.085052988860604e-05, + "loss": 0.0948, + "step": 148570 + }, + { + "epoch": 32.00470129448085, + "grad_norm": 0.0010284713935106993, + "learning_rate": 1.0847520867560949e-05, + "loss": 0.0052, + "step": 148580 + }, + { + "epoch": 32.00475545685966, + "grad_norm": 0.0010764398612082005, + "learning_rate": 1.0844511846515855e-05, + "loss": 0.0779, + "step": 148590 + }, + { + "epoch": 32.00480961923848, + "grad_norm": 0.02109268307685852, + "learning_rate": 1.0841502825470761e-05, + "loss": 0.0283, + "step": 148600 + }, + { + "epoch": 32.00486378161729, + "grad_norm": 0.007110459264367819, + "learning_rate": 1.083849380442567e-05, + "loss": 0.0198, + "step": 148610 + }, + { + "epoch": 32.0049179439961, + "grad_norm": 0.0015744249103590846, + "learning_rate": 1.0835484783380576e-05, + "loss": 0.0124, + "step": 148620 + }, + { + "epoch": 32.004972106374915, + "grad_norm": 0.002973647089675069, + "learning_rate": 1.0832475762335482e-05, + "loss": 0.0157, + "step": 148630 + }, + { + "epoch": 32.005026268753724, + "grad_norm": 0.0010326994815841317, + "learning_rate": 1.082946674129039e-05, + "loss": 0.0062, + "step": 148640 + }, + { + "epoch": 32.005080431132534, + "grad_norm": 0.001083545503206551, + "learning_rate": 1.0826457720245296e-05, + "loss": 0.0237, + "step": 148650 + }, + { + "epoch": 32.00513459351135, + "grad_norm": 0.0010812543332576752, + "learning_rate": 1.0823448699200204e-05, + "loss": 0.0541, + "step": 148660 + }, + { + "epoch": 32.00518875589016, + "grad_norm": 0.677266001701355, + "learning_rate": 1.082043967815511e-05, + "loss": 0.0859, + "step": 148670 + }, + { + "epoch": 32.00524291826897, + "grad_norm": 0.06199921667575836, + "learning_rate": 1.0817430657110017e-05, + "loss": 0.025, + "step": 148680 + }, + { + "epoch": 32.00529708064778, + "grad_norm": 0.7132025361061096, + "learning_rate": 1.0814421636064923e-05, + "loss": 0.0173, + "step": 148690 + }, + { + "epoch": 32.005351243026595, + "grad_norm": 0.7390036582946777, + "learning_rate": 1.0811412615019829e-05, + "loss": 0.0283, + "step": 148700 + }, + { + "epoch": 32.005405405405405, + "grad_norm": 0.0036605133209377527, + "learning_rate": 1.0808403593974737e-05, + "loss": 0.0022, + "step": 148710 + }, + { + "epoch": 32.005459567784214, + "grad_norm": 0.0011188095668330789, + "learning_rate": 1.0805394572929643e-05, + "loss": 0.0205, + "step": 148720 + }, + { + "epoch": 32.00551373016303, + "grad_norm": 0.0011427897261455655, + "learning_rate": 1.080238555188455e-05, + "loss": 0.0388, + "step": 148730 + }, + { + "epoch": 32.00556789254184, + "grad_norm": 0.0012029142817482352, + "learning_rate": 1.0799376530839458e-05, + "loss": 0.0017, + "step": 148740 + }, + { + "epoch": 32.00562205492065, + "grad_norm": 0.0011335519375279546, + "learning_rate": 1.0796367509794364e-05, + "loss": 0.0001, + "step": 148750 + }, + { + "epoch": 32.00567621729947, + "grad_norm": 0.0010155531344935298, + "learning_rate": 1.079335848874927e-05, + "loss": 0.0022, + "step": 148760 + }, + { + "epoch": 32.005730379678276, + "grad_norm": 0.001004363177344203, + "learning_rate": 1.0790349467704178e-05, + "loss": 0.0002, + "step": 148770 + }, + { + "epoch": 32.005784542057086, + "grad_norm": 0.001307277474552393, + "learning_rate": 1.0787340446659084e-05, + "loss": 0.0263, + "step": 148780 + }, + { + "epoch": 32.0058387044359, + "grad_norm": 0.0015019113197922707, + "learning_rate": 1.0784331425613992e-05, + "loss": 0.0855, + "step": 148790 + }, + { + "epoch": 32.00589286681471, + "grad_norm": 0.0015690489672124386, + "learning_rate": 1.0781322404568899e-05, + "loss": 0.0467, + "step": 148800 + }, + { + "epoch": 32.00594702919352, + "grad_norm": 0.0010849985992535949, + "learning_rate": 1.0778313383523805e-05, + "loss": 0.0333, + "step": 148810 + }, + { + "epoch": 32.00600119157233, + "grad_norm": 0.7282863259315491, + "learning_rate": 1.0775304362478713e-05, + "loss": 0.008, + "step": 148820 + }, + { + "epoch": 32.00605535395115, + "grad_norm": 0.0018835036316886544, + "learning_rate": 1.0772295341433619e-05, + "loss": 0.0053, + "step": 148830 + }, + { + "epoch": 32.00610951632996, + "grad_norm": 0.03208991512656212, + "learning_rate": 1.0769286320388525e-05, + "loss": 0.0078, + "step": 148840 + }, + { + "epoch": 32.006163678708766, + "grad_norm": 0.02118540368974209, + "learning_rate": 1.0766277299343432e-05, + "loss": 0.0206, + "step": 148850 + }, + { + "epoch": 32.00621784108758, + "grad_norm": 0.006505724973976612, + "learning_rate": 1.0763268278298338e-05, + "loss": 0.0543, + "step": 148860 + }, + { + "epoch": 32.00627200346639, + "grad_norm": 0.0012186168460175395, + "learning_rate": 1.0760259257253246e-05, + "loss": 0.1153, + "step": 148870 + }, + { + "epoch": 32.0063261658452, + "grad_norm": 0.7219786643981934, + "learning_rate": 1.0757250236208152e-05, + "loss": 0.0457, + "step": 148880 + }, + { + "epoch": 32.00638032822402, + "grad_norm": 0.0009827985195443034, + "learning_rate": 1.0754241215163058e-05, + "loss": 0.0655, + "step": 148890 + }, + { + "epoch": 32.00643449060283, + "grad_norm": 0.0016167048597708344, + "learning_rate": 1.0751232194117966e-05, + "loss": 0.0742, + "step": 148900 + }, + { + "epoch": 32.00648865298164, + "grad_norm": 0.012213717214763165, + "learning_rate": 1.0748223173072873e-05, + "loss": 0.0061, + "step": 148910 + }, + { + "epoch": 32.00654281536045, + "grad_norm": 0.0010184618877246976, + "learning_rate": 1.074521415202778e-05, + "loss": 0.0237, + "step": 148920 + }, + { + "epoch": 32.006596977739264, + "grad_norm": 0.46772223711013794, + "learning_rate": 1.0742205130982687e-05, + "loss": 0.055, + "step": 148930 + }, + { + "epoch": 32.00665114011807, + "grad_norm": 0.0012192060239613056, + "learning_rate": 1.0739196109937593e-05, + "loss": 0.003, + "step": 148940 + }, + { + "epoch": 32.00670530249688, + "grad_norm": 0.0010850594844669104, + "learning_rate": 1.0736187088892501e-05, + "loss": 0.0348, + "step": 148950 + }, + { + "epoch": 32.0067594648757, + "grad_norm": 1.8863945007324219, + "learning_rate": 1.0733178067847407e-05, + "loss": 0.0336, + "step": 148960 + }, + { + "epoch": 32.00681362725451, + "grad_norm": 0.0010651719057932496, + "learning_rate": 1.0730169046802314e-05, + "loss": 0.0088, + "step": 148970 + }, + { + "epoch": 32.00686778963332, + "grad_norm": 0.0031520603224635124, + "learning_rate": 1.0727160025757222e-05, + "loss": 0.0179, + "step": 148980 + }, + { + "epoch": 32.006921952012135, + "grad_norm": 0.48189258575439453, + "learning_rate": 1.0724151004712126e-05, + "loss": 0.0228, + "step": 148990 + }, + { + "epoch": 32.006976114390945, + "grad_norm": 0.0010160437086597085, + "learning_rate": 1.0721141983667034e-05, + "loss": 0.0797, + "step": 149000 + }, + { + "epoch": 32.007030276769754, + "grad_norm": 0.0010047523537650704, + "learning_rate": 1.071813296262194e-05, + "loss": 0.0321, + "step": 149010 + }, + { + "epoch": 32.00708443914857, + "grad_norm": 0.004613724537193775, + "learning_rate": 1.0715123941576847e-05, + "loss": 0.1092, + "step": 149020 + }, + { + "epoch": 32.00713860152738, + "grad_norm": 0.0009787441231310368, + "learning_rate": 1.0712114920531755e-05, + "loss": 0.019, + "step": 149030 + }, + { + "epoch": 32.00719276390619, + "grad_norm": 2.236886501312256, + "learning_rate": 1.0709105899486661e-05, + "loss": 0.0741, + "step": 149040 + }, + { + "epoch": 32.007246926285, + "grad_norm": 0.0013561163796111941, + "learning_rate": 1.0706096878441569e-05, + "loss": 0.0165, + "step": 149050 + }, + { + "epoch": 32.007301088663816, + "grad_norm": 7.185647010803223, + "learning_rate": 1.0703087857396475e-05, + "loss": 0.0433, + "step": 149060 + }, + { + "epoch": 32.007355251042625, + "grad_norm": 0.001047912985086441, + "learning_rate": 1.0700078836351382e-05, + "loss": 0.0369, + "step": 149070 + }, + { + "epoch": 32.007409413421435, + "grad_norm": 0.0012855748645961285, + "learning_rate": 1.069706981530629e-05, + "loss": 0.0029, + "step": 149080 + }, + { + "epoch": 32.00746357580025, + "grad_norm": 0.004139617085456848, + "learning_rate": 1.0694060794261196e-05, + "loss": 0.0401, + "step": 149090 + }, + { + "epoch": 32.00751773817906, + "grad_norm": 0.001097002997994423, + "learning_rate": 1.0691051773216102e-05, + "loss": 0.0392, + "step": 149100 + }, + { + "epoch": 32.00757190055787, + "grad_norm": 0.001000372227281332, + "learning_rate": 1.068804275217101e-05, + "loss": 0.0102, + "step": 149110 + }, + { + "epoch": 32.00762606293669, + "grad_norm": 0.0009842339204624295, + "learning_rate": 1.0685033731125916e-05, + "loss": 0.0385, + "step": 149120 + }, + { + "epoch": 32.0076802253155, + "grad_norm": 0.0010270359925925732, + "learning_rate": 1.0682024710080824e-05, + "loss": 0.0452, + "step": 149130 + }, + { + "epoch": 32.007734387694306, + "grad_norm": 0.1462010145187378, + "learning_rate": 1.0679015689035729e-05, + "loss": 0.0165, + "step": 149140 + }, + { + "epoch": 32.00778855007312, + "grad_norm": 0.001111186807975173, + "learning_rate": 1.0676006667990635e-05, + "loss": 0.0656, + "step": 149150 + }, + { + "epoch": 32.00784271245193, + "grad_norm": 0.264287531375885, + "learning_rate": 1.0672997646945543e-05, + "loss": 0.0256, + "step": 149160 + }, + { + "epoch": 32.00789687483074, + "grad_norm": 0.0009858568664640188, + "learning_rate": 1.066998862590045e-05, + "loss": 0.0221, + "step": 149170 + }, + { + "epoch": 32.00795103720955, + "grad_norm": 1.3928728103637695, + "learning_rate": 1.0666979604855357e-05, + "loss": 0.0186, + "step": 149180 + }, + { + "epoch": 32.00800519958837, + "grad_norm": 0.0023563611321151257, + "learning_rate": 1.0663970583810264e-05, + "loss": 0.0579, + "step": 149190 + }, + { + "epoch": 32.00805936196718, + "grad_norm": 2.617032051086426, + "learning_rate": 1.066096156276517e-05, + "loss": 0.0425, + "step": 149200 + }, + { + "epoch": 32.00811352434599, + "grad_norm": 0.0009770069736987352, + "learning_rate": 1.0657952541720078e-05, + "loss": 0.0121, + "step": 149210 + }, + { + "epoch": 32.0081676867248, + "grad_norm": 0.7265501618385315, + "learning_rate": 1.0654943520674984e-05, + "loss": 0.0181, + "step": 149220 + }, + { + "epoch": 32.00822184910361, + "grad_norm": 0.0010258933762088418, + "learning_rate": 1.065193449962989e-05, + "loss": 0.1166, + "step": 149230 + }, + { + "epoch": 32.00827601148242, + "grad_norm": 0.0010527046397328377, + "learning_rate": 1.0648925478584798e-05, + "loss": 0.0203, + "step": 149240 + }, + { + "epoch": 32.00833017386124, + "grad_norm": 0.0009769561002030969, + "learning_rate": 1.0645916457539705e-05, + "loss": 0.0262, + "step": 149250 + }, + { + "epoch": 32.00838433624005, + "grad_norm": 0.014198550023138523, + "learning_rate": 1.0642907436494613e-05, + "loss": 0.0477, + "step": 149260 + }, + { + "epoch": 32.00843849861886, + "grad_norm": 0.0013894259463995695, + "learning_rate": 1.0639898415449519e-05, + "loss": 0.0196, + "step": 149270 + }, + { + "epoch": 32.00849266099767, + "grad_norm": 0.0010060002095997334, + "learning_rate": 1.0636889394404425e-05, + "loss": 0.0637, + "step": 149280 + }, + { + "epoch": 32.008546823376484, + "grad_norm": 0.0011092870263382792, + "learning_rate": 1.0633880373359332e-05, + "loss": 0.0038, + "step": 149290 + }, + { + "epoch": 32.008600985755294, + "grad_norm": 0.0012588754761964083, + "learning_rate": 1.0630871352314238e-05, + "loss": 0.0244, + "step": 149300 + }, + { + "epoch": 32.0086551481341, + "grad_norm": 6.341516971588135, + "learning_rate": 1.0627862331269146e-05, + "loss": 0.056, + "step": 149310 + }, + { + "epoch": 32.00870931051292, + "grad_norm": 2.965808629989624, + "learning_rate": 1.0624853310224052e-05, + "loss": 0.0075, + "step": 149320 + }, + { + "epoch": 32.00876347289173, + "grad_norm": 1.4249180555343628, + "learning_rate": 1.0621844289178958e-05, + "loss": 0.0587, + "step": 149330 + }, + { + "epoch": 32.00881763527054, + "grad_norm": 0.02527838572859764, + "learning_rate": 1.0618835268133866e-05, + "loss": 0.0333, + "step": 149340 + }, + { + "epoch": 32.008871797649356, + "grad_norm": 0.7088559865951538, + "learning_rate": 1.0615826247088773e-05, + "loss": 0.0689, + "step": 149350 + }, + { + "epoch": 32.008925960028165, + "grad_norm": 0.0033114482648670673, + "learning_rate": 1.0612817226043679e-05, + "loss": 0.0347, + "step": 149360 + }, + { + "epoch": 32.008980122406975, + "grad_norm": 5.828704357147217, + "learning_rate": 1.0609808204998587e-05, + "loss": 0.0356, + "step": 149370 + }, + { + "epoch": 32.00903428478579, + "grad_norm": 50.590614318847656, + "learning_rate": 1.0606799183953493e-05, + "loss": 0.1638, + "step": 149380 + }, + { + "epoch": 32.0090884471646, + "grad_norm": 0.02248016931116581, + "learning_rate": 1.0603790162908401e-05, + "loss": 0.0445, + "step": 149390 + }, + { + "epoch": 32.00914260954341, + "grad_norm": 0.0014366440009325743, + "learning_rate": 1.0600781141863307e-05, + "loss": 0.0004, + "step": 149400 + }, + { + "epoch": 32.00919677192222, + "grad_norm": 0.006180120166391134, + "learning_rate": 1.0597772120818214e-05, + "loss": 0.0004, + "step": 149410 + }, + { + "epoch": 32.009250934301036, + "grad_norm": 0.001651780679821968, + "learning_rate": 1.0594763099773122e-05, + "loss": 0.0001, + "step": 149420 + }, + { + "epoch": 32.009305096679846, + "grad_norm": 3.561372756958008, + "learning_rate": 1.0591754078728026e-05, + "loss": 0.0451, + "step": 149430 + }, + { + "epoch": 32.009359259058655, + "grad_norm": 0.001295720343478024, + "learning_rate": 1.0588745057682934e-05, + "loss": 0.0006, + "step": 149440 + }, + { + "epoch": 32.00941342143747, + "grad_norm": 0.0011159168789163232, + "learning_rate": 1.058573603663784e-05, + "loss": 0.0391, + "step": 149450 + }, + { + "epoch": 32.00946758381628, + "grad_norm": 0.0012008780613541603, + "learning_rate": 1.0582727015592747e-05, + "loss": 0.0058, + "step": 149460 + }, + { + "epoch": 32.00952174619509, + "grad_norm": 0.0016717140097171068, + "learning_rate": 1.0579717994547655e-05, + "loss": 0.0563, + "step": 149470 + }, + { + "epoch": 32.00957590857391, + "grad_norm": 0.0011660223826766014, + "learning_rate": 1.0576708973502561e-05, + "loss": 0.0147, + "step": 149480 + }, + { + "epoch": 32.00963007095272, + "grad_norm": 1.9672287702560425, + "learning_rate": 1.0573699952457467e-05, + "loss": 0.0765, + "step": 149490 + }, + { + "epoch": 32.00968423333153, + "grad_norm": 0.058634448796510696, + "learning_rate": 1.0570690931412375e-05, + "loss": 0.015, + "step": 149500 + }, + { + "epoch": 32.009738395710336, + "grad_norm": 1.63728666305542, + "learning_rate": 1.0567681910367281e-05, + "loss": 0.0392, + "step": 149510 + }, + { + "epoch": 32.00979255808915, + "grad_norm": 0.07946424931287766, + "learning_rate": 1.056467288932219e-05, + "loss": 0.0652, + "step": 149520 + }, + { + "epoch": 32.00984672046796, + "grad_norm": 0.004911371972411871, + "learning_rate": 1.0561663868277096e-05, + "loss": 0.0084, + "step": 149530 + }, + { + "epoch": 32.00990088284677, + "grad_norm": 0.004837065003812313, + "learning_rate": 1.0558654847232002e-05, + "loss": 0.0207, + "step": 149540 + }, + { + "epoch": 32.00995504522559, + "grad_norm": 0.056092653423547745, + "learning_rate": 1.055564582618691e-05, + "loss": 0.0112, + "step": 149550 + }, + { + "epoch": 32.0100092076044, + "grad_norm": 0.0009614548762328923, + "learning_rate": 1.0552636805141816e-05, + "loss": 0.0004, + "step": 149560 + }, + { + "epoch": 32.01006336998321, + "grad_norm": 0.001017745933495462, + "learning_rate": 1.0549627784096722e-05, + "loss": 0.0014, + "step": 149570 + }, + { + "epoch": 32.010117532362024, + "grad_norm": 0.000968041829764843, + "learning_rate": 1.0546618763051629e-05, + "loss": 0.0759, + "step": 149580 + }, + { + "epoch": 32.01017169474083, + "grad_norm": 0.0013042830396443605, + "learning_rate": 1.0543609742006535e-05, + "loss": 0.0114, + "step": 149590 + }, + { + "epoch": 32.01022585711964, + "grad_norm": 0.0010295311221852899, + "learning_rate": 1.0540600720961443e-05, + "loss": 0.0476, + "step": 149600 + }, + { + "epoch": 32.01028001949846, + "grad_norm": 0.003289238316938281, + "learning_rate": 1.053759169991635e-05, + "loss": 0.0185, + "step": 149610 + }, + { + "epoch": 32.01033418187727, + "grad_norm": 0.0009922051103785634, + "learning_rate": 1.0534582678871256e-05, + "loss": 0.0211, + "step": 149620 + }, + { + "epoch": 32.01038834425608, + "grad_norm": 0.6105882525444031, + "learning_rate": 1.0531573657826163e-05, + "loss": 0.068, + "step": 149630 + }, + { + "epoch": 32.01044250663489, + "grad_norm": 0.0012170149711892009, + "learning_rate": 1.052856463678107e-05, + "loss": 0.035, + "step": 149640 + }, + { + "epoch": 32.010496669013705, + "grad_norm": 0.000986688770353794, + "learning_rate": 1.0525555615735978e-05, + "loss": 0.0179, + "step": 149650 + }, + { + "epoch": 32.010550831392514, + "grad_norm": 0.0009843914303928614, + "learning_rate": 1.0522546594690884e-05, + "loss": 0.0193, + "step": 149660 + }, + { + "epoch": 32.010604993771324, + "grad_norm": 0.17054696381092072, + "learning_rate": 1.051953757364579e-05, + "loss": 0.0147, + "step": 149670 + }, + { + "epoch": 32.01065915615014, + "grad_norm": 1.1314483880996704, + "learning_rate": 1.0516528552600698e-05, + "loss": 0.0427, + "step": 149680 + }, + { + "epoch": 32.01071331852895, + "grad_norm": 0.00125992635730654, + "learning_rate": 1.0513519531555605e-05, + "loss": 0.0491, + "step": 149690 + }, + { + "epoch": 32.01076748090776, + "grad_norm": 1.607029676437378, + "learning_rate": 1.051051051051051e-05, + "loss": 0.0272, + "step": 149700 + }, + { + "epoch": 32.010821643286576, + "grad_norm": 0.31502339243888855, + "learning_rate": 1.0507501489465419e-05, + "loss": 0.0232, + "step": 149710 + }, + { + "epoch": 32.010875805665385, + "grad_norm": 0.22139152884483337, + "learning_rate": 1.0504492468420325e-05, + "loss": 0.0473, + "step": 149720 + }, + { + "epoch": 32.010929968044195, + "grad_norm": 0.09659809619188309, + "learning_rate": 1.0501483447375231e-05, + "loss": 0.0713, + "step": 149730 + }, + { + "epoch": 32.01098413042301, + "grad_norm": 1.4187604188919067, + "learning_rate": 1.0498474426330138e-05, + "loss": 0.0374, + "step": 149740 + }, + { + "epoch": 32.01103829280182, + "grad_norm": 0.0014357551699504256, + "learning_rate": 1.0495465405285044e-05, + "loss": 0.0361, + "step": 149750 + }, + { + "epoch": 32.01109245518063, + "grad_norm": 0.10545331239700317, + "learning_rate": 1.0492456384239952e-05, + "loss": 0.0657, + "step": 149760 + }, + { + "epoch": 32.01114661755944, + "grad_norm": 2.786069869995117, + "learning_rate": 1.0489447363194858e-05, + "loss": 0.0433, + "step": 149770 + }, + { + "epoch": 32.01120077993826, + "grad_norm": 0.0011058570817112923, + "learning_rate": 1.0486438342149766e-05, + "loss": 0.0093, + "step": 149780 + }, + { + "epoch": 32.011254942317066, + "grad_norm": 0.5598047375679016, + "learning_rate": 1.0483429321104672e-05, + "loss": 0.0095, + "step": 149790 + }, + { + "epoch": 32.011309104695876, + "grad_norm": 0.0016321943840011954, + "learning_rate": 1.0480420300059579e-05, + "loss": 0.0427, + "step": 149800 + }, + { + "epoch": 32.01136326707469, + "grad_norm": 0.0016083199298009276, + "learning_rate": 1.0477411279014487e-05, + "loss": 0.0274, + "step": 149810 + }, + { + "epoch": 32.0114174294535, + "grad_norm": 0.08916234970092773, + "learning_rate": 1.0474402257969393e-05, + "loss": 0.0285, + "step": 149820 + }, + { + "epoch": 32.01147159183231, + "grad_norm": 4.888732433319092, + "learning_rate": 1.0471393236924299e-05, + "loss": 0.1213, + "step": 149830 + }, + { + "epoch": 32.01152575421113, + "grad_norm": 0.0013071214780211449, + "learning_rate": 1.0468384215879207e-05, + "loss": 0.0215, + "step": 149840 + }, + { + "epoch": 32.01157991658994, + "grad_norm": 0.001388648757711053, + "learning_rate": 1.0465375194834113e-05, + "loss": 0.0332, + "step": 149850 + }, + { + "epoch": 32.01163407896875, + "grad_norm": 0.854225754737854, + "learning_rate": 1.0462366173789021e-05, + "loss": 0.0176, + "step": 149860 + }, + { + "epoch": 32.01168824134756, + "grad_norm": 0.966777503490448, + "learning_rate": 1.0459357152743928e-05, + "loss": 0.044, + "step": 149870 + }, + { + "epoch": 32.01174240372637, + "grad_norm": 1.6361445188522339, + "learning_rate": 1.0456348131698832e-05, + "loss": 0.0374, + "step": 149880 + }, + { + "epoch": 32.01179656610518, + "grad_norm": 0.012648342177271843, + "learning_rate": 1.045333911065374e-05, + "loss": 0.0338, + "step": 149890 + }, + { + "epoch": 32.01185072848399, + "grad_norm": 0.011470340192317963, + "learning_rate": 1.0450330089608646e-05, + "loss": 0.1319, + "step": 149900 + }, + { + "epoch": 32.01190489086281, + "grad_norm": 0.0010491348803043365, + "learning_rate": 1.0447321068563554e-05, + "loss": 0.0216, + "step": 149910 + }, + { + "epoch": 32.01195905324162, + "grad_norm": 0.0015496800187975168, + "learning_rate": 1.044431204751846e-05, + "loss": 0.01, + "step": 149920 + }, + { + "epoch": 32.01201321562043, + "grad_norm": 0.001739784493111074, + "learning_rate": 1.0441303026473367e-05, + "loss": 0.0322, + "step": 149930 + }, + { + "epoch": 32.012067377999244, + "grad_norm": 0.0012119454331696033, + "learning_rate": 1.0438294005428275e-05, + "loss": 0.022, + "step": 149940 + }, + { + "epoch": 32.012121540378054, + "grad_norm": 0.0013810541713610291, + "learning_rate": 1.0435284984383181e-05, + "loss": 0.0113, + "step": 149950 + }, + { + "epoch": 32.01217570275686, + "grad_norm": 0.00105301255825907, + "learning_rate": 1.0432275963338087e-05, + "loss": 0.0074, + "step": 149960 + }, + { + "epoch": 32.01222986513568, + "grad_norm": 0.00136244622990489, + "learning_rate": 1.0429266942292995e-05, + "loss": 0.1255, + "step": 149970 + }, + { + "epoch": 32.01228402751449, + "grad_norm": 0.0017043426632881165, + "learning_rate": 1.0426257921247902e-05, + "loss": 0.0105, + "step": 149980 + }, + { + "epoch": 32.0123381898933, + "grad_norm": 0.001081518828868866, + "learning_rate": 1.042324890020281e-05, + "loss": 0.0179, + "step": 149990 + }, + { + "epoch": 32.01239235227211, + "grad_norm": 0.0010386859066784382, + "learning_rate": 1.0420239879157716e-05, + "loss": 0.0091, + "step": 150000 + }, + { + "epoch": 32.012446514650925, + "grad_norm": 0.0012783295242115855, + "learning_rate": 1.0417230858112622e-05, + "loss": 0.0275, + "step": 150010 + }, + { + "epoch": 32.012500677029735, + "grad_norm": 0.20619787275791168, + "learning_rate": 1.041422183706753e-05, + "loss": 0.0176, + "step": 150020 + }, + { + "epoch": 32.012554839408544, + "grad_norm": 0.25553712248802185, + "learning_rate": 1.0411212816022435e-05, + "loss": 0.0022, + "step": 150030 + }, + { + "epoch": 32.01260900178736, + "grad_norm": 0.0010975536424666643, + "learning_rate": 1.0408203794977343e-05, + "loss": 0.0278, + "step": 150040 + }, + { + "epoch": 32.01266316416617, + "grad_norm": 0.0031279614195227623, + "learning_rate": 1.0405194773932249e-05, + "loss": 0.0711, + "step": 150050 + }, + { + "epoch": 32.01271732654498, + "grad_norm": 0.0009867938933894038, + "learning_rate": 1.0402185752887155e-05, + "loss": 0.0343, + "step": 150060 + }, + { + "epoch": 32.012771488923796, + "grad_norm": 0.0010174197377637029, + "learning_rate": 1.0399176731842063e-05, + "loss": 0.0399, + "step": 150070 + }, + { + "epoch": 32.012825651302606, + "grad_norm": 0.002484832191839814, + "learning_rate": 1.039616771079697e-05, + "loss": 0.0203, + "step": 150080 + }, + { + "epoch": 32.012879813681415, + "grad_norm": 0.08510618656873703, + "learning_rate": 1.0393158689751876e-05, + "loss": 0.1578, + "step": 150090 + }, + { + "epoch": 32.01293397606023, + "grad_norm": 1.5782231092453003, + "learning_rate": 1.0390149668706784e-05, + "loss": 0.1637, + "step": 150100 + }, + { + "epoch": 32.01298813843904, + "grad_norm": 1.1603314876556396, + "learning_rate": 1.038714064766169e-05, + "loss": 0.1506, + "step": 150110 + }, + { + "epoch": 32.01304230081785, + "grad_norm": 0.04689832776784897, + "learning_rate": 1.0384131626616598e-05, + "loss": 0.0066, + "step": 150120 + }, + { + "epoch": 32.01309646319666, + "grad_norm": 1.141291856765747, + "learning_rate": 1.0381122605571504e-05, + "loss": 0.0625, + "step": 150130 + }, + { + "epoch": 32.01315062557548, + "grad_norm": 0.0037554199807345867, + "learning_rate": 1.037811358452641e-05, + "loss": 0.006, + "step": 150140 + }, + { + "epoch": 32.01320478795429, + "grad_norm": 0.4802930951118469, + "learning_rate": 1.0375104563481319e-05, + "loss": 0.0157, + "step": 150150 + }, + { + "epoch": 32.013258950333096, + "grad_norm": 0.6999050378799438, + "learning_rate": 1.0372095542436225e-05, + "loss": 0.0509, + "step": 150160 + }, + { + "epoch": 32.01331311271191, + "grad_norm": 0.001037990441545844, + "learning_rate": 1.0369086521391131e-05, + "loss": 0.0117, + "step": 150170 + }, + { + "epoch": 32.01336727509072, + "grad_norm": 0.9986100792884827, + "learning_rate": 1.0366077500346037e-05, + "loss": 0.0155, + "step": 150180 + }, + { + "epoch": 32.01342143746953, + "grad_norm": 0.0054576643742620945, + "learning_rate": 1.0363068479300944e-05, + "loss": 0.023, + "step": 150190 + }, + { + "epoch": 32.01347559984835, + "grad_norm": 0.030215919017791748, + "learning_rate": 1.0360059458255852e-05, + "loss": 0.0065, + "step": 150200 + }, + { + "epoch": 32.01352976222716, + "grad_norm": 3.1698732376098633, + "learning_rate": 1.0357050437210758e-05, + "loss": 0.0574, + "step": 150210 + }, + { + "epoch": 32.01358392460597, + "grad_norm": 0.2343820184469223, + "learning_rate": 1.0354041416165664e-05, + "loss": 0.0263, + "step": 150220 + }, + { + "epoch": 32.01363808698478, + "grad_norm": 0.0897386372089386, + "learning_rate": 1.0351032395120572e-05, + "loss": 0.0185, + "step": 150230 + }, + { + "epoch": 32.013692249363594, + "grad_norm": 0.0013270446797832847, + "learning_rate": 1.0348023374075478e-05, + "loss": 0.0133, + "step": 150240 + }, + { + "epoch": 32.0137464117424, + "grad_norm": 0.0012189612025395036, + "learning_rate": 1.0345014353030386e-05, + "loss": 0.0461, + "step": 150250 + }, + { + "epoch": 32.01380057412121, + "grad_norm": 0.0012092523975297809, + "learning_rate": 1.0342005331985293e-05, + "loss": 0.0211, + "step": 150260 + }, + { + "epoch": 32.01385473650003, + "grad_norm": 0.001235402189195156, + "learning_rate": 1.0338996310940199e-05, + "loss": 0.0741, + "step": 150270 + }, + { + "epoch": 32.01390889887884, + "grad_norm": 0.6535447835922241, + "learning_rate": 1.0335987289895107e-05, + "loss": 0.0189, + "step": 150280 + }, + { + "epoch": 32.01396306125765, + "grad_norm": 1.0435354709625244, + "learning_rate": 1.0332978268850013e-05, + "loss": 0.0498, + "step": 150290 + }, + { + "epoch": 32.014017223636465, + "grad_norm": 0.5825850367546082, + "learning_rate": 1.032996924780492e-05, + "loss": 0.1875, + "step": 150300 + }, + { + "epoch": 32.014071386015274, + "grad_norm": 0.0014807666884735227, + "learning_rate": 1.0326960226759827e-05, + "loss": 0.0021, + "step": 150310 + }, + { + "epoch": 32.014125548394084, + "grad_norm": 0.0010653159115463495, + "learning_rate": 1.0323951205714734e-05, + "loss": 0.0441, + "step": 150320 + }, + { + "epoch": 32.0141797107729, + "grad_norm": 0.0012772540794685483, + "learning_rate": 1.032094218466964e-05, + "loss": 0.0212, + "step": 150330 + }, + { + "epoch": 32.01423387315171, + "grad_norm": 1.0102194547653198, + "learning_rate": 1.0317933163624546e-05, + "loss": 0.0262, + "step": 150340 + }, + { + "epoch": 32.01428803553052, + "grad_norm": 3.0589065551757812, + "learning_rate": 1.0314924142579453e-05, + "loss": 0.1095, + "step": 150350 + }, + { + "epoch": 32.01434219790933, + "grad_norm": 0.12949974834918976, + "learning_rate": 1.031191512153436e-05, + "loss": 0.0324, + "step": 150360 + }, + { + "epoch": 32.014396360288146, + "grad_norm": 0.006947927176952362, + "learning_rate": 1.0308906100489267e-05, + "loss": 0.0112, + "step": 150370 + }, + { + "epoch": 32.014450522666955, + "grad_norm": 0.6393834352493286, + "learning_rate": 1.0305897079444175e-05, + "loss": 0.0721, + "step": 150380 + }, + { + "epoch": 32.014504685045765, + "grad_norm": 0.07776015996932983, + "learning_rate": 1.0302888058399081e-05, + "loss": 0.0585, + "step": 150390 + }, + { + "epoch": 32.01455884742458, + "grad_norm": 4.0183844566345215, + "learning_rate": 1.0299879037353987e-05, + "loss": 0.0559, + "step": 150400 + }, + { + "epoch": 32.01461300980339, + "grad_norm": 4.155232906341553, + "learning_rate": 1.0296870016308895e-05, + "loss": 0.0453, + "step": 150410 + }, + { + "epoch": 32.0146671721822, + "grad_norm": 0.001173982280306518, + "learning_rate": 1.0293860995263802e-05, + "loss": 0.0511, + "step": 150420 + }, + { + "epoch": 32.01472133456102, + "grad_norm": 0.37680596113204956, + "learning_rate": 1.0290851974218708e-05, + "loss": 0.1876, + "step": 150430 + }, + { + "epoch": 32.014775496939826, + "grad_norm": 0.0010651983320713043, + "learning_rate": 1.0287842953173616e-05, + "loss": 0.066, + "step": 150440 + }, + { + "epoch": 32.014829659318636, + "grad_norm": 0.0010907415999099612, + "learning_rate": 1.0284833932128522e-05, + "loss": 0.0302, + "step": 150450 + }, + { + "epoch": 32.01488382169745, + "grad_norm": 0.1142922043800354, + "learning_rate": 1.0281824911083428e-05, + "loss": 0.0801, + "step": 150460 + }, + { + "epoch": 32.01493798407626, + "grad_norm": 0.001599916722625494, + "learning_rate": 1.0278815890038336e-05, + "loss": 0.0303, + "step": 150470 + }, + { + "epoch": 32.01499214645507, + "grad_norm": 0.0013027817476540804, + "learning_rate": 1.0275806868993241e-05, + "loss": 0.016, + "step": 150480 + }, + { + "epoch": 32.01504630883388, + "grad_norm": 0.6121360659599304, + "learning_rate": 1.0272797847948149e-05, + "loss": 0.058, + "step": 150490 + }, + { + "epoch": 32.0151004712127, + "grad_norm": 1.6571346521377563, + "learning_rate": 1.0269788826903055e-05, + "loss": 0.0136, + "step": 150500 + }, + { + "epoch": 32.01515463359151, + "grad_norm": 0.5926203727722168, + "learning_rate": 1.0266779805857963e-05, + "loss": 0.0152, + "step": 150510 + }, + { + "epoch": 32.01520879597032, + "grad_norm": 0.006012402009218931, + "learning_rate": 1.026377078481287e-05, + "loss": 0.1296, + "step": 150520 + }, + { + "epoch": 32.01526295834913, + "grad_norm": 0.001977320061996579, + "learning_rate": 1.0260761763767776e-05, + "loss": 0.0088, + "step": 150530 + }, + { + "epoch": 32.01531712072794, + "grad_norm": 0.0011231936514377594, + "learning_rate": 1.0257752742722684e-05, + "loss": 0.0321, + "step": 150540 + }, + { + "epoch": 32.01537128310675, + "grad_norm": 1.1571321487426758, + "learning_rate": 1.025474372167759e-05, + "loss": 0.076, + "step": 150550 + }, + { + "epoch": 32.01542544548557, + "grad_norm": 0.5492844581604004, + "learning_rate": 1.0251734700632496e-05, + "loss": 0.0213, + "step": 150560 + }, + { + "epoch": 32.01547960786438, + "grad_norm": 0.0015119242016226053, + "learning_rate": 1.0248725679587404e-05, + "loss": 0.0865, + "step": 150570 + }, + { + "epoch": 32.01553377024319, + "grad_norm": 0.002281760098412633, + "learning_rate": 1.024571665854231e-05, + "loss": 0.0577, + "step": 150580 + }, + { + "epoch": 32.015587932622, + "grad_norm": 0.7579674124717712, + "learning_rate": 1.0242707637497218e-05, + "loss": 0.0208, + "step": 150590 + }, + { + "epoch": 32.015642095000814, + "grad_norm": 0.4265110492706299, + "learning_rate": 1.0239698616452125e-05, + "loss": 0.0622, + "step": 150600 + }, + { + "epoch": 32.01569625737962, + "grad_norm": 0.002943153493106365, + "learning_rate": 1.0236689595407031e-05, + "loss": 0.0889, + "step": 150610 + }, + { + "epoch": 32.01575041975843, + "grad_norm": 3.841097116470337, + "learning_rate": 1.0233680574361939e-05, + "loss": 0.0406, + "step": 150620 + }, + { + "epoch": 32.01580458213725, + "grad_norm": 1.6697897911071777, + "learning_rate": 1.0230671553316843e-05, + "loss": 0.0536, + "step": 150630 + }, + { + "epoch": 32.01585874451606, + "grad_norm": 0.07232756912708282, + "learning_rate": 1.0227662532271751e-05, + "loss": 0.0305, + "step": 150640 + }, + { + "epoch": 32.01591290689487, + "grad_norm": 0.0014177234843373299, + "learning_rate": 1.0224653511226658e-05, + "loss": 0.0466, + "step": 150650 + }, + { + "epoch": 32.015967069273685, + "grad_norm": 0.0012496364070102572, + "learning_rate": 1.0221644490181564e-05, + "loss": 0.055, + "step": 150660 + }, + { + "epoch": 32.016021231652495, + "grad_norm": 0.027862094342708588, + "learning_rate": 1.0218635469136472e-05, + "loss": 0.0637, + "step": 150670 + }, + { + "epoch": 32.016075394031304, + "grad_norm": 0.001656052190810442, + "learning_rate": 1.0215626448091378e-05, + "loss": 0.0005, + "step": 150680 + }, + { + "epoch": 32.01612955641012, + "grad_norm": 0.9329699873924255, + "learning_rate": 1.0212617427046284e-05, + "loss": 0.0331, + "step": 150690 + }, + { + "epoch": 32.01618371878893, + "grad_norm": 0.0022359704598784447, + "learning_rate": 1.0209608406001192e-05, + "loss": 0.0005, + "step": 150700 + }, + { + "epoch": 32.01623788116774, + "grad_norm": 0.4700607657432556, + "learning_rate": 1.0206599384956099e-05, + "loss": 0.0129, + "step": 150710 + }, + { + "epoch": 32.01629204354655, + "grad_norm": 0.003035970265045762, + "learning_rate": 1.0203590363911007e-05, + "loss": 0.0542, + "step": 150720 + }, + { + "epoch": 32.016346205925366, + "grad_norm": 0.0023317362647503614, + "learning_rate": 1.0200581342865913e-05, + "loss": 0.1187, + "step": 150730 + }, + { + "epoch": 32.016400368304176, + "grad_norm": 0.0020045998971909285, + "learning_rate": 1.019757232182082e-05, + "loss": 0.0008, + "step": 150740 + }, + { + "epoch": 32.016454530682985, + "grad_norm": 0.0019432853441685438, + "learning_rate": 1.0194563300775727e-05, + "loss": 0.0127, + "step": 150750 + }, + { + "epoch": 32.0165086930618, + "grad_norm": 1.124664306640625, + "learning_rate": 1.0191554279730633e-05, + "loss": 0.0188, + "step": 150760 + }, + { + "epoch": 32.01656285544061, + "grad_norm": 0.004025730304419994, + "learning_rate": 1.018854525868554e-05, + "loss": 0.0147, + "step": 150770 + }, + { + "epoch": 32.01661701781942, + "grad_norm": 0.009224371053278446, + "learning_rate": 1.0185536237640446e-05, + "loss": 0.0236, + "step": 150780 + }, + { + "epoch": 32.01667118019824, + "grad_norm": 0.0014806822873651981, + "learning_rate": 1.0182527216595352e-05, + "loss": 0.0227, + "step": 150790 + }, + { + "epoch": 32.01672534257705, + "grad_norm": 0.03097394108772278, + "learning_rate": 1.017951819555026e-05, + "loss": 0.0253, + "step": 150800 + }, + { + "epoch": 32.016779504955856, + "grad_norm": 0.42758527398109436, + "learning_rate": 1.0176509174505167e-05, + "loss": 0.0811, + "step": 150810 + }, + { + "epoch": 32.016833667334666, + "grad_norm": 0.07683314383029938, + "learning_rate": 1.0173500153460073e-05, + "loss": 0.0093, + "step": 150820 + }, + { + "epoch": 32.01688782971348, + "grad_norm": 0.11298303306102753, + "learning_rate": 1.017049113241498e-05, + "loss": 0.0268, + "step": 150830 + }, + { + "epoch": 32.01694199209229, + "grad_norm": 0.0014066521544009447, + "learning_rate": 1.0167482111369887e-05, + "loss": 0.0691, + "step": 150840 + }, + { + "epoch": 32.0169961544711, + "grad_norm": 0.0014755246229469776, + "learning_rate": 1.0164473090324795e-05, + "loss": 0.0361, + "step": 150850 + }, + { + "epoch": 32.01705031684992, + "grad_norm": 0.0012427523033693433, + "learning_rate": 1.0161464069279701e-05, + "loss": 0.0246, + "step": 150860 + }, + { + "epoch": 32.01710447922873, + "grad_norm": 0.001958659617230296, + "learning_rate": 1.0158455048234608e-05, + "loss": 0.0056, + "step": 150870 + }, + { + "epoch": 32.01715864160754, + "grad_norm": 0.004150379449129105, + "learning_rate": 1.0155446027189516e-05, + "loss": 0.0209, + "step": 150880 + }, + { + "epoch": 32.017212803986354, + "grad_norm": 0.0013094039168208838, + "learning_rate": 1.0152437006144422e-05, + "loss": 0.0375, + "step": 150890 + }, + { + "epoch": 32.01726696636516, + "grad_norm": 0.6278864741325378, + "learning_rate": 1.0149427985099328e-05, + "loss": 0.003, + "step": 150900 + }, + { + "epoch": 32.01732112874397, + "grad_norm": 0.0012164550134912133, + "learning_rate": 1.0146418964054236e-05, + "loss": 0.0269, + "step": 150910 + }, + { + "epoch": 32.01737529112279, + "grad_norm": 0.3098675012588501, + "learning_rate": 1.0143409943009142e-05, + "loss": 0.0561, + "step": 150920 + }, + { + "epoch": 32.0174294535016, + "grad_norm": 0.7202355265617371, + "learning_rate": 1.0140400921964049e-05, + "loss": 0.0415, + "step": 150930 + }, + { + "epoch": 32.01748361588041, + "grad_norm": 0.0024164640344679356, + "learning_rate": 1.0137391900918955e-05, + "loss": 0.0003, + "step": 150940 + }, + { + "epoch": 32.01753777825922, + "grad_norm": 0.001216954318806529, + "learning_rate": 1.0134382879873861e-05, + "loss": 0.0355, + "step": 150950 + }, + { + "epoch": 32.017591940638034, + "grad_norm": 0.29288074374198914, + "learning_rate": 1.0131373858828769e-05, + "loss": 0.0027, + "step": 150960 + }, + { + "epoch": 32.017646103016844, + "grad_norm": 1.7266459465026855, + "learning_rate": 1.0128364837783675e-05, + "loss": 0.0718, + "step": 150970 + }, + { + "epoch": 32.01770026539565, + "grad_norm": 2.3675453662872314, + "learning_rate": 1.0125355816738583e-05, + "loss": 0.0369, + "step": 150980 + }, + { + "epoch": 32.01775442777447, + "grad_norm": 0.0017562638968229294, + "learning_rate": 1.012234679569349e-05, + "loss": 0.0209, + "step": 150990 + }, + { + "epoch": 32.01780859015328, + "grad_norm": 0.002192465588450432, + "learning_rate": 1.0119337774648396e-05, + "loss": 0.009, + "step": 151000 + }, + { + "epoch": 32.01786275253209, + "grad_norm": 0.45617276430130005, + "learning_rate": 1.0116328753603304e-05, + "loss": 0.0464, + "step": 151010 + }, + { + "epoch": 32.017916914910906, + "grad_norm": 0.6628497838973999, + "learning_rate": 1.011331973255821e-05, + "loss": 0.0126, + "step": 151020 + }, + { + "epoch": 32.017971077289715, + "grad_norm": 0.0015246417606249452, + "learning_rate": 1.0110310711513116e-05, + "loss": 0.0039, + "step": 151030 + }, + { + "epoch": 32.018025239668525, + "grad_norm": 43.01822280883789, + "learning_rate": 1.0107301690468024e-05, + "loss": 0.0204, + "step": 151040 + }, + { + "epoch": 32.01807940204734, + "grad_norm": 3.079087734222412, + "learning_rate": 1.010429266942293e-05, + "loss": 0.0118, + "step": 151050 + }, + { + "epoch": 32.01813356442615, + "grad_norm": 0.05938435345888138, + "learning_rate": 1.0101283648377837e-05, + "loss": 0.0211, + "step": 151060 + }, + { + "epoch": 32.01818772680496, + "grad_norm": 0.001049390877597034, + "learning_rate": 1.0098274627332745e-05, + "loss": 0.0376, + "step": 151070 + }, + { + "epoch": 32.01824188918377, + "grad_norm": 0.0010732995579019189, + "learning_rate": 1.009526560628765e-05, + "loss": 0.0126, + "step": 151080 + }, + { + "epoch": 32.01829605156259, + "grad_norm": 0.00444684037938714, + "learning_rate": 1.0092256585242557e-05, + "loss": 0.0129, + "step": 151090 + }, + { + "epoch": 32.018350213941396, + "grad_norm": 0.0013802240137010813, + "learning_rate": 1.0089247564197464e-05, + "loss": 0.0459, + "step": 151100 + }, + { + "epoch": 32.018404376320206, + "grad_norm": 0.0010563102550804615, + "learning_rate": 1.0086238543152372e-05, + "loss": 0.0337, + "step": 151110 + }, + { + "epoch": 32.01845853869902, + "grad_norm": 0.0016060335328802466, + "learning_rate": 1.0083229522107278e-05, + "loss": 0.0889, + "step": 151120 + }, + { + "epoch": 32.01851270107783, + "grad_norm": 0.0013259872794151306, + "learning_rate": 1.0080220501062184e-05, + "loss": 0.0124, + "step": 151130 + }, + { + "epoch": 32.01856686345664, + "grad_norm": 0.020264845341444016, + "learning_rate": 1.0077211480017092e-05, + "loss": 0.1046, + "step": 151140 + }, + { + "epoch": 32.01862102583546, + "grad_norm": 0.0014541572891175747, + "learning_rate": 1.0074202458971999e-05, + "loss": 0.0856, + "step": 151150 + }, + { + "epoch": 32.01867518821427, + "grad_norm": 0.020499059930443764, + "learning_rate": 1.0071193437926905e-05, + "loss": 0.0281, + "step": 151160 + }, + { + "epoch": 32.01872935059308, + "grad_norm": 0.7353841662406921, + "learning_rate": 1.0068184416881813e-05, + "loss": 0.0128, + "step": 151170 + }, + { + "epoch": 32.018783512971886, + "grad_norm": 0.0021322991233319044, + "learning_rate": 1.0065175395836719e-05, + "loss": 0.09, + "step": 151180 + }, + { + "epoch": 32.0188376753507, + "grad_norm": 0.4787602722644806, + "learning_rate": 1.0062166374791625e-05, + "loss": 0.0194, + "step": 151190 + }, + { + "epoch": 32.01889183772951, + "grad_norm": 0.0014380788197740912, + "learning_rate": 1.0059157353746533e-05, + "loss": 0.0446, + "step": 151200 + }, + { + "epoch": 32.01894600010832, + "grad_norm": 0.48636236786842346, + "learning_rate": 1.005614833270144e-05, + "loss": 0.0165, + "step": 151210 + }, + { + "epoch": 32.01900016248714, + "grad_norm": 0.007360705174505711, + "learning_rate": 1.0053139311656348e-05, + "loss": 0.0056, + "step": 151220 + }, + { + "epoch": 32.01905432486595, + "grad_norm": 0.0014228638028725982, + "learning_rate": 1.0050130290611252e-05, + "loss": 0.0013, + "step": 151230 + }, + { + "epoch": 32.01910848724476, + "grad_norm": 0.5201848149299622, + "learning_rate": 1.004712126956616e-05, + "loss": 0.0271, + "step": 151240 + }, + { + "epoch": 32.019162649623574, + "grad_norm": 0.0012910212390124798, + "learning_rate": 1.0044112248521066e-05, + "loss": 0.0148, + "step": 151250 + }, + { + "epoch": 32.019216812002384, + "grad_norm": 0.036079369485378265, + "learning_rate": 1.0041103227475973e-05, + "loss": 0.1272, + "step": 151260 + }, + { + "epoch": 32.01927097438119, + "grad_norm": 0.008230281993746758, + "learning_rate": 1.003809420643088e-05, + "loss": 0.0294, + "step": 151270 + }, + { + "epoch": 32.01932513676001, + "grad_norm": 0.0012853009393438697, + "learning_rate": 1.0035085185385787e-05, + "loss": 0.0083, + "step": 151280 + }, + { + "epoch": 32.01937929913882, + "grad_norm": 0.6737910509109497, + "learning_rate": 1.0032076164340693e-05, + "loss": 0.0407, + "step": 151290 + }, + { + "epoch": 32.01943346151763, + "grad_norm": 0.000987156294286251, + "learning_rate": 1.0029067143295601e-05, + "loss": 0.065, + "step": 151300 + }, + { + "epoch": 32.01948762389644, + "grad_norm": 0.0013838147278875113, + "learning_rate": 1.0026058122250507e-05, + "loss": 0.0053, + "step": 151310 + }, + { + "epoch": 32.019541786275255, + "grad_norm": 0.0015322075923904777, + "learning_rate": 1.0023049101205414e-05, + "loss": 0.0118, + "step": 151320 + }, + { + "epoch": 32.019595948654064, + "grad_norm": 0.0012373693753033876, + "learning_rate": 1.0020040080160322e-05, + "loss": 0.0027, + "step": 151330 + }, + { + "epoch": 32.019650111032874, + "grad_norm": 0.020043035969138145, + "learning_rate": 1.0017031059115228e-05, + "loss": 0.0125, + "step": 151340 + }, + { + "epoch": 32.01970427341169, + "grad_norm": 0.0009781933622434735, + "learning_rate": 1.0014022038070136e-05, + "loss": 0.0595, + "step": 151350 + }, + { + "epoch": 32.0197584357905, + "grad_norm": 0.03857807815074921, + "learning_rate": 1.0011013017025042e-05, + "loss": 0.0035, + "step": 151360 + }, + { + "epoch": 32.01981259816931, + "grad_norm": 3.561170816421509, + "learning_rate": 1.0008003995979948e-05, + "loss": 0.075, + "step": 151370 + }, + { + "epoch": 32.019866760548126, + "grad_norm": 0.9906697869300842, + "learning_rate": 1.0004994974934855e-05, + "loss": 0.0151, + "step": 151380 + }, + { + "epoch": 32.019920922926936, + "grad_norm": 0.620521068572998, + "learning_rate": 1.0001985953889761e-05, + "loss": 0.0101, + "step": 151390 + }, + { + "epoch": 32.019975085305745, + "grad_norm": 0.0009293495095334947, + "learning_rate": 9.998976932844669e-06, + "loss": 0.0138, + "step": 151400 + }, + { + "epoch": 32.02002924768456, + "grad_norm": 0.0012005312601104379, + "learning_rate": 9.995967911799575e-06, + "loss": 0.0775, + "step": 151410 + }, + { + "epoch": 32.02008341006337, + "grad_norm": 0.001283403835259378, + "learning_rate": 9.992958890754482e-06, + "loss": 0.0542, + "step": 151420 + }, + { + "epoch": 32.02013757244218, + "grad_norm": 0.001622192794457078, + "learning_rate": 9.98994986970939e-06, + "loss": 0.0063, + "step": 151430 + }, + { + "epoch": 32.02019173482099, + "grad_norm": 0.18444201350212097, + "learning_rate": 9.986940848664296e-06, + "loss": 0.0072, + "step": 151440 + }, + { + "epoch": 32.02024589719981, + "grad_norm": 0.0010998675134032965, + "learning_rate": 9.983931827619204e-06, + "loss": 0.0247, + "step": 151450 + }, + { + "epoch": 32.02030005957862, + "grad_norm": 1.114986777305603, + "learning_rate": 9.98092280657411e-06, + "loss": 0.0152, + "step": 151460 + }, + { + "epoch": 32.020354221957426, + "grad_norm": 0.0009099129820242524, + "learning_rate": 9.977913785529016e-06, + "loss": 0.0229, + "step": 151470 + }, + { + "epoch": 32.02040838433624, + "grad_norm": 0.001144351321272552, + "learning_rate": 9.974904764483924e-06, + "loss": 0.0052, + "step": 151480 + }, + { + "epoch": 32.02046254671505, + "grad_norm": 0.0009856666438281536, + "learning_rate": 9.97189574343883e-06, + "loss": 0.0164, + "step": 151490 + }, + { + "epoch": 32.02051670909386, + "grad_norm": 0.0008991335052996874, + "learning_rate": 9.968886722393737e-06, + "loss": 0.0005, + "step": 151500 + }, + { + "epoch": 32.02057087147268, + "grad_norm": 0.0017282344633713365, + "learning_rate": 9.965877701348645e-06, + "loss": 0.137, + "step": 151510 + }, + { + "epoch": 32.02062503385149, + "grad_norm": 0.0011840964434668422, + "learning_rate": 9.962868680303551e-06, + "loss": 0.009, + "step": 151520 + }, + { + "epoch": 32.0206791962303, + "grad_norm": 0.9238152503967285, + "learning_rate": 9.959859659258457e-06, + "loss": 0.04, + "step": 151530 + }, + { + "epoch": 32.02073335860911, + "grad_norm": 0.0056241098791360855, + "learning_rate": 9.956850638213364e-06, + "loss": 0.0967, + "step": 151540 + }, + { + "epoch": 32.02078752098792, + "grad_norm": 0.0009704431286081672, + "learning_rate": 9.95384161716827e-06, + "loss": 0.0053, + "step": 151550 + }, + { + "epoch": 32.02084168336673, + "grad_norm": 0.0009187583345919847, + "learning_rate": 9.950832596123178e-06, + "loss": 0.0106, + "step": 151560 + }, + { + "epoch": 32.02089584574554, + "grad_norm": 0.00158290087711066, + "learning_rate": 9.947823575078084e-06, + "loss": 0.0064, + "step": 151570 + }, + { + "epoch": 32.02095000812436, + "grad_norm": 0.0009227739064954221, + "learning_rate": 9.944814554032992e-06, + "loss": 0.0258, + "step": 151580 + }, + { + "epoch": 32.02100417050317, + "grad_norm": 1.979806900024414, + "learning_rate": 9.941805532987898e-06, + "loss": 0.0374, + "step": 151590 + }, + { + "epoch": 32.02105833288198, + "grad_norm": 0.0009079491137526929, + "learning_rate": 9.938796511942805e-06, + "loss": 0.0056, + "step": 151600 + }, + { + "epoch": 32.021112495260795, + "grad_norm": 0.91510409116745, + "learning_rate": 9.935787490897713e-06, + "loss": 0.0093, + "step": 151610 + }, + { + "epoch": 32.021166657639604, + "grad_norm": 0.11775677651166916, + "learning_rate": 9.932778469852619e-06, + "loss": 0.0211, + "step": 151620 + }, + { + "epoch": 32.021220820018414, + "grad_norm": 0.11534125357866287, + "learning_rate": 9.929769448807525e-06, + "loss": 0.0794, + "step": 151630 + }, + { + "epoch": 32.02127498239723, + "grad_norm": 0.0009262995445169508, + "learning_rate": 9.926760427762433e-06, + "loss": 0.0009, + "step": 151640 + }, + { + "epoch": 32.02132914477604, + "grad_norm": 1.0123540163040161, + "learning_rate": 9.92375140671734e-06, + "loss": 0.0456, + "step": 151650 + }, + { + "epoch": 32.02138330715485, + "grad_norm": 0.019636722281575203, + "learning_rate": 9.920742385672246e-06, + "loss": 0.0175, + "step": 151660 + }, + { + "epoch": 32.02143746953366, + "grad_norm": 0.003960346803069115, + "learning_rate": 9.917733364627154e-06, + "loss": 0.0031, + "step": 151670 + }, + { + "epoch": 32.021491631912475, + "grad_norm": 0.0008919375250115991, + "learning_rate": 9.914724343582058e-06, + "loss": 0.1109, + "step": 151680 + }, + { + "epoch": 32.021545794291285, + "grad_norm": 0.4785602390766144, + "learning_rate": 9.911715322536966e-06, + "loss": 0.0205, + "step": 151690 + }, + { + "epoch": 32.021599956670094, + "grad_norm": 0.0010798834264278412, + "learning_rate": 9.908706301491872e-06, + "loss": 0.0015, + "step": 151700 + }, + { + "epoch": 32.02165411904891, + "grad_norm": 0.0018674425082281232, + "learning_rate": 9.90569728044678e-06, + "loss": 0.0003, + "step": 151710 + }, + { + "epoch": 32.02170828142772, + "grad_norm": 0.0014883727999404073, + "learning_rate": 9.902688259401687e-06, + "loss": 0.0004, + "step": 151720 + }, + { + "epoch": 32.02176244380653, + "grad_norm": 0.0015092692337930202, + "learning_rate": 9.899679238356593e-06, + "loss": 0.0286, + "step": 151730 + }, + { + "epoch": 32.02181660618535, + "grad_norm": 1.0545239448547363, + "learning_rate": 9.896670217311501e-06, + "loss": 0.013, + "step": 151740 + }, + { + "epoch": 32.021870768564156, + "grad_norm": 0.0009110876708291471, + "learning_rate": 9.893661196266407e-06, + "loss": 0.0156, + "step": 151750 + }, + { + "epoch": 32.021924930942966, + "grad_norm": 5.49387264251709, + "learning_rate": 9.890652175221313e-06, + "loss": 0.0908, + "step": 151760 + }, + { + "epoch": 32.021979093321775, + "grad_norm": 3.9476380348205566, + "learning_rate": 9.887643154176221e-06, + "loss": 0.1074, + "step": 151770 + }, + { + "epoch": 32.02203325570059, + "grad_norm": 0.12297827750444412, + "learning_rate": 9.884634133131128e-06, + "loss": 0.0493, + "step": 151780 + }, + { + "epoch": 32.0220874180794, + "grad_norm": 0.5483043193817139, + "learning_rate": 9.881625112086034e-06, + "loss": 0.0216, + "step": 151790 + }, + { + "epoch": 32.02214158045821, + "grad_norm": 0.0011708444217219949, + "learning_rate": 9.878616091040942e-06, + "loss": 0.0491, + "step": 151800 + }, + { + "epoch": 32.02219574283703, + "grad_norm": 0.0009091958636417985, + "learning_rate": 9.875607069995848e-06, + "loss": 0.0179, + "step": 151810 + }, + { + "epoch": 32.02224990521584, + "grad_norm": 0.0009232585434801877, + "learning_rate": 9.872598048950756e-06, + "loss": 0.0471, + "step": 151820 + }, + { + "epoch": 32.022304067594646, + "grad_norm": 0.0015352547634392977, + "learning_rate": 9.86958902790566e-06, + "loss": 0.0213, + "step": 151830 + }, + { + "epoch": 32.02235822997346, + "grad_norm": 0.0011633997783064842, + "learning_rate": 9.866580006860569e-06, + "loss": 0.0109, + "step": 151840 + }, + { + "epoch": 32.02241239235227, + "grad_norm": 0.0009194331360049546, + "learning_rate": 9.863570985815475e-06, + "loss": 0.0399, + "step": 151850 + }, + { + "epoch": 32.02246655473108, + "grad_norm": 1.0256427526474, + "learning_rate": 9.860561964770381e-06, + "loss": 0.0593, + "step": 151860 + }, + { + "epoch": 32.0225207171099, + "grad_norm": 4.236121654510498, + "learning_rate": 9.85755294372529e-06, + "loss": 0.0527, + "step": 151870 + }, + { + "epoch": 32.02257487948871, + "grad_norm": 0.19572916626930237, + "learning_rate": 9.854543922680196e-06, + "loss": 0.0103, + "step": 151880 + }, + { + "epoch": 32.02262904186752, + "grad_norm": 0.0009431593935005367, + "learning_rate": 9.851534901635102e-06, + "loss": 0.0208, + "step": 151890 + }, + { + "epoch": 32.02268320424633, + "grad_norm": 0.6796891093254089, + "learning_rate": 9.84852588059001e-06, + "loss": 0.0171, + "step": 151900 + }, + { + "epoch": 32.022737366625144, + "grad_norm": 1.15977942943573, + "learning_rate": 9.845516859544916e-06, + "loss": 0.0411, + "step": 151910 + }, + { + "epoch": 32.02279152900395, + "grad_norm": 0.8783634305000305, + "learning_rate": 9.842507838499822e-06, + "loss": 0.0859, + "step": 151920 + }, + { + "epoch": 32.02284569138276, + "grad_norm": 0.0027051803190261126, + "learning_rate": 9.83949881745473e-06, + "loss": 0.0087, + "step": 151930 + }, + { + "epoch": 32.02289985376158, + "grad_norm": 0.010983333922922611, + "learning_rate": 9.836489796409637e-06, + "loss": 0.0497, + "step": 151940 + }, + { + "epoch": 32.02295401614039, + "grad_norm": 0.0012423311127349734, + "learning_rate": 9.833480775364545e-06, + "loss": 0.0103, + "step": 151950 + }, + { + "epoch": 32.0230081785192, + "grad_norm": 0.0009814264485612512, + "learning_rate": 9.83047175431945e-06, + "loss": 0.0702, + "step": 151960 + }, + { + "epoch": 32.023062340898015, + "grad_norm": 0.006377696990966797, + "learning_rate": 9.827462733274357e-06, + "loss": 0.0397, + "step": 151970 + }, + { + "epoch": 32.023116503276825, + "grad_norm": 0.5362852215766907, + "learning_rate": 9.824453712229263e-06, + "loss": 0.0019, + "step": 151980 + }, + { + "epoch": 32.023170665655634, + "grad_norm": 0.0012170665431767702, + "learning_rate": 9.82144469118417e-06, + "loss": 0.1056, + "step": 151990 + }, + { + "epoch": 32.02322482803445, + "grad_norm": 0.0009363614954054356, + "learning_rate": 9.818435670139078e-06, + "loss": 0.0798, + "step": 152000 + }, + { + "epoch": 32.02327899041326, + "grad_norm": 0.0009322190890088677, + "learning_rate": 9.815426649093984e-06, + "loss": 0.0061, + "step": 152010 + }, + { + "epoch": 32.02333315279207, + "grad_norm": 0.0009690177976153791, + "learning_rate": 9.81241762804889e-06, + "loss": 0.05, + "step": 152020 + }, + { + "epoch": 32.02338731517088, + "grad_norm": 0.0009497384307906032, + "learning_rate": 9.809408607003798e-06, + "loss": 0.0202, + "step": 152030 + }, + { + "epoch": 32.023441477549696, + "grad_norm": 0.00155552732758224, + "learning_rate": 9.806399585958704e-06, + "loss": 0.0493, + "step": 152040 + }, + { + "epoch": 32.023495639928505, + "grad_norm": 0.0009340150281786919, + "learning_rate": 9.80339056491361e-06, + "loss": 0.0727, + "step": 152050 + }, + { + "epoch": 32.023549802307315, + "grad_norm": 0.0878886878490448, + "learning_rate": 9.800381543868519e-06, + "loss": 0.0607, + "step": 152060 + }, + { + "epoch": 32.02360396468613, + "grad_norm": 0.0016728565096855164, + "learning_rate": 9.797372522823425e-06, + "loss": 0.0035, + "step": 152070 + }, + { + "epoch": 32.02365812706494, + "grad_norm": 0.0013151895254850388, + "learning_rate": 9.794363501778333e-06, + "loss": 0.0875, + "step": 152080 + }, + { + "epoch": 32.02371228944375, + "grad_norm": 1.478830099105835, + "learning_rate": 9.79135448073324e-06, + "loss": 0.0318, + "step": 152090 + }, + { + "epoch": 32.02376645182257, + "grad_norm": 0.6897132396697998, + "learning_rate": 9.788345459688145e-06, + "loss": 0.0107, + "step": 152100 + }, + { + "epoch": 32.02382061420138, + "grad_norm": 0.3223443329334259, + "learning_rate": 9.785336438643053e-06, + "loss": 0.0825, + "step": 152110 + }, + { + "epoch": 32.023874776580186, + "grad_norm": 0.001947410637512803, + "learning_rate": 9.78232741759796e-06, + "loss": 0.1269, + "step": 152120 + }, + { + "epoch": 32.023928938958996, + "grad_norm": 0.0009981190087273717, + "learning_rate": 9.779318396552866e-06, + "loss": 0.0031, + "step": 152130 + }, + { + "epoch": 32.02398310133781, + "grad_norm": 0.009443036280572414, + "learning_rate": 9.776309375507772e-06, + "loss": 0.0167, + "step": 152140 + }, + { + "epoch": 32.02403726371662, + "grad_norm": 0.000985184800811112, + "learning_rate": 9.773300354462679e-06, + "loss": 0.0425, + "step": 152150 + }, + { + "epoch": 32.02409142609543, + "grad_norm": 0.0015859048580750823, + "learning_rate": 9.770291333417586e-06, + "loss": 0.0065, + "step": 152160 + }, + { + "epoch": 32.02414558847425, + "grad_norm": 0.3335256576538086, + "learning_rate": 9.767282312372493e-06, + "loss": 0.0058, + "step": 152170 + }, + { + "epoch": 32.02419975085306, + "grad_norm": 0.001151342410594225, + "learning_rate": 9.764273291327399e-06, + "loss": 0.1282, + "step": 152180 + }, + { + "epoch": 32.02425391323187, + "grad_norm": 0.7311741709709167, + "learning_rate": 9.761264270282307e-06, + "loss": 0.0215, + "step": 152190 + }, + { + "epoch": 32.02430807561068, + "grad_norm": 0.0008952836506068707, + "learning_rate": 9.758255249237213e-06, + "loss": 0.0309, + "step": 152200 + }, + { + "epoch": 32.02436223798949, + "grad_norm": 0.2500004470348358, + "learning_rate": 9.755246228192121e-06, + "loss": 0.0006, + "step": 152210 + }, + { + "epoch": 32.0244164003683, + "grad_norm": 0.5964955687522888, + "learning_rate": 9.752237207147028e-06, + "loss": 0.0254, + "step": 152220 + }, + { + "epoch": 32.02447056274712, + "grad_norm": 0.0009282988030463457, + "learning_rate": 9.749228186101934e-06, + "loss": 0.025, + "step": 152230 + }, + { + "epoch": 32.02452472512593, + "grad_norm": 0.0020121661946177483, + "learning_rate": 9.746219165056842e-06, + "loss": 0.0945, + "step": 152240 + }, + { + "epoch": 32.02457888750474, + "grad_norm": 0.07660901546478271, + "learning_rate": 9.743210144011748e-06, + "loss": 0.0341, + "step": 152250 + }, + { + "epoch": 32.02463304988355, + "grad_norm": 0.0009285096311941743, + "learning_rate": 9.740201122966654e-06, + "loss": 0.1122, + "step": 152260 + }, + { + "epoch": 32.024687212262364, + "grad_norm": 0.0021729059517383575, + "learning_rate": 9.73719210192156e-06, + "loss": 0.0256, + "step": 152270 + }, + { + "epoch": 32.024741374641174, + "grad_norm": 0.0012826431775465608, + "learning_rate": 9.734183080876467e-06, + "loss": 0.171, + "step": 152280 + }, + { + "epoch": 32.02479553701998, + "grad_norm": 0.001219842815771699, + "learning_rate": 9.731174059831375e-06, + "loss": 0.0652, + "step": 152290 + }, + { + "epoch": 32.0248496993988, + "grad_norm": 0.0010586661519482732, + "learning_rate": 9.728165038786281e-06, + "loss": 0.0281, + "step": 152300 + }, + { + "epoch": 32.02490386177761, + "grad_norm": 0.0009570816182531416, + "learning_rate": 9.725156017741189e-06, + "loss": 0.0764, + "step": 152310 + }, + { + "epoch": 32.02495802415642, + "grad_norm": 0.023409781977534294, + "learning_rate": 9.722146996696095e-06, + "loss": 0.0045, + "step": 152320 + }, + { + "epoch": 32.02500135405947, + "eval_accuracy": 0.8184193337687786, + "eval_loss": 1.0976042747497559, + "eval_runtime": 117.6562, + "eval_samples_per_second": 26.025, + "eval_steps_per_second": 3.255, + "step": 152328 + }, + { + "epoch": 33.00001083247576, + "grad_norm": 0.02288762293756008, + "learning_rate": 9.719137975651002e-06, + "loss": 0.0024, + "step": 152330 + }, + { + "epoch": 33.000064994854576, + "grad_norm": 0.0012561480980366468, + "learning_rate": 9.71612895460591e-06, + "loss": 0.1117, + "step": 152340 + }, + { + "epoch": 33.000119157233385, + "grad_norm": 0.9799814224243164, + "learning_rate": 9.713119933560816e-06, + "loss": 0.1547, + "step": 152350 + }, + { + "epoch": 33.000173319612195, + "grad_norm": 0.10737193375825882, + "learning_rate": 9.710110912515722e-06, + "loss": 0.0178, + "step": 152360 + }, + { + "epoch": 33.00022748199101, + "grad_norm": 0.9854555130004883, + "learning_rate": 9.70710189147063e-06, + "loss": 0.0218, + "step": 152370 + }, + { + "epoch": 33.00028164436982, + "grad_norm": 0.031602438539266586, + "learning_rate": 9.704092870425536e-06, + "loss": 0.0341, + "step": 152380 + }, + { + "epoch": 33.00033580674863, + "grad_norm": 0.9937012195587158, + "learning_rate": 9.701083849380443e-06, + "loss": 0.0218, + "step": 152390 + }, + { + "epoch": 33.00038996912745, + "grad_norm": 0.0015023205196484923, + "learning_rate": 9.69807482833535e-06, + "loss": 0.0002, + "step": 152400 + }, + { + "epoch": 33.000444131506256, + "grad_norm": 0.0010684923036023974, + "learning_rate": 9.695065807290257e-06, + "loss": 0.0015, + "step": 152410 + }, + { + "epoch": 33.000498293885066, + "grad_norm": 1.2042732238769531, + "learning_rate": 9.692056786245163e-06, + "loss": 0.0152, + "step": 152420 + }, + { + "epoch": 33.00055245626388, + "grad_norm": 0.2414310872554779, + "learning_rate": 9.68904776520007e-06, + "loss": 0.0162, + "step": 152430 + }, + { + "epoch": 33.00060661864269, + "grad_norm": 0.22966133058071136, + "learning_rate": 9.686038744154977e-06, + "loss": 0.0492, + "step": 152440 + }, + { + "epoch": 33.0006607810215, + "grad_norm": 0.0020675957202911377, + "learning_rate": 9.683029723109884e-06, + "loss": 0.0429, + "step": 152450 + }, + { + "epoch": 33.00071494340031, + "grad_norm": 0.0013694562949240208, + "learning_rate": 9.68002070206479e-06, + "loss": 0.0036, + "step": 152460 + }, + { + "epoch": 33.00076910577913, + "grad_norm": 0.0015062728198245168, + "learning_rate": 9.677011681019698e-06, + "loss": 0.0052, + "step": 152470 + }, + { + "epoch": 33.00082326815794, + "grad_norm": 0.048504509031772614, + "learning_rate": 9.674002659974604e-06, + "loss": 0.2238, + "step": 152480 + }, + { + "epoch": 33.00087743053675, + "grad_norm": 1.990997552871704, + "learning_rate": 9.67099363892951e-06, + "loss": 0.0367, + "step": 152490 + }, + { + "epoch": 33.00093159291556, + "grad_norm": 0.003984028473496437, + "learning_rate": 9.667984617884418e-06, + "loss": 0.008, + "step": 152500 + }, + { + "epoch": 33.00098575529437, + "grad_norm": 0.0059555647894740105, + "learning_rate": 9.664975596839325e-06, + "loss": 0.0023, + "step": 152510 + }, + { + "epoch": 33.00103991767318, + "grad_norm": 0.0015343941049650311, + "learning_rate": 9.661966575794231e-06, + "loss": 0.0631, + "step": 152520 + }, + { + "epoch": 33.001094080052, + "grad_norm": 0.04486417770385742, + "learning_rate": 9.658957554749139e-06, + "loss": 0.0975, + "step": 152530 + }, + { + "epoch": 33.00114824243081, + "grad_norm": 0.1650315374135971, + "learning_rate": 9.655948533704045e-06, + "loss": 0.0088, + "step": 152540 + }, + { + "epoch": 33.00120240480962, + "grad_norm": 0.11499453336000443, + "learning_rate": 9.652939512658953e-06, + "loss": 0.0561, + "step": 152550 + }, + { + "epoch": 33.00125656718843, + "grad_norm": 0.0034955039154738188, + "learning_rate": 9.64993049161386e-06, + "loss": 0.0059, + "step": 152560 + }, + { + "epoch": 33.001310729567244, + "grad_norm": 0.0011320504127070308, + "learning_rate": 9.646921470568766e-06, + "loss": 0.0083, + "step": 152570 + }, + { + "epoch": 33.001364891946054, + "grad_norm": 0.0028230296447873116, + "learning_rate": 9.643912449523672e-06, + "loss": 0.0091, + "step": 152580 + }, + { + "epoch": 33.00141905432486, + "grad_norm": 0.001189225702546537, + "learning_rate": 9.640903428478578e-06, + "loss": 0.0064, + "step": 152590 + }, + { + "epoch": 33.00147321670368, + "grad_norm": 0.012375527992844582, + "learning_rate": 9.637894407433486e-06, + "loss": 0.0058, + "step": 152600 + }, + { + "epoch": 33.00152737908249, + "grad_norm": 0.002667429856956005, + "learning_rate": 9.634885386388393e-06, + "loss": 0.001, + "step": 152610 + }, + { + "epoch": 33.0015815414613, + "grad_norm": 0.0024095193948596716, + "learning_rate": 9.631876365343299e-06, + "loss": 0.0946, + "step": 152620 + }, + { + "epoch": 33.001635703840115, + "grad_norm": 0.0013510518474504352, + "learning_rate": 9.628867344298207e-06, + "loss": 0.0499, + "step": 152630 + }, + { + "epoch": 33.001689866218925, + "grad_norm": 0.0016529485583305359, + "learning_rate": 9.625858323253113e-06, + "loss": 0.0217, + "step": 152640 + }, + { + "epoch": 33.001744028597734, + "grad_norm": 0.003187149064615369, + "learning_rate": 9.62284930220802e-06, + "loss": 0.0086, + "step": 152650 + }, + { + "epoch": 33.00179819097655, + "grad_norm": 0.009756914339959621, + "learning_rate": 9.619840281162927e-06, + "loss": 0.0035, + "step": 152660 + }, + { + "epoch": 33.00185235335536, + "grad_norm": 0.0014317968161776662, + "learning_rate": 9.616831260117834e-06, + "loss": 0.0834, + "step": 152670 + }, + { + "epoch": 33.00190651573417, + "grad_norm": 0.0014178149867802858, + "learning_rate": 9.613822239072742e-06, + "loss": 0.0237, + "step": 152680 + }, + { + "epoch": 33.00196067811298, + "grad_norm": 0.001303797122091055, + "learning_rate": 9.610813218027648e-06, + "loss": 0.012, + "step": 152690 + }, + { + "epoch": 33.002014840491796, + "grad_norm": 0.001039669499732554, + "learning_rate": 9.607804196982554e-06, + "loss": 0.0056, + "step": 152700 + }, + { + "epoch": 33.002069002870606, + "grad_norm": 0.05621890351176262, + "learning_rate": 9.604795175937462e-06, + "loss": 0.0028, + "step": 152710 + }, + { + "epoch": 33.002123165249415, + "grad_norm": 0.001073374878615141, + "learning_rate": 9.601786154892367e-06, + "loss": 0.0043, + "step": 152720 + }, + { + "epoch": 33.00217732762823, + "grad_norm": 0.0009615437011234462, + "learning_rate": 9.598777133847275e-06, + "loss": 0.0139, + "step": 152730 + }, + { + "epoch": 33.00223149000704, + "grad_norm": 0.0010070537682622671, + "learning_rate": 9.595768112802181e-06, + "loss": 0.0003, + "step": 152740 + }, + { + "epoch": 33.00228565238585, + "grad_norm": 0.0012355707585811615, + "learning_rate": 9.592759091757087e-06, + "loss": 0.0142, + "step": 152750 + }, + { + "epoch": 33.00233981476467, + "grad_norm": 5.638260364532471, + "learning_rate": 9.589750070711995e-06, + "loss": 0.1002, + "step": 152760 + }, + { + "epoch": 33.00239397714348, + "grad_norm": 0.0010660510743036866, + "learning_rate": 9.586741049666901e-06, + "loss": 0.0013, + "step": 152770 + }, + { + "epoch": 33.002448139522286, + "grad_norm": 2.931410312652588, + "learning_rate": 9.583732028621808e-06, + "loss": 0.0701, + "step": 152780 + }, + { + "epoch": 33.0025023019011, + "grad_norm": 0.0010047713294625282, + "learning_rate": 9.580723007576716e-06, + "loss": 0.0072, + "step": 152790 + }, + { + "epoch": 33.00255646427991, + "grad_norm": 0.029197342693805695, + "learning_rate": 9.577713986531622e-06, + "loss": 0.0283, + "step": 152800 + }, + { + "epoch": 33.00261062665872, + "grad_norm": 0.0011793579906225204, + "learning_rate": 9.57470496548653e-06, + "loss": 0.0236, + "step": 152810 + }, + { + "epoch": 33.00266478903753, + "grad_norm": 0.029252924025058746, + "learning_rate": 9.571695944441436e-06, + "loss": 0.0064, + "step": 152820 + }, + { + "epoch": 33.00271895141635, + "grad_norm": 0.001231931964866817, + "learning_rate": 9.568686923396342e-06, + "loss": 0.0007, + "step": 152830 + }, + { + "epoch": 33.00277311379516, + "grad_norm": 0.0010258577531203628, + "learning_rate": 9.56567790235125e-06, + "loss": 0.0039, + "step": 152840 + }, + { + "epoch": 33.00282727617397, + "grad_norm": 0.0009693374740891159, + "learning_rate": 9.562668881306157e-06, + "loss": 0.0004, + "step": 152850 + }, + { + "epoch": 33.002881438552784, + "grad_norm": 0.0014381436631083488, + "learning_rate": 9.559659860261063e-06, + "loss": 0.0412, + "step": 152860 + }, + { + "epoch": 33.00293560093159, + "grad_norm": 0.001474415068514645, + "learning_rate": 9.55665083921597e-06, + "loss": 0.045, + "step": 152870 + }, + { + "epoch": 33.0029897633104, + "grad_norm": 2.4726669788360596, + "learning_rate": 9.553641818170876e-06, + "loss": 0.0706, + "step": 152880 + }, + { + "epoch": 33.00304392568922, + "grad_norm": 0.030241239815950394, + "learning_rate": 9.550632797125783e-06, + "loss": 0.1121, + "step": 152890 + }, + { + "epoch": 33.00309808806803, + "grad_norm": 1.7764478921890259, + "learning_rate": 9.54762377608069e-06, + "loss": 0.0394, + "step": 152900 + }, + { + "epoch": 33.00315225044684, + "grad_norm": 0.8964686393737793, + "learning_rate": 9.544614755035596e-06, + "loss": 0.0494, + "step": 152910 + }, + { + "epoch": 33.00320641282565, + "grad_norm": 0.0010239975526928902, + "learning_rate": 9.541605733990504e-06, + "loss": 0.0345, + "step": 152920 + }, + { + "epoch": 33.003260575204465, + "grad_norm": 0.18940173089504242, + "learning_rate": 9.53859671294541e-06, + "loss": 0.0392, + "step": 152930 + }, + { + "epoch": 33.003314737583274, + "grad_norm": 0.0009452024241909385, + "learning_rate": 9.535587691900318e-06, + "loss": 0.0335, + "step": 152940 + }, + { + "epoch": 33.003368899962084, + "grad_norm": 0.0010514057939872146, + "learning_rate": 9.532578670855225e-06, + "loss": 0.0353, + "step": 152950 + }, + { + "epoch": 33.0034230623409, + "grad_norm": 0.0017750210827216506, + "learning_rate": 9.52956964981013e-06, + "loss": 0.0476, + "step": 152960 + }, + { + "epoch": 33.00347722471971, + "grad_norm": 0.0016761088045313954, + "learning_rate": 9.526560628765039e-06, + "loss": 0.0008, + "step": 152970 + }, + { + "epoch": 33.00353138709852, + "grad_norm": 0.001156403566710651, + "learning_rate": 9.523551607719945e-06, + "loss": 0.128, + "step": 152980 + }, + { + "epoch": 33.003585549477336, + "grad_norm": 0.0012471032096073031, + "learning_rate": 9.520542586674851e-06, + "loss": 0.0534, + "step": 152990 + }, + { + "epoch": 33.003639711856145, + "grad_norm": 0.0013656908413395286, + "learning_rate": 9.51753356562976e-06, + "loss": 0.0193, + "step": 153000 + }, + { + "epoch": 33.003693874234955, + "grad_norm": 0.015253155492246151, + "learning_rate": 9.514524544584666e-06, + "loss": 0.1611, + "step": 153010 + }, + { + "epoch": 33.00374803661377, + "grad_norm": 0.0012461142614483833, + "learning_rate": 9.511515523539572e-06, + "loss": 0.1306, + "step": 153020 + }, + { + "epoch": 33.00380219899258, + "grad_norm": 4.606529712677002, + "learning_rate": 9.508506502494478e-06, + "loss": 0.0234, + "step": 153030 + }, + { + "epoch": 33.00385636137139, + "grad_norm": 3.480440378189087, + "learning_rate": 9.505497481449384e-06, + "loss": 0.0683, + "step": 153040 + }, + { + "epoch": 33.0039105237502, + "grad_norm": 0.4570764899253845, + "learning_rate": 9.502488460404292e-06, + "loss": 0.0308, + "step": 153050 + }, + { + "epoch": 33.00396468612902, + "grad_norm": 0.0010212667984887958, + "learning_rate": 9.499479439359199e-06, + "loss": 0.0224, + "step": 153060 + }, + { + "epoch": 33.004018848507826, + "grad_norm": 0.0014774729497730732, + "learning_rate": 9.496470418314107e-06, + "loss": 0.0005, + "step": 153070 + }, + { + "epoch": 33.004073010886636, + "grad_norm": 0.03147656470537186, + "learning_rate": 9.493461397269013e-06, + "loss": 0.0074, + "step": 153080 + }, + { + "epoch": 33.00412717326545, + "grad_norm": 0.0018356689251959324, + "learning_rate": 9.490452376223919e-06, + "loss": 0.1242, + "step": 153090 + }, + { + "epoch": 33.00418133564426, + "grad_norm": 0.0010734011884778738, + "learning_rate": 9.487443355178827e-06, + "loss": 0.0238, + "step": 153100 + }, + { + "epoch": 33.00423549802307, + "grad_norm": 0.43916061520576477, + "learning_rate": 9.484434334133733e-06, + "loss": 0.0642, + "step": 153110 + }, + { + "epoch": 33.00428966040189, + "grad_norm": 0.0045577287673950195, + "learning_rate": 9.48142531308864e-06, + "loss": 0.0683, + "step": 153120 + }, + { + "epoch": 33.0043438227807, + "grad_norm": 0.0018753311596810818, + "learning_rate": 9.478416292043548e-06, + "loss": 0.0341, + "step": 153130 + }, + { + "epoch": 33.00439798515951, + "grad_norm": 2.2457191944122314, + "learning_rate": 9.475407270998454e-06, + "loss": 0.054, + "step": 153140 + }, + { + "epoch": 33.004452147538316, + "grad_norm": 1.9478281736373901, + "learning_rate": 9.472398249953362e-06, + "loss": 0.0376, + "step": 153150 + }, + { + "epoch": 33.00450630991713, + "grad_norm": 0.0073913270607590675, + "learning_rate": 9.469389228908268e-06, + "loss": 0.008, + "step": 153160 + }, + { + "epoch": 33.00456047229594, + "grad_norm": 1.5390849113464355, + "learning_rate": 9.466380207863174e-06, + "loss": 0.0212, + "step": 153170 + }, + { + "epoch": 33.00461463467475, + "grad_norm": 0.0018076248234137893, + "learning_rate": 9.46337118681808e-06, + "loss": 0.0289, + "step": 153180 + }, + { + "epoch": 33.00466879705357, + "grad_norm": 0.001283663441427052, + "learning_rate": 9.460362165772987e-06, + "loss": 0.0785, + "step": 153190 + }, + { + "epoch": 33.00472295943238, + "grad_norm": 0.0013101720251142979, + "learning_rate": 9.457353144727895e-06, + "loss": 0.0483, + "step": 153200 + }, + { + "epoch": 33.00477712181119, + "grad_norm": 0.1891421526670456, + "learning_rate": 9.454344123682801e-06, + "loss": 0.0465, + "step": 153210 + }, + { + "epoch": 33.004831284190004, + "grad_norm": 0.9927645921707153, + "learning_rate": 9.451335102637707e-06, + "loss": 0.0083, + "step": 153220 + }, + { + "epoch": 33.004885446568814, + "grad_norm": 0.0015894955722615123, + "learning_rate": 9.448326081592615e-06, + "loss": 0.0023, + "step": 153230 + }, + { + "epoch": 33.00493960894762, + "grad_norm": 0.37691929936408997, + "learning_rate": 9.445317060547522e-06, + "loss": 0.0434, + "step": 153240 + }, + { + "epoch": 33.00499377132644, + "grad_norm": 0.0015628928085789084, + "learning_rate": 9.442308039502428e-06, + "loss": 0.0091, + "step": 153250 + }, + { + "epoch": 33.00504793370525, + "grad_norm": 0.47543981671333313, + "learning_rate": 9.439299018457336e-06, + "loss": 0.0284, + "step": 153260 + }, + { + "epoch": 33.00510209608406, + "grad_norm": 0.002981411060318351, + "learning_rate": 9.436289997412242e-06, + "loss": 0.0743, + "step": 153270 + }, + { + "epoch": 33.00515625846287, + "grad_norm": 0.029049351811408997, + "learning_rate": 9.43328097636715e-06, + "loss": 0.0098, + "step": 153280 + }, + { + "epoch": 33.005210420841685, + "grad_norm": 0.007788658607751131, + "learning_rate": 9.430271955322056e-06, + "loss": 0.0058, + "step": 153290 + }, + { + "epoch": 33.005264583220495, + "grad_norm": 0.001048170612193644, + "learning_rate": 9.427262934276963e-06, + "loss": 0.0362, + "step": 153300 + }, + { + "epoch": 33.005318745599304, + "grad_norm": 0.6017438173294067, + "learning_rate": 9.42425391323187e-06, + "loss": 0.0217, + "step": 153310 + }, + { + "epoch": 33.00537290797812, + "grad_norm": 11.167346000671387, + "learning_rate": 9.421244892186775e-06, + "loss": 0.0663, + "step": 153320 + }, + { + "epoch": 33.00542707035693, + "grad_norm": 0.010583030059933662, + "learning_rate": 9.418235871141683e-06, + "loss": 0.0121, + "step": 153330 + }, + { + "epoch": 33.00548123273574, + "grad_norm": 0.22477686405181885, + "learning_rate": 9.41522685009659e-06, + "loss": 0.1321, + "step": 153340 + }, + { + "epoch": 33.005535395114556, + "grad_norm": 0.001366630312986672, + "learning_rate": 9.412217829051496e-06, + "loss": 0.0611, + "step": 153350 + }, + { + "epoch": 33.005589557493366, + "grad_norm": 0.590827465057373, + "learning_rate": 9.409208808006404e-06, + "loss": 0.0153, + "step": 153360 + }, + { + "epoch": 33.005643719872175, + "grad_norm": 0.0012150985421612859, + "learning_rate": 9.40619978696131e-06, + "loss": 0.01, + "step": 153370 + }, + { + "epoch": 33.00569788225099, + "grad_norm": 0.0031868473161011934, + "learning_rate": 9.403190765916216e-06, + "loss": 0.0229, + "step": 153380 + }, + { + "epoch": 33.0057520446298, + "grad_norm": 0.0016674950020387769, + "learning_rate": 9.400181744871124e-06, + "loss": 0.0077, + "step": 153390 + }, + { + "epoch": 33.00580620700861, + "grad_norm": 0.0031344599556177855, + "learning_rate": 9.39717272382603e-06, + "loss": 0.0148, + "step": 153400 + }, + { + "epoch": 33.00586036938742, + "grad_norm": 0.9854575991630554, + "learning_rate": 9.394163702780939e-06, + "loss": 0.0189, + "step": 153410 + }, + { + "epoch": 33.00591453176624, + "grad_norm": 2.2110347747802734, + "learning_rate": 9.391154681735845e-06, + "loss": 0.0281, + "step": 153420 + }, + { + "epoch": 33.00596869414505, + "grad_norm": 2.846496820449829, + "learning_rate": 9.388145660690751e-06, + "loss": 0.04, + "step": 153430 + }, + { + "epoch": 33.006022856523856, + "grad_norm": 0.0712607130408287, + "learning_rate": 9.385136639645659e-06, + "loss": 0.0302, + "step": 153440 + }, + { + "epoch": 33.00607701890267, + "grad_norm": 0.0035567334853112698, + "learning_rate": 9.382127618600565e-06, + "loss": 0.0151, + "step": 153450 + }, + { + "epoch": 33.00613118128148, + "grad_norm": 0.0014622382586821914, + "learning_rate": 9.379118597555472e-06, + "loss": 0.049, + "step": 153460 + }, + { + "epoch": 33.00618534366029, + "grad_norm": 0.004164962098002434, + "learning_rate": 9.376109576510378e-06, + "loss": 0.0109, + "step": 153470 + }, + { + "epoch": 33.00623950603911, + "grad_norm": 0.002644872758537531, + "learning_rate": 9.373100555465284e-06, + "loss": 0.0162, + "step": 153480 + }, + { + "epoch": 33.00629366841792, + "grad_norm": 0.0015606696251779795, + "learning_rate": 9.370091534420192e-06, + "loss": 0.0528, + "step": 153490 + }, + { + "epoch": 33.00634783079673, + "grad_norm": 0.0012282449752092361, + "learning_rate": 9.367082513375098e-06, + "loss": 0.065, + "step": 153500 + }, + { + "epoch": 33.00640199317554, + "grad_norm": 0.0016004728386178613, + "learning_rate": 9.364073492330005e-06, + "loss": 0.1064, + "step": 153510 + }, + { + "epoch": 33.00645615555435, + "grad_norm": 0.13898275792598724, + "learning_rate": 9.361064471284913e-06, + "loss": 0.0379, + "step": 153520 + }, + { + "epoch": 33.00651031793316, + "grad_norm": 0.13982166349887848, + "learning_rate": 9.358055450239819e-06, + "loss": 0.0081, + "step": 153530 + }, + { + "epoch": 33.00656448031197, + "grad_norm": 0.001448080874979496, + "learning_rate": 9.355046429194727e-06, + "loss": 0.0132, + "step": 153540 + }, + { + "epoch": 33.00661864269079, + "grad_norm": 0.005135736893862486, + "learning_rate": 9.352037408149633e-06, + "loss": 0.0162, + "step": 153550 + }, + { + "epoch": 33.0066728050696, + "grad_norm": 1.4799612760543823, + "learning_rate": 9.34902838710454e-06, + "loss": 0.0206, + "step": 153560 + }, + { + "epoch": 33.00672696744841, + "grad_norm": 0.37656837701797485, + "learning_rate": 9.346019366059447e-06, + "loss": 0.0384, + "step": 153570 + }, + { + "epoch": 33.006781129827225, + "grad_norm": 0.0041399127803742886, + "learning_rate": 9.343010345014354e-06, + "loss": 0.0178, + "step": 153580 + }, + { + "epoch": 33.006835292206034, + "grad_norm": 0.015306166373193264, + "learning_rate": 9.34000132396926e-06, + "loss": 0.0108, + "step": 153590 + }, + { + "epoch": 33.006889454584844, + "grad_norm": 0.08826585114002228, + "learning_rate": 9.336992302924168e-06, + "loss": 0.0291, + "step": 153600 + }, + { + "epoch": 33.00694361696366, + "grad_norm": 0.0010681950952857733, + "learning_rate": 9.333983281879074e-06, + "loss": 0.1401, + "step": 153610 + }, + { + "epoch": 33.00699777934247, + "grad_norm": 0.006485847756266594, + "learning_rate": 9.33097426083398e-06, + "loss": 0.0113, + "step": 153620 + }, + { + "epoch": 33.00705194172128, + "grad_norm": 0.0010719551937654614, + "learning_rate": 9.327965239788887e-06, + "loss": 0.0145, + "step": 153630 + }, + { + "epoch": 33.00710610410009, + "grad_norm": 0.7690951228141785, + "learning_rate": 9.324956218743793e-06, + "loss": 0.0087, + "step": 153640 + }, + { + "epoch": 33.007160266478905, + "grad_norm": 0.0022815752308815718, + "learning_rate": 9.321947197698701e-06, + "loss": 0.0628, + "step": 153650 + }, + { + "epoch": 33.007214428857715, + "grad_norm": 0.01850752718746662, + "learning_rate": 9.318938176653607e-06, + "loss": 0.0129, + "step": 153660 + }, + { + "epoch": 33.007268591236524, + "grad_norm": 0.001787869376130402, + "learning_rate": 9.315929155608515e-06, + "loss": 0.0714, + "step": 153670 + }, + { + "epoch": 33.00732275361534, + "grad_norm": 0.0013032716233283281, + "learning_rate": 9.312920134563422e-06, + "loss": 0.0917, + "step": 153680 + }, + { + "epoch": 33.00737691599415, + "grad_norm": 0.0017795913154259324, + "learning_rate": 9.309911113518328e-06, + "loss": 0.045, + "step": 153690 + }, + { + "epoch": 33.00743107837296, + "grad_norm": 0.8616999387741089, + "learning_rate": 9.306902092473236e-06, + "loss": 0.0632, + "step": 153700 + }, + { + "epoch": 33.00748524075178, + "grad_norm": 2.091846466064453, + "learning_rate": 9.303893071428142e-06, + "loss": 0.0615, + "step": 153710 + }, + { + "epoch": 33.007539403130586, + "grad_norm": 0.009213155135512352, + "learning_rate": 9.300884050383048e-06, + "loss": 0.0303, + "step": 153720 + }, + { + "epoch": 33.007593565509396, + "grad_norm": 0.7544044256210327, + "learning_rate": 9.297875029337956e-06, + "loss": 0.0676, + "step": 153730 + }, + { + "epoch": 33.00764772788821, + "grad_norm": 0.6988968849182129, + "learning_rate": 9.294866008292863e-06, + "loss": 0.0154, + "step": 153740 + }, + { + "epoch": 33.00770189026702, + "grad_norm": 5.620532035827637, + "learning_rate": 9.29185698724777e-06, + "loss": 0.1122, + "step": 153750 + }, + { + "epoch": 33.00775605264583, + "grad_norm": 0.04597204551100731, + "learning_rate": 9.288847966202677e-06, + "loss": 0.0079, + "step": 153760 + }, + { + "epoch": 33.00781021502464, + "grad_norm": 4.919663906097412, + "learning_rate": 9.285838945157581e-06, + "loss": 0.0532, + "step": 153770 + }, + { + "epoch": 33.00786437740346, + "grad_norm": 0.2766071557998657, + "learning_rate": 9.28282992411249e-06, + "loss": 0.0266, + "step": 153780 + }, + { + "epoch": 33.00791853978227, + "grad_norm": 0.0014403032837435603, + "learning_rate": 9.279820903067396e-06, + "loss": 0.1142, + "step": 153790 + }, + { + "epoch": 33.00797270216108, + "grad_norm": 0.022442733868956566, + "learning_rate": 9.276811882022304e-06, + "loss": 0.039, + "step": 153800 + }, + { + "epoch": 33.00802686453989, + "grad_norm": 0.01681920886039734, + "learning_rate": 9.27380286097721e-06, + "loss": 0.0225, + "step": 153810 + }, + { + "epoch": 33.0080810269187, + "grad_norm": 0.001875699614174664, + "learning_rate": 9.270793839932116e-06, + "loss": 0.0489, + "step": 153820 + }, + { + "epoch": 33.00813518929751, + "grad_norm": 0.07023913413286209, + "learning_rate": 9.267784818887024e-06, + "loss": 0.0052, + "step": 153830 + }, + { + "epoch": 33.00818935167633, + "grad_norm": 0.002300682244822383, + "learning_rate": 9.26477579784193e-06, + "loss": 0.051, + "step": 153840 + }, + { + "epoch": 33.00824351405514, + "grad_norm": 0.007420539855957031, + "learning_rate": 9.261766776796837e-06, + "loss": 0.0425, + "step": 153850 + }, + { + "epoch": 33.00829767643395, + "grad_norm": 0.5626431703567505, + "learning_rate": 9.258757755751745e-06, + "loss": 0.021, + "step": 153860 + }, + { + "epoch": 33.00835183881276, + "grad_norm": 0.0012516237329691648, + "learning_rate": 9.255748734706651e-06, + "loss": 0.0246, + "step": 153870 + }, + { + "epoch": 33.008406001191574, + "grad_norm": 0.0017965188017114997, + "learning_rate": 9.252739713661559e-06, + "loss": 0.0483, + "step": 153880 + }, + { + "epoch": 33.00846016357038, + "grad_norm": 2.024224281311035, + "learning_rate": 9.249730692616465e-06, + "loss": 0.0165, + "step": 153890 + }, + { + "epoch": 33.00851432594919, + "grad_norm": 0.0019227161537855864, + "learning_rate": 9.246721671571371e-06, + "loss": 0.0568, + "step": 153900 + }, + { + "epoch": 33.00856848832801, + "grad_norm": 0.0016160531667992473, + "learning_rate": 9.24371265052628e-06, + "loss": 0.0262, + "step": 153910 + }, + { + "epoch": 33.00862265070682, + "grad_norm": 0.04152408987283707, + "learning_rate": 9.240703629481184e-06, + "loss": 0.0151, + "step": 153920 + }, + { + "epoch": 33.00867681308563, + "grad_norm": 0.002231733873486519, + "learning_rate": 9.237694608436092e-06, + "loss": 0.0459, + "step": 153930 + }, + { + "epoch": 33.008730975464445, + "grad_norm": 0.355380654335022, + "learning_rate": 9.234685587390998e-06, + "loss": 0.0559, + "step": 153940 + }, + { + "epoch": 33.008785137843255, + "grad_norm": 0.0330415703356266, + "learning_rate": 9.231676566345905e-06, + "loss": 0.0626, + "step": 153950 + }, + { + "epoch": 33.008839300222064, + "grad_norm": 0.8176736235618591, + "learning_rate": 9.228667545300812e-06, + "loss": 0.0484, + "step": 153960 + }, + { + "epoch": 33.00889346260088, + "grad_norm": 0.0012203368823975325, + "learning_rate": 9.225658524255719e-06, + "loss": 0.0657, + "step": 153970 + }, + { + "epoch": 33.00894762497969, + "grad_norm": 0.014136960729956627, + "learning_rate": 9.222649503210625e-06, + "loss": 0.0251, + "step": 153980 + }, + { + "epoch": 33.0090017873585, + "grad_norm": 0.0469941571354866, + "learning_rate": 9.219640482165533e-06, + "loss": 0.0225, + "step": 153990 + }, + { + "epoch": 33.00905594973731, + "grad_norm": 0.0012559943133965135, + "learning_rate": 9.21663146112044e-06, + "loss": 0.0243, + "step": 154000 + }, + { + "epoch": 33.009110112116126, + "grad_norm": 0.8553053736686707, + "learning_rate": 9.213622440075347e-06, + "loss": 0.0175, + "step": 154010 + }, + { + "epoch": 33.009164274494935, + "grad_norm": 0.0011626516934484243, + "learning_rate": 9.210613419030254e-06, + "loss": 0.0124, + "step": 154020 + }, + { + "epoch": 33.009218436873745, + "grad_norm": 2.111506462097168, + "learning_rate": 9.20760439798516e-06, + "loss": 0.0567, + "step": 154030 + }, + { + "epoch": 33.00927259925256, + "grad_norm": 0.6232417821884155, + "learning_rate": 9.204595376940068e-06, + "loss": 0.0766, + "step": 154040 + }, + { + "epoch": 33.00932676163137, + "grad_norm": 0.0012881114380434155, + "learning_rate": 9.201586355894974e-06, + "loss": 0.0001, + "step": 154050 + }, + { + "epoch": 33.00938092401018, + "grad_norm": 1.4219129085540771, + "learning_rate": 9.19857733484988e-06, + "loss": 0.076, + "step": 154060 + }, + { + "epoch": 33.009435086389, + "grad_norm": 0.002453767228871584, + "learning_rate": 9.195568313804787e-06, + "loss": 0.0297, + "step": 154070 + }, + { + "epoch": 33.00948924876781, + "grad_norm": 1.0113162994384766, + "learning_rate": 9.192559292759693e-06, + "loss": 0.0408, + "step": 154080 + }, + { + "epoch": 33.009543411146616, + "grad_norm": 0.0011323030339553952, + "learning_rate": 9.1895502717146e-06, + "loss": 0.0291, + "step": 154090 + }, + { + "epoch": 33.00959757352543, + "grad_norm": 0.001525149797089398, + "learning_rate": 9.186541250669507e-06, + "loss": 0.005, + "step": 154100 + }, + { + "epoch": 33.00965173590424, + "grad_norm": 1.5278034210205078, + "learning_rate": 9.183532229624413e-06, + "loss": 0.0123, + "step": 154110 + }, + { + "epoch": 33.00970589828305, + "grad_norm": 0.0011446181451901793, + "learning_rate": 9.180523208579321e-06, + "loss": 0.0346, + "step": 154120 + }, + { + "epoch": 33.00976006066186, + "grad_norm": 1.0672050714492798, + "learning_rate": 9.177514187534228e-06, + "loss": 0.023, + "step": 154130 + }, + { + "epoch": 33.00981422304068, + "grad_norm": 0.0018770602764561772, + "learning_rate": 9.174505166489136e-06, + "loss": 0.0062, + "step": 154140 + }, + { + "epoch": 33.00986838541949, + "grad_norm": 0.055001161992549896, + "learning_rate": 9.171496145444042e-06, + "loss": 0.0428, + "step": 154150 + }, + { + "epoch": 33.0099225477983, + "grad_norm": 0.52874755859375, + "learning_rate": 9.168487124398948e-06, + "loss": 0.0279, + "step": 154160 + }, + { + "epoch": 33.00997671017711, + "grad_norm": 0.001087316544726491, + "learning_rate": 9.165478103353856e-06, + "loss": 0.0417, + "step": 154170 + }, + { + "epoch": 33.01003087255592, + "grad_norm": 0.3824273943901062, + "learning_rate": 9.162469082308762e-06, + "loss": 0.0706, + "step": 154180 + }, + { + "epoch": 33.01008503493473, + "grad_norm": 0.003398188855499029, + "learning_rate": 9.159460061263669e-06, + "loss": 0.0405, + "step": 154190 + }, + { + "epoch": 33.01013919731355, + "grad_norm": 0.03955509886145592, + "learning_rate": 9.156451040218577e-06, + "loss": 0.0176, + "step": 154200 + }, + { + "epoch": 33.01019335969236, + "grad_norm": 3.1365556716918945, + "learning_rate": 9.153442019173483e-06, + "loss": 0.0403, + "step": 154210 + }, + { + "epoch": 33.01024752207117, + "grad_norm": 0.001467923866584897, + "learning_rate": 9.15043299812839e-06, + "loss": 0.0524, + "step": 154220 + }, + { + "epoch": 33.01030168444998, + "grad_norm": 0.004118509590625763, + "learning_rate": 9.147423977083295e-06, + "loss": 0.0236, + "step": 154230 + }, + { + "epoch": 33.010355846828794, + "grad_norm": 0.058822911232709885, + "learning_rate": 9.144414956038202e-06, + "loss": 0.0993, + "step": 154240 + }, + { + "epoch": 33.010410009207604, + "grad_norm": 0.0010715287644416094, + "learning_rate": 9.14140593499311e-06, + "loss": 0.0368, + "step": 154250 + }, + { + "epoch": 33.01046417158641, + "grad_norm": 0.04905927926301956, + "learning_rate": 9.138396913948016e-06, + "loss": 0.0484, + "step": 154260 + }, + { + "epoch": 33.01051833396523, + "grad_norm": 0.23340429365634918, + "learning_rate": 9.135387892902924e-06, + "loss": 0.0036, + "step": 154270 + }, + { + "epoch": 33.01057249634404, + "grad_norm": 1.0742366313934326, + "learning_rate": 9.13237887185783e-06, + "loss": 0.0323, + "step": 154280 + }, + { + "epoch": 33.01062665872285, + "grad_norm": 0.038558948785066605, + "learning_rate": 9.129369850812736e-06, + "loss": 0.0721, + "step": 154290 + }, + { + "epoch": 33.010680821101666, + "grad_norm": 8.276334762573242, + "learning_rate": 9.126360829767644e-06, + "loss": 0.0515, + "step": 154300 + }, + { + "epoch": 33.010734983480475, + "grad_norm": 0.0011547558242455125, + "learning_rate": 9.12335180872255e-06, + "loss": 0.0399, + "step": 154310 + }, + { + "epoch": 33.010789145859285, + "grad_norm": 0.011979245580732822, + "learning_rate": 9.120342787677457e-06, + "loss": 0.0098, + "step": 154320 + }, + { + "epoch": 33.0108433082381, + "grad_norm": 0.0015821554698050022, + "learning_rate": 9.117333766632365e-06, + "loss": 0.1057, + "step": 154330 + }, + { + "epoch": 33.01089747061691, + "grad_norm": 25.98789405822754, + "learning_rate": 9.114324745587271e-06, + "loss": 0.0383, + "step": 154340 + }, + { + "epoch": 33.01095163299572, + "grad_norm": 0.0014052002225071192, + "learning_rate": 9.11131572454218e-06, + "loss": 0.03, + "step": 154350 + }, + { + "epoch": 33.01100579537453, + "grad_norm": 28.98552131652832, + "learning_rate": 9.108306703497085e-06, + "loss": 0.041, + "step": 154360 + }, + { + "epoch": 33.011059957753346, + "grad_norm": 0.0013426237273961306, + "learning_rate": 9.10529768245199e-06, + "loss": 0.0015, + "step": 154370 + }, + { + "epoch": 33.011114120132156, + "grad_norm": 0.0010719425044953823, + "learning_rate": 9.102288661406898e-06, + "loss": 0.0348, + "step": 154380 + }, + { + "epoch": 33.011168282510965, + "grad_norm": 0.0023776497691869736, + "learning_rate": 9.099279640361804e-06, + "loss": 0.0207, + "step": 154390 + }, + { + "epoch": 33.01122244488978, + "grad_norm": 1.0877928733825684, + "learning_rate": 9.096270619316712e-06, + "loss": 0.1168, + "step": 154400 + }, + { + "epoch": 33.01127660726859, + "grad_norm": 0.0010437463643029332, + "learning_rate": 9.093261598271619e-06, + "loss": 0.0189, + "step": 154410 + }, + { + "epoch": 33.0113307696474, + "grad_norm": 0.6885427832603455, + "learning_rate": 9.090252577226525e-06, + "loss": 0.0483, + "step": 154420 + }, + { + "epoch": 33.01138493202622, + "grad_norm": 0.22037380933761597, + "learning_rate": 9.087243556181433e-06, + "loss": 0.0029, + "step": 154430 + }, + { + "epoch": 33.01143909440503, + "grad_norm": 0.0011029073502868414, + "learning_rate": 9.084234535136339e-06, + "loss": 0.0657, + "step": 154440 + }, + { + "epoch": 33.01149325678384, + "grad_norm": 0.009812003001570702, + "learning_rate": 9.081225514091245e-06, + "loss": 0.0049, + "step": 154450 + }, + { + "epoch": 33.011547419162646, + "grad_norm": 0.004924652632325888, + "learning_rate": 9.078216493046153e-06, + "loss": 0.0036, + "step": 154460 + }, + { + "epoch": 33.01160158154146, + "grad_norm": 0.0011417163768783212, + "learning_rate": 9.07520747200106e-06, + "loss": 0.0486, + "step": 154470 + }, + { + "epoch": 33.01165574392027, + "grad_norm": 0.0019159754738211632, + "learning_rate": 9.072198450955968e-06, + "loss": 0.0167, + "step": 154480 + }, + { + "epoch": 33.01170990629908, + "grad_norm": 0.01851576194167137, + "learning_rate": 9.069189429910874e-06, + "loss": 0.043, + "step": 154490 + }, + { + "epoch": 33.0117640686779, + "grad_norm": 0.0076746102422475815, + "learning_rate": 9.06618040886578e-06, + "loss": 0.0533, + "step": 154500 + }, + { + "epoch": 33.01181823105671, + "grad_norm": 0.0010176439536735415, + "learning_rate": 9.063171387820688e-06, + "loss": 0.0335, + "step": 154510 + }, + { + "epoch": 33.01187239343552, + "grad_norm": 2.938124656677246, + "learning_rate": 9.060162366775593e-06, + "loss": 0.0168, + "step": 154520 + }, + { + "epoch": 33.011926555814334, + "grad_norm": 0.5372026562690735, + "learning_rate": 9.0571533457305e-06, + "loss": 0.0308, + "step": 154530 + }, + { + "epoch": 33.01198071819314, + "grad_norm": 0.0009826167952269316, + "learning_rate": 9.054144324685407e-06, + "loss": 0.0514, + "step": 154540 + }, + { + "epoch": 33.01203488057195, + "grad_norm": 3.7235686779022217, + "learning_rate": 9.051135303640313e-06, + "loss": 0.0294, + "step": 154550 + }, + { + "epoch": 33.01208904295077, + "grad_norm": 0.0010799471056088805, + "learning_rate": 9.048126282595221e-06, + "loss": 0.0637, + "step": 154560 + }, + { + "epoch": 33.01214320532958, + "grad_norm": 0.0014923008857294917, + "learning_rate": 9.045117261550127e-06, + "loss": 0.0061, + "step": 154570 + }, + { + "epoch": 33.01219736770839, + "grad_norm": 3.538323402404785, + "learning_rate": 9.042108240505034e-06, + "loss": 0.0626, + "step": 154580 + }, + { + "epoch": 33.0122515300872, + "grad_norm": 0.0016834544949233532, + "learning_rate": 9.039099219459942e-06, + "loss": 0.016, + "step": 154590 + }, + { + "epoch": 33.012305692466015, + "grad_norm": 0.0012653061421588063, + "learning_rate": 9.036090198414848e-06, + "loss": 0.0043, + "step": 154600 + }, + { + "epoch": 33.012359854844824, + "grad_norm": 0.6858632564544678, + "learning_rate": 9.033081177369756e-06, + "loss": 0.0072, + "step": 154610 + }, + { + "epoch": 33.012414017223634, + "grad_norm": 0.001033080043271184, + "learning_rate": 9.030072156324662e-06, + "loss": 0.0356, + "step": 154620 + }, + { + "epoch": 33.01246817960245, + "grad_norm": 0.0012773623457178473, + "learning_rate": 9.027063135279568e-06, + "loss": 0.0126, + "step": 154630 + }, + { + "epoch": 33.01252234198126, + "grad_norm": 1.0749655961990356, + "learning_rate": 9.024054114234476e-06, + "loss": 0.0403, + "step": 154640 + }, + { + "epoch": 33.01257650436007, + "grad_norm": 0.004449633415788412, + "learning_rate": 9.021045093189383e-06, + "loss": 0.0799, + "step": 154650 + }, + { + "epoch": 33.012630666738886, + "grad_norm": 0.0015734975459054112, + "learning_rate": 9.018036072144289e-06, + "loss": 0.0089, + "step": 154660 + }, + { + "epoch": 33.012684829117696, + "grad_norm": 0.0011519698891788721, + "learning_rate": 9.015027051099195e-06, + "loss": 0.0097, + "step": 154670 + }, + { + "epoch": 33.012738991496505, + "grad_norm": 6.59876823425293, + "learning_rate": 9.012018030054102e-06, + "loss": 0.0596, + "step": 154680 + }, + { + "epoch": 33.01279315387532, + "grad_norm": 0.0011453236220404506, + "learning_rate": 9.00900900900901e-06, + "loss": 0.0449, + "step": 154690 + }, + { + "epoch": 33.01284731625413, + "grad_norm": 1.5374236106872559, + "learning_rate": 9.005999987963916e-06, + "loss": 0.0647, + "step": 154700 + }, + { + "epoch": 33.01290147863294, + "grad_norm": 0.06983186304569244, + "learning_rate": 9.002990966918822e-06, + "loss": 0.0872, + "step": 154710 + }, + { + "epoch": 33.01295564101175, + "grad_norm": 2.6041338443756104, + "learning_rate": 8.99998194587373e-06, + "loss": 0.0741, + "step": 154720 + }, + { + "epoch": 33.01300980339057, + "grad_norm": 0.0011240686289966106, + "learning_rate": 8.996972924828636e-06, + "loss": 0.0121, + "step": 154730 + }, + { + "epoch": 33.013063965769376, + "grad_norm": 0.002933866111561656, + "learning_rate": 8.993963903783544e-06, + "loss": 0.0001, + "step": 154740 + }, + { + "epoch": 33.013118128148186, + "grad_norm": 0.0022034223657101393, + "learning_rate": 8.99095488273845e-06, + "loss": 0.0132, + "step": 154750 + }, + { + "epoch": 33.013172290527, + "grad_norm": 1.5897353887557983, + "learning_rate": 8.987945861693357e-06, + "loss": 0.0265, + "step": 154760 + }, + { + "epoch": 33.01322645290581, + "grad_norm": 0.0010697317775338888, + "learning_rate": 8.984936840648265e-06, + "loss": 0.0309, + "step": 154770 + }, + { + "epoch": 33.01328061528462, + "grad_norm": 0.0011697837617248297, + "learning_rate": 8.981927819603171e-06, + "loss": 0.0029, + "step": 154780 + }, + { + "epoch": 33.01333477766344, + "grad_norm": 1.023632526397705, + "learning_rate": 8.978918798558077e-06, + "loss": 0.0841, + "step": 154790 + }, + { + "epoch": 33.01338894004225, + "grad_norm": 0.0014423808315768838, + "learning_rate": 8.975909777512985e-06, + "loss": 0.0016, + "step": 154800 + }, + { + "epoch": 33.01344310242106, + "grad_norm": 0.0011083536082878709, + "learning_rate": 8.972900756467892e-06, + "loss": 0.0785, + "step": 154810 + }, + { + "epoch": 33.01349726479987, + "grad_norm": 0.0027079551946371794, + "learning_rate": 8.969891735422798e-06, + "loss": 0.0252, + "step": 154820 + }, + { + "epoch": 33.01355142717868, + "grad_norm": 0.47422629594802856, + "learning_rate": 8.966882714377704e-06, + "loss": 0.0368, + "step": 154830 + }, + { + "epoch": 33.01360558955749, + "grad_norm": 0.0013823056360706687, + "learning_rate": 8.96387369333261e-06, + "loss": 0.0308, + "step": 154840 + }, + { + "epoch": 33.0136597519363, + "grad_norm": 0.14472737908363342, + "learning_rate": 8.960864672287518e-06, + "loss": 0.0136, + "step": 154850 + }, + { + "epoch": 33.01371391431512, + "grad_norm": 0.8297941088676453, + "learning_rate": 8.957855651242425e-06, + "loss": 0.0231, + "step": 154860 + }, + { + "epoch": 33.01376807669393, + "grad_norm": 0.002080128528177738, + "learning_rate": 8.954846630197333e-06, + "loss": 0.0048, + "step": 154870 + }, + { + "epoch": 33.01382223907274, + "grad_norm": 0.7335353493690491, + "learning_rate": 8.951837609152239e-06, + "loss": 0.0282, + "step": 154880 + }, + { + "epoch": 33.013876401451554, + "grad_norm": 0.0017439030343666673, + "learning_rate": 8.948828588107145e-06, + "loss": 0.0098, + "step": 154890 + }, + { + "epoch": 33.013930563830364, + "grad_norm": 0.017269765958189964, + "learning_rate": 8.945819567062053e-06, + "loss": 0.0007, + "step": 154900 + }, + { + "epoch": 33.01398472620917, + "grad_norm": 0.049667272716760635, + "learning_rate": 8.94281054601696e-06, + "loss": 0.1587, + "step": 154910 + }, + { + "epoch": 33.01403888858799, + "grad_norm": 0.0009927082573994994, + "learning_rate": 8.939801524971866e-06, + "loss": 0.0527, + "step": 154920 + }, + { + "epoch": 33.0140930509668, + "grad_norm": 0.0012655856553465128, + "learning_rate": 8.936792503926774e-06, + "loss": 0.0452, + "step": 154930 + }, + { + "epoch": 33.01414721334561, + "grad_norm": 2.8316473960876465, + "learning_rate": 8.93378348288168e-06, + "loss": 0.059, + "step": 154940 + }, + { + "epoch": 33.01420137572442, + "grad_norm": 1.8204110860824585, + "learning_rate": 8.930774461836588e-06, + "loss": 0.0115, + "step": 154950 + }, + { + "epoch": 33.014255538103235, + "grad_norm": 0.23687410354614258, + "learning_rate": 8.927765440791492e-06, + "loss": 0.0047, + "step": 154960 + }, + { + "epoch": 33.014309700482045, + "grad_norm": 0.05268815532326698, + "learning_rate": 8.924756419746399e-06, + "loss": 0.0583, + "step": 154970 + }, + { + "epoch": 33.014363862860854, + "grad_norm": 0.001034628483466804, + "learning_rate": 8.921747398701307e-06, + "loss": 0.0081, + "step": 154980 + }, + { + "epoch": 33.01441802523967, + "grad_norm": 0.0009967600926756859, + "learning_rate": 8.918738377656213e-06, + "loss": 0.0192, + "step": 154990 + }, + { + "epoch": 33.01447218761848, + "grad_norm": 0.0009994425345212221, + "learning_rate": 8.915729356611121e-06, + "loss": 0.0244, + "step": 155000 + }, + { + "epoch": 33.01452634999729, + "grad_norm": 0.0010568563593551517, + "learning_rate": 8.912720335566027e-06, + "loss": 0.0203, + "step": 155010 + }, + { + "epoch": 33.01458051237611, + "grad_norm": 0.001124799600802362, + "learning_rate": 8.909711314520933e-06, + "loss": 0.0003, + "step": 155020 + }, + { + "epoch": 33.014634674754916, + "grad_norm": 0.001996639184653759, + "learning_rate": 8.906702293475841e-06, + "loss": 0.1119, + "step": 155030 + }, + { + "epoch": 33.014688837133725, + "grad_norm": 0.0012824882287532091, + "learning_rate": 8.903693272430748e-06, + "loss": 0.0093, + "step": 155040 + }, + { + "epoch": 33.01474299951254, + "grad_norm": 1.3982112407684326, + "learning_rate": 8.900684251385654e-06, + "loss": 0.0204, + "step": 155050 + }, + { + "epoch": 33.01479716189135, + "grad_norm": 0.001374818617478013, + "learning_rate": 8.897675230340562e-06, + "loss": 0.0619, + "step": 155060 + }, + { + "epoch": 33.01485132427016, + "grad_norm": 0.0010879960609599948, + "learning_rate": 8.894666209295468e-06, + "loss": 0.0188, + "step": 155070 + }, + { + "epoch": 33.01490548664897, + "grad_norm": 0.0013932165456935763, + "learning_rate": 8.891657188250376e-06, + "loss": 0.0666, + "step": 155080 + }, + { + "epoch": 33.01495964902779, + "grad_norm": 0.0016948474803939462, + "learning_rate": 8.888648167205282e-06, + "loss": 0.0001, + "step": 155090 + }, + { + "epoch": 33.0150138114066, + "grad_norm": 0.0010451098205521703, + "learning_rate": 8.885639146160189e-06, + "loss": 0.0269, + "step": 155100 + }, + { + "epoch": 33.015067973785406, + "grad_norm": 0.19670425355434418, + "learning_rate": 8.882630125115095e-06, + "loss": 0.0857, + "step": 155110 + }, + { + "epoch": 33.01512213616422, + "grad_norm": 0.6010565757751465, + "learning_rate": 8.879621104070001e-06, + "loss": 0.019, + "step": 155120 + }, + { + "epoch": 33.01517629854303, + "grad_norm": 0.004687504842877388, + "learning_rate": 8.87661208302491e-06, + "loss": 0.0083, + "step": 155130 + }, + { + "epoch": 33.01523046092184, + "grad_norm": 3.0773632526397705, + "learning_rate": 8.873603061979816e-06, + "loss": 0.0253, + "step": 155140 + }, + { + "epoch": 33.01528462330066, + "grad_norm": 0.0035094167105853558, + "learning_rate": 8.870594040934722e-06, + "loss": 0.0044, + "step": 155150 + }, + { + "epoch": 33.01533878567947, + "grad_norm": 0.0020151908975094557, + "learning_rate": 8.86758501988963e-06, + "loss": 0.0264, + "step": 155160 + }, + { + "epoch": 33.01539294805828, + "grad_norm": 0.002773093292489648, + "learning_rate": 8.864575998844536e-06, + "loss": 0.02, + "step": 155170 + }, + { + "epoch": 33.01544711043709, + "grad_norm": 1.7454702854156494, + "learning_rate": 8.861566977799442e-06, + "loss": 0.0376, + "step": 155180 + }, + { + "epoch": 33.015501272815904, + "grad_norm": 0.002937096869572997, + "learning_rate": 8.85855795675435e-06, + "loss": 0.0113, + "step": 155190 + }, + { + "epoch": 33.01555543519471, + "grad_norm": 0.0014075561193749309, + "learning_rate": 8.855548935709257e-06, + "loss": 0.028, + "step": 155200 + }, + { + "epoch": 33.01560959757352, + "grad_norm": 0.0013121847296133637, + "learning_rate": 8.852539914664165e-06, + "loss": 0.0555, + "step": 155210 + }, + { + "epoch": 33.01566375995234, + "grad_norm": 0.009985709562897682, + "learning_rate": 8.84953089361907e-06, + "loss": 0.0176, + "step": 155220 + }, + { + "epoch": 33.01571792233115, + "grad_norm": 0.0010923799127340317, + "learning_rate": 8.846521872573977e-06, + "loss": 0.0509, + "step": 155230 + }, + { + "epoch": 33.01577208470996, + "grad_norm": 0.0011271970579400659, + "learning_rate": 8.843512851528885e-06, + "loss": 0.0963, + "step": 155240 + }, + { + "epoch": 33.015826247088775, + "grad_norm": 0.001078873174265027, + "learning_rate": 8.840503830483791e-06, + "loss": 0.0288, + "step": 155250 + }, + { + "epoch": 33.015880409467584, + "grad_norm": 0.001689083524979651, + "learning_rate": 8.837494809438698e-06, + "loss": 0.0002, + "step": 155260 + }, + { + "epoch": 33.015934571846394, + "grad_norm": 0.0013507213443517685, + "learning_rate": 8.834485788393604e-06, + "loss": 0.0041, + "step": 155270 + }, + { + "epoch": 33.01598873422521, + "grad_norm": 0.005955953616648912, + "learning_rate": 8.83147676734851e-06, + "loss": 0.0703, + "step": 155280 + }, + { + "epoch": 33.01604289660402, + "grad_norm": 0.0015772313345223665, + "learning_rate": 8.828467746303418e-06, + "loss": 0.0243, + "step": 155290 + }, + { + "epoch": 33.01609705898283, + "grad_norm": 0.34626153111457825, + "learning_rate": 8.825458725258324e-06, + "loss": 0.007, + "step": 155300 + }, + { + "epoch": 33.01615122136164, + "grad_norm": 0.025319436565041542, + "learning_rate": 8.82244970421323e-06, + "loss": 0.1092, + "step": 155310 + }, + { + "epoch": 33.016205383740456, + "grad_norm": 0.001356462249532342, + "learning_rate": 8.819440683168139e-06, + "loss": 0.0144, + "step": 155320 + }, + { + "epoch": 33.016259546119265, + "grad_norm": 0.0017973092617467046, + "learning_rate": 8.816431662123045e-06, + "loss": 0.0585, + "step": 155330 + }, + { + "epoch": 33.016313708498075, + "grad_norm": 0.009738939814269543, + "learning_rate": 8.813422641077953e-06, + "loss": 0.0253, + "step": 155340 + }, + { + "epoch": 33.01636787087689, + "grad_norm": 0.010397092439234257, + "learning_rate": 8.81041362003286e-06, + "loss": 0.0165, + "step": 155350 + }, + { + "epoch": 33.0164220332557, + "grad_norm": 0.001169124967418611, + "learning_rate": 8.807404598987765e-06, + "loss": 0.0317, + "step": 155360 + }, + { + "epoch": 33.01647619563451, + "grad_norm": 0.0015113700646907091, + "learning_rate": 8.804395577942673e-06, + "loss": 0.0183, + "step": 155370 + }, + { + "epoch": 33.01653035801333, + "grad_norm": 0.0013954354217275977, + "learning_rate": 8.80138655689758e-06, + "loss": 0.014, + "step": 155380 + }, + { + "epoch": 33.016584520392136, + "grad_norm": 0.09648401290178299, + "learning_rate": 8.798377535852486e-06, + "loss": 0.0205, + "step": 155390 + }, + { + "epoch": 33.016638682770946, + "grad_norm": 1.2116620540618896, + "learning_rate": 8.795368514807394e-06, + "loss": 0.0162, + "step": 155400 + }, + { + "epoch": 33.016692845149755, + "grad_norm": 0.0011997572146356106, + "learning_rate": 8.792359493762299e-06, + "loss": 0.0019, + "step": 155410 + }, + { + "epoch": 33.01674700752857, + "grad_norm": 0.005118073429912329, + "learning_rate": 8.789350472717206e-06, + "loss": 0.0116, + "step": 155420 + }, + { + "epoch": 33.01680116990738, + "grad_norm": 1.2496410608291626, + "learning_rate": 8.786341451672113e-06, + "loss": 0.0728, + "step": 155430 + }, + { + "epoch": 33.01685533228619, + "grad_norm": 3.92498517036438, + "learning_rate": 8.783332430627019e-06, + "loss": 0.1022, + "step": 155440 + }, + { + "epoch": 33.01690949466501, + "grad_norm": 0.0010817943839356303, + "learning_rate": 8.780323409581927e-06, + "loss": 0.0137, + "step": 155450 + }, + { + "epoch": 33.01696365704382, + "grad_norm": 0.001815181109122932, + "learning_rate": 8.777314388536833e-06, + "loss": 0.0046, + "step": 155460 + }, + { + "epoch": 33.01701781942263, + "grad_norm": 0.08012010902166367, + "learning_rate": 8.774305367491741e-06, + "loss": 0.0363, + "step": 155470 + }, + { + "epoch": 33.01707198180144, + "grad_norm": 0.02226715162396431, + "learning_rate": 8.771296346446648e-06, + "loss": 0.0333, + "step": 155480 + }, + { + "epoch": 33.01712614418025, + "grad_norm": 0.009448456577956676, + "learning_rate": 8.768287325401554e-06, + "loss": 0.015, + "step": 155490 + }, + { + "epoch": 33.01718030655906, + "grad_norm": 0.12708602845668793, + "learning_rate": 8.765278304356462e-06, + "loss": 0.0272, + "step": 155500 + }, + { + "epoch": 33.01723446893788, + "grad_norm": 1.126020073890686, + "learning_rate": 8.762269283311368e-06, + "loss": 0.0173, + "step": 155510 + }, + { + "epoch": 33.01728863131669, + "grad_norm": 0.0013463555369526148, + "learning_rate": 8.759260262266274e-06, + "loss": 0.039, + "step": 155520 + }, + { + "epoch": 33.0173427936955, + "grad_norm": 0.001877983333542943, + "learning_rate": 8.756251241221182e-06, + "loss": 0.0241, + "step": 155530 + }, + { + "epoch": 33.01739695607431, + "grad_norm": 0.048087526112794876, + "learning_rate": 8.753242220176089e-06, + "loss": 0.0142, + "step": 155540 + }, + { + "epoch": 33.017451118453124, + "grad_norm": 0.10155703872442245, + "learning_rate": 8.750233199130997e-06, + "loss": 0.0065, + "step": 155550 + }, + { + "epoch": 33.017505280831934, + "grad_norm": 8.989927291870117, + "learning_rate": 8.747224178085901e-06, + "loss": 0.2046, + "step": 155560 + }, + { + "epoch": 33.01755944321074, + "grad_norm": 0.5365561246871948, + "learning_rate": 8.744215157040807e-06, + "loss": 0.0114, + "step": 155570 + }, + { + "epoch": 33.01761360558956, + "grad_norm": 0.01808842457830906, + "learning_rate": 8.741206135995715e-06, + "loss": 0.0574, + "step": 155580 + }, + { + "epoch": 33.01766776796837, + "grad_norm": 0.0014040290843695402, + "learning_rate": 8.738197114950622e-06, + "loss": 0.0041, + "step": 155590 + }, + { + "epoch": 33.01772193034718, + "grad_norm": 0.0010452597634866834, + "learning_rate": 8.73518809390553e-06, + "loss": 0.0238, + "step": 155600 + }, + { + "epoch": 33.017776092725995, + "grad_norm": 0.0010081975487992167, + "learning_rate": 8.732179072860436e-06, + "loss": 0.021, + "step": 155610 + }, + { + "epoch": 33.017830255104805, + "grad_norm": 1.206376314163208, + "learning_rate": 8.729170051815342e-06, + "loss": 0.0635, + "step": 155620 + }, + { + "epoch": 33.017884417483614, + "grad_norm": 0.0012700965162366629, + "learning_rate": 8.72616103077025e-06, + "loss": 0.1701, + "step": 155630 + }, + { + "epoch": 33.01793857986243, + "grad_norm": 0.9503816366195679, + "learning_rate": 8.723152009725156e-06, + "loss": 0.0381, + "step": 155640 + }, + { + "epoch": 33.01799274224124, + "grad_norm": 0.9056456089019775, + "learning_rate": 8.720142988680063e-06, + "loss": 0.0691, + "step": 155650 + }, + { + "epoch": 33.01804690462005, + "grad_norm": 0.0035231157671660185, + "learning_rate": 8.71713396763497e-06, + "loss": 0.0608, + "step": 155660 + }, + { + "epoch": 33.01810106699886, + "grad_norm": 0.002927775727584958, + "learning_rate": 8.714124946589877e-06, + "loss": 0.045, + "step": 155670 + }, + { + "epoch": 33.018155229377676, + "grad_norm": 0.028085248544812202, + "learning_rate": 8.711115925544785e-06, + "loss": 0.0226, + "step": 155680 + }, + { + "epoch": 33.018209391756486, + "grad_norm": 0.4771071970462799, + "learning_rate": 8.708106904499691e-06, + "loss": 0.0215, + "step": 155690 + }, + { + "epoch": 33.018263554135295, + "grad_norm": 0.7846824526786804, + "learning_rate": 8.705097883454597e-06, + "loss": 0.0068, + "step": 155700 + }, + { + "epoch": 33.01831771651411, + "grad_norm": 0.16814793646335602, + "learning_rate": 8.702088862409504e-06, + "loss": 0.0124, + "step": 155710 + }, + { + "epoch": 33.01837187889292, + "grad_norm": 0.8840909600257874, + "learning_rate": 8.69907984136441e-06, + "loss": 0.0347, + "step": 155720 + }, + { + "epoch": 33.01842604127173, + "grad_norm": 0.0011688083177432418, + "learning_rate": 8.696070820319318e-06, + "loss": 0.0322, + "step": 155730 + }, + { + "epoch": 33.01848020365055, + "grad_norm": 2.213303327560425, + "learning_rate": 8.693061799274224e-06, + "loss": 0.0233, + "step": 155740 + }, + { + "epoch": 33.01853436602936, + "grad_norm": 0.0013776447158306837, + "learning_rate": 8.69005277822913e-06, + "loss": 0.0441, + "step": 155750 + }, + { + "epoch": 33.018588528408166, + "grad_norm": 0.38571247458457947, + "learning_rate": 8.687043757184038e-06, + "loss": 0.0763, + "step": 155760 + }, + { + "epoch": 33.018642690786976, + "grad_norm": 0.0014368678675964475, + "learning_rate": 8.684034736138945e-06, + "loss": 0.0645, + "step": 155770 + }, + { + "epoch": 33.01869685316579, + "grad_norm": 0.0011230692034587264, + "learning_rate": 8.681025715093851e-06, + "loss": 0.0308, + "step": 155780 + }, + { + "epoch": 33.0187510155446, + "grad_norm": 0.41840285062789917, + "learning_rate": 8.678016694048759e-06, + "loss": 0.0085, + "step": 155790 + }, + { + "epoch": 33.01880517792341, + "grad_norm": 0.004701285623013973, + "learning_rate": 8.675007673003665e-06, + "loss": 0.0334, + "step": 155800 + }, + { + "epoch": 33.01885934030223, + "grad_norm": 1.9290192127227783, + "learning_rate": 8.671998651958573e-06, + "loss": 0.0748, + "step": 155810 + }, + { + "epoch": 33.01891350268104, + "grad_norm": 0.11555343121290207, + "learning_rate": 8.66898963091348e-06, + "loss": 0.0956, + "step": 155820 + }, + { + "epoch": 33.01896766505985, + "grad_norm": 0.0015561989275738597, + "learning_rate": 8.665980609868386e-06, + "loss": 0.0281, + "step": 155830 + }, + { + "epoch": 33.019021827438664, + "grad_norm": 0.0011363578960299492, + "learning_rate": 8.662971588823294e-06, + "loss": 0.0232, + "step": 155840 + }, + { + "epoch": 33.01907598981747, + "grad_norm": 0.001350009348243475, + "learning_rate": 8.6599625677782e-06, + "loss": 0.0591, + "step": 155850 + }, + { + "epoch": 33.01913015219628, + "grad_norm": 0.0010702608851715922, + "learning_rate": 8.656953546733106e-06, + "loss": 0.0833, + "step": 155860 + }, + { + "epoch": 33.0191843145751, + "grad_norm": 0.0017023615073412657, + "learning_rate": 8.653944525688013e-06, + "loss": 0.0261, + "step": 155870 + }, + { + "epoch": 33.01923847695391, + "grad_norm": 0.0012587280943989754, + "learning_rate": 8.650935504642919e-06, + "loss": 0.0356, + "step": 155880 + }, + { + "epoch": 33.01929263933272, + "grad_norm": 0.0033758359495550394, + "learning_rate": 8.647926483597827e-06, + "loss": 0.0365, + "step": 155890 + }, + { + "epoch": 33.01934680171153, + "grad_norm": 0.0012755142524838448, + "learning_rate": 8.644917462552733e-06, + "loss": 0.0095, + "step": 155900 + }, + { + "epoch": 33.019400964090345, + "grad_norm": 0.0012404690496623516, + "learning_rate": 8.64190844150764e-06, + "loss": 0.072, + "step": 155910 + }, + { + "epoch": 33.019455126469154, + "grad_norm": 0.0018047133926302195, + "learning_rate": 8.638899420462547e-06, + "loss": 0.0348, + "step": 155920 + }, + { + "epoch": 33.01950928884796, + "grad_norm": 0.004598159343004227, + "learning_rate": 8.635890399417454e-06, + "loss": 0.0692, + "step": 155930 + }, + { + "epoch": 33.01956345122678, + "grad_norm": 0.4954577386379242, + "learning_rate": 8.632881378372362e-06, + "loss": 0.0329, + "step": 155940 + }, + { + "epoch": 33.01961761360559, + "grad_norm": 0.05213632062077522, + "learning_rate": 8.629872357327268e-06, + "loss": 0.0071, + "step": 155950 + }, + { + "epoch": 33.0196717759844, + "grad_norm": 0.004772063344717026, + "learning_rate": 8.626863336282174e-06, + "loss": 0.0045, + "step": 155960 + }, + { + "epoch": 33.019725938363216, + "grad_norm": 0.0021258704364299774, + "learning_rate": 8.623854315237082e-06, + "loss": 0.01, + "step": 155970 + }, + { + "epoch": 33.019780100742025, + "grad_norm": 0.002187000121921301, + "learning_rate": 8.620845294191988e-06, + "loss": 0.098, + "step": 155980 + }, + { + "epoch": 33.019834263120835, + "grad_norm": 0.0032624169252812862, + "learning_rate": 8.617836273146895e-06, + "loss": 0.0053, + "step": 155990 + }, + { + "epoch": 33.01988842549965, + "grad_norm": 0.0018335342174395919, + "learning_rate": 8.614827252101803e-06, + "loss": 0.0829, + "step": 156000 + }, + { + "epoch": 33.01994258787846, + "grad_norm": 0.19448351860046387, + "learning_rate": 8.611818231056707e-06, + "loss": 0.0385, + "step": 156010 + }, + { + "epoch": 33.01999675025727, + "grad_norm": 0.39044734835624695, + "learning_rate": 8.608809210011615e-06, + "loss": 0.0106, + "step": 156020 + }, + { + "epoch": 33.02005091263608, + "grad_norm": 0.0011086140293627977, + "learning_rate": 8.605800188966521e-06, + "loss": 0.0278, + "step": 156030 + }, + { + "epoch": 33.0201050750149, + "grad_norm": 0.740889310836792, + "learning_rate": 8.602791167921428e-06, + "loss": 0.0835, + "step": 156040 + }, + { + "epoch": 33.020159237393706, + "grad_norm": 0.0015726630808785558, + "learning_rate": 8.599782146876336e-06, + "loss": 0.014, + "step": 156050 + }, + { + "epoch": 33.020213399772516, + "grad_norm": 0.0013759901048615575, + "learning_rate": 8.596773125831242e-06, + "loss": 0.145, + "step": 156060 + }, + { + "epoch": 33.02026756215133, + "grad_norm": 0.0012908155331388116, + "learning_rate": 8.59376410478615e-06, + "loss": 0.0419, + "step": 156070 + }, + { + "epoch": 33.02032172453014, + "grad_norm": 0.2468561828136444, + "learning_rate": 8.590755083741056e-06, + "loss": 0.0305, + "step": 156080 + }, + { + "epoch": 33.02037588690895, + "grad_norm": 0.0011705821380019188, + "learning_rate": 8.587746062695962e-06, + "loss": 0.0, + "step": 156090 + }, + { + "epoch": 33.02043004928777, + "grad_norm": 0.0017096757655963302, + "learning_rate": 8.58473704165087e-06, + "loss": 0.0201, + "step": 156100 + }, + { + "epoch": 33.02048421166658, + "grad_norm": 0.001087329932488501, + "learning_rate": 8.581728020605777e-06, + "loss": 0.0755, + "step": 156110 + }, + { + "epoch": 33.02053837404539, + "grad_norm": 1.1718419790267944, + "learning_rate": 8.578718999560683e-06, + "loss": 0.0207, + "step": 156120 + }, + { + "epoch": 33.020592536424196, + "grad_norm": 0.001140799024142325, + "learning_rate": 8.575709978515591e-06, + "loss": 0.016, + "step": 156130 + }, + { + "epoch": 33.02064669880301, + "grad_norm": 0.0013426435180008411, + "learning_rate": 8.572700957470497e-06, + "loss": 0.0177, + "step": 156140 + }, + { + "epoch": 33.02070086118182, + "grad_norm": 0.0014170515350997448, + "learning_rate": 8.569691936425405e-06, + "loss": 0.0306, + "step": 156150 + }, + { + "epoch": 33.02075502356063, + "grad_norm": 0.0011879439698532224, + "learning_rate": 8.56668291538031e-06, + "loss": 0.1168, + "step": 156160 + }, + { + "epoch": 33.02080918593945, + "grad_norm": 0.0022015937138348818, + "learning_rate": 8.563673894335216e-06, + "loss": 0.0098, + "step": 156170 + }, + { + "epoch": 33.02086334831826, + "grad_norm": 0.0011490706820040941, + "learning_rate": 8.560664873290124e-06, + "loss": 0.0313, + "step": 156180 + }, + { + "epoch": 33.02091751069707, + "grad_norm": 0.4062493145465851, + "learning_rate": 8.55765585224503e-06, + "loss": 0.0023, + "step": 156190 + }, + { + "epoch": 33.020971673075884, + "grad_norm": 0.003929082304239273, + "learning_rate": 8.554646831199938e-06, + "loss": 0.0098, + "step": 156200 + }, + { + "epoch": 33.021025835454694, + "grad_norm": 0.47349271178245544, + "learning_rate": 8.551637810154845e-06, + "loss": 0.0127, + "step": 156210 + }, + { + "epoch": 33.0210799978335, + "grad_norm": 0.004006197210401297, + "learning_rate": 8.54862878910975e-06, + "loss": 0.0007, + "step": 156220 + }, + { + "epoch": 33.02113416021232, + "grad_norm": 0.0015090596862137318, + "learning_rate": 8.545619768064659e-06, + "loss": 0.0368, + "step": 156230 + }, + { + "epoch": 33.02118832259113, + "grad_norm": 0.0011346421670168638, + "learning_rate": 8.542610747019565e-06, + "loss": 0.0008, + "step": 156240 + }, + { + "epoch": 33.02124248496994, + "grad_norm": 0.0011790822027251124, + "learning_rate": 8.539601725974471e-06, + "loss": 0.0182, + "step": 156250 + }, + { + "epoch": 33.02129664734875, + "grad_norm": 0.8055948615074158, + "learning_rate": 8.53659270492938e-06, + "loss": 0.0058, + "step": 156260 + }, + { + "epoch": 33.021350809727565, + "grad_norm": 0.0016086301766335964, + "learning_rate": 8.533583683884286e-06, + "loss": 0.0523, + "step": 156270 + }, + { + "epoch": 33.021404972106374, + "grad_norm": 0.07067129760980606, + "learning_rate": 8.530574662839194e-06, + "loss": 0.0081, + "step": 156280 + }, + { + "epoch": 33.021459134485184, + "grad_norm": 0.022256067022681236, + "learning_rate": 8.5275656417941e-06, + "loss": 0.0278, + "step": 156290 + }, + { + "epoch": 33.021513296864, + "grad_norm": 0.0018219148041680455, + "learning_rate": 8.524556620749006e-06, + "loss": 0.0147, + "step": 156300 + }, + { + "epoch": 33.02156745924281, + "grad_norm": 1.0028634071350098, + "learning_rate": 8.521547599703912e-06, + "loss": 0.0378, + "step": 156310 + }, + { + "epoch": 33.02162162162162, + "grad_norm": 0.6276468634605408, + "learning_rate": 8.518538578658819e-06, + "loss": 0.0064, + "step": 156320 + }, + { + "epoch": 33.021675784000436, + "grad_norm": 0.0010626196162775159, + "learning_rate": 8.515529557613727e-06, + "loss": 0.0758, + "step": 156330 + }, + { + "epoch": 33.021729946379246, + "grad_norm": 0.0011153300292789936, + "learning_rate": 8.512520536568633e-06, + "loss": 0.0148, + "step": 156340 + }, + { + "epoch": 33.021784108758055, + "grad_norm": 0.0015351579058915377, + "learning_rate": 8.50951151552354e-06, + "loss": 0.0801, + "step": 156350 + }, + { + "epoch": 33.02183827113687, + "grad_norm": 0.0014033811166882515, + "learning_rate": 8.506502494478447e-06, + "loss": 0.0214, + "step": 156360 + }, + { + "epoch": 33.02189243351568, + "grad_norm": 0.0013987342827022076, + "learning_rate": 8.503493473433353e-06, + "loss": 0.0225, + "step": 156370 + }, + { + "epoch": 33.02194659589449, + "grad_norm": 0.0010397416772320867, + "learning_rate": 8.50048445238826e-06, + "loss": 0.0234, + "step": 156380 + }, + { + "epoch": 33.0220007582733, + "grad_norm": 2.8629884719848633, + "learning_rate": 8.497475431343168e-06, + "loss": 0.0897, + "step": 156390 + }, + { + "epoch": 33.02205492065212, + "grad_norm": 0.0011383924866095185, + "learning_rate": 8.494466410298074e-06, + "loss": 0.013, + "step": 156400 + }, + { + "epoch": 33.02210908303093, + "grad_norm": 0.09154537320137024, + "learning_rate": 8.491457389252982e-06, + "loss": 0.0002, + "step": 156410 + }, + { + "epoch": 33.022163245409736, + "grad_norm": 0.06823217123746872, + "learning_rate": 8.488448368207888e-06, + "loss": 0.0293, + "step": 156420 + }, + { + "epoch": 33.02221740778855, + "grad_norm": 0.0026885410770773888, + "learning_rate": 8.485439347162794e-06, + "loss": 0.0132, + "step": 156430 + }, + { + "epoch": 33.02227157016736, + "grad_norm": 0.0011089384788647294, + "learning_rate": 8.482430326117702e-06, + "loss": 0.0021, + "step": 156440 + }, + { + "epoch": 33.02232573254617, + "grad_norm": 2.9806931018829346, + "learning_rate": 8.479421305072609e-06, + "loss": 0.0311, + "step": 156450 + }, + { + "epoch": 33.02237989492499, + "grad_norm": 0.0010768044739961624, + "learning_rate": 8.476412284027515e-06, + "loss": 0.0701, + "step": 156460 + }, + { + "epoch": 33.0224340573038, + "grad_norm": 0.005519277416169643, + "learning_rate": 8.473403262982421e-06, + "loss": 0.01, + "step": 156470 + }, + { + "epoch": 33.02248821968261, + "grad_norm": 0.0027488789055496454, + "learning_rate": 8.470394241937328e-06, + "loss": 0.0946, + "step": 156480 + }, + { + "epoch": 33.02254238206142, + "grad_norm": 0.8374418020248413, + "learning_rate": 8.467385220892235e-06, + "loss": 0.0393, + "step": 156490 + }, + { + "epoch": 33.02259654444023, + "grad_norm": 0.0012650489807128906, + "learning_rate": 8.464376199847142e-06, + "loss": 0.0848, + "step": 156500 + }, + { + "epoch": 33.02265070681904, + "grad_norm": 0.003219562815502286, + "learning_rate": 8.461367178802048e-06, + "loss": 0.0544, + "step": 156510 + }, + { + "epoch": 33.02270486919785, + "grad_norm": 0.001077149878256023, + "learning_rate": 8.458358157756956e-06, + "loss": 0.0626, + "step": 156520 + }, + { + "epoch": 33.02275903157667, + "grad_norm": 2.669661045074463, + "learning_rate": 8.455349136711862e-06, + "loss": 0.0102, + "step": 156530 + }, + { + "epoch": 33.02281319395548, + "grad_norm": 0.0017841635271906853, + "learning_rate": 8.45234011566677e-06, + "loss": 0.0314, + "step": 156540 + }, + { + "epoch": 33.02286735633429, + "grad_norm": 0.001162472297437489, + "learning_rate": 8.449331094621677e-06, + "loss": 0.0078, + "step": 156550 + }, + { + "epoch": 33.022921518713105, + "grad_norm": 0.0013595765922218561, + "learning_rate": 8.446322073576583e-06, + "loss": 0.01, + "step": 156560 + }, + { + "epoch": 33.022975681091914, + "grad_norm": 0.9821156859397888, + "learning_rate": 8.44331305253149e-06, + "loss": 0.0428, + "step": 156570 + }, + { + "epoch": 33.023029843470724, + "grad_norm": 2.627094268798828, + "learning_rate": 8.440304031486397e-06, + "loss": 0.1506, + "step": 156580 + }, + { + "epoch": 33.02308400584954, + "grad_norm": 0.0015048347413539886, + "learning_rate": 8.437295010441303e-06, + "loss": 0.0535, + "step": 156590 + }, + { + "epoch": 33.02313816822835, + "grad_norm": 0.004612297285348177, + "learning_rate": 8.434285989396211e-06, + "loss": 0.0055, + "step": 156600 + }, + { + "epoch": 33.02319233060716, + "grad_norm": 0.09501682221889496, + "learning_rate": 8.431276968351116e-06, + "loss": 0.0965, + "step": 156610 + }, + { + "epoch": 33.02324649298597, + "grad_norm": 0.001473184791393578, + "learning_rate": 8.428267947306024e-06, + "loss": 0.1067, + "step": 156620 + }, + { + "epoch": 33.023300655364785, + "grad_norm": 0.2528242766857147, + "learning_rate": 8.42525892626093e-06, + "loss": 0.0828, + "step": 156630 + }, + { + "epoch": 33.023354817743595, + "grad_norm": 0.026612108573317528, + "learning_rate": 8.422249905215836e-06, + "loss": 0.0413, + "step": 156640 + }, + { + "epoch": 33.023408980122404, + "grad_norm": 0.014167298562824726, + "learning_rate": 8.419240884170744e-06, + "loss": 0.0503, + "step": 156650 + }, + { + "epoch": 33.02346314250122, + "grad_norm": 0.010815496556460857, + "learning_rate": 8.41623186312565e-06, + "loss": 0.044, + "step": 156660 + }, + { + "epoch": 33.02351730488003, + "grad_norm": 0.29643669724464417, + "learning_rate": 8.413222842080559e-06, + "loss": 0.017, + "step": 156670 + }, + { + "epoch": 33.02357146725884, + "grad_norm": 0.0015146612422540784, + "learning_rate": 8.410213821035465e-06, + "loss": 0.0251, + "step": 156680 + }, + { + "epoch": 33.02362562963766, + "grad_norm": 0.09578081965446472, + "learning_rate": 8.407204799990371e-06, + "loss": 0.1188, + "step": 156690 + }, + { + "epoch": 33.023679792016466, + "grad_norm": 0.0014152855146676302, + "learning_rate": 8.404195778945279e-06, + "loss": 0.0298, + "step": 156700 + }, + { + "epoch": 33.023733954395276, + "grad_norm": 0.0013146139681339264, + "learning_rate": 8.401186757900185e-06, + "loss": 0.0127, + "step": 156710 + }, + { + "epoch": 33.023788116774085, + "grad_norm": 1.2176058292388916, + "learning_rate": 8.398177736855092e-06, + "loss": 0.062, + "step": 156720 + }, + { + "epoch": 33.0238422791529, + "grad_norm": 0.17754188179969788, + "learning_rate": 8.39516871581e-06, + "loss": 0.0014, + "step": 156730 + }, + { + "epoch": 33.02389644153171, + "grad_norm": 1.116268277168274, + "learning_rate": 8.392159694764906e-06, + "loss": 0.029, + "step": 156740 + }, + { + "epoch": 33.02395060391052, + "grad_norm": 0.0012735105119645596, + "learning_rate": 8.389150673719814e-06, + "loss": 0.0008, + "step": 156750 + }, + { + "epoch": 33.02400476628934, + "grad_norm": 0.4930126368999481, + "learning_rate": 8.386141652674718e-06, + "loss": 0.0145, + "step": 156760 + }, + { + "epoch": 33.02405892866815, + "grad_norm": 0.009168807417154312, + "learning_rate": 8.383132631629625e-06, + "loss": 0.0508, + "step": 156770 + }, + { + "epoch": 33.02411309104696, + "grad_norm": 0.00131101137958467, + "learning_rate": 8.380123610584533e-06, + "loss": 0.0146, + "step": 156780 + }, + { + "epoch": 33.02416725342577, + "grad_norm": 0.00125223770737648, + "learning_rate": 8.377114589539439e-06, + "loss": 0.0211, + "step": 156790 + }, + { + "epoch": 33.02422141580458, + "grad_norm": 0.2889016568660736, + "learning_rate": 8.374105568494347e-06, + "loss": 0.0098, + "step": 156800 + }, + { + "epoch": 33.02427557818339, + "grad_norm": 0.00181673897895962, + "learning_rate": 8.371096547449253e-06, + "loss": 0.0223, + "step": 156810 + }, + { + "epoch": 33.02432974056221, + "grad_norm": 0.13573616743087769, + "learning_rate": 8.36808752640416e-06, + "loss": 0.0156, + "step": 156820 + }, + { + "epoch": 33.02438390294102, + "grad_norm": 0.0037795654498040676, + "learning_rate": 8.365078505359067e-06, + "loss": 0.009, + "step": 156830 + }, + { + "epoch": 33.02443806531983, + "grad_norm": 0.0014021655078977346, + "learning_rate": 8.362069484313974e-06, + "loss": 0.0423, + "step": 156840 + }, + { + "epoch": 33.02449222769864, + "grad_norm": 0.13325634598731995, + "learning_rate": 8.35906046326888e-06, + "loss": 0.0211, + "step": 156850 + }, + { + "epoch": 33.024546390077454, + "grad_norm": 0.001074914587661624, + "learning_rate": 8.356051442223788e-06, + "loss": 0.0087, + "step": 156860 + }, + { + "epoch": 33.02460055245626, + "grad_norm": 0.01853512041270733, + "learning_rate": 8.353042421178694e-06, + "loss": 0.0337, + "step": 156870 + }, + { + "epoch": 33.02465471483507, + "grad_norm": 1.152685523033142, + "learning_rate": 8.350033400133602e-06, + "loss": 0.0683, + "step": 156880 + }, + { + "epoch": 33.02470887721389, + "grad_norm": 0.9270173907279968, + "learning_rate": 8.347024379088508e-06, + "loss": 0.1121, + "step": 156890 + }, + { + "epoch": 33.0247630395927, + "grad_norm": 0.0011058374075219035, + "learning_rate": 8.344015358043415e-06, + "loss": 0.0213, + "step": 156900 + }, + { + "epoch": 33.02481720197151, + "grad_norm": 0.0010770739754661918, + "learning_rate": 8.341006336998321e-06, + "loss": 0.0105, + "step": 156910 + }, + { + "epoch": 33.024871364350325, + "grad_norm": 0.0016725638415664434, + "learning_rate": 8.337997315953227e-06, + "loss": 0.0695, + "step": 156920 + }, + { + "epoch": 33.024925526729135, + "grad_norm": 0.0010723730083554983, + "learning_rate": 8.334988294908135e-06, + "loss": 0.0556, + "step": 156930 + }, + { + "epoch": 33.024979689107944, + "grad_norm": 0.0010837131412699819, + "learning_rate": 8.331979273863042e-06, + "loss": 0.08, + "step": 156940 + }, + { + "epoch": 33.02500135405947, + "eval_accuracy": 0.8442194644023514, + "eval_loss": 0.9634324908256531, + "eval_runtime": 114.5822, + "eval_samples_per_second": 26.723, + "eval_steps_per_second": 3.343, + "step": 156944 + }, + { + "epoch": 34.000032497427284, + "grad_norm": 0.001208704081363976, + "learning_rate": 8.328970252817948e-06, + "loss": 0.0432, + "step": 156950 + }, + { + "epoch": 34.0000866598061, + "grad_norm": 0.222971111536026, + "learning_rate": 8.325961231772856e-06, + "loss": 0.0106, + "step": 156960 + }, + { + "epoch": 34.00014082218491, + "grad_norm": 0.0011508826864883304, + "learning_rate": 8.322952210727762e-06, + "loss": 0.0198, + "step": 156970 + }, + { + "epoch": 34.00019498456372, + "grad_norm": 0.45924824476242065, + "learning_rate": 8.319943189682668e-06, + "loss": 0.0031, + "step": 156980 + }, + { + "epoch": 34.00024914694254, + "grad_norm": 0.24631763994693756, + "learning_rate": 8.316934168637576e-06, + "loss": 0.011, + "step": 156990 + }, + { + "epoch": 34.000303309321346, + "grad_norm": 1.8709511756896973, + "learning_rate": 8.313925147592483e-06, + "loss": 0.0381, + "step": 157000 + }, + { + "epoch": 34.000357471700156, + "grad_norm": 0.0012777387164533138, + "learning_rate": 8.31091612654739e-06, + "loss": 0.0194, + "step": 157010 + }, + { + "epoch": 34.00041163407897, + "grad_norm": 1.150405764579773, + "learning_rate": 8.307907105502297e-06, + "loss": 0.0244, + "step": 157020 + }, + { + "epoch": 34.00046579645778, + "grad_norm": 0.012916764244437218, + "learning_rate": 8.304898084457203e-06, + "loss": 0.0247, + "step": 157030 + }, + { + "epoch": 34.00051995883659, + "grad_norm": 0.0015123678604140878, + "learning_rate": 8.301889063412111e-06, + "loss": 0.0137, + "step": 157040 + }, + { + "epoch": 34.0005741212154, + "grad_norm": 0.0653136745095253, + "learning_rate": 8.298880042367017e-06, + "loss": 0.0257, + "step": 157050 + }, + { + "epoch": 34.00062828359422, + "grad_norm": 0.0011100443080067635, + "learning_rate": 8.295871021321924e-06, + "loss": 0.0439, + "step": 157060 + }, + { + "epoch": 34.00068244597303, + "grad_norm": 0.001104629714973271, + "learning_rate": 8.29286200027683e-06, + "loss": 0.0076, + "step": 157070 + }, + { + "epoch": 34.000736608351836, + "grad_norm": 0.001447056420147419, + "learning_rate": 8.289852979231736e-06, + "loss": 0.0359, + "step": 157080 + }, + { + "epoch": 34.00079077073065, + "grad_norm": 0.004648244008421898, + "learning_rate": 8.286843958186644e-06, + "loss": 0.002, + "step": 157090 + }, + { + "epoch": 34.00084493310946, + "grad_norm": 0.002023633336648345, + "learning_rate": 8.28383493714155e-06, + "loss": 0.0595, + "step": 157100 + }, + { + "epoch": 34.00089909548827, + "grad_norm": 0.0014520770637318492, + "learning_rate": 8.280825916096457e-06, + "loss": 0.0244, + "step": 157110 + }, + { + "epoch": 34.00095325786709, + "grad_norm": 0.0017898324877023697, + "learning_rate": 8.277816895051365e-06, + "loss": 0.043, + "step": 157120 + }, + { + "epoch": 34.0010074202459, + "grad_norm": 0.4429274797439575, + "learning_rate": 8.274807874006271e-06, + "loss": 0.0102, + "step": 157130 + }, + { + "epoch": 34.00106158262471, + "grad_norm": 0.0011357757030054927, + "learning_rate": 8.271798852961179e-06, + "loss": 0.0326, + "step": 157140 + }, + { + "epoch": 34.00111574500352, + "grad_norm": 0.05879081040620804, + "learning_rate": 8.268789831916085e-06, + "loss": 0.0089, + "step": 157150 + }, + { + "epoch": 34.001169907382334, + "grad_norm": 0.001340515329502523, + "learning_rate": 8.265780810870991e-06, + "loss": 0.0754, + "step": 157160 + }, + { + "epoch": 34.00122406976114, + "grad_norm": 0.21243099868297577, + "learning_rate": 8.2627717898259e-06, + "loss": 0.0195, + "step": 157170 + }, + { + "epoch": 34.00127823213995, + "grad_norm": 0.0016058309702202678, + "learning_rate": 8.259762768780806e-06, + "loss": 0.0966, + "step": 157180 + }, + { + "epoch": 34.00133239451877, + "grad_norm": 0.0020329405087977648, + "learning_rate": 8.256753747735712e-06, + "loss": 0.0712, + "step": 157190 + }, + { + "epoch": 34.00138655689758, + "grad_norm": 0.0013773535611107945, + "learning_rate": 8.25374472669062e-06, + "loss": 0.033, + "step": 157200 + }, + { + "epoch": 34.00144071927639, + "grad_norm": 0.04842551797628403, + "learning_rate": 8.250735705645525e-06, + "loss": 0.0506, + "step": 157210 + }, + { + "epoch": 34.001494881655205, + "grad_norm": 0.684447705745697, + "learning_rate": 8.247726684600432e-06, + "loss": 0.0248, + "step": 157220 + }, + { + "epoch": 34.001549044034014, + "grad_norm": 1.0055009126663208, + "learning_rate": 8.244717663555339e-06, + "loss": 0.0211, + "step": 157230 + }, + { + "epoch": 34.001603206412824, + "grad_norm": 0.021354373544454575, + "learning_rate": 8.241708642510245e-06, + "loss": 0.0086, + "step": 157240 + }, + { + "epoch": 34.00165736879164, + "grad_norm": 0.971040666103363, + "learning_rate": 8.238699621465153e-06, + "loss": 0.0185, + "step": 157250 + }, + { + "epoch": 34.00171153117045, + "grad_norm": 0.0010542983654886484, + "learning_rate": 8.23569060042006e-06, + "loss": 0.1206, + "step": 157260 + }, + { + "epoch": 34.00176569354926, + "grad_norm": 0.004877476487308741, + "learning_rate": 8.232681579374967e-06, + "loss": 0.079, + "step": 157270 + }, + { + "epoch": 34.00181985592807, + "grad_norm": 0.015715302899479866, + "learning_rate": 8.229672558329874e-06, + "loss": 0.0245, + "step": 157280 + }, + { + "epoch": 34.001874018306886, + "grad_norm": 0.0026044584810733795, + "learning_rate": 8.22666353728478e-06, + "loss": 0.012, + "step": 157290 + }, + { + "epoch": 34.001928180685695, + "grad_norm": 0.07016323506832123, + "learning_rate": 8.223654516239688e-06, + "loss": 0.0042, + "step": 157300 + }, + { + "epoch": 34.001982343064505, + "grad_norm": 0.40812262892723083, + "learning_rate": 8.220645495194594e-06, + "loss": 0.0123, + "step": 157310 + }, + { + "epoch": 34.00203650544332, + "grad_norm": 0.0010428472887724638, + "learning_rate": 8.2176364741495e-06, + "loss": 0.0016, + "step": 157320 + }, + { + "epoch": 34.00209066782213, + "grad_norm": 0.004058642778545618, + "learning_rate": 8.214627453104408e-06, + "loss": 0.0216, + "step": 157330 + }, + { + "epoch": 34.00214483020094, + "grad_norm": 0.001330976141616702, + "learning_rate": 8.211618432059315e-06, + "loss": 0.0224, + "step": 157340 + }, + { + "epoch": 34.00219899257976, + "grad_norm": 1.2363561391830444, + "learning_rate": 8.208609411014223e-06, + "loss": 0.0349, + "step": 157350 + }, + { + "epoch": 34.00225315495857, + "grad_norm": 0.1940627545118332, + "learning_rate": 8.205600389969127e-06, + "loss": 0.02, + "step": 157360 + }, + { + "epoch": 34.002307317337376, + "grad_norm": 0.0027253609150648117, + "learning_rate": 8.202591368924033e-06, + "loss": 0.0121, + "step": 157370 + }, + { + "epoch": 34.00236147971619, + "grad_norm": 5.864095211029053, + "learning_rate": 8.199582347878941e-06, + "loss": 0.0229, + "step": 157380 + }, + { + "epoch": 34.002415642095, + "grad_norm": 0.4225933253765106, + "learning_rate": 8.196573326833848e-06, + "loss": 0.0175, + "step": 157390 + }, + { + "epoch": 34.00246980447381, + "grad_norm": 0.0010163628030568361, + "learning_rate": 8.193564305788756e-06, + "loss": 0.0851, + "step": 157400 + }, + { + "epoch": 34.00252396685262, + "grad_norm": 0.0010399699676781893, + "learning_rate": 8.190555284743662e-06, + "loss": 0.0669, + "step": 157410 + }, + { + "epoch": 34.00257812923144, + "grad_norm": 0.001050427439622581, + "learning_rate": 8.187546263698568e-06, + "loss": 0.0001, + "step": 157420 + }, + { + "epoch": 34.00263229161025, + "grad_norm": 0.18314868211746216, + "learning_rate": 8.184537242653476e-06, + "loss": 0.0256, + "step": 157430 + }, + { + "epoch": 34.00268645398906, + "grad_norm": 3.8960838317871094, + "learning_rate": 8.181528221608382e-06, + "loss": 0.0773, + "step": 157440 + }, + { + "epoch": 34.00274061636787, + "grad_norm": 0.0010850047692656517, + "learning_rate": 8.178519200563289e-06, + "loss": 0.0048, + "step": 157450 + }, + { + "epoch": 34.00279477874668, + "grad_norm": 0.6309173703193665, + "learning_rate": 8.175510179518197e-06, + "loss": 0.0814, + "step": 157460 + }, + { + "epoch": 34.00284894112549, + "grad_norm": 0.1381339579820633, + "learning_rate": 8.172501158473103e-06, + "loss": 0.0024, + "step": 157470 + }, + { + "epoch": 34.00290310350431, + "grad_norm": 0.0020952520426362753, + "learning_rate": 8.169492137428011e-06, + "loss": 0.0309, + "step": 157480 + }, + { + "epoch": 34.00295726588312, + "grad_norm": 0.06951489299535751, + "learning_rate": 8.166483116382917e-06, + "loss": 0.2474, + "step": 157490 + }, + { + "epoch": 34.00301142826193, + "grad_norm": 0.0012436488177627325, + "learning_rate": 8.163474095337823e-06, + "loss": 0.0015, + "step": 157500 + }, + { + "epoch": 34.00306559064074, + "grad_norm": 0.00104038929566741, + "learning_rate": 8.16046507429273e-06, + "loss": 0.0036, + "step": 157510 + }, + { + "epoch": 34.003119753019554, + "grad_norm": 0.6261703968048096, + "learning_rate": 8.157456053247636e-06, + "loss": 0.0265, + "step": 157520 + }, + { + "epoch": 34.003173915398364, + "grad_norm": 2.2462568283081055, + "learning_rate": 8.154447032202544e-06, + "loss": 0.0506, + "step": 157530 + }, + { + "epoch": 34.00322807777717, + "grad_norm": 0.0012560468167066574, + "learning_rate": 8.15143801115745e-06, + "loss": 0.009, + "step": 157540 + }, + { + "epoch": 34.00328224015599, + "grad_norm": 0.039514653384685516, + "learning_rate": 8.148428990112356e-06, + "loss": 0.0405, + "step": 157550 + }, + { + "epoch": 34.0033364025348, + "grad_norm": 1.0174956321716309, + "learning_rate": 8.145419969067264e-06, + "loss": 0.0176, + "step": 157560 + }, + { + "epoch": 34.00339056491361, + "grad_norm": 0.0010511046275496483, + "learning_rate": 8.14241094802217e-06, + "loss": 0.0571, + "step": 157570 + }, + { + "epoch": 34.003444727292425, + "grad_norm": 0.074413001537323, + "learning_rate": 8.139401926977077e-06, + "loss": 0.1225, + "step": 157580 + }, + { + "epoch": 34.003498889671235, + "grad_norm": 0.001139449654147029, + "learning_rate": 8.136392905931985e-06, + "loss": 0.0023, + "step": 157590 + }, + { + "epoch": 34.003553052050044, + "grad_norm": 0.0013726594625040889, + "learning_rate": 8.133383884886891e-06, + "loss": 0.0193, + "step": 157600 + }, + { + "epoch": 34.00360721442886, + "grad_norm": 0.7079969644546509, + "learning_rate": 8.1303748638418e-06, + "loss": 0.0615, + "step": 157610 + }, + { + "epoch": 34.00366137680767, + "grad_norm": 1.771307349205017, + "learning_rate": 8.127365842796706e-06, + "loss": 0.0201, + "step": 157620 + }, + { + "epoch": 34.00371553918648, + "grad_norm": 0.0016632332699373364, + "learning_rate": 8.124356821751612e-06, + "loss": 0.0609, + "step": 157630 + }, + { + "epoch": 34.00376970156529, + "grad_norm": 0.0014906729338690639, + "learning_rate": 8.12134780070652e-06, + "loss": 0.0923, + "step": 157640 + }, + { + "epoch": 34.003823863944106, + "grad_norm": 0.002834745915606618, + "learning_rate": 8.118338779661426e-06, + "loss": 0.0313, + "step": 157650 + }, + { + "epoch": 34.003878026322916, + "grad_norm": 1.09282648563385, + "learning_rate": 8.115329758616332e-06, + "loss": 0.0091, + "step": 157660 + }, + { + "epoch": 34.003932188701725, + "grad_norm": 0.0010320888832211494, + "learning_rate": 8.112320737571239e-06, + "loss": 0.0684, + "step": 157670 + }, + { + "epoch": 34.00398635108054, + "grad_norm": 0.0032277246937155724, + "learning_rate": 8.109311716526145e-06, + "loss": 0.0082, + "step": 157680 + }, + { + "epoch": 34.00404051345935, + "grad_norm": 0.0022079618647694588, + "learning_rate": 8.106302695481053e-06, + "loss": 0.0236, + "step": 157690 + }, + { + "epoch": 34.00409467583816, + "grad_norm": 0.0025700274854898453, + "learning_rate": 8.103293674435959e-06, + "loss": 0.0189, + "step": 157700 + }, + { + "epoch": 34.00414883821698, + "grad_norm": 0.001433135592378676, + "learning_rate": 8.100284653390865e-06, + "loss": 0.0582, + "step": 157710 + }, + { + "epoch": 34.00420300059579, + "grad_norm": 0.5345391631126404, + "learning_rate": 8.097275632345773e-06, + "loss": 0.0763, + "step": 157720 + }, + { + "epoch": 34.004257162974596, + "grad_norm": 0.0010545453988015652, + "learning_rate": 8.09426661130068e-06, + "loss": 0.0835, + "step": 157730 + }, + { + "epoch": 34.00431132535341, + "grad_norm": 0.0013037376338616014, + "learning_rate": 8.091257590255588e-06, + "loss": 0.0252, + "step": 157740 + }, + { + "epoch": 34.00436548773222, + "grad_norm": 0.414190411567688, + "learning_rate": 8.088248569210494e-06, + "loss": 0.0305, + "step": 157750 + }, + { + "epoch": 34.00441965011103, + "grad_norm": 0.0018719560466706753, + "learning_rate": 8.0852395481654e-06, + "loss": 0.0003, + "step": 157760 + }, + { + "epoch": 34.00447381248984, + "grad_norm": 1.1243488788604736, + "learning_rate": 8.082230527120308e-06, + "loss": 0.0697, + "step": 157770 + }, + { + "epoch": 34.00452797486866, + "grad_norm": 0.5743622183799744, + "learning_rate": 8.079221506075214e-06, + "loss": 0.0798, + "step": 157780 + }, + { + "epoch": 34.00458213724747, + "grad_norm": 0.0011258251033723354, + "learning_rate": 8.07621248503012e-06, + "loss": 0.0316, + "step": 157790 + }, + { + "epoch": 34.00463629962628, + "grad_norm": 0.0017625957261770964, + "learning_rate": 8.073203463985027e-06, + "loss": 0.0225, + "step": 157800 + }, + { + "epoch": 34.004690462005094, + "grad_norm": 0.5159770846366882, + "learning_rate": 8.070194442939933e-06, + "loss": 0.0741, + "step": 157810 + }, + { + "epoch": 34.0047446243839, + "grad_norm": 0.0010371271055191755, + "learning_rate": 8.067185421894841e-06, + "loss": 0.0839, + "step": 157820 + }, + { + "epoch": 34.00479878676271, + "grad_norm": 1.1638176441192627, + "learning_rate": 8.064176400849747e-06, + "loss": 0.0314, + "step": 157830 + }, + { + "epoch": 34.00485294914153, + "grad_norm": 0.0011564861051738262, + "learning_rate": 8.061167379804654e-06, + "loss": 0.0533, + "step": 157840 + }, + { + "epoch": 34.00490711152034, + "grad_norm": 0.8904253244400024, + "learning_rate": 8.058158358759562e-06, + "loss": 0.0028, + "step": 157850 + }, + { + "epoch": 34.00496127389915, + "grad_norm": 1.368506908416748, + "learning_rate": 8.055149337714468e-06, + "loss": 0.0333, + "step": 157860 + }, + { + "epoch": 34.00501543627796, + "grad_norm": 0.010454509407281876, + "learning_rate": 8.052140316669376e-06, + "loss": 0.0002, + "step": 157870 + }, + { + "epoch": 34.005069598656775, + "grad_norm": 0.9347416162490845, + "learning_rate": 8.049131295624282e-06, + "loss": 0.0071, + "step": 157880 + }, + { + "epoch": 34.005123761035584, + "grad_norm": 0.005901617929339409, + "learning_rate": 8.046122274579188e-06, + "loss": 0.0048, + "step": 157890 + }, + { + "epoch": 34.005177923414394, + "grad_norm": 0.001212650560773909, + "learning_rate": 8.043113253534096e-06, + "loss": 0.0239, + "step": 157900 + }, + { + "epoch": 34.00523208579321, + "grad_norm": 0.0016715178498998284, + "learning_rate": 8.040104232489003e-06, + "loss": 0.001, + "step": 157910 + }, + { + "epoch": 34.00528624817202, + "grad_norm": 0.0011462550610303879, + "learning_rate": 8.037095211443909e-06, + "loss": 0.0015, + "step": 157920 + }, + { + "epoch": 34.00534041055083, + "grad_norm": 0.0030903073493391275, + "learning_rate": 8.034086190398817e-06, + "loss": 0.0064, + "step": 157930 + }, + { + "epoch": 34.005394572929646, + "grad_norm": 0.010937764309346676, + "learning_rate": 8.031077169353723e-06, + "loss": 0.0102, + "step": 157940 + }, + { + "epoch": 34.005448735308455, + "grad_norm": 0.001539433840662241, + "learning_rate": 8.02806814830863e-06, + "loss": 0.0255, + "step": 157950 + }, + { + "epoch": 34.005502897687265, + "grad_norm": 3.5542311668395996, + "learning_rate": 8.025059127263536e-06, + "loss": 0.0711, + "step": 157960 + }, + { + "epoch": 34.00555706006608, + "grad_norm": 1.0898809432983398, + "learning_rate": 8.022050106218442e-06, + "loss": 0.0086, + "step": 157970 + }, + { + "epoch": 34.00561122244489, + "grad_norm": 0.8800159096717834, + "learning_rate": 8.01904108517335e-06, + "loss": 0.0136, + "step": 157980 + }, + { + "epoch": 34.0056653848237, + "grad_norm": 2.484523057937622, + "learning_rate": 8.016032064128256e-06, + "loss": 0.0289, + "step": 157990 + }, + { + "epoch": 34.00571954720251, + "grad_norm": 0.14715972542762756, + "learning_rate": 8.013023043083164e-06, + "loss": 0.0565, + "step": 158000 + }, + { + "epoch": 34.00577370958133, + "grad_norm": 0.007080270443111658, + "learning_rate": 8.01001402203807e-06, + "loss": 0.0143, + "step": 158010 + }, + { + "epoch": 34.005827871960136, + "grad_norm": 0.12442964315414429, + "learning_rate": 8.007005000992977e-06, + "loss": 0.0008, + "step": 158020 + }, + { + "epoch": 34.005882034338946, + "grad_norm": 0.0014508186141029, + "learning_rate": 8.003995979947885e-06, + "loss": 0.002, + "step": 158030 + }, + { + "epoch": 34.00593619671776, + "grad_norm": 1.067327618598938, + "learning_rate": 8.000986958902791e-06, + "loss": 0.1128, + "step": 158040 + }, + { + "epoch": 34.00599035909657, + "grad_norm": 0.0013213185593485832, + "learning_rate": 7.997977937857697e-06, + "loss": 0.0058, + "step": 158050 + }, + { + "epoch": 34.00604452147538, + "grad_norm": 0.0020751121919602156, + "learning_rate": 7.994968916812605e-06, + "loss": 0.0603, + "step": 158060 + }, + { + "epoch": 34.0060986838542, + "grad_norm": 0.0009980598697438836, + "learning_rate": 7.991959895767512e-06, + "loss": 0.2033, + "step": 158070 + }, + { + "epoch": 34.00615284623301, + "grad_norm": 0.9148399233818054, + "learning_rate": 7.98895087472242e-06, + "loss": 0.0693, + "step": 158080 + }, + { + "epoch": 34.00620700861182, + "grad_norm": 0.0014589462662115693, + "learning_rate": 7.985941853677326e-06, + "loss": 0.039, + "step": 158090 + }, + { + "epoch": 34.006261170990626, + "grad_norm": 4.920996189117432, + "learning_rate": 7.98293283263223e-06, + "loss": 0.0253, + "step": 158100 + }, + { + "epoch": 34.00631533336944, + "grad_norm": 0.00103968963958323, + "learning_rate": 7.979923811587138e-06, + "loss": 0.0057, + "step": 158110 + }, + { + "epoch": 34.00636949574825, + "grad_norm": 0.43729451298713684, + "learning_rate": 7.976914790542045e-06, + "loss": 0.0356, + "step": 158120 + }, + { + "epoch": 34.00642365812706, + "grad_norm": 0.001721117296256125, + "learning_rate": 7.973905769496953e-06, + "loss": 0.0315, + "step": 158130 + }, + { + "epoch": 34.00647782050588, + "grad_norm": 0.15581345558166504, + "learning_rate": 7.970896748451859e-06, + "loss": 0.0219, + "step": 158140 + }, + { + "epoch": 34.00653198288469, + "grad_norm": 0.0010494656162336469, + "learning_rate": 7.967887727406765e-06, + "loss": 0.0697, + "step": 158150 + }, + { + "epoch": 34.0065861452635, + "grad_norm": 1.0216915607452393, + "learning_rate": 7.964878706361673e-06, + "loss": 0.0397, + "step": 158160 + }, + { + "epoch": 34.006640307642314, + "grad_norm": 0.559333324432373, + "learning_rate": 7.96186968531658e-06, + "loss": 0.0576, + "step": 158170 + }, + { + "epoch": 34.006694470021124, + "grad_norm": 0.17186053097248077, + "learning_rate": 7.958860664271486e-06, + "loss": 0.04, + "step": 158180 + }, + { + "epoch": 34.00674863239993, + "grad_norm": 0.0012634812155738473, + "learning_rate": 7.955851643226394e-06, + "loss": 0.0052, + "step": 158190 + }, + { + "epoch": 34.00680279477875, + "grad_norm": 0.0012952120741829276, + "learning_rate": 7.9528426221813e-06, + "loss": 0.0072, + "step": 158200 + }, + { + "epoch": 34.00685695715756, + "grad_norm": 0.37707623839378357, + "learning_rate": 7.949833601136208e-06, + "loss": 0.0948, + "step": 158210 + }, + { + "epoch": 34.00691111953637, + "grad_norm": 0.019737843424081802, + "learning_rate": 7.946824580091114e-06, + "loss": 0.0023, + "step": 158220 + }, + { + "epoch": 34.00696528191518, + "grad_norm": 0.006738210562616587, + "learning_rate": 7.94381555904602e-06, + "loss": 0.0403, + "step": 158230 + }, + { + "epoch": 34.007019444293995, + "grad_norm": 0.001023398246616125, + "learning_rate": 7.940806538000928e-06, + "loss": 0.0535, + "step": 158240 + }, + { + "epoch": 34.007073606672805, + "grad_norm": 0.002350271213799715, + "learning_rate": 7.937797516955833e-06, + "loss": 0.0707, + "step": 158250 + }, + { + "epoch": 34.007127769051614, + "grad_norm": 0.0015269429422914982, + "learning_rate": 7.934788495910741e-06, + "loss": 0.0103, + "step": 158260 + }, + { + "epoch": 34.00718193143043, + "grad_norm": 0.003090689657256007, + "learning_rate": 7.931779474865647e-06, + "loss": 0.0003, + "step": 158270 + }, + { + "epoch": 34.00723609380924, + "grad_norm": 0.445303350687027, + "learning_rate": 7.928770453820554e-06, + "loss": 0.0055, + "step": 158280 + }, + { + "epoch": 34.00729025618805, + "grad_norm": 0.001013042638078332, + "learning_rate": 7.925761432775461e-06, + "loss": 0.0028, + "step": 158290 + }, + { + "epoch": 34.007344418566866, + "grad_norm": 0.0020030790474265814, + "learning_rate": 7.922752411730368e-06, + "loss": 0.0323, + "step": 158300 + }, + { + "epoch": 34.007398580945676, + "grad_norm": 0.0010225595906376839, + "learning_rate": 7.919743390685274e-06, + "loss": 0.0109, + "step": 158310 + }, + { + "epoch": 34.007452743324485, + "grad_norm": 0.0016673820791766047, + "learning_rate": 7.916734369640182e-06, + "loss": 0.0747, + "step": 158320 + }, + { + "epoch": 34.0075069057033, + "grad_norm": 1.2752245664596558, + "learning_rate": 7.913725348595088e-06, + "loss": 0.0371, + "step": 158330 + }, + { + "epoch": 34.00756106808211, + "grad_norm": 0.0010053551523014903, + "learning_rate": 7.910716327549996e-06, + "loss": 0.0211, + "step": 158340 + }, + { + "epoch": 34.00761523046092, + "grad_norm": 0.0010200508404523134, + "learning_rate": 7.907707306504903e-06, + "loss": 0.0012, + "step": 158350 + }, + { + "epoch": 34.00766939283973, + "grad_norm": 0.0036507819313555956, + "learning_rate": 7.904698285459809e-06, + "loss": 0.0642, + "step": 158360 + }, + { + "epoch": 34.00772355521855, + "grad_norm": 0.0014281170442700386, + "learning_rate": 7.901689264414717e-06, + "loss": 0.0507, + "step": 158370 + }, + { + "epoch": 34.00777771759736, + "grad_norm": 0.0012128683738410473, + "learning_rate": 7.898680243369623e-06, + "loss": 0.0378, + "step": 158380 + }, + { + "epoch": 34.007831879976166, + "grad_norm": 0.0010120332008227706, + "learning_rate": 7.89567122232453e-06, + "loss": 0.0005, + "step": 158390 + }, + { + "epoch": 34.00788604235498, + "grad_norm": 0.0014893636107444763, + "learning_rate": 7.892662201279436e-06, + "loss": 0.0963, + "step": 158400 + }, + { + "epoch": 34.00794020473379, + "grad_norm": 0.004627875983715057, + "learning_rate": 7.889653180234342e-06, + "loss": 0.0751, + "step": 158410 + }, + { + "epoch": 34.0079943671126, + "grad_norm": 0.0011776742758229375, + "learning_rate": 7.88664415918925e-06, + "loss": 0.0018, + "step": 158420 + }, + { + "epoch": 34.00804852949142, + "grad_norm": 0.0010758249554783106, + "learning_rate": 7.883635138144156e-06, + "loss": 0.0004, + "step": 158430 + }, + { + "epoch": 34.00810269187023, + "grad_norm": 0.09253310412168503, + "learning_rate": 7.880626117099062e-06, + "loss": 0.1242, + "step": 158440 + }, + { + "epoch": 34.00815685424904, + "grad_norm": 0.0025274031795561314, + "learning_rate": 7.87761709605397e-06, + "loss": 0.03, + "step": 158450 + }, + { + "epoch": 34.00821101662785, + "grad_norm": 0.0011392806190997362, + "learning_rate": 7.874608075008877e-06, + "loss": 0.0209, + "step": 158460 + }, + { + "epoch": 34.00826517900666, + "grad_norm": 0.002204369055107236, + "learning_rate": 7.871599053963785e-06, + "loss": 0.0117, + "step": 158470 + }, + { + "epoch": 34.00831934138547, + "grad_norm": 0.0015467897756025195, + "learning_rate": 7.868590032918691e-06, + "loss": 0.0019, + "step": 158480 + }, + { + "epoch": 34.00837350376428, + "grad_norm": 0.0010677189566195011, + "learning_rate": 7.865581011873597e-06, + "loss": 0.0093, + "step": 158490 + }, + { + "epoch": 34.0084276661431, + "grad_norm": 0.026645002886652946, + "learning_rate": 7.862571990828505e-06, + "loss": 0.0068, + "step": 158500 + }, + { + "epoch": 34.00848182852191, + "grad_norm": 3.9584639072418213, + "learning_rate": 7.859562969783411e-06, + "loss": 0.0571, + "step": 158510 + }, + { + "epoch": 34.00853599090072, + "grad_norm": 0.001056249369867146, + "learning_rate": 7.856553948738318e-06, + "loss": 0.0463, + "step": 158520 + }, + { + "epoch": 34.008590153279535, + "grad_norm": 0.001081750262528658, + "learning_rate": 7.853544927693226e-06, + "loss": 0.0021, + "step": 158530 + }, + { + "epoch": 34.008644315658344, + "grad_norm": 0.04186325892806053, + "learning_rate": 7.850535906648132e-06, + "loss": 0.0047, + "step": 158540 + }, + { + "epoch": 34.008698478037154, + "grad_norm": 0.2118605673313141, + "learning_rate": 7.847526885603038e-06, + "loss": 0.0228, + "step": 158550 + }, + { + "epoch": 34.00875264041597, + "grad_norm": 0.0013233169447630644, + "learning_rate": 7.844517864557944e-06, + "loss": 0.08, + "step": 158560 + }, + { + "epoch": 34.00880680279478, + "grad_norm": 0.2708684802055359, + "learning_rate": 7.84150884351285e-06, + "loss": 0.1038, + "step": 158570 + }, + { + "epoch": 34.00886096517359, + "grad_norm": 0.6543224453926086, + "learning_rate": 7.838499822467759e-06, + "loss": 0.061, + "step": 158580 + }, + { + "epoch": 34.0089151275524, + "grad_norm": 0.8801040053367615, + "learning_rate": 7.835490801422665e-06, + "loss": 0.0581, + "step": 158590 + }, + { + "epoch": 34.008969289931215, + "grad_norm": 3.7114341259002686, + "learning_rate": 7.832481780377573e-06, + "loss": 0.0892, + "step": 158600 + }, + { + "epoch": 34.009023452310025, + "grad_norm": 1.5007774829864502, + "learning_rate": 7.82947275933248e-06, + "loss": 0.022, + "step": 158610 + }, + { + "epoch": 34.009077614688835, + "grad_norm": 2.1185970306396484, + "learning_rate": 7.826463738287385e-06, + "loss": 0.0392, + "step": 158620 + }, + { + "epoch": 34.00913177706765, + "grad_norm": 0.0011174156097695231, + "learning_rate": 7.823454717242293e-06, + "loss": 0.0354, + "step": 158630 + }, + { + "epoch": 34.00918593944646, + "grad_norm": 0.004015900194644928, + "learning_rate": 7.8204456961972e-06, + "loss": 0.0332, + "step": 158640 + }, + { + "epoch": 34.00924010182527, + "grad_norm": 0.060886453837156296, + "learning_rate": 7.817436675152106e-06, + "loss": 0.0876, + "step": 158650 + }, + { + "epoch": 34.00929426420409, + "grad_norm": 0.0015944552142173052, + "learning_rate": 7.814427654107014e-06, + "loss": 0.0427, + "step": 158660 + }, + { + "epoch": 34.009348426582896, + "grad_norm": 0.4056573510169983, + "learning_rate": 7.81141863306192e-06, + "loss": 0.0169, + "step": 158670 + }, + { + "epoch": 34.009402588961706, + "grad_norm": 1.3432813882827759, + "learning_rate": 7.808409612016827e-06, + "loss": 0.0199, + "step": 158680 + }, + { + "epoch": 34.00945675134052, + "grad_norm": 5.503861904144287, + "learning_rate": 7.805400590971734e-06, + "loss": 0.0888, + "step": 158690 + }, + { + "epoch": 34.00951091371933, + "grad_norm": 0.0020281102042645216, + "learning_rate": 7.802391569926639e-06, + "loss": 0.1542, + "step": 158700 + }, + { + "epoch": 34.00956507609814, + "grad_norm": 0.6766178011894226, + "learning_rate": 7.799382548881547e-06, + "loss": 0.0069, + "step": 158710 + }, + { + "epoch": 34.00961923847695, + "grad_norm": 0.0015961858443915844, + "learning_rate": 7.796373527836453e-06, + "loss": 0.0087, + "step": 158720 + }, + { + "epoch": 34.00967340085577, + "grad_norm": 0.4475398063659668, + "learning_rate": 7.793364506791361e-06, + "loss": 0.0279, + "step": 158730 + }, + { + "epoch": 34.00972756323458, + "grad_norm": 0.032374266535043716, + "learning_rate": 7.790355485746268e-06, + "loss": 0.0135, + "step": 158740 + }, + { + "epoch": 34.00978172561339, + "grad_norm": 0.0037260642275214195, + "learning_rate": 7.787346464701174e-06, + "loss": 0.0568, + "step": 158750 + }, + { + "epoch": 34.0098358879922, + "grad_norm": 5.684840679168701, + "learning_rate": 7.784337443656082e-06, + "loss": 0.1311, + "step": 158760 + }, + { + "epoch": 34.00989005037101, + "grad_norm": 0.04905104264616966, + "learning_rate": 7.781328422610988e-06, + "loss": 0.0391, + "step": 158770 + }, + { + "epoch": 34.00994421274982, + "grad_norm": 0.003085602540522814, + "learning_rate": 7.778319401565894e-06, + "loss": 0.0128, + "step": 158780 + }, + { + "epoch": 34.00999837512864, + "grad_norm": 0.0017507505835965276, + "learning_rate": 7.775310380520802e-06, + "loss": 0.0992, + "step": 158790 + }, + { + "epoch": 34.01005253750745, + "grad_norm": 0.0016228773165494204, + "learning_rate": 7.772301359475709e-06, + "loss": 0.0194, + "step": 158800 + }, + { + "epoch": 34.01010669988626, + "grad_norm": 3.0416290760040283, + "learning_rate": 7.769292338430617e-06, + "loss": 0.0411, + "step": 158810 + }, + { + "epoch": 34.01016086226507, + "grad_norm": 0.0012978498125448823, + "learning_rate": 7.766283317385523e-06, + "loss": 0.013, + "step": 158820 + }, + { + "epoch": 34.010215024643884, + "grad_norm": 0.0017299832543358207, + "learning_rate": 7.763274296340429e-06, + "loss": 0.1164, + "step": 158830 + }, + { + "epoch": 34.01026918702269, + "grad_norm": 1.0348985195159912, + "learning_rate": 7.760265275295337e-06, + "loss": 0.0295, + "step": 158840 + }, + { + "epoch": 34.0103233494015, + "grad_norm": 0.6745988726615906, + "learning_rate": 7.757256254250242e-06, + "loss": 0.0227, + "step": 158850 + }, + { + "epoch": 34.01037751178032, + "grad_norm": 0.10444007068872452, + "learning_rate": 7.75424723320515e-06, + "loss": 0.0442, + "step": 158860 + }, + { + "epoch": 34.01043167415913, + "grad_norm": 0.08162540942430496, + "learning_rate": 7.751238212160056e-06, + "loss": 0.0629, + "step": 158870 + }, + { + "epoch": 34.01048583653794, + "grad_norm": 0.7980866432189941, + "learning_rate": 7.748229191114962e-06, + "loss": 0.0414, + "step": 158880 + }, + { + "epoch": 34.010539998916755, + "grad_norm": 0.010193288326263428, + "learning_rate": 7.74522017006987e-06, + "loss": 0.0041, + "step": 158890 + }, + { + "epoch": 34.010594161295565, + "grad_norm": 0.0012798503739759326, + "learning_rate": 7.742211149024776e-06, + "loss": 0.038, + "step": 158900 + }, + { + "epoch": 34.010648323674374, + "grad_norm": 0.18921782076358795, + "learning_rate": 7.739202127979683e-06, + "loss": 0.0033, + "step": 158910 + }, + { + "epoch": 34.01070248605319, + "grad_norm": 0.0035608168691396713, + "learning_rate": 7.73619310693459e-06, + "loss": 0.0209, + "step": 158920 + }, + { + "epoch": 34.010756648432, + "grad_norm": 0.0014111138880252838, + "learning_rate": 7.733184085889497e-06, + "loss": 0.0306, + "step": 158930 + }, + { + "epoch": 34.01081081081081, + "grad_norm": 0.2271648347377777, + "learning_rate": 7.730175064844405e-06, + "loss": 0.002, + "step": 158940 + }, + { + "epoch": 34.01086497318962, + "grad_norm": 0.0011569318594411016, + "learning_rate": 7.727166043799311e-06, + "loss": 0.0857, + "step": 158950 + }, + { + "epoch": 34.010919135568436, + "grad_norm": 0.0014974152436479926, + "learning_rate": 7.724157022754217e-06, + "loss": 0.0018, + "step": 158960 + }, + { + "epoch": 34.010973297947245, + "grad_norm": 0.0016959606437012553, + "learning_rate": 7.721148001709125e-06, + "loss": 0.009, + "step": 158970 + }, + { + "epoch": 34.011027460326055, + "grad_norm": 0.8590657711029053, + "learning_rate": 7.718138980664032e-06, + "loss": 0.0186, + "step": 158980 + }, + { + "epoch": 34.01108162270487, + "grad_norm": 7.56602668762207, + "learning_rate": 7.715129959618938e-06, + "loss": 0.0742, + "step": 158990 + }, + { + "epoch": 34.01113578508368, + "grad_norm": 0.0011030214373022318, + "learning_rate": 7.712120938573844e-06, + "loss": 0.0136, + "step": 159000 + }, + { + "epoch": 34.01118994746249, + "grad_norm": 6.312027454376221, + "learning_rate": 7.70911191752875e-06, + "loss": 0.0376, + "step": 159010 + }, + { + "epoch": 34.01124410984131, + "grad_norm": 0.6092588901519775, + "learning_rate": 7.706102896483658e-06, + "loss": 0.0408, + "step": 159020 + }, + { + "epoch": 34.01129827222012, + "grad_norm": 0.0153201287612319, + "learning_rate": 7.703093875438565e-06, + "loss": 0.0226, + "step": 159030 + }, + { + "epoch": 34.011352434598926, + "grad_norm": 0.13736392557621002, + "learning_rate": 7.700084854393471e-06, + "loss": 0.0153, + "step": 159040 + }, + { + "epoch": 34.011406596977736, + "grad_norm": 0.0024867712054401636, + "learning_rate": 7.697075833348379e-06, + "loss": 0.0001, + "step": 159050 + }, + { + "epoch": 34.01146075935655, + "grad_norm": 0.0010502777295187116, + "learning_rate": 7.694066812303285e-06, + "loss": 0.0721, + "step": 159060 + }, + { + "epoch": 34.01151492173536, + "grad_norm": 1.0285851955413818, + "learning_rate": 7.691057791258193e-06, + "loss": 0.0673, + "step": 159070 + }, + { + "epoch": 34.01156908411417, + "grad_norm": 0.033308688551187515, + "learning_rate": 7.6880487702131e-06, + "loss": 0.0065, + "step": 159080 + }, + { + "epoch": 34.01162324649299, + "grad_norm": 0.9238662123680115, + "learning_rate": 7.685039749168006e-06, + "loss": 0.0574, + "step": 159090 + }, + { + "epoch": 34.0116774088718, + "grad_norm": 1.0690008401870728, + "learning_rate": 7.682030728122914e-06, + "loss": 0.0694, + "step": 159100 + }, + { + "epoch": 34.01173157125061, + "grad_norm": 0.0012801331467926502, + "learning_rate": 7.67902170707782e-06, + "loss": 0.1341, + "step": 159110 + }, + { + "epoch": 34.011785733629424, + "grad_norm": 3.5251424312591553, + "learning_rate": 7.676012686032726e-06, + "loss": 0.087, + "step": 159120 + }, + { + "epoch": 34.01183989600823, + "grad_norm": 4.751280784606934, + "learning_rate": 7.673003664987634e-06, + "loss": 0.0927, + "step": 159130 + }, + { + "epoch": 34.01189405838704, + "grad_norm": 1.0497225522994995, + "learning_rate": 7.66999464394254e-06, + "loss": 0.0477, + "step": 159140 + }, + { + "epoch": 34.01194822076586, + "grad_norm": 0.002341305138543248, + "learning_rate": 7.666985622897447e-06, + "loss": 0.0079, + "step": 159150 + }, + { + "epoch": 34.01200238314467, + "grad_norm": 0.0013796964194625616, + "learning_rate": 7.663976601852353e-06, + "loss": 0.0292, + "step": 159160 + }, + { + "epoch": 34.01205654552348, + "grad_norm": 1.7231367826461792, + "learning_rate": 7.66096758080726e-06, + "loss": 0.0425, + "step": 159170 + }, + { + "epoch": 34.01211070790229, + "grad_norm": 0.0011561318533495069, + "learning_rate": 7.657958559762167e-06, + "loss": 0.0081, + "step": 159180 + }, + { + "epoch": 34.012164870281104, + "grad_norm": 0.022832704707980156, + "learning_rate": 7.654949538717074e-06, + "loss": 0.0633, + "step": 159190 + }, + { + "epoch": 34.012219032659914, + "grad_norm": 0.6375668048858643, + "learning_rate": 7.651940517671982e-06, + "loss": 0.0191, + "step": 159200 + }, + { + "epoch": 34.01227319503872, + "grad_norm": 0.01943417452275753, + "learning_rate": 7.648931496626888e-06, + "loss": 0.0057, + "step": 159210 + }, + { + "epoch": 34.01232735741754, + "grad_norm": 0.0024469648487865925, + "learning_rate": 7.645922475581794e-06, + "loss": 0.018, + "step": 159220 + }, + { + "epoch": 34.01238151979635, + "grad_norm": 0.85723477602005, + "learning_rate": 7.642913454536702e-06, + "loss": 0.0206, + "step": 159230 + }, + { + "epoch": 34.01243568217516, + "grad_norm": 0.001157450140453875, + "learning_rate": 7.639904433491608e-06, + "loss": 0.0061, + "step": 159240 + }, + { + "epoch": 34.012489844553976, + "grad_norm": 0.11244355887174606, + "learning_rate": 7.636895412446515e-06, + "loss": 0.046, + "step": 159250 + }, + { + "epoch": 34.012544006932785, + "grad_norm": 0.0017242225585505366, + "learning_rate": 7.633886391401423e-06, + "loss": 0.0095, + "step": 159260 + }, + { + "epoch": 34.012598169311595, + "grad_norm": 0.001483156462199986, + "learning_rate": 7.630877370356329e-06, + "loss": 0.005, + "step": 159270 + }, + { + "epoch": 34.01265233169041, + "grad_norm": 0.014208323322236538, + "learning_rate": 7.627868349311236e-06, + "loss": 0.0002, + "step": 159280 + }, + { + "epoch": 34.01270649406922, + "grad_norm": 0.0016504410887137055, + "learning_rate": 7.624859328266143e-06, + "loss": 0.009, + "step": 159290 + }, + { + "epoch": 34.01276065644803, + "grad_norm": 0.0011050639441236854, + "learning_rate": 7.6218503072210486e-06, + "loss": 0.0163, + "step": 159300 + }, + { + "epoch": 34.01281481882684, + "grad_norm": 0.7949632406234741, + "learning_rate": 7.618841286175956e-06, + "loss": 0.0243, + "step": 159310 + }, + { + "epoch": 34.012868981205656, + "grad_norm": 0.004894562065601349, + "learning_rate": 7.615832265130862e-06, + "loss": 0.0104, + "step": 159320 + }, + { + "epoch": 34.012923143584466, + "grad_norm": 0.0017246479401364923, + "learning_rate": 7.612823244085769e-06, + "loss": 0.0185, + "step": 159330 + }, + { + "epoch": 34.012977305963275, + "grad_norm": 0.0015627131797373295, + "learning_rate": 7.609814223040676e-06, + "loss": 0.0151, + "step": 159340 + }, + { + "epoch": 34.01303146834209, + "grad_norm": 0.0012826838064938784, + "learning_rate": 7.606805201995583e-06, + "loss": 0.0078, + "step": 159350 + }, + { + "epoch": 34.0130856307209, + "grad_norm": 0.0017866907874122262, + "learning_rate": 7.60379618095049e-06, + "loss": 0.011, + "step": 159360 + }, + { + "epoch": 34.01313979309971, + "grad_norm": 0.0010974769247695804, + "learning_rate": 7.600787159905397e-06, + "loss": 0.0003, + "step": 159370 + }, + { + "epoch": 34.01319395547853, + "grad_norm": 0.8730658292770386, + "learning_rate": 7.597778138860304e-06, + "loss": 0.0604, + "step": 159380 + }, + { + "epoch": 34.01324811785734, + "grad_norm": 0.00248124310746789, + "learning_rate": 7.594769117815211e-06, + "loss": 0.1041, + "step": 159390 + }, + { + "epoch": 34.01330228023615, + "grad_norm": 0.005295139737427235, + "learning_rate": 7.591760096770117e-06, + "loss": 0.0286, + "step": 159400 + }, + { + "epoch": 34.013356442614956, + "grad_norm": 0.3793982267379761, + "learning_rate": 7.588751075725024e-06, + "loss": 0.0221, + "step": 159410 + }, + { + "epoch": 34.01341060499377, + "grad_norm": 0.0013708516489714384, + "learning_rate": 7.5857420546799315e-06, + "loss": 0.0132, + "step": 159420 + }, + { + "epoch": 34.01346476737258, + "grad_norm": 0.0014686745125800371, + "learning_rate": 7.582733033634838e-06, + "loss": 0.0687, + "step": 159430 + }, + { + "epoch": 34.01351892975139, + "grad_norm": 0.3341088593006134, + "learning_rate": 7.579724012589745e-06, + "loss": 0.0016, + "step": 159440 + }, + { + "epoch": 34.01357309213021, + "grad_norm": 0.0010706832399591804, + "learning_rate": 7.57671499154465e-06, + "loss": 0.0401, + "step": 159450 + }, + { + "epoch": 34.01362725450902, + "grad_norm": 0.0015882485313341022, + "learning_rate": 7.5737059704995574e-06, + "loss": 0.0785, + "step": 159460 + }, + { + "epoch": 34.01368141688783, + "grad_norm": 0.0014031323371455073, + "learning_rate": 7.5706969494544646e-06, + "loss": 0.0497, + "step": 159470 + }, + { + "epoch": 34.013735579266644, + "grad_norm": 0.05028209462761879, + "learning_rate": 7.567687928409372e-06, + "loss": 0.0588, + "step": 159480 + }, + { + "epoch": 34.01378974164545, + "grad_norm": 0.027003809809684753, + "learning_rate": 7.564678907364278e-06, + "loss": 0.0026, + "step": 159490 + }, + { + "epoch": 34.01384390402426, + "grad_norm": 0.8831090331077576, + "learning_rate": 7.561669886319185e-06, + "loss": 0.0408, + "step": 159500 + }, + { + "epoch": 34.01389806640308, + "grad_norm": 0.0046325670555233955, + "learning_rate": 7.558660865274092e-06, + "loss": 0.0535, + "step": 159510 + }, + { + "epoch": 34.01395222878189, + "grad_norm": 0.1690148264169693, + "learning_rate": 7.555651844228999e-06, + "loss": 0.0726, + "step": 159520 + }, + { + "epoch": 34.0140063911607, + "grad_norm": 0.0018253033049404621, + "learning_rate": 7.552642823183906e-06, + "loss": 0.0297, + "step": 159530 + }, + { + "epoch": 34.01406055353951, + "grad_norm": 0.03142261877655983, + "learning_rate": 7.549633802138813e-06, + "loss": 0.0009, + "step": 159540 + }, + { + "epoch": 34.014114715918325, + "grad_norm": 0.0034051539842039347, + "learning_rate": 7.54662478109372e-06, + "loss": 0.0002, + "step": 159550 + }, + { + "epoch": 34.014168878297134, + "grad_norm": 0.0011452308390289545, + "learning_rate": 7.543615760048626e-06, + "loss": 0.0187, + "step": 159560 + }, + { + "epoch": 34.014223040675944, + "grad_norm": 0.00277312402613461, + "learning_rate": 7.540606739003533e-06, + "loss": 0.0296, + "step": 159570 + }, + { + "epoch": 34.01427720305476, + "grad_norm": 0.0016881724586710334, + "learning_rate": 7.53759771795844e-06, + "loss": 0.0269, + "step": 159580 + }, + { + "epoch": 34.01433136543357, + "grad_norm": 0.001077491557225585, + "learning_rate": 7.5345886969133475e-06, + "loss": 0.0626, + "step": 159590 + }, + { + "epoch": 34.01438552781238, + "grad_norm": 0.002048599999397993, + "learning_rate": 7.531579675868253e-06, + "loss": 0.0641, + "step": 159600 + }, + { + "epoch": 34.014439690191196, + "grad_norm": 0.0017423706594854593, + "learning_rate": 7.52857065482316e-06, + "loss": 0.0073, + "step": 159610 + }, + { + "epoch": 34.014493852570006, + "grad_norm": 0.0017289647366851568, + "learning_rate": 7.525561633778066e-06, + "loss": 0.0145, + "step": 159620 + }, + { + "epoch": 34.014548014948815, + "grad_norm": 0.09573765844106674, + "learning_rate": 7.5225526127329734e-06, + "loss": 0.0227, + "step": 159630 + }, + { + "epoch": 34.01460217732763, + "grad_norm": 0.0010619384702295065, + "learning_rate": 7.5195435916878805e-06, + "loss": 0.061, + "step": 159640 + }, + { + "epoch": 34.01465633970644, + "grad_norm": 2.6326863765716553, + "learning_rate": 7.516534570642788e-06, + "loss": 0.0304, + "step": 159650 + }, + { + "epoch": 34.01471050208525, + "grad_norm": 0.0013370151864364743, + "learning_rate": 7.513525549597694e-06, + "loss": 0.0619, + "step": 159660 + }, + { + "epoch": 34.01476466446406, + "grad_norm": 0.0022734233643859625, + "learning_rate": 7.510516528552601e-06, + "loss": 0.0058, + "step": 159670 + }, + { + "epoch": 34.01481882684288, + "grad_norm": 0.0011839504586532712, + "learning_rate": 7.507507507507508e-06, + "loss": 0.0971, + "step": 159680 + }, + { + "epoch": 34.014872989221686, + "grad_norm": 0.0020638976711779833, + "learning_rate": 7.5044984864624145e-06, + "loss": 0.0138, + "step": 159690 + }, + { + "epoch": 34.014927151600496, + "grad_norm": 0.4317655861377716, + "learning_rate": 7.501489465417322e-06, + "loss": 0.0442, + "step": 159700 + }, + { + "epoch": 34.01498131397931, + "grad_norm": 1.2655788660049438, + "learning_rate": 7.498480444372229e-06, + "loss": 0.028, + "step": 159710 + }, + { + "epoch": 34.01503547635812, + "grad_norm": 0.0022211303003132343, + "learning_rate": 7.495471423327136e-06, + "loss": 0.0016, + "step": 159720 + }, + { + "epoch": 34.01508963873693, + "grad_norm": 0.0021419143304228783, + "learning_rate": 7.492462402282042e-06, + "loss": 0.0305, + "step": 159730 + }, + { + "epoch": 34.01514380111575, + "grad_norm": 0.002011561766266823, + "learning_rate": 7.489453381236949e-06, + "loss": 0.0244, + "step": 159740 + }, + { + "epoch": 34.01519796349456, + "grad_norm": 0.0016535629983991385, + "learning_rate": 7.486444360191855e-06, + "loss": 0.0001, + "step": 159750 + }, + { + "epoch": 34.01525212587337, + "grad_norm": 0.0011396671179682016, + "learning_rate": 7.483435339146762e-06, + "loss": 0.0022, + "step": 159760 + }, + { + "epoch": 34.01530628825218, + "grad_norm": 9.377391815185547, + "learning_rate": 7.480426318101669e-06, + "loss": 0.0532, + "step": 159770 + }, + { + "epoch": 34.01536045063099, + "grad_norm": 0.0017988004256039858, + "learning_rate": 7.477417297056576e-06, + "loss": 0.0111, + "step": 159780 + }, + { + "epoch": 34.0154146130098, + "grad_norm": 0.0010606002761051059, + "learning_rate": 7.474408276011482e-06, + "loss": 0.0002, + "step": 159790 + }, + { + "epoch": 34.01546877538861, + "grad_norm": 0.003212760202586651, + "learning_rate": 7.471399254966389e-06, + "loss": 0.0177, + "step": 159800 + }, + { + "epoch": 34.01552293776743, + "grad_norm": 0.05825715512037277, + "learning_rate": 7.4683902339212965e-06, + "loss": 0.0003, + "step": 159810 + }, + { + "epoch": 34.01557710014624, + "grad_norm": 0.6676540374755859, + "learning_rate": 7.465381212876204e-06, + "loss": 0.044, + "step": 159820 + }, + { + "epoch": 34.01563126252505, + "grad_norm": 0.0030618072487413883, + "learning_rate": 7.46237219183111e-06, + "loss": 0.0142, + "step": 159830 + }, + { + "epoch": 34.015685424903864, + "grad_norm": 0.09377098828554153, + "learning_rate": 7.459363170786017e-06, + "loss": 0.0655, + "step": 159840 + }, + { + "epoch": 34.015739587282674, + "grad_norm": 0.0010038117179647088, + "learning_rate": 7.456354149740924e-06, + "loss": 0.0092, + "step": 159850 + }, + { + "epoch": 34.01579374966148, + "grad_norm": 2.46450138092041, + "learning_rate": 7.4533451286958304e-06, + "loss": 0.0974, + "step": 159860 + }, + { + "epoch": 34.0158479120403, + "grad_norm": 0.004041695035994053, + "learning_rate": 7.4503361076507376e-06, + "loss": 0.0063, + "step": 159870 + }, + { + "epoch": 34.01590207441911, + "grad_norm": 0.001039921771734953, + "learning_rate": 7.447327086605645e-06, + "loss": 0.0521, + "step": 159880 + }, + { + "epoch": 34.01595623679792, + "grad_norm": 0.001094689592719078, + "learning_rate": 7.444318065560552e-06, + "loss": 0.0021, + "step": 159890 + }, + { + "epoch": 34.01601039917673, + "grad_norm": 0.00102247204631567, + "learning_rate": 7.441309044515457e-06, + "loss": 0.0, + "step": 159900 + }, + { + "epoch": 34.016064561555545, + "grad_norm": 0.0013295894023030996, + "learning_rate": 7.438300023470364e-06, + "loss": 0.0879, + "step": 159910 + }, + { + "epoch": 34.016118723934355, + "grad_norm": 2.8926773071289062, + "learning_rate": 7.435291002425271e-06, + "loss": 0.0514, + "step": 159920 + }, + { + "epoch": 34.016172886313164, + "grad_norm": 0.001803962397389114, + "learning_rate": 7.432281981380178e-06, + "loss": 0.0171, + "step": 159930 + }, + { + "epoch": 34.01622704869198, + "grad_norm": 1.5274500846862793, + "learning_rate": 7.429272960335085e-06, + "loss": 0.0308, + "step": 159940 + }, + { + "epoch": 34.01628121107079, + "grad_norm": 0.0013026727829128504, + "learning_rate": 7.426263939289992e-06, + "loss": 0.0322, + "step": 159950 + }, + { + "epoch": 34.0163353734496, + "grad_norm": 0.7697742581367493, + "learning_rate": 7.423254918244898e-06, + "loss": 0.0072, + "step": 159960 + }, + { + "epoch": 34.01638953582842, + "grad_norm": 0.0013097964692860842, + "learning_rate": 7.420245897199805e-06, + "loss": 0.0279, + "step": 159970 + }, + { + "epoch": 34.016443698207226, + "grad_norm": 0.23760850727558136, + "learning_rate": 7.4172368761547125e-06, + "loss": 0.018, + "step": 159980 + }, + { + "epoch": 34.016497860586036, + "grad_norm": 0.001150784082710743, + "learning_rate": 7.414227855109619e-06, + "loss": 0.0079, + "step": 159990 + }, + { + "epoch": 34.01655202296485, + "grad_norm": 3.2122998237609863, + "learning_rate": 7.411218834064526e-06, + "loss": 0.0576, + "step": 160000 + }, + { + "epoch": 34.01660618534366, + "grad_norm": 0.0017172322841361165, + "learning_rate": 7.408209813019433e-06, + "loss": 0.0029, + "step": 160010 + }, + { + "epoch": 34.01666034772247, + "grad_norm": 0.9287866950035095, + "learning_rate": 7.40520079197434e-06, + "loss": 0.0161, + "step": 160020 + }, + { + "epoch": 34.01671451010128, + "grad_norm": 0.015080994926393032, + "learning_rate": 7.4021917709292464e-06, + "loss": 0.0722, + "step": 160030 + }, + { + "epoch": 34.0167686724801, + "grad_norm": 2.1804137229919434, + "learning_rate": 7.3991827498841536e-06, + "loss": 0.0919, + "step": 160040 + }, + { + "epoch": 34.01682283485891, + "grad_norm": 0.0012083571637049317, + "learning_rate": 7.396173728839059e-06, + "loss": 0.0469, + "step": 160050 + }, + { + "epoch": 34.016876997237716, + "grad_norm": 0.05324240401387215, + "learning_rate": 7.393164707793966e-06, + "loss": 0.0677, + "step": 160060 + }, + { + "epoch": 34.01693115961653, + "grad_norm": 0.0024103051982820034, + "learning_rate": 7.390155686748873e-06, + "loss": 0.0013, + "step": 160070 + }, + { + "epoch": 34.01698532199534, + "grad_norm": 1.9432703256607056, + "learning_rate": 7.38714666570378e-06, + "loss": 0.0084, + "step": 160080 + }, + { + "epoch": 34.01703948437415, + "grad_norm": 0.001122900634072721, + "learning_rate": 7.384137644658687e-06, + "loss": 0.0254, + "step": 160090 + }, + { + "epoch": 34.01709364675297, + "grad_norm": 0.018902922049164772, + "learning_rate": 7.381128623613594e-06, + "loss": 0.0412, + "step": 160100 + }, + { + "epoch": 34.01714780913178, + "grad_norm": 0.002389948582276702, + "learning_rate": 7.378119602568501e-06, + "loss": 0.0682, + "step": 160110 + }, + { + "epoch": 34.01720197151059, + "grad_norm": 0.017387637868523598, + "learning_rate": 7.375110581523407e-06, + "loss": 0.037, + "step": 160120 + }, + { + "epoch": 34.0172561338894, + "grad_norm": 0.2038542926311493, + "learning_rate": 7.372101560478314e-06, + "loss": 0.0173, + "step": 160130 + }, + { + "epoch": 34.017310296268214, + "grad_norm": 0.0014629188226535916, + "learning_rate": 7.369092539433221e-06, + "loss": 0.0252, + "step": 160140 + }, + { + "epoch": 34.01736445864702, + "grad_norm": 0.0014503146521747112, + "learning_rate": 7.3660835183881285e-06, + "loss": 0.0091, + "step": 160150 + }, + { + "epoch": 34.01741862102583, + "grad_norm": 0.0010369695955887437, + "learning_rate": 7.363074497343035e-06, + "loss": 0.0215, + "step": 160160 + }, + { + "epoch": 34.01747278340465, + "grad_norm": 0.6734838485717773, + "learning_rate": 7.360065476297942e-06, + "loss": 0.0191, + "step": 160170 + }, + { + "epoch": 34.01752694578346, + "grad_norm": 0.0027500137221068144, + "learning_rate": 7.357056455252849e-06, + "loss": 0.0257, + "step": 160180 + }, + { + "epoch": 34.01758110816227, + "grad_norm": 0.034631967544555664, + "learning_rate": 7.354047434207756e-06, + "loss": 0.0876, + "step": 160190 + }, + { + "epoch": 34.017635270541085, + "grad_norm": 0.0010852599516510963, + "learning_rate": 7.3510384131626616e-06, + "loss": 0.0125, + "step": 160200 + }, + { + "epoch": 34.017689432919894, + "grad_norm": 0.001040023984387517, + "learning_rate": 7.348029392117569e-06, + "loss": 0.0496, + "step": 160210 + }, + { + "epoch": 34.017743595298704, + "grad_norm": 0.05824693292379379, + "learning_rate": 7.345020371072475e-06, + "loss": 0.0141, + "step": 160220 + }, + { + "epoch": 34.01779775767752, + "grad_norm": 0.7580000758171082, + "learning_rate": 7.342011350027382e-06, + "loss": 0.0456, + "step": 160230 + }, + { + "epoch": 34.01785192005633, + "grad_norm": 0.0012344319839030504, + "learning_rate": 7.339002328982289e-06, + "loss": 0.0002, + "step": 160240 + }, + { + "epoch": 34.01790608243514, + "grad_norm": 0.01898929663002491, + "learning_rate": 7.335993307937196e-06, + "loss": 0.0179, + "step": 160250 + }, + { + "epoch": 34.01796024481395, + "grad_norm": 0.0015962577890604734, + "learning_rate": 7.332984286892103e-06, + "loss": 0.0555, + "step": 160260 + }, + { + "epoch": 34.018014407192766, + "grad_norm": 0.001290945685468614, + "learning_rate": 7.32997526584701e-06, + "loss": 0.0221, + "step": 160270 + }, + { + "epoch": 34.018068569571575, + "grad_norm": 0.0010609597666189075, + "learning_rate": 7.326966244801917e-06, + "loss": 0.0393, + "step": 160280 + }, + { + "epoch": 34.018122731950385, + "grad_norm": 9.752299308776855, + "learning_rate": 7.323957223756823e-06, + "loss": 0.1339, + "step": 160290 + }, + { + "epoch": 34.0181768943292, + "grad_norm": 0.001238293363712728, + "learning_rate": 7.32094820271173e-06, + "loss": 0.0333, + "step": 160300 + }, + { + "epoch": 34.01823105670801, + "grad_norm": 0.4496288299560547, + "learning_rate": 7.317939181666637e-06, + "loss": 0.172, + "step": 160310 + }, + { + "epoch": 34.01828521908682, + "grad_norm": 0.0013058667536824942, + "learning_rate": 7.3149301606215445e-06, + "loss": 0.0238, + "step": 160320 + }, + { + "epoch": 34.01833938146564, + "grad_norm": 0.0011502186534926295, + "learning_rate": 7.311921139576451e-06, + "loss": 0.0043, + "step": 160330 + }, + { + "epoch": 34.01839354384445, + "grad_norm": 0.01566176861524582, + "learning_rate": 7.308912118531358e-06, + "loss": 0.0539, + "step": 160340 + }, + { + "epoch": 34.018447706223256, + "grad_norm": 0.0021289715077728033, + "learning_rate": 7.305903097486263e-06, + "loss": 0.0888, + "step": 160350 + }, + { + "epoch": 34.018501868602065, + "grad_norm": 0.0010527330450713634, + "learning_rate": 7.3028940764411704e-06, + "loss": 0.0437, + "step": 160360 + }, + { + "epoch": 34.01855603098088, + "grad_norm": 0.003078813198953867, + "learning_rate": 7.2998850553960776e-06, + "loss": 0.0878, + "step": 160370 + }, + { + "epoch": 34.01861019335969, + "grad_norm": 0.817530632019043, + "learning_rate": 7.296876034350985e-06, + "loss": 0.0594, + "step": 160380 + }, + { + "epoch": 34.0186643557385, + "grad_norm": 0.0022694251965731382, + "learning_rate": 7.293867013305891e-06, + "loss": 0.0145, + "step": 160390 + }, + { + "epoch": 34.01871851811732, + "grad_norm": 0.27025261521339417, + "learning_rate": 7.290857992260798e-06, + "loss": 0.0598, + "step": 160400 + }, + { + "epoch": 34.01877268049613, + "grad_norm": 0.0012683062814176083, + "learning_rate": 7.287848971215705e-06, + "loss": 0.0409, + "step": 160410 + }, + { + "epoch": 34.01882684287494, + "grad_norm": 5.365365505218506, + "learning_rate": 7.2848399501706115e-06, + "loss": 0.0891, + "step": 160420 + }, + { + "epoch": 34.01888100525375, + "grad_norm": 0.34596702456474304, + "learning_rate": 7.281830929125519e-06, + "loss": 0.0599, + "step": 160430 + }, + { + "epoch": 34.01893516763256, + "grad_norm": 0.001685267430730164, + "learning_rate": 7.278821908080426e-06, + "loss": 0.0079, + "step": 160440 + }, + { + "epoch": 34.01898933001137, + "grad_norm": 0.0010888274991884828, + "learning_rate": 7.275812887035333e-06, + "loss": 0.0357, + "step": 160450 + }, + { + "epoch": 34.01904349239019, + "grad_norm": 0.8599717020988464, + "learning_rate": 7.272803865990239e-06, + "loss": 0.0792, + "step": 160460 + }, + { + "epoch": 34.019097654769, + "grad_norm": 1.4259259700775146, + "learning_rate": 7.269794844945146e-06, + "loss": 0.0696, + "step": 160470 + }, + { + "epoch": 34.01915181714781, + "grad_norm": 0.28948289155960083, + "learning_rate": 7.266785823900053e-06, + "loss": 0.024, + "step": 160480 + }, + { + "epoch": 34.01920597952662, + "grad_norm": 0.010182971134781837, + "learning_rate": 7.2637768028549605e-06, + "loss": 0.0204, + "step": 160490 + }, + { + "epoch": 34.019260141905434, + "grad_norm": 0.39846858382225037, + "learning_rate": 7.260767781809866e-06, + "loss": 0.0911, + "step": 160500 + }, + { + "epoch": 34.019314304284244, + "grad_norm": 0.0010603173868730664, + "learning_rate": 7.257758760764773e-06, + "loss": 0.0116, + "step": 160510 + }, + { + "epoch": 34.01936846666305, + "grad_norm": 0.001200477359816432, + "learning_rate": 7.254749739719679e-06, + "loss": 0.0851, + "step": 160520 + }, + { + "epoch": 34.01942262904187, + "grad_norm": 0.7084013223648071, + "learning_rate": 7.2517407186745864e-06, + "loss": 0.0202, + "step": 160530 + }, + { + "epoch": 34.01947679142068, + "grad_norm": 0.009593065828084946, + "learning_rate": 7.2487316976294935e-06, + "loss": 0.0097, + "step": 160540 + }, + { + "epoch": 34.01953095379949, + "grad_norm": 0.3492724299430847, + "learning_rate": 7.2457226765844e-06, + "loss": 0.0494, + "step": 160550 + }, + { + "epoch": 34.019585116178305, + "grad_norm": 0.007059519644826651, + "learning_rate": 7.242713655539307e-06, + "loss": 0.0552, + "step": 160560 + }, + { + "epoch": 34.019639278557115, + "grad_norm": 2.9351847171783447, + "learning_rate": 7.239704634494214e-06, + "loss": 0.0508, + "step": 160570 + }, + { + "epoch": 34.019693440935924, + "grad_norm": 0.007901427336037159, + "learning_rate": 7.236695613449121e-06, + "loss": 0.0023, + "step": 160580 + }, + { + "epoch": 34.01974760331474, + "grad_norm": 0.007399384398013353, + "learning_rate": 7.2336865924040275e-06, + "loss": 0.0124, + "step": 160590 + }, + { + "epoch": 34.01980176569355, + "grad_norm": 1.951495885848999, + "learning_rate": 7.230677571358935e-06, + "loss": 0.0257, + "step": 160600 + }, + { + "epoch": 34.01985592807236, + "grad_norm": 0.0018250951543450356, + "learning_rate": 7.227668550313842e-06, + "loss": 0.011, + "step": 160610 + }, + { + "epoch": 34.01991009045117, + "grad_norm": 1.915316104888916, + "learning_rate": 7.224659529268749e-06, + "loss": 0.0223, + "step": 160620 + }, + { + "epoch": 34.019964252829986, + "grad_norm": 0.001314076827839017, + "learning_rate": 7.221650508223655e-06, + "loss": 0.0072, + "step": 160630 + }, + { + "epoch": 34.020018415208796, + "grad_norm": 0.0017673143884167075, + "learning_rate": 7.218641487178561e-06, + "loss": 0.0335, + "step": 160640 + }, + { + "epoch": 34.020072577587605, + "grad_norm": 0.0013116521295160055, + "learning_rate": 7.215632466133468e-06, + "loss": 0.0484, + "step": 160650 + }, + { + "epoch": 34.02012673996642, + "grad_norm": 0.0010454350849613547, + "learning_rate": 7.212623445088375e-06, + "loss": 0.052, + "step": 160660 + }, + { + "epoch": 34.02018090234523, + "grad_norm": 0.001323054893873632, + "learning_rate": 7.209614424043282e-06, + "loss": 0.0086, + "step": 160670 + }, + { + "epoch": 34.02023506472404, + "grad_norm": 0.001161792897619307, + "learning_rate": 7.206605402998189e-06, + "loss": 0.0004, + "step": 160680 + }, + { + "epoch": 34.02028922710286, + "grad_norm": 0.004764409735798836, + "learning_rate": 7.203596381953095e-06, + "loss": 0.0454, + "step": 160690 + }, + { + "epoch": 34.02034338948167, + "grad_norm": 0.0013817897997796535, + "learning_rate": 7.200587360908002e-06, + "loss": 0.0165, + "step": 160700 + }, + { + "epoch": 34.020397551860476, + "grad_norm": 0.004421991296112537, + "learning_rate": 7.1975783398629095e-06, + "loss": 0.0006, + "step": 160710 + }, + { + "epoch": 34.020451714239286, + "grad_norm": 1.6324942111968994, + "learning_rate": 7.194569318817816e-06, + "loss": 0.0272, + "step": 160720 + }, + { + "epoch": 34.0205058766181, + "grad_norm": 0.13588590919971466, + "learning_rate": 7.191560297772723e-06, + "loss": 0.0052, + "step": 160730 + }, + { + "epoch": 34.02056003899691, + "grad_norm": 0.0015122572658583522, + "learning_rate": 7.18855127672763e-06, + "loss": 0.0926, + "step": 160740 + }, + { + "epoch": 34.02061420137572, + "grad_norm": 2.323833465576172, + "learning_rate": 7.185542255682537e-06, + "loss": 0.0292, + "step": 160750 + }, + { + "epoch": 34.02066836375454, + "grad_norm": 0.005830323789268732, + "learning_rate": 7.1825332346374434e-06, + "loss": 0.0064, + "step": 160760 + }, + { + "epoch": 34.02072252613335, + "grad_norm": 0.0015376080991700292, + "learning_rate": 7.1795242135923506e-06, + "loss": 0.0203, + "step": 160770 + }, + { + "epoch": 34.02077668851216, + "grad_norm": 0.0009788143215700984, + "learning_rate": 7.176515192547258e-06, + "loss": 0.0092, + "step": 160780 + }, + { + "epoch": 34.020830850890974, + "grad_norm": 0.0035480561200529337, + "learning_rate": 7.173506171502163e-06, + "loss": 0.0023, + "step": 160790 + }, + { + "epoch": 34.02088501326978, + "grad_norm": 0.0018566349754109979, + "learning_rate": 7.17049715045707e-06, + "loss": 0.0559, + "step": 160800 + }, + { + "epoch": 34.02093917564859, + "grad_norm": 0.06108725070953369, + "learning_rate": 7.167488129411977e-06, + "loss": 0.0615, + "step": 160810 + }, + { + "epoch": 34.02099333802741, + "grad_norm": 0.002227331046015024, + "learning_rate": 7.164479108366884e-06, + "loss": 0.0255, + "step": 160820 + }, + { + "epoch": 34.02104750040622, + "grad_norm": 0.00484723225235939, + "learning_rate": 7.161470087321791e-06, + "loss": 0.0257, + "step": 160830 + }, + { + "epoch": 34.02110166278503, + "grad_norm": 0.0013301573926582932, + "learning_rate": 7.158461066276698e-06, + "loss": 0.057, + "step": 160840 + }, + { + "epoch": 34.02115582516384, + "grad_norm": 0.7705066204071045, + "learning_rate": 7.155452045231604e-06, + "loss": 0.0189, + "step": 160850 + }, + { + "epoch": 34.021209987542655, + "grad_norm": 0.5313911437988281, + "learning_rate": 7.152443024186511e-06, + "loss": 0.0684, + "step": 160860 + }, + { + "epoch": 34.021264149921464, + "grad_norm": 4.3164753913879395, + "learning_rate": 7.149434003141418e-06, + "loss": 0.0614, + "step": 160870 + }, + { + "epoch": 34.021318312300274, + "grad_norm": 0.0010036731837317348, + "learning_rate": 7.1464249820963255e-06, + "loss": 0.0088, + "step": 160880 + }, + { + "epoch": 34.02137247467909, + "grad_norm": 1.5859357118606567, + "learning_rate": 7.143415961051232e-06, + "loss": 0.0463, + "step": 160890 + }, + { + "epoch": 34.0214266370579, + "grad_norm": 0.8345617055892944, + "learning_rate": 7.140406940006139e-06, + "loss": 0.0351, + "step": 160900 + }, + { + "epoch": 34.02148079943671, + "grad_norm": 0.0028327072504907846, + "learning_rate": 7.137397918961046e-06, + "loss": 0.0391, + "step": 160910 + }, + { + "epoch": 34.021534961815526, + "grad_norm": 0.0015581678599119186, + "learning_rate": 7.134388897915953e-06, + "loss": 0.0702, + "step": 160920 + }, + { + "epoch": 34.021589124194335, + "grad_norm": 0.03524164855480194, + "learning_rate": 7.1313798768708594e-06, + "loss": 0.0213, + "step": 160930 + }, + { + "epoch": 34.021643286573145, + "grad_norm": 0.0024613449349999428, + "learning_rate": 7.128370855825766e-06, + "loss": 0.0747, + "step": 160940 + }, + { + "epoch": 34.02169744895196, + "grad_norm": 1.2000540494918823, + "learning_rate": 7.125361834780672e-06, + "loss": 0.0681, + "step": 160950 + }, + { + "epoch": 34.02175161133077, + "grad_norm": 0.0010264826705679297, + "learning_rate": 7.122352813735579e-06, + "loss": 0.0255, + "step": 160960 + }, + { + "epoch": 34.02180577370958, + "grad_norm": 0.0017725479556247592, + "learning_rate": 7.119343792690486e-06, + "loss": 0.0569, + "step": 160970 + }, + { + "epoch": 34.02185993608839, + "grad_norm": 0.0016410043463110924, + "learning_rate": 7.1163347716453925e-06, + "loss": 0.0188, + "step": 160980 + }, + { + "epoch": 34.02191409846721, + "grad_norm": 0.001751969219185412, + "learning_rate": 7.1133257506003e-06, + "loss": 0.002, + "step": 160990 + }, + { + "epoch": 34.021968260846016, + "grad_norm": 0.09410140663385391, + "learning_rate": 7.110316729555207e-06, + "loss": 0.0662, + "step": 161000 + }, + { + "epoch": 34.022022423224826, + "grad_norm": 0.059194229543209076, + "learning_rate": 7.107307708510114e-06, + "loss": 0.0184, + "step": 161010 + }, + { + "epoch": 34.02207658560364, + "grad_norm": 1.8991724252700806, + "learning_rate": 7.10429868746502e-06, + "loss": 0.0852, + "step": 161020 + }, + { + "epoch": 34.02213074798245, + "grad_norm": 0.001984626753255725, + "learning_rate": 7.101289666419927e-06, + "loss": 0.0034, + "step": 161030 + }, + { + "epoch": 34.02218491036126, + "grad_norm": 0.0021084602922201157, + "learning_rate": 7.098280645374834e-06, + "loss": 0.0277, + "step": 161040 + }, + { + "epoch": 34.02223907274008, + "grad_norm": 2.8197240829467773, + "learning_rate": 7.0952716243297415e-06, + "loss": 0.0287, + "step": 161050 + }, + { + "epoch": 34.02229323511889, + "grad_norm": 0.004903275519609451, + "learning_rate": 7.092262603284648e-06, + "loss": 0.0115, + "step": 161060 + }, + { + "epoch": 34.0223473974977, + "grad_norm": 0.0016455617733299732, + "learning_rate": 7.089253582239555e-06, + "loss": 0.0438, + "step": 161070 + }, + { + "epoch": 34.022401559876506, + "grad_norm": 0.00412128446623683, + "learning_rate": 7.086244561194462e-06, + "loss": 0.019, + "step": 161080 + }, + { + "epoch": 34.02245572225532, + "grad_norm": 0.0012656361795961857, + "learning_rate": 7.0832355401493674e-06, + "loss": 0.04, + "step": 161090 + }, + { + "epoch": 34.02250988463413, + "grad_norm": 0.0013984033139422536, + "learning_rate": 7.0802265191042746e-06, + "loss": 0.0272, + "step": 161100 + }, + { + "epoch": 34.02256404701294, + "grad_norm": 0.0010124812833964825, + "learning_rate": 7.077217498059182e-06, + "loss": 0.0184, + "step": 161110 + }, + { + "epoch": 34.02261820939176, + "grad_norm": 0.0069670225493609905, + "learning_rate": 7.074208477014088e-06, + "loss": 0.0436, + "step": 161120 + }, + { + "epoch": 34.02267237177057, + "grad_norm": 0.0009920018492266536, + "learning_rate": 7.071199455968995e-06, + "loss": 0.0172, + "step": 161130 + }, + { + "epoch": 34.02272653414938, + "grad_norm": 0.08369561284780502, + "learning_rate": 7.068190434923902e-06, + "loss": 0.0246, + "step": 161140 + }, + { + "epoch": 34.022780696528194, + "grad_norm": 0.0009939376031979918, + "learning_rate": 7.0651814138788085e-06, + "loss": 0.0139, + "step": 161150 + }, + { + "epoch": 34.022834858907004, + "grad_norm": 0.6412469744682312, + "learning_rate": 7.062172392833716e-06, + "loss": 0.016, + "step": 161160 + }, + { + "epoch": 34.02288902128581, + "grad_norm": 0.00104769435711205, + "learning_rate": 7.059163371788623e-06, + "loss": 0.0057, + "step": 161170 + }, + { + "epoch": 34.02294318366463, + "grad_norm": 0.051848556846380234, + "learning_rate": 7.05615435074353e-06, + "loss": 0.0284, + "step": 161180 + }, + { + "epoch": 34.02299734604344, + "grad_norm": 0.0013797198189422488, + "learning_rate": 7.053145329698436e-06, + "loss": 0.0102, + "step": 161190 + }, + { + "epoch": 34.02305150842225, + "grad_norm": 0.0015181743074208498, + "learning_rate": 7.050136308653343e-06, + "loss": 0.0275, + "step": 161200 + }, + { + "epoch": 34.02310567080106, + "grad_norm": 0.0013743072049692273, + "learning_rate": 7.04712728760825e-06, + "loss": 0.0299, + "step": 161210 + }, + { + "epoch": 34.023159833179875, + "grad_norm": 0.0010099300416186452, + "learning_rate": 7.0441182665631575e-06, + "loss": 0.0504, + "step": 161220 + }, + { + "epoch": 34.023213995558685, + "grad_norm": 0.0013302987208589911, + "learning_rate": 7.041109245518064e-06, + "loss": 0.0617, + "step": 161230 + }, + { + "epoch": 34.023268157937494, + "grad_norm": 0.0010818841401487589, + "learning_rate": 7.03810022447297e-06, + "loss": 0.0157, + "step": 161240 + }, + { + "epoch": 34.02332232031631, + "grad_norm": 0.0016118973726406693, + "learning_rate": 7.035091203427876e-06, + "loss": 0.0278, + "step": 161250 + }, + { + "epoch": 34.02337648269512, + "grad_norm": 0.0013438812457025051, + "learning_rate": 7.0320821823827834e-06, + "loss": 0.0001, + "step": 161260 + }, + { + "epoch": 34.02343064507393, + "grad_norm": 1.656896948814392, + "learning_rate": 7.0290731613376906e-06, + "loss": 0.0518, + "step": 161270 + }, + { + "epoch": 34.023484807452746, + "grad_norm": 0.011553856544196606, + "learning_rate": 7.026064140292597e-06, + "loss": 0.0028, + "step": 161280 + }, + { + "epoch": 34.023538969831556, + "grad_norm": 0.4757416844367981, + "learning_rate": 7.023055119247504e-06, + "loss": 0.095, + "step": 161290 + }, + { + "epoch": 34.023593132210365, + "grad_norm": 0.005603067576885223, + "learning_rate": 7.020046098202411e-06, + "loss": 0.059, + "step": 161300 + }, + { + "epoch": 34.02364729458918, + "grad_norm": 0.0012901572044938803, + "learning_rate": 7.017037077157318e-06, + "loss": 0.0229, + "step": 161310 + }, + { + "epoch": 34.02370145696799, + "grad_norm": 0.001428426243364811, + "learning_rate": 7.0140280561122245e-06, + "loss": 0.0006, + "step": 161320 + }, + { + "epoch": 34.0237556193468, + "grad_norm": 0.3827001750469208, + "learning_rate": 7.011019035067132e-06, + "loss": 0.0397, + "step": 161330 + }, + { + "epoch": 34.02380978172561, + "grad_norm": 0.001294175279326737, + "learning_rate": 7.008010014022039e-06, + "loss": 0.0749, + "step": 161340 + }, + { + "epoch": 34.02386394410443, + "grad_norm": 0.20381814241409302, + "learning_rate": 7.005000992976946e-06, + "loss": 0.012, + "step": 161350 + }, + { + "epoch": 34.02391810648324, + "grad_norm": 0.15539297461509705, + "learning_rate": 7.001991971931852e-06, + "loss": 0.0068, + "step": 161360 + }, + { + "epoch": 34.023972268862046, + "grad_norm": 0.0010384180350229144, + "learning_rate": 6.998982950886759e-06, + "loss": 0.0206, + "step": 161370 + }, + { + "epoch": 34.02402643124086, + "grad_norm": 8.42757797241211, + "learning_rate": 6.995973929841666e-06, + "loss": 0.0454, + "step": 161380 + }, + { + "epoch": 34.02408059361967, + "grad_norm": 0.0016772057861089706, + "learning_rate": 6.992964908796572e-06, + "loss": 0.013, + "step": 161390 + }, + { + "epoch": 34.02413475599848, + "grad_norm": 2.770693063735962, + "learning_rate": 6.989955887751479e-06, + "loss": 0.0486, + "step": 161400 + }, + { + "epoch": 34.0241889183773, + "grad_norm": 0.003136077895760536, + "learning_rate": 6.986946866706385e-06, + "loss": 0.0214, + "step": 161410 + }, + { + "epoch": 34.02424308075611, + "grad_norm": 0.0013763966271653771, + "learning_rate": 6.983937845661292e-06, + "loss": 0.0087, + "step": 161420 + }, + { + "epoch": 34.02429724313492, + "grad_norm": 1.9763755798339844, + "learning_rate": 6.980928824616199e-06, + "loss": 0.0191, + "step": 161430 + }, + { + "epoch": 34.02435140551373, + "grad_norm": 0.37855467200279236, + "learning_rate": 6.9779198035711065e-06, + "loss": 0.03, + "step": 161440 + }, + { + "epoch": 34.02440556789254, + "grad_norm": 16.352689743041992, + "learning_rate": 6.974910782526013e-06, + "loss": 0.04, + "step": 161450 + }, + { + "epoch": 34.02445973027135, + "grad_norm": 0.6066820621490479, + "learning_rate": 6.97190176148092e-06, + "loss": 0.0474, + "step": 161460 + }, + { + "epoch": 34.02451389265016, + "grad_norm": 0.0011090044863522053, + "learning_rate": 6.968892740435827e-06, + "loss": 0.0019, + "step": 161470 + }, + { + "epoch": 34.02456805502898, + "grad_norm": 0.006244789808988571, + "learning_rate": 6.965883719390734e-06, + "loss": 0.0312, + "step": 161480 + }, + { + "epoch": 34.02462221740779, + "grad_norm": 0.07881326228380203, + "learning_rate": 6.9628746983456405e-06, + "loss": 0.095, + "step": 161490 + }, + { + "epoch": 34.0246763797866, + "grad_norm": 0.012346627190709114, + "learning_rate": 6.959865677300548e-06, + "loss": 0.0077, + "step": 161500 + }, + { + "epoch": 34.024730542165415, + "grad_norm": 0.9715369343757629, + "learning_rate": 6.956856656255455e-06, + "loss": 0.0541, + "step": 161510 + }, + { + "epoch": 34.024784704544224, + "grad_norm": 0.17327935993671417, + "learning_rate": 6.953847635210362e-06, + "loss": 0.0179, + "step": 161520 + }, + { + "epoch": 34.024838866923034, + "grad_norm": 7.650644302368164, + "learning_rate": 6.950838614165268e-06, + "loss": 0.1288, + "step": 161530 + }, + { + "epoch": 34.02489302930185, + "grad_norm": 0.0011568263871595263, + "learning_rate": 6.947829593120174e-06, + "loss": 0.0204, + "step": 161540 + }, + { + "epoch": 34.02494719168066, + "grad_norm": 0.0009294657502323389, + "learning_rate": 6.944820572075081e-06, + "loss": 0.0472, + "step": 161550 + }, + { + "epoch": 34.02500135405947, + "grad_norm": 0.0009363247081637383, + "learning_rate": 6.941811551029988e-06, + "loss": 0.0005, + "step": 161560 + }, + { + "epoch": 34.02500135405947, + "eval_accuracy": 0.8347485303723057, + "eval_loss": 1.0482841730117798, + "eval_runtime": 117.929, + "eval_samples_per_second": 25.965, + "eval_steps_per_second": 3.248, + "step": 161560 + }, + { + "epoch": 35.00005416237881, + "grad_norm": 1.3240336179733276, + "learning_rate": 6.938802529984895e-06, + "loss": 0.0248, + "step": 161570 + }, + { + "epoch": 35.000108324757626, + "grad_norm": 0.11751689016819, + "learning_rate": 6.935793508939801e-06, + "loss": 0.108, + "step": 161580 + }, + { + "epoch": 35.000162487136436, + "grad_norm": 0.0012302880641072989, + "learning_rate": 6.932784487894708e-06, + "loss": 0.0078, + "step": 161590 + }, + { + "epoch": 35.000216649515245, + "grad_norm": 1.0639818906784058, + "learning_rate": 6.929775466849615e-06, + "loss": 0.0614, + "step": 161600 + }, + { + "epoch": 35.00027081189406, + "grad_norm": 0.0714239701628685, + "learning_rate": 6.9267664458045225e-06, + "loss": 0.0486, + "step": 161610 + }, + { + "epoch": 35.00032497427287, + "grad_norm": 0.0009396829991601408, + "learning_rate": 6.923757424759429e-06, + "loss": 0.0151, + "step": 161620 + }, + { + "epoch": 35.00037913665168, + "grad_norm": 0.758962869644165, + "learning_rate": 6.920748403714336e-06, + "loss": 0.0218, + "step": 161630 + }, + { + "epoch": 35.00043329903049, + "grad_norm": 0.0013324967585504055, + "learning_rate": 6.917739382669243e-06, + "loss": 0.0129, + "step": 161640 + }, + { + "epoch": 35.00048746140931, + "grad_norm": 0.001243988866917789, + "learning_rate": 6.91473036162415e-06, + "loss": 0.0063, + "step": 161650 + }, + { + "epoch": 35.000541623788116, + "grad_norm": 0.600338339805603, + "learning_rate": 6.9117213405790564e-06, + "loss": 0.0694, + "step": 161660 + }, + { + "epoch": 35.000595786166926, + "grad_norm": 1.1376971006393433, + "learning_rate": 6.9087123195339636e-06, + "loss": 0.0617, + "step": 161670 + }, + { + "epoch": 35.00064994854574, + "grad_norm": 0.0009714727057144046, + "learning_rate": 6.905703298488871e-06, + "loss": 0.0193, + "step": 161680 + }, + { + "epoch": 35.00070411092455, + "grad_norm": 0.0012684548273682594, + "learning_rate": 6.902694277443776e-06, + "loss": 0.0294, + "step": 161690 + }, + { + "epoch": 35.00075827330336, + "grad_norm": 0.001176777295768261, + "learning_rate": 6.899685256398683e-06, + "loss": 0.0235, + "step": 161700 + }, + { + "epoch": 35.00081243568218, + "grad_norm": 0.0028982474468648434, + "learning_rate": 6.8966762353535895e-06, + "loss": 0.055, + "step": 161710 + }, + { + "epoch": 35.00086659806099, + "grad_norm": 0.000979590811766684, + "learning_rate": 6.893667214308497e-06, + "loss": 0.1062, + "step": 161720 + }, + { + "epoch": 35.0009207604398, + "grad_norm": 0.0010009643156081438, + "learning_rate": 6.890658193263404e-06, + "loss": 0.0118, + "step": 161730 + }, + { + "epoch": 35.00097492281861, + "grad_norm": 0.0034780448768287897, + "learning_rate": 6.887649172218311e-06, + "loss": 0.0234, + "step": 161740 + }, + { + "epoch": 35.00102908519742, + "grad_norm": 0.37421393394470215, + "learning_rate": 6.884640151173217e-06, + "loss": 0.0248, + "step": 161750 + }, + { + "epoch": 35.00108324757623, + "grad_norm": 4.086302757263184, + "learning_rate": 6.881631130128124e-06, + "loss": 0.0511, + "step": 161760 + }, + { + "epoch": 35.00113740995504, + "grad_norm": 0.2092210352420807, + "learning_rate": 6.878622109083031e-06, + "loss": 0.0313, + "step": 161770 + }, + { + "epoch": 35.00119157233386, + "grad_norm": 0.0015184883959591389, + "learning_rate": 6.8756130880379385e-06, + "loss": 0.0386, + "step": 161780 + }, + { + "epoch": 35.00124573471267, + "grad_norm": 0.004193149507045746, + "learning_rate": 6.872604066992845e-06, + "loss": 0.078, + "step": 161790 + }, + { + "epoch": 35.00129989709148, + "grad_norm": 0.0009563368512317538, + "learning_rate": 6.869595045947752e-06, + "loss": 0.0183, + "step": 161800 + }, + { + "epoch": 35.001354059470295, + "grad_norm": 0.0012608396355062723, + "learning_rate": 6.866586024902659e-06, + "loss": 0.0266, + "step": 161810 + }, + { + "epoch": 35.001408221849104, + "grad_norm": 0.009346726350486279, + "learning_rate": 6.863577003857566e-06, + "loss": 0.1239, + "step": 161820 + }, + { + "epoch": 35.001462384227914, + "grad_norm": 0.4918944239616394, + "learning_rate": 6.8605679828124724e-06, + "loss": 0.0904, + "step": 161830 + }, + { + "epoch": 35.00151654660673, + "grad_norm": 0.07370750606060028, + "learning_rate": 6.857558961767378e-06, + "loss": 0.0265, + "step": 161840 + }, + { + "epoch": 35.00157070898554, + "grad_norm": 0.0010758950375020504, + "learning_rate": 6.854549940722285e-06, + "loss": 0.0002, + "step": 161850 + }, + { + "epoch": 35.00162487136435, + "grad_norm": 1.4044933319091797, + "learning_rate": 6.851540919677192e-06, + "loss": 0.0177, + "step": 161860 + }, + { + "epoch": 35.00167903374316, + "grad_norm": 0.0010749957291409373, + "learning_rate": 6.848531898632099e-06, + "loss": 0.014, + "step": 161870 + }, + { + "epoch": 35.001733196121975, + "grad_norm": 0.0013597040669992566, + "learning_rate": 6.8455228775870055e-06, + "loss": 0.0411, + "step": 161880 + }, + { + "epoch": 35.001787358500785, + "grad_norm": 0.001465488807298243, + "learning_rate": 6.842513856541913e-06, + "loss": 0.0166, + "step": 161890 + }, + { + "epoch": 35.001841520879594, + "grad_norm": 0.0009436412365175784, + "learning_rate": 6.83950483549682e-06, + "loss": 0.0226, + "step": 161900 + }, + { + "epoch": 35.00189568325841, + "grad_norm": 0.0014265322824940085, + "learning_rate": 6.836495814451727e-06, + "loss": 0.0046, + "step": 161910 + }, + { + "epoch": 35.00194984563722, + "grad_norm": 0.001078714383766055, + "learning_rate": 6.833486793406633e-06, + "loss": 0.0078, + "step": 161920 + }, + { + "epoch": 35.00200400801603, + "grad_norm": 0.000993568217381835, + "learning_rate": 6.83047777236154e-06, + "loss": 0.0355, + "step": 161930 + }, + { + "epoch": 35.00205817039485, + "grad_norm": 0.0014037492219358683, + "learning_rate": 6.827468751316447e-06, + "loss": 0.0099, + "step": 161940 + }, + { + "epoch": 35.002112332773656, + "grad_norm": 0.0009299474768340588, + "learning_rate": 6.8244597302713545e-06, + "loss": 0.0116, + "step": 161950 + }, + { + "epoch": 35.002166495152466, + "grad_norm": 0.986409068107605, + "learning_rate": 6.821450709226261e-06, + "loss": 0.0285, + "step": 161960 + }, + { + "epoch": 35.00222065753128, + "grad_norm": 0.0013941129436716437, + "learning_rate": 6.818441688181168e-06, + "loss": 0.0007, + "step": 161970 + }, + { + "epoch": 35.00227481991009, + "grad_norm": 0.0011343320365995169, + "learning_rate": 6.815432667136075e-06, + "loss": 0.0282, + "step": 161980 + }, + { + "epoch": 35.0023289822889, + "grad_norm": 0.09781330078840256, + "learning_rate": 6.8124236460909804e-06, + "loss": 0.0611, + "step": 161990 + }, + { + "epoch": 35.00238314466771, + "grad_norm": 0.0012020849389955401, + "learning_rate": 6.8094146250458876e-06, + "loss": 0.0241, + "step": 162000 + }, + { + "epoch": 35.00243730704653, + "grad_norm": 0.019591767340898514, + "learning_rate": 6.806405604000794e-06, + "loss": 0.0001, + "step": 162010 + }, + { + "epoch": 35.00249146942534, + "grad_norm": 0.0009045650949701667, + "learning_rate": 6.803396582955701e-06, + "loss": 0.0153, + "step": 162020 + }, + { + "epoch": 35.002545631804146, + "grad_norm": 0.0009017541306093335, + "learning_rate": 6.800387561910608e-06, + "loss": 0.003, + "step": 162030 + }, + { + "epoch": 35.00259979418296, + "grad_norm": 2.914250612258911, + "learning_rate": 6.797378540865515e-06, + "loss": 0.08, + "step": 162040 + }, + { + "epoch": 35.00265395656177, + "grad_norm": 9.676069259643555, + "learning_rate": 6.7943695198204215e-06, + "loss": 0.0688, + "step": 162050 + }, + { + "epoch": 35.00270811894058, + "grad_norm": 0.6598367094993591, + "learning_rate": 6.791360498775329e-06, + "loss": 0.0042, + "step": 162060 + }, + { + "epoch": 35.0027622813194, + "grad_norm": 0.0011567604960873723, + "learning_rate": 6.788351477730236e-06, + "loss": 0.0014, + "step": 162070 + }, + { + "epoch": 35.00281644369821, + "grad_norm": 0.000921902246773243, + "learning_rate": 6.785342456685143e-06, + "loss": 0.0372, + "step": 162080 + }, + { + "epoch": 35.00287060607702, + "grad_norm": 0.0008950043702498078, + "learning_rate": 6.782333435640049e-06, + "loss": 0.0014, + "step": 162090 + }, + { + "epoch": 35.00292476845583, + "grad_norm": 0.0009189894190058112, + "learning_rate": 6.779324414594956e-06, + "loss": 0.0371, + "step": 162100 + }, + { + "epoch": 35.002978930834644, + "grad_norm": 1.7831707000732422, + "learning_rate": 6.776315393549863e-06, + "loss": 0.0013, + "step": 162110 + }, + { + "epoch": 35.00303309321345, + "grad_norm": 1.0358339548110962, + "learning_rate": 6.7733063725047705e-06, + "loss": 0.0207, + "step": 162120 + }, + { + "epoch": 35.00308725559226, + "grad_norm": 0.0010226914891973138, + "learning_rate": 6.770297351459677e-06, + "loss": 0.0113, + "step": 162130 + }, + { + "epoch": 35.00314141797108, + "grad_norm": 0.0011167037300765514, + "learning_rate": 6.767288330414582e-06, + "loss": 0.0293, + "step": 162140 + }, + { + "epoch": 35.00319558034989, + "grad_norm": 5.981983661651611, + "learning_rate": 6.764279309369489e-06, + "loss": 0.0508, + "step": 162150 + }, + { + "epoch": 35.0032497427287, + "grad_norm": 0.0015337057411670685, + "learning_rate": 6.7612702883243964e-06, + "loss": 0.0292, + "step": 162160 + }, + { + "epoch": 35.003303905107515, + "grad_norm": 0.0016641088295727968, + "learning_rate": 6.7582612672793036e-06, + "loss": 0.0851, + "step": 162170 + }, + { + "epoch": 35.003358067486325, + "grad_norm": 2.2099416255950928, + "learning_rate": 6.75525224623421e-06, + "loss": 0.0378, + "step": 162180 + }, + { + "epoch": 35.003412229865134, + "grad_norm": 0.001550330314785242, + "learning_rate": 6.752243225189117e-06, + "loss": 0.0364, + "step": 162190 + }, + { + "epoch": 35.00346639224395, + "grad_norm": 0.0009537818841636181, + "learning_rate": 6.749234204144024e-06, + "loss": 0.043, + "step": 162200 + }, + { + "epoch": 35.00352055462276, + "grad_norm": 0.5475096106529236, + "learning_rate": 6.746225183098931e-06, + "loss": 0.0392, + "step": 162210 + }, + { + "epoch": 35.00357471700157, + "grad_norm": 0.0010614576749503613, + "learning_rate": 6.7432161620538375e-06, + "loss": 0.0707, + "step": 162220 + }, + { + "epoch": 35.00362887938038, + "grad_norm": 0.0033933573868125677, + "learning_rate": 6.740207141008745e-06, + "loss": 0.012, + "step": 162230 + }, + { + "epoch": 35.003683041759196, + "grad_norm": 0.3388301432132721, + "learning_rate": 6.737198119963652e-06, + "loss": 0.0182, + "step": 162240 + }, + { + "epoch": 35.003737204138005, + "grad_norm": 0.0011759396875277162, + "learning_rate": 6.734189098918559e-06, + "loss": 0.0071, + "step": 162250 + }, + { + "epoch": 35.003791366516815, + "grad_norm": 0.0009174234583042562, + "learning_rate": 6.731180077873465e-06, + "loss": 0.0174, + "step": 162260 + }, + { + "epoch": 35.00384552889563, + "grad_norm": 0.8770433068275452, + "learning_rate": 6.728171056828372e-06, + "loss": 0.0111, + "step": 162270 + }, + { + "epoch": 35.00389969127444, + "grad_norm": 1.2337100505828857, + "learning_rate": 6.725162035783279e-06, + "loss": 0.0162, + "step": 162280 + }, + { + "epoch": 35.00395385365325, + "grad_norm": 0.0011170555371791124, + "learning_rate": 6.722153014738185e-06, + "loss": 0.0176, + "step": 162290 + }, + { + "epoch": 35.00400801603207, + "grad_norm": 0.0009536564466543496, + "learning_rate": 6.719143993693092e-06, + "loss": 0.0043, + "step": 162300 + }, + { + "epoch": 35.00406217841088, + "grad_norm": 0.0011957871029153466, + "learning_rate": 6.716134972647998e-06, + "loss": 0.0161, + "step": 162310 + }, + { + "epoch": 35.004116340789686, + "grad_norm": 0.0011698331218212843, + "learning_rate": 6.713125951602905e-06, + "loss": 0.0006, + "step": 162320 + }, + { + "epoch": 35.0041705031685, + "grad_norm": 0.0008837698260322213, + "learning_rate": 6.710116930557812e-06, + "loss": 0.0007, + "step": 162330 + }, + { + "epoch": 35.00422466554731, + "grad_norm": 0.4269762635231018, + "learning_rate": 6.7071079095127195e-06, + "loss": 0.0771, + "step": 162340 + }, + { + "epoch": 35.00427882792612, + "grad_norm": 0.9956657886505127, + "learning_rate": 6.704098888467626e-06, + "loss": 0.0944, + "step": 162350 + }, + { + "epoch": 35.00433299030493, + "grad_norm": 0.0010958723723888397, + "learning_rate": 6.701089867422533e-06, + "loss": 0.0114, + "step": 162360 + }, + { + "epoch": 35.00438715268375, + "grad_norm": 2.559356689453125, + "learning_rate": 6.69808084637744e-06, + "loss": 0.0672, + "step": 162370 + }, + { + "epoch": 35.00444131506256, + "grad_norm": 0.0009000705904327333, + "learning_rate": 6.695071825332347e-06, + "loss": 0.0092, + "step": 162380 + }, + { + "epoch": 35.00449547744137, + "grad_norm": 0.8887849450111389, + "learning_rate": 6.6920628042872535e-06, + "loss": 0.053, + "step": 162390 + }, + { + "epoch": 35.00454963982018, + "grad_norm": 0.0009398856200277805, + "learning_rate": 6.6890537832421606e-06, + "loss": 0.0036, + "step": 162400 + }, + { + "epoch": 35.00460380219899, + "grad_norm": 0.821107029914856, + "learning_rate": 6.686044762197068e-06, + "loss": 0.0515, + "step": 162410 + }, + { + "epoch": 35.0046579645778, + "grad_norm": 0.0010730252834036946, + "learning_rate": 6.683035741151975e-06, + "loss": 0.0006, + "step": 162420 + }, + { + "epoch": 35.00471212695662, + "grad_norm": 0.4118429720401764, + "learning_rate": 6.680026720106881e-06, + "loss": 0.0153, + "step": 162430 + }, + { + "epoch": 35.00476628933543, + "grad_norm": 0.0009069142979569733, + "learning_rate": 6.6770176990617865e-06, + "loss": 0.0812, + "step": 162440 + }, + { + "epoch": 35.00482045171424, + "grad_norm": 0.0009480870212428272, + "learning_rate": 6.674008678016694e-06, + "loss": 0.1018, + "step": 162450 + }, + { + "epoch": 35.00487461409305, + "grad_norm": 2.1429977416992188, + "learning_rate": 6.670999656971601e-06, + "loss": 0.0235, + "step": 162460 + }, + { + "epoch": 35.004928776471864, + "grad_norm": 0.4990878999233246, + "learning_rate": 6.667990635926508e-06, + "loss": 0.0112, + "step": 162470 + }, + { + "epoch": 35.004982938850674, + "grad_norm": 0.0012150187976658344, + "learning_rate": 6.664981614881414e-06, + "loss": 0.0613, + "step": 162480 + }, + { + "epoch": 35.00503710122948, + "grad_norm": 0.1374993771314621, + "learning_rate": 6.661972593836321e-06, + "loss": 0.0018, + "step": 162490 + }, + { + "epoch": 35.0050912636083, + "grad_norm": 0.0021010565105825663, + "learning_rate": 6.658963572791228e-06, + "loss": 0.1381, + "step": 162500 + }, + { + "epoch": 35.00514542598711, + "grad_norm": 0.20544175803661346, + "learning_rate": 6.6559545517461355e-06, + "loss": 0.0903, + "step": 162510 + }, + { + "epoch": 35.00519958836592, + "grad_norm": 0.0019915858283638954, + "learning_rate": 6.652945530701042e-06, + "loss": 0.012, + "step": 162520 + }, + { + "epoch": 35.005253750744735, + "grad_norm": 0.024388253688812256, + "learning_rate": 6.649936509655949e-06, + "loss": 0.0826, + "step": 162530 + }, + { + "epoch": 35.005307913123545, + "grad_norm": 0.0011490731267258525, + "learning_rate": 6.646927488610856e-06, + "loss": 0.0054, + "step": 162540 + }, + { + "epoch": 35.005362075502354, + "grad_norm": 0.00219715409912169, + "learning_rate": 6.643918467565763e-06, + "loss": 0.0085, + "step": 162550 + }, + { + "epoch": 35.00541623788117, + "grad_norm": 0.0014450028538703918, + "learning_rate": 6.6409094465206694e-06, + "loss": 0.0138, + "step": 162560 + }, + { + "epoch": 35.00547040025998, + "grad_norm": 1.021752953529358, + "learning_rate": 6.6379004254755766e-06, + "loss": 0.0221, + "step": 162570 + }, + { + "epoch": 35.00552456263879, + "grad_norm": 0.09782081097364426, + "learning_rate": 6.634891404430484e-06, + "loss": 0.0117, + "step": 162580 + }, + { + "epoch": 35.0055787250176, + "grad_norm": 1.3037211894989014, + "learning_rate": 6.631882383385389e-06, + "loss": 0.0123, + "step": 162590 + }, + { + "epoch": 35.005632887396416, + "grad_norm": 2.0859546661376953, + "learning_rate": 6.628873362340296e-06, + "loss": 0.0139, + "step": 162600 + }, + { + "epoch": 35.005687049775226, + "grad_norm": 0.09013711661100388, + "learning_rate": 6.6258643412952025e-06, + "loss": 0.0885, + "step": 162610 + }, + { + "epoch": 35.005741212154035, + "grad_norm": 0.0016369008226320148, + "learning_rate": 6.62285532025011e-06, + "loss": 0.0078, + "step": 162620 + }, + { + "epoch": 35.00579537453285, + "grad_norm": 0.0458466075360775, + "learning_rate": 6.619846299205017e-06, + "loss": 0.064, + "step": 162630 + }, + { + "epoch": 35.00584953691166, + "grad_norm": 2.459933280944824, + "learning_rate": 6.616837278159924e-06, + "loss": 0.016, + "step": 162640 + }, + { + "epoch": 35.00590369929047, + "grad_norm": 0.07070112228393555, + "learning_rate": 6.61382825711483e-06, + "loss": 0.0208, + "step": 162650 + }, + { + "epoch": 35.00595786166929, + "grad_norm": 0.0008926515001803637, + "learning_rate": 6.610819236069737e-06, + "loss": 0.0141, + "step": 162660 + }, + { + "epoch": 35.0060120240481, + "grad_norm": 1.750472903251648, + "learning_rate": 6.607810215024644e-06, + "loss": 0.0274, + "step": 162670 + }, + { + "epoch": 35.00606618642691, + "grad_norm": 0.001425126800313592, + "learning_rate": 6.6048011939795515e-06, + "loss": 0.0254, + "step": 162680 + }, + { + "epoch": 35.00612034880572, + "grad_norm": 0.0011151900980621576, + "learning_rate": 6.601792172934458e-06, + "loss": 0.0101, + "step": 162690 + }, + { + "epoch": 35.00617451118453, + "grad_norm": 0.5143992304801941, + "learning_rate": 6.598783151889365e-06, + "loss": 0.0789, + "step": 162700 + }, + { + "epoch": 35.00622867356334, + "grad_norm": 1.0508085489273071, + "learning_rate": 6.595774130844272e-06, + "loss": 0.024, + "step": 162710 + }, + { + "epoch": 35.00628283594215, + "grad_norm": 0.0011483636917546391, + "learning_rate": 6.592765109799179e-06, + "loss": 0.0094, + "step": 162720 + }, + { + "epoch": 35.00633699832097, + "grad_norm": 0.0012424209853634238, + "learning_rate": 6.5897560887540854e-06, + "loss": 0.0321, + "step": 162730 + }, + { + "epoch": 35.00639116069978, + "grad_norm": 0.007801178842782974, + "learning_rate": 6.586747067708991e-06, + "loss": 0.0088, + "step": 162740 + }, + { + "epoch": 35.00644532307859, + "grad_norm": 0.0014296245062723756, + "learning_rate": 6.583738046663898e-06, + "loss": 0.0239, + "step": 162750 + }, + { + "epoch": 35.006499485457404, + "grad_norm": 2.108018636703491, + "learning_rate": 6.580729025618805e-06, + "loss": 0.0341, + "step": 162760 + }, + { + "epoch": 35.00655364783621, + "grad_norm": 0.0009250030270777643, + "learning_rate": 6.577720004573712e-06, + "loss": 0.0116, + "step": 162770 + }, + { + "epoch": 35.00660781021502, + "grad_norm": 0.0008846770506352186, + "learning_rate": 6.5747109835286185e-06, + "loss": 0.0359, + "step": 162780 + }, + { + "epoch": 35.00666197259384, + "grad_norm": 0.03301270678639412, + "learning_rate": 6.571701962483526e-06, + "loss": 0.0195, + "step": 162790 + }, + { + "epoch": 35.00671613497265, + "grad_norm": 0.11479897052049637, + "learning_rate": 6.568692941438433e-06, + "loss": 0.0383, + "step": 162800 + }, + { + "epoch": 35.00677029735146, + "grad_norm": 0.0030917758122086525, + "learning_rate": 6.56568392039334e-06, + "loss": 0.0488, + "step": 162810 + }, + { + "epoch": 35.00682445973027, + "grad_norm": 0.7038829326629639, + "learning_rate": 6.562674899348246e-06, + "loss": 0.0081, + "step": 162820 + }, + { + "epoch": 35.006878622109085, + "grad_norm": 0.0008727723034098744, + "learning_rate": 6.559665878303153e-06, + "loss": 0.0357, + "step": 162830 + }, + { + "epoch": 35.006932784487894, + "grad_norm": 0.009262317791581154, + "learning_rate": 6.55665685725806e-06, + "loss": 0.1585, + "step": 162840 + }, + { + "epoch": 35.006986946866704, + "grad_norm": 0.0011135195381939411, + "learning_rate": 6.5536478362129675e-06, + "loss": 0.011, + "step": 162850 + }, + { + "epoch": 35.00704110924552, + "grad_norm": 0.5036118626594543, + "learning_rate": 6.550638815167874e-06, + "loss": 0.0303, + "step": 162860 + }, + { + "epoch": 35.00709527162433, + "grad_norm": 0.0015250291908159852, + "learning_rate": 6.547629794122781e-06, + "loss": 0.0044, + "step": 162870 + }, + { + "epoch": 35.00714943400314, + "grad_norm": 0.4386574923992157, + "learning_rate": 6.544620773077688e-06, + "loss": 0.0365, + "step": 162880 + }, + { + "epoch": 35.007203596381956, + "grad_norm": 0.0008807082776911557, + "learning_rate": 6.5416117520325934e-06, + "loss": 0.0001, + "step": 162890 + }, + { + "epoch": 35.007257758760765, + "grad_norm": 4.182064056396484, + "learning_rate": 6.5386027309875006e-06, + "loss": 0.0144, + "step": 162900 + }, + { + "epoch": 35.007311921139575, + "grad_norm": 1.500488519668579, + "learning_rate": 6.535593709942407e-06, + "loss": 0.0349, + "step": 162910 + }, + { + "epoch": 35.00736608351839, + "grad_norm": 0.0031798696145415306, + "learning_rate": 6.532584688897314e-06, + "loss": 0.0168, + "step": 162920 + }, + { + "epoch": 35.0074202458972, + "grad_norm": 0.0009223063243553042, + "learning_rate": 6.529575667852221e-06, + "loss": 0.0003, + "step": 162930 + }, + { + "epoch": 35.00747440827601, + "grad_norm": 0.001081653288565576, + "learning_rate": 6.526566646807128e-06, + "loss": 0.0236, + "step": 162940 + }, + { + "epoch": 35.00752857065482, + "grad_norm": 0.0012470940127968788, + "learning_rate": 6.5235576257620345e-06, + "loss": 0.0524, + "step": 162950 + }, + { + "epoch": 35.00758273303364, + "grad_norm": 0.0014545650919899344, + "learning_rate": 6.520548604716942e-06, + "loss": 0.0197, + "step": 162960 + }, + { + "epoch": 35.007636895412446, + "grad_norm": 0.0011575659736990929, + "learning_rate": 6.517539583671849e-06, + "loss": 0.0274, + "step": 162970 + }, + { + "epoch": 35.007691057791256, + "grad_norm": 0.0012093250406906009, + "learning_rate": 6.514530562626756e-06, + "loss": 0.0803, + "step": 162980 + }, + { + "epoch": 35.00774522017007, + "grad_norm": 0.6530835628509521, + "learning_rate": 6.511521541581662e-06, + "loss": 0.0312, + "step": 162990 + }, + { + "epoch": 35.00779938254888, + "grad_norm": 0.004904730711132288, + "learning_rate": 6.508512520536569e-06, + "loss": 0.0215, + "step": 163000 + }, + { + "epoch": 35.00785354492769, + "grad_norm": 0.001179384533315897, + "learning_rate": 6.505503499491476e-06, + "loss": 0.023, + "step": 163010 + }, + { + "epoch": 35.00790770730651, + "grad_norm": 0.25271061062812805, + "learning_rate": 6.5024944784463835e-06, + "loss": 0.1448, + "step": 163020 + }, + { + "epoch": 35.00796186968532, + "grad_norm": 0.0010651060147210956, + "learning_rate": 6.49948545740129e-06, + "loss": 0.0131, + "step": 163030 + }, + { + "epoch": 35.00801603206413, + "grad_norm": 1.0353128910064697, + "learning_rate": 6.496476436356195e-06, + "loss": 0.1247, + "step": 163040 + }, + { + "epoch": 35.008070194442936, + "grad_norm": 0.03477271646261215, + "learning_rate": 6.493467415311102e-06, + "loss": 0.0284, + "step": 163050 + }, + { + "epoch": 35.00812435682175, + "grad_norm": 0.0009108966332860291, + "learning_rate": 6.4904583942660094e-06, + "loss": 0.022, + "step": 163060 + }, + { + "epoch": 35.00817851920056, + "grad_norm": 0.02557283453643322, + "learning_rate": 6.4874493732209166e-06, + "loss": 0.0249, + "step": 163070 + }, + { + "epoch": 35.00823268157937, + "grad_norm": 0.0008588893688283861, + "learning_rate": 6.484440352175823e-06, + "loss": 0.0518, + "step": 163080 + }, + { + "epoch": 35.00828684395819, + "grad_norm": 0.0011077586095780134, + "learning_rate": 6.48143133113073e-06, + "loss": 0.0214, + "step": 163090 + }, + { + "epoch": 35.008341006337, + "grad_norm": 0.0011136907851323485, + "learning_rate": 6.478422310085637e-06, + "loss": 0.0313, + "step": 163100 + }, + { + "epoch": 35.00839516871581, + "grad_norm": 0.3727080821990967, + "learning_rate": 6.475413289040544e-06, + "loss": 0.0276, + "step": 163110 + }, + { + "epoch": 35.008449331094624, + "grad_norm": 3.478999376296997, + "learning_rate": 6.4724042679954505e-06, + "loss": 0.1055, + "step": 163120 + }, + { + "epoch": 35.008503493473434, + "grad_norm": 0.0009019350982271135, + "learning_rate": 6.469395246950358e-06, + "loss": 0.1203, + "step": 163130 + }, + { + "epoch": 35.00855765585224, + "grad_norm": 0.0008778584306128323, + "learning_rate": 6.466386225905265e-06, + "loss": 0.0246, + "step": 163140 + }, + { + "epoch": 35.00861181823106, + "grad_norm": 0.32354363799095154, + "learning_rate": 6.463377204860172e-06, + "loss": 0.0077, + "step": 163150 + }, + { + "epoch": 35.00866598060987, + "grad_norm": 0.9792607426643372, + "learning_rate": 6.460368183815078e-06, + "loss": 0.0217, + "step": 163160 + }, + { + "epoch": 35.00872014298868, + "grad_norm": 0.05229372903704643, + "learning_rate": 6.457359162769985e-06, + "loss": 0.0354, + "step": 163170 + }, + { + "epoch": 35.00877430536749, + "grad_norm": 0.19947828352451324, + "learning_rate": 6.454350141724892e-06, + "loss": 0.0129, + "step": 163180 + }, + { + "epoch": 35.008828467746305, + "grad_norm": 0.12670741975307465, + "learning_rate": 6.451341120679798e-06, + "loss": 0.0114, + "step": 163190 + }, + { + "epoch": 35.008882630125115, + "grad_norm": 0.6466854810714722, + "learning_rate": 6.448332099634705e-06, + "loss": 0.0279, + "step": 163200 + }, + { + "epoch": 35.008936792503924, + "grad_norm": 5.976950645446777, + "learning_rate": 6.445323078589611e-06, + "loss": 0.1072, + "step": 163210 + }, + { + "epoch": 35.00899095488274, + "grad_norm": 0.0008740520570427179, + "learning_rate": 6.442314057544518e-06, + "loss": 0.0308, + "step": 163220 + }, + { + "epoch": 35.00904511726155, + "grad_norm": 0.0008628572104498744, + "learning_rate": 6.439305036499425e-06, + "loss": 0.0206, + "step": 163230 + }, + { + "epoch": 35.00909927964036, + "grad_norm": 2.61448335647583, + "learning_rate": 6.4362960154543325e-06, + "loss": 0.0422, + "step": 163240 + }, + { + "epoch": 35.009153442019176, + "grad_norm": 0.0014249414671212435, + "learning_rate": 6.433286994409239e-06, + "loss": 0.0181, + "step": 163250 + }, + { + "epoch": 35.009207604397986, + "grad_norm": 3.5110175609588623, + "learning_rate": 6.430277973364146e-06, + "loss": 0.008, + "step": 163260 + }, + { + "epoch": 35.009261766776795, + "grad_norm": 0.13502322137355804, + "learning_rate": 6.427268952319053e-06, + "loss": 0.005, + "step": 163270 + }, + { + "epoch": 35.00931592915561, + "grad_norm": 0.002224334981292486, + "learning_rate": 6.42425993127396e-06, + "loss": 0.0467, + "step": 163280 + }, + { + "epoch": 35.00937009153442, + "grad_norm": 3.3931374549865723, + "learning_rate": 6.4212509102288665e-06, + "loss": 0.0367, + "step": 163290 + }, + { + "epoch": 35.00942425391323, + "grad_norm": 3.38659930229187, + "learning_rate": 6.4182418891837736e-06, + "loss": 0.0639, + "step": 163300 + }, + { + "epoch": 35.00947841629204, + "grad_norm": 0.028522459790110588, + "learning_rate": 6.415232868138681e-06, + "loss": 0.0052, + "step": 163310 + }, + { + "epoch": 35.00953257867086, + "grad_norm": 0.42727726697921753, + "learning_rate": 6.412223847093588e-06, + "loss": 0.0219, + "step": 163320 + }, + { + "epoch": 35.00958674104967, + "grad_norm": 0.0011943286517634988, + "learning_rate": 6.409214826048494e-06, + "loss": 0.013, + "step": 163330 + }, + { + "epoch": 35.009640903428476, + "grad_norm": 4.090538024902344, + "learning_rate": 6.4062058050033995e-06, + "loss": 0.1066, + "step": 163340 + }, + { + "epoch": 35.00969506580729, + "grad_norm": 1.0550556182861328, + "learning_rate": 6.403196783958307e-06, + "loss": 0.0318, + "step": 163350 + }, + { + "epoch": 35.0097492281861, + "grad_norm": 0.7219516634941101, + "learning_rate": 6.400187762913214e-06, + "loss": 0.0141, + "step": 163360 + }, + { + "epoch": 35.00980339056491, + "grad_norm": 0.48919761180877686, + "learning_rate": 6.397178741868121e-06, + "loss": 0.0549, + "step": 163370 + }, + { + "epoch": 35.00985755294373, + "grad_norm": 1.042809009552002, + "learning_rate": 6.394169720823027e-06, + "loss": 0.0329, + "step": 163380 + }, + { + "epoch": 35.00991171532254, + "grad_norm": 0.0008856372442096472, + "learning_rate": 6.391160699777934e-06, + "loss": 0.0136, + "step": 163390 + }, + { + "epoch": 35.00996587770135, + "grad_norm": 0.0017828949494287372, + "learning_rate": 6.388151678732841e-06, + "loss": 0.0559, + "step": 163400 + }, + { + "epoch": 35.01002004008016, + "grad_norm": 0.41810357570648193, + "learning_rate": 6.3851426576877485e-06, + "loss": 0.0843, + "step": 163410 + }, + { + "epoch": 35.01007420245897, + "grad_norm": 0.3988931477069855, + "learning_rate": 6.382133636642655e-06, + "loss": 0.0112, + "step": 163420 + }, + { + "epoch": 35.01012836483778, + "grad_norm": 0.0011460221139714122, + "learning_rate": 6.379124615597562e-06, + "loss": 0.0458, + "step": 163430 + }, + { + "epoch": 35.01018252721659, + "grad_norm": 0.007778727449476719, + "learning_rate": 6.376115594552469e-06, + "loss": 0.0277, + "step": 163440 + }, + { + "epoch": 35.01023668959541, + "grad_norm": 0.0029946905560791492, + "learning_rate": 6.373106573507376e-06, + "loss": 0.0103, + "step": 163450 + }, + { + "epoch": 35.01029085197422, + "grad_norm": 14.132522583007812, + "learning_rate": 6.3700975524622824e-06, + "loss": 0.0594, + "step": 163460 + }, + { + "epoch": 35.01034501435303, + "grad_norm": 0.0008806312689557672, + "learning_rate": 6.3670885314171896e-06, + "loss": 0.0029, + "step": 163470 + }, + { + "epoch": 35.010399176731845, + "grad_norm": 0.6186717748641968, + "learning_rate": 6.364079510372095e-06, + "loss": 0.0044, + "step": 163480 + }, + { + "epoch": 35.010453339110654, + "grad_norm": 0.0011790653225034475, + "learning_rate": 6.361070489327002e-06, + "loss": 0.0437, + "step": 163490 + }, + { + "epoch": 35.010507501489464, + "grad_norm": 0.7737136483192444, + "learning_rate": 6.358061468281909e-06, + "loss": 0.0482, + "step": 163500 + }, + { + "epoch": 35.01056166386828, + "grad_norm": 0.008873190730810165, + "learning_rate": 6.3550524472368155e-06, + "loss": 0.0098, + "step": 163510 + }, + { + "epoch": 35.01061582624709, + "grad_norm": 0.8562555909156799, + "learning_rate": 6.352043426191723e-06, + "loss": 0.0875, + "step": 163520 + }, + { + "epoch": 35.0106699886259, + "grad_norm": 0.001366963959299028, + "learning_rate": 6.34903440514663e-06, + "loss": 0.0163, + "step": 163530 + }, + { + "epoch": 35.01072415100471, + "grad_norm": 0.001056704786606133, + "learning_rate": 6.346025384101537e-06, + "loss": 0.0157, + "step": 163540 + }, + { + "epoch": 35.010778313383526, + "grad_norm": 0.0018265257822349668, + "learning_rate": 6.343016363056443e-06, + "loss": 0.0319, + "step": 163550 + }, + { + "epoch": 35.010832475762335, + "grad_norm": 0.0008988476474769413, + "learning_rate": 6.34000734201135e-06, + "loss": 0.0336, + "step": 163560 + }, + { + "epoch": 35.010886638141145, + "grad_norm": 0.0008515636436641216, + "learning_rate": 6.336998320966257e-06, + "loss": 0.0259, + "step": 163570 + }, + { + "epoch": 35.01094080051996, + "grad_norm": 0.1972447633743286, + "learning_rate": 6.3339892999211645e-06, + "loss": 0.045, + "step": 163580 + }, + { + "epoch": 35.01099496289877, + "grad_norm": 0.051068224012851715, + "learning_rate": 6.330980278876071e-06, + "loss": 0.0297, + "step": 163590 + }, + { + "epoch": 35.01104912527758, + "grad_norm": 2.1425957679748535, + "learning_rate": 6.327971257830978e-06, + "loss": 0.0301, + "step": 163600 + }, + { + "epoch": 35.0111032876564, + "grad_norm": 1.0301251411437988, + "learning_rate": 6.324962236785885e-06, + "loss": 0.027, + "step": 163610 + }, + { + "epoch": 35.011157450035206, + "grad_norm": 0.0009706023265607655, + "learning_rate": 6.321953215740792e-06, + "loss": 0.0396, + "step": 163620 + }, + { + "epoch": 35.011211612414016, + "grad_norm": 0.0008583295857533813, + "learning_rate": 6.318944194695698e-06, + "loss": 0.0479, + "step": 163630 + }, + { + "epoch": 35.01126577479283, + "grad_norm": 0.0013334720861166716, + "learning_rate": 6.315935173650604e-06, + "loss": 0.0636, + "step": 163640 + }, + { + "epoch": 35.01131993717164, + "grad_norm": 0.0008728119428269565, + "learning_rate": 6.312926152605511e-06, + "loss": 0.0, + "step": 163650 + }, + { + "epoch": 35.01137409955045, + "grad_norm": 0.6549242734909058, + "learning_rate": 6.309917131560418e-06, + "loss": 0.0504, + "step": 163660 + }, + { + "epoch": 35.01142826192926, + "grad_norm": 0.0009457406704314053, + "learning_rate": 6.306908110515325e-06, + "loss": 0.0074, + "step": 163670 + }, + { + "epoch": 35.01148242430808, + "grad_norm": 0.0008674184209667146, + "learning_rate": 6.3038990894702315e-06, + "loss": 0.0105, + "step": 163680 + }, + { + "epoch": 35.01153658668689, + "grad_norm": 0.0008666341309435666, + "learning_rate": 6.300890068425139e-06, + "loss": 0.0515, + "step": 163690 + }, + { + "epoch": 35.0115907490657, + "grad_norm": 5.686366558074951, + "learning_rate": 6.297881047380046e-06, + "loss": 0.0689, + "step": 163700 + }, + { + "epoch": 35.01164491144451, + "grad_norm": 0.000992598826996982, + "learning_rate": 6.294872026334953e-06, + "loss": 0.0202, + "step": 163710 + }, + { + "epoch": 35.01169907382332, + "grad_norm": 0.001108393887989223, + "learning_rate": 6.291863005289859e-06, + "loss": 0.0096, + "step": 163720 + }, + { + "epoch": 35.01175323620213, + "grad_norm": 0.0045509920455515385, + "learning_rate": 6.288853984244766e-06, + "loss": 0.0032, + "step": 163730 + }, + { + "epoch": 35.01180739858095, + "grad_norm": 0.5816704034805298, + "learning_rate": 6.285844963199673e-06, + "loss": 0.0047, + "step": 163740 + }, + { + "epoch": 35.01186156095976, + "grad_norm": 0.5530796051025391, + "learning_rate": 6.2828359421545805e-06, + "loss": 0.0251, + "step": 163750 + }, + { + "epoch": 35.01191572333857, + "grad_norm": 0.0010595869971439242, + "learning_rate": 6.279826921109487e-06, + "loss": 0.0305, + "step": 163760 + }, + { + "epoch": 35.01196988571738, + "grad_norm": 0.10826289653778076, + "learning_rate": 6.276817900064394e-06, + "loss": 0.0008, + "step": 163770 + }, + { + "epoch": 35.012024048096194, + "grad_norm": 1.1545965671539307, + "learning_rate": 6.273808879019299e-06, + "loss": 0.0102, + "step": 163780 + }, + { + "epoch": 35.012078210475, + "grad_norm": 0.001067734556272626, + "learning_rate": 6.2707998579742064e-06, + "loss": 0.0431, + "step": 163790 + }, + { + "epoch": 35.01213237285381, + "grad_norm": 0.0010821870528161526, + "learning_rate": 6.2677908369291136e-06, + "loss": 0.0742, + "step": 163800 + }, + { + "epoch": 35.01218653523263, + "grad_norm": 0.00173045857809484, + "learning_rate": 6.26478181588402e-06, + "loss": 0.0091, + "step": 163810 + }, + { + "epoch": 35.01224069761144, + "grad_norm": 0.009196235798299313, + "learning_rate": 6.261772794838927e-06, + "loss": 0.0032, + "step": 163820 + }, + { + "epoch": 35.01229485999025, + "grad_norm": 0.27445095777511597, + "learning_rate": 6.258763773793834e-06, + "loss": 0.1043, + "step": 163830 + }, + { + "epoch": 35.012349022369065, + "grad_norm": 0.42462313175201416, + "learning_rate": 6.255754752748741e-06, + "loss": 0.0167, + "step": 163840 + }, + { + "epoch": 35.012403184747875, + "grad_norm": 0.0667504072189331, + "learning_rate": 6.2527457317036475e-06, + "loss": 0.0312, + "step": 163850 + }, + { + "epoch": 35.012457347126684, + "grad_norm": 2.555194139480591, + "learning_rate": 6.249736710658555e-06, + "loss": 0.0241, + "step": 163860 + }, + { + "epoch": 35.0125115095055, + "grad_norm": 1.9863858222961426, + "learning_rate": 6.246727689613462e-06, + "loss": 0.0205, + "step": 163870 + }, + { + "epoch": 35.01256567188431, + "grad_norm": 0.08001742511987686, + "learning_rate": 6.243718668568369e-06, + "loss": 0.0233, + "step": 163880 + }, + { + "epoch": 35.01261983426312, + "grad_norm": 0.0008480778196826577, + "learning_rate": 6.240709647523275e-06, + "loss": 0.0601, + "step": 163890 + }, + { + "epoch": 35.01267399664193, + "grad_norm": 0.0019344660686329007, + "learning_rate": 6.237700626478181e-06, + "loss": 0.0067, + "step": 163900 + }, + { + "epoch": 35.012728159020746, + "grad_norm": 0.31349194049835205, + "learning_rate": 6.2346916054330885e-06, + "loss": 0.0289, + "step": 163910 + }, + { + "epoch": 35.012782321399555, + "grad_norm": 0.0008492623455822468, + "learning_rate": 6.231682584387996e-06, + "loss": 0.0263, + "step": 163920 + }, + { + "epoch": 35.012836483778365, + "grad_norm": 1.4301961660385132, + "learning_rate": 6.228673563342903e-06, + "loss": 0.0623, + "step": 163930 + }, + { + "epoch": 35.01289064615718, + "grad_norm": 0.0008773687877692282, + "learning_rate": 6.225664542297809e-06, + "loss": 0.006, + "step": 163940 + }, + { + "epoch": 35.01294480853599, + "grad_norm": 0.9927545189857483, + "learning_rate": 6.222655521252716e-06, + "loss": 0.064, + "step": 163950 + }, + { + "epoch": 35.0129989709148, + "grad_norm": 0.001168656861409545, + "learning_rate": 6.219646500207623e-06, + "loss": 0.0176, + "step": 163960 + }, + { + "epoch": 35.01305313329362, + "grad_norm": 0.007212364114820957, + "learning_rate": 6.2166374791625295e-06, + "loss": 0.0606, + "step": 163970 + }, + { + "epoch": 35.01310729567243, + "grad_norm": 0.0013755602994933724, + "learning_rate": 6.213628458117436e-06, + "loss": 0.0003, + "step": 163980 + }, + { + "epoch": 35.013161458051236, + "grad_norm": 0.002090088790282607, + "learning_rate": 6.210619437072343e-06, + "loss": 0.0264, + "step": 163990 + }, + { + "epoch": 35.013215620430046, + "grad_norm": 0.0012353246565908194, + "learning_rate": 6.20761041602725e-06, + "loss": 0.0097, + "step": 164000 + }, + { + "epoch": 35.01326978280886, + "grad_norm": 0.1893192082643509, + "learning_rate": 6.204601394982157e-06, + "loss": 0.0497, + "step": 164010 + }, + { + "epoch": 35.01332394518767, + "grad_norm": 0.020633049309253693, + "learning_rate": 6.2015923739370635e-06, + "loss": 0.0005, + "step": 164020 + }, + { + "epoch": 35.01337810756648, + "grad_norm": 0.0012939533917233348, + "learning_rate": 6.198583352891971e-06, + "loss": 0.0114, + "step": 164030 + }, + { + "epoch": 35.0134322699453, + "grad_norm": 0.000924178515560925, + "learning_rate": 6.195574331846878e-06, + "loss": 0.0302, + "step": 164040 + }, + { + "epoch": 35.01348643232411, + "grad_norm": 0.001900447765365243, + "learning_rate": 6.192565310801784e-06, + "loss": 0.0029, + "step": 164050 + }, + { + "epoch": 35.01354059470292, + "grad_norm": 0.0017374904127791524, + "learning_rate": 6.189556289756691e-06, + "loss": 0.0642, + "step": 164060 + }, + { + "epoch": 35.013594757081734, + "grad_norm": 0.23625370860099792, + "learning_rate": 6.186547268711597e-06, + "loss": 0.0389, + "step": 164070 + }, + { + "epoch": 35.01364891946054, + "grad_norm": 0.0013701149728149176, + "learning_rate": 6.1835382476665045e-06, + "loss": 0.0261, + "step": 164080 + }, + { + "epoch": 35.01370308183935, + "grad_norm": 0.05985535681247711, + "learning_rate": 6.180529226621412e-06, + "loss": 0.0146, + "step": 164090 + }, + { + "epoch": 35.01375724421817, + "grad_norm": 0.008791310712695122, + "learning_rate": 6.177520205576318e-06, + "loss": 0.0052, + "step": 164100 + }, + { + "epoch": 35.01381140659698, + "grad_norm": 0.001828819396905601, + "learning_rate": 6.174511184531225e-06, + "loss": 0.0205, + "step": 164110 + }, + { + "epoch": 35.01386556897579, + "grad_norm": 0.0010086449328809977, + "learning_rate": 6.171502163486131e-06, + "loss": 0.0, + "step": 164120 + }, + { + "epoch": 35.0139197313546, + "grad_norm": 0.22563078999519348, + "learning_rate": 6.168493142441038e-06, + "loss": 0.0157, + "step": 164130 + }, + { + "epoch": 35.013973893733414, + "grad_norm": 0.0011533878277987242, + "learning_rate": 6.1654841213959455e-06, + "loss": 0.0433, + "step": 164140 + }, + { + "epoch": 35.014028056112224, + "grad_norm": 4.640152931213379, + "learning_rate": 6.162475100350852e-06, + "loss": 0.0981, + "step": 164150 + }, + { + "epoch": 35.01408221849103, + "grad_norm": 1.7837581634521484, + "learning_rate": 6.159466079305759e-06, + "loss": 0.0522, + "step": 164160 + }, + { + "epoch": 35.01413638086985, + "grad_norm": 0.0010370417730882764, + "learning_rate": 6.156457058260666e-06, + "loss": 0.0403, + "step": 164170 + }, + { + "epoch": 35.01419054324866, + "grad_norm": 0.0018155797151848674, + "learning_rate": 6.153448037215573e-06, + "loss": 0.0139, + "step": 164180 + }, + { + "epoch": 35.01424470562747, + "grad_norm": 0.0008430928573943675, + "learning_rate": 6.1504390161704794e-06, + "loss": 0.0383, + "step": 164190 + }, + { + "epoch": 35.014298868006286, + "grad_norm": 0.4851064682006836, + "learning_rate": 6.147429995125386e-06, + "loss": 0.047, + "step": 164200 + }, + { + "epoch": 35.014353030385095, + "grad_norm": 0.0010692916112020612, + "learning_rate": 6.144420974080293e-06, + "loss": 0.0009, + "step": 164210 + }, + { + "epoch": 35.014407192763905, + "grad_norm": 0.003231313079595566, + "learning_rate": 6.1414119530352e-06, + "loss": 0.0261, + "step": 164220 + }, + { + "epoch": 35.01446135514272, + "grad_norm": 0.0030178253073245287, + "learning_rate": 6.138402931990106e-06, + "loss": 0.1381, + "step": 164230 + }, + { + "epoch": 35.01451551752153, + "grad_norm": 0.002050436567515135, + "learning_rate": 6.135393910945013e-06, + "loss": 0.0, + "step": 164240 + }, + { + "epoch": 35.01456967990034, + "grad_norm": 0.0010877307504415512, + "learning_rate": 6.1323848898999205e-06, + "loss": 0.052, + "step": 164250 + }, + { + "epoch": 35.01462384227915, + "grad_norm": 2.404799222946167, + "learning_rate": 6.129375868854828e-06, + "loss": 0.0209, + "step": 164260 + }, + { + "epoch": 35.014678004657966, + "grad_norm": 0.16562582552433014, + "learning_rate": 6.126366847809734e-06, + "loss": 0.0219, + "step": 164270 + }, + { + "epoch": 35.014732167036776, + "grad_norm": 0.006918993778526783, + "learning_rate": 6.12335782676464e-06, + "loss": 0.0291, + "step": 164280 + }, + { + "epoch": 35.014786329415585, + "grad_norm": 0.0011656853603199124, + "learning_rate": 6.120348805719547e-06, + "loss": 0.0283, + "step": 164290 + }, + { + "epoch": 35.0148404917944, + "grad_norm": 0.0008925202419050038, + "learning_rate": 6.117339784674454e-06, + "loss": 0.0694, + "step": 164300 + }, + { + "epoch": 35.01489465417321, + "grad_norm": 0.0011527262395247817, + "learning_rate": 6.1143307636293615e-06, + "loss": 0.005, + "step": 164310 + }, + { + "epoch": 35.01494881655202, + "grad_norm": 7.3522820472717285, + "learning_rate": 6.111321742584268e-06, + "loss": 0.1027, + "step": 164320 + }, + { + "epoch": 35.01500297893084, + "grad_norm": 0.0008990111527964473, + "learning_rate": 6.108312721539175e-06, + "loss": 0.1314, + "step": 164330 + }, + { + "epoch": 35.01505714130965, + "grad_norm": 0.002149339998140931, + "learning_rate": 6.105303700494081e-06, + "loss": 0.0861, + "step": 164340 + }, + { + "epoch": 35.01511130368846, + "grad_norm": 0.05980107560753822, + "learning_rate": 6.102294679448988e-06, + "loss": 0.004, + "step": 164350 + }, + { + "epoch": 35.015165466067266, + "grad_norm": 1.428114414215088, + "learning_rate": 6.0992856584038954e-06, + "loss": 0.0246, + "step": 164360 + }, + { + "epoch": 35.01521962844608, + "grad_norm": 3.9196102619171143, + "learning_rate": 6.096276637358802e-06, + "loss": 0.013, + "step": 164370 + }, + { + "epoch": 35.01527379082489, + "grad_norm": 0.0008671802352182567, + "learning_rate": 6.093267616313709e-06, + "loss": 0.0014, + "step": 164380 + }, + { + "epoch": 35.0153279532037, + "grad_norm": 0.6964612007141113, + "learning_rate": 6.090258595268616e-06, + "loss": 0.0152, + "step": 164390 + }, + { + "epoch": 35.01538211558252, + "grad_norm": 0.0009135348373092711, + "learning_rate": 6.087249574223522e-06, + "loss": 0.0053, + "step": 164400 + }, + { + "epoch": 35.01543627796133, + "grad_norm": 0.0008376957848668098, + "learning_rate": 6.084240553178429e-06, + "loss": 0.0121, + "step": 164410 + }, + { + "epoch": 35.01549044034014, + "grad_norm": 0.0010626454604789615, + "learning_rate": 6.081231532133336e-06, + "loss": 0.0244, + "step": 164420 + }, + { + "epoch": 35.015544602718954, + "grad_norm": 0.0441637746989727, + "learning_rate": 6.078222511088243e-06, + "loss": 0.0248, + "step": 164430 + }, + { + "epoch": 35.015598765097764, + "grad_norm": 0.0008798002381809056, + "learning_rate": 6.07521349004315e-06, + "loss": 0.0022, + "step": 164440 + }, + { + "epoch": 35.01565292747657, + "grad_norm": 0.05445652827620506, + "learning_rate": 6.072204468998056e-06, + "loss": 0.0201, + "step": 164450 + }, + { + "epoch": 35.01570708985539, + "grad_norm": 0.002951786620542407, + "learning_rate": 6.069195447952963e-06, + "loss": 0.0439, + "step": 164460 + }, + { + "epoch": 35.0157612522342, + "grad_norm": 0.004036530386656523, + "learning_rate": 6.06618642690787e-06, + "loss": 0.0212, + "step": 164470 + }, + { + "epoch": 35.01581541461301, + "grad_norm": 0.005460288375616074, + "learning_rate": 6.0631774058627775e-06, + "loss": 0.0112, + "step": 164480 + }, + { + "epoch": 35.01586957699182, + "grad_norm": 0.022058280184864998, + "learning_rate": 6.060168384817684e-06, + "loss": 0.0357, + "step": 164490 + }, + { + "epoch": 35.015923739370635, + "grad_norm": 0.0022025322541594505, + "learning_rate": 6.05715936377259e-06, + "loss": 0.0426, + "step": 164500 + }, + { + "epoch": 35.015977901749444, + "grad_norm": 2.671269655227661, + "learning_rate": 6.054150342727497e-06, + "loss": 0.0266, + "step": 164510 + }, + { + "epoch": 35.016032064128254, + "grad_norm": 0.0013102812226861715, + "learning_rate": 6.051141321682404e-06, + "loss": 0.008, + "step": 164520 + }, + { + "epoch": 35.01608622650707, + "grad_norm": 0.0010854777647182345, + "learning_rate": 6.0481323006373106e-06, + "loss": 0.0189, + "step": 164530 + }, + { + "epoch": 35.01614038888588, + "grad_norm": 0.08092061430215836, + "learning_rate": 6.045123279592218e-06, + "loss": 0.0451, + "step": 164540 + }, + { + "epoch": 35.01619455126469, + "grad_norm": 0.0011158823035657406, + "learning_rate": 6.042114258547125e-06, + "loss": 0.0654, + "step": 164550 + }, + { + "epoch": 35.016248713643506, + "grad_norm": 0.0011834966717287898, + "learning_rate": 6.039105237502032e-06, + "loss": 0.0037, + "step": 164560 + }, + { + "epoch": 35.016302876022316, + "grad_norm": 0.0008613590616732836, + "learning_rate": 6.036096216456938e-06, + "loss": 0.149, + "step": 164570 + }, + { + "epoch": 35.016357038401125, + "grad_norm": 0.013331834226846695, + "learning_rate": 6.0330871954118445e-06, + "loss": 0.0493, + "step": 164580 + }, + { + "epoch": 35.01641120077994, + "grad_norm": 0.0010580122470855713, + "learning_rate": 6.030078174366752e-06, + "loss": 0.0079, + "step": 164590 + }, + { + "epoch": 35.01646536315875, + "grad_norm": 0.001179876271635294, + "learning_rate": 6.027069153321659e-06, + "loss": 0.0065, + "step": 164600 + }, + { + "epoch": 35.01651952553756, + "grad_norm": 0.027023496106266975, + "learning_rate": 6.024060132276566e-06, + "loss": 0.0175, + "step": 164610 + }, + { + "epoch": 35.01657368791637, + "grad_norm": 0.004135197028517723, + "learning_rate": 6.021051111231472e-06, + "loss": 0.0648, + "step": 164620 + }, + { + "epoch": 35.01662785029519, + "grad_norm": 0.0016804917249828577, + "learning_rate": 6.018042090186379e-06, + "loss": 0.016, + "step": 164630 + }, + { + "epoch": 35.016682012673996, + "grad_norm": 0.0008882734109647572, + "learning_rate": 6.0150330691412855e-06, + "loss": 0.0072, + "step": 164640 + }, + { + "epoch": 35.016736175052806, + "grad_norm": 0.0010727797634899616, + "learning_rate": 6.012024048096193e-06, + "loss": 0.1073, + "step": 164650 + }, + { + "epoch": 35.01679033743162, + "grad_norm": 0.0010935396421700716, + "learning_rate": 6.009015027051099e-06, + "loss": 0.0275, + "step": 164660 + }, + { + "epoch": 35.01684449981043, + "grad_norm": 0.004522240720689297, + "learning_rate": 6.006006006006006e-06, + "loss": 0.0257, + "step": 164670 + }, + { + "epoch": 35.01689866218924, + "grad_norm": 0.0011049800086766481, + "learning_rate": 6.002996984960913e-06, + "loss": 0.0001, + "step": 164680 + }, + { + "epoch": 35.01695282456806, + "grad_norm": 0.0008957021054811776, + "learning_rate": 5.99998796391582e-06, + "loss": 0.02, + "step": 164690 + }, + { + "epoch": 35.01700698694687, + "grad_norm": 0.0028522908687591553, + "learning_rate": 5.9969789428707266e-06, + "loss": 0.0248, + "step": 164700 + }, + { + "epoch": 35.01706114932568, + "grad_norm": 0.049699537456035614, + "learning_rate": 5.993969921825634e-06, + "loss": 0.0243, + "step": 164710 + }, + { + "epoch": 35.01711531170449, + "grad_norm": 0.0008425737032666802, + "learning_rate": 5.99096090078054e-06, + "loss": 0.024, + "step": 164720 + }, + { + "epoch": 35.0171694740833, + "grad_norm": 0.003784437896683812, + "learning_rate": 5.987951879735447e-06, + "loss": 0.0104, + "step": 164730 + }, + { + "epoch": 35.01722363646211, + "grad_norm": 1.2630462646484375, + "learning_rate": 5.984942858690354e-06, + "loss": 0.0182, + "step": 164740 + }, + { + "epoch": 35.01727779884092, + "grad_norm": 0.9670185446739197, + "learning_rate": 5.9819338376452605e-06, + "loss": 0.0408, + "step": 164750 + }, + { + "epoch": 35.01733196121974, + "grad_norm": 0.000846049515530467, + "learning_rate": 5.978924816600168e-06, + "loss": 0.0087, + "step": 164760 + }, + { + "epoch": 35.01738612359855, + "grad_norm": 1.019283652305603, + "learning_rate": 5.975915795555075e-06, + "loss": 0.009, + "step": 164770 + }, + { + "epoch": 35.01744028597736, + "grad_norm": 0.4557885527610779, + "learning_rate": 5.972906774509982e-06, + "loss": 0.0223, + "step": 164780 + }, + { + "epoch": 35.017494448356175, + "grad_norm": 0.0009093937696889043, + "learning_rate": 5.969897753464888e-06, + "loss": 0.0538, + "step": 164790 + }, + { + "epoch": 35.017548610734984, + "grad_norm": 0.0022654635831713676, + "learning_rate": 5.966888732419794e-06, + "loss": 0.0558, + "step": 164800 + }, + { + "epoch": 35.01760277311379, + "grad_norm": 0.23045608401298523, + "learning_rate": 5.9638797113747015e-06, + "loss": 0.0009, + "step": 164810 + }, + { + "epoch": 35.01765693549261, + "grad_norm": 1.6069413423538208, + "learning_rate": 5.960870690329609e-06, + "loss": 0.1382, + "step": 164820 + }, + { + "epoch": 35.01771109787142, + "grad_norm": 1.8790396451950073, + "learning_rate": 5.957861669284515e-06, + "loss": 0.0633, + "step": 164830 + }, + { + "epoch": 35.01776526025023, + "grad_norm": 0.02962169423699379, + "learning_rate": 5.954852648239422e-06, + "loss": 0.0402, + "step": 164840 + }, + { + "epoch": 35.01781942262904, + "grad_norm": 0.013107817620038986, + "learning_rate": 5.951843627194329e-06, + "loss": 0.0342, + "step": 164850 + }, + { + "epoch": 35.017873585007855, + "grad_norm": 0.0008942993590608239, + "learning_rate": 5.948834606149236e-06, + "loss": 0.0899, + "step": 164860 + }, + { + "epoch": 35.017927747386665, + "grad_norm": 0.011889890767633915, + "learning_rate": 5.9458255851041425e-06, + "loss": 0.0502, + "step": 164870 + }, + { + "epoch": 35.017981909765474, + "grad_norm": 0.12306447327136993, + "learning_rate": 5.942816564059049e-06, + "loss": 0.066, + "step": 164880 + }, + { + "epoch": 35.01803607214429, + "grad_norm": 1.9906655550003052, + "learning_rate": 5.939807543013956e-06, + "loss": 0.0279, + "step": 164890 + }, + { + "epoch": 35.0180902345231, + "grad_norm": 0.0009014632669277489, + "learning_rate": 5.936798521968863e-06, + "loss": 0.0339, + "step": 164900 + }, + { + "epoch": 35.01814439690191, + "grad_norm": 0.8796550631523132, + "learning_rate": 5.93378950092377e-06, + "loss": 0.0827, + "step": 164910 + }, + { + "epoch": 35.01819855928073, + "grad_norm": 0.0012194191804155707, + "learning_rate": 5.9307804798786765e-06, + "loss": 0.019, + "step": 164920 + }, + { + "epoch": 35.018252721659536, + "grad_norm": 0.21319085359573364, + "learning_rate": 5.927771458833584e-06, + "loss": 0.0757, + "step": 164930 + }, + { + "epoch": 35.018306884038346, + "grad_norm": 0.9598028659820557, + "learning_rate": 5.92476243778849e-06, + "loss": 0.1375, + "step": 164940 + }, + { + "epoch": 35.01836104641716, + "grad_norm": 0.0008572666556574404, + "learning_rate": 5.921753416743397e-06, + "loss": 0.0002, + "step": 164950 + }, + { + "epoch": 35.01841520879597, + "grad_norm": 0.0009264145628549159, + "learning_rate": 5.918744395698303e-06, + "loss": 0.0295, + "step": 164960 + }, + { + "epoch": 35.01846937117478, + "grad_norm": 1.0770790576934814, + "learning_rate": 5.91573537465321e-06, + "loss": 0.0387, + "step": 164970 + }, + { + "epoch": 35.01852353355359, + "grad_norm": 0.0012045156909152865, + "learning_rate": 5.9127263536081175e-06, + "loss": 0.0204, + "step": 164980 + }, + { + "epoch": 35.01857769593241, + "grad_norm": 1.0691940784454346, + "learning_rate": 5.909717332563025e-06, + "loss": 0.022, + "step": 164990 + }, + { + "epoch": 35.01863185831122, + "grad_norm": 0.0010193617781624198, + "learning_rate": 5.906708311517931e-06, + "loss": 0.0084, + "step": 165000 + }, + { + "epoch": 35.018686020690026, + "grad_norm": 0.9703909158706665, + "learning_rate": 5.903699290472838e-06, + "loss": 0.1702, + "step": 165010 + }, + { + "epoch": 35.01874018306884, + "grad_norm": 0.0011772096622735262, + "learning_rate": 5.900690269427744e-06, + "loss": 0.0581, + "step": 165020 + }, + { + "epoch": 35.01879434544765, + "grad_norm": 0.06127498298883438, + "learning_rate": 5.897681248382651e-06, + "loss": 0.0745, + "step": 165030 + }, + { + "epoch": 35.01884850782646, + "grad_norm": 0.0031068797688931227, + "learning_rate": 5.8946722273375585e-06, + "loss": 0.0168, + "step": 165040 + }, + { + "epoch": 35.01890267020528, + "grad_norm": 0.2650218605995178, + "learning_rate": 5.891663206292465e-06, + "loss": 0.0267, + "step": 165050 + }, + { + "epoch": 35.01895683258409, + "grad_norm": 0.002560395048931241, + "learning_rate": 5.888654185247372e-06, + "loss": 0.0001, + "step": 165060 + }, + { + "epoch": 35.0190109949629, + "grad_norm": 1.6165311336517334, + "learning_rate": 5.885645164202279e-06, + "loss": 0.0148, + "step": 165070 + }, + { + "epoch": 35.01906515734171, + "grad_norm": 1.133992075920105, + "learning_rate": 5.882636143157186e-06, + "loss": 0.0392, + "step": 165080 + }, + { + "epoch": 35.019119319720524, + "grad_norm": 3.4908597469329834, + "learning_rate": 5.879627122112092e-06, + "loss": 0.2084, + "step": 165090 + }, + { + "epoch": 35.01917348209933, + "grad_norm": 0.004740417003631592, + "learning_rate": 5.876618101066999e-06, + "loss": 0.0425, + "step": 165100 + }, + { + "epoch": 35.01922764447814, + "grad_norm": 0.0008783622179180384, + "learning_rate": 5.873609080021906e-06, + "loss": 0.0302, + "step": 165110 + }, + { + "epoch": 35.01928180685696, + "grad_norm": 0.0009526798967272043, + "learning_rate": 5.870600058976813e-06, + "loss": 0.055, + "step": 165120 + }, + { + "epoch": 35.01933596923577, + "grad_norm": 0.0012370400363579392, + "learning_rate": 5.867591037931719e-06, + "loss": 0.0103, + "step": 165130 + }, + { + "epoch": 35.01939013161458, + "grad_norm": 0.001534046372398734, + "learning_rate": 5.864582016886626e-06, + "loss": 0.013, + "step": 165140 + }, + { + "epoch": 35.019444293993395, + "grad_norm": 0.0009096178691834211, + "learning_rate": 5.8615729958415335e-06, + "loss": 0.0229, + "step": 165150 + }, + { + "epoch": 35.019498456372204, + "grad_norm": 0.12017222493886948, + "learning_rate": 5.858563974796441e-06, + "loss": 0.0332, + "step": 165160 + }, + { + "epoch": 35.019552618751014, + "grad_norm": 1.077867031097412, + "learning_rate": 5.855554953751347e-06, + "loss": 0.022, + "step": 165170 + }, + { + "epoch": 35.01960678112983, + "grad_norm": 0.0020074662752449512, + "learning_rate": 5.852545932706253e-06, + "loss": 0.0151, + "step": 165180 + }, + { + "epoch": 35.01966094350864, + "grad_norm": 0.0010210872860625386, + "learning_rate": 5.84953691166116e-06, + "loss": 0.044, + "step": 165190 + }, + { + "epoch": 35.01971510588745, + "grad_norm": 0.05913989245891571, + "learning_rate": 5.846527890616067e-06, + "loss": 0.0469, + "step": 165200 + }, + { + "epoch": 35.01976926826626, + "grad_norm": 4.192391872406006, + "learning_rate": 5.8435188695709745e-06, + "loss": 0.1015, + "step": 165210 + }, + { + "epoch": 35.019823430645076, + "grad_norm": 1.0844063758850098, + "learning_rate": 5.840509848525881e-06, + "loss": 0.0173, + "step": 165220 + }, + { + "epoch": 35.019877593023885, + "grad_norm": 0.0008800249779596925, + "learning_rate": 5.837500827480788e-06, + "loss": 0.0015, + "step": 165230 + }, + { + "epoch": 35.019931755402695, + "grad_norm": 0.11622493714094162, + "learning_rate": 5.834491806435694e-06, + "loss": 0.0073, + "step": 165240 + }, + { + "epoch": 35.01998591778151, + "grad_norm": 0.0034038012381643057, + "learning_rate": 5.831482785390601e-06, + "loss": 0.017, + "step": 165250 + }, + { + "epoch": 35.02004008016032, + "grad_norm": 0.25480225682258606, + "learning_rate": 5.828473764345508e-06, + "loss": 0.0204, + "step": 165260 + }, + { + "epoch": 35.02009424253913, + "grad_norm": 0.0011869880836457014, + "learning_rate": 5.825464743300415e-06, + "loss": 0.0797, + "step": 165270 + }, + { + "epoch": 35.02014840491795, + "grad_norm": 0.000979884178377688, + "learning_rate": 5.822455722255322e-06, + "loss": 0.0688, + "step": 165280 + }, + { + "epoch": 35.02020256729676, + "grad_norm": 0.0011634220136329532, + "learning_rate": 5.819446701210229e-06, + "loss": 0.0071, + "step": 165290 + }, + { + "epoch": 35.020256729675566, + "grad_norm": 1.3838015794754028, + "learning_rate": 5.816437680165135e-06, + "loss": 0.0561, + "step": 165300 + }, + { + "epoch": 35.020310892054376, + "grad_norm": 0.31026360392570496, + "learning_rate": 5.813428659120042e-06, + "loss": 0.0105, + "step": 165310 + }, + { + "epoch": 35.02036505443319, + "grad_norm": 0.0013622931437566876, + "learning_rate": 5.810419638074949e-06, + "loss": 0.0394, + "step": 165320 + }, + { + "epoch": 35.020419216812, + "grad_norm": 0.17699581384658813, + "learning_rate": 5.807410617029856e-06, + "loss": 0.0068, + "step": 165330 + }, + { + "epoch": 35.02047337919081, + "grad_norm": 0.6602747440338135, + "learning_rate": 5.804401595984763e-06, + "loss": 0.0187, + "step": 165340 + }, + { + "epoch": 35.02052754156963, + "grad_norm": 0.0009495517006143928, + "learning_rate": 5.801392574939669e-06, + "loss": 0.12, + "step": 165350 + }, + { + "epoch": 35.02058170394844, + "grad_norm": 0.001226150430738926, + "learning_rate": 5.798383553894576e-06, + "loss": 0.03, + "step": 165360 + }, + { + "epoch": 35.02063586632725, + "grad_norm": 0.11287061125040054, + "learning_rate": 5.795374532849483e-06, + "loss": 0.0569, + "step": 165370 + }, + { + "epoch": 35.02069002870606, + "grad_norm": 0.5774153470993042, + "learning_rate": 5.7923655118043905e-06, + "loss": 0.0701, + "step": 165380 + }, + { + "epoch": 35.02074419108487, + "grad_norm": 3.00010085105896, + "learning_rate": 5.789356490759296e-06, + "loss": 0.0556, + "step": 165390 + }, + { + "epoch": 35.02079835346368, + "grad_norm": 0.0021163190249353647, + "learning_rate": 5.786347469714203e-06, + "loss": 0.0123, + "step": 165400 + }, + { + "epoch": 35.0208525158425, + "grad_norm": 0.04378902539610863, + "learning_rate": 5.78333844866911e-06, + "loss": 0.0093, + "step": 165410 + }, + { + "epoch": 35.02090667822131, + "grad_norm": 0.3257392346858978, + "learning_rate": 5.780329427624017e-06, + "loss": 0.0037, + "step": 165420 + }, + { + "epoch": 35.02096084060012, + "grad_norm": 3.13234281539917, + "learning_rate": 5.7773204065789236e-06, + "loss": 0.0467, + "step": 165430 + }, + { + "epoch": 35.02101500297893, + "grad_norm": 0.17707793414592743, + "learning_rate": 5.774311385533831e-06, + "loss": 0.0764, + "step": 165440 + }, + { + "epoch": 35.021069165357744, + "grad_norm": 0.45229604840278625, + "learning_rate": 5.771302364488738e-06, + "loss": 0.0037, + "step": 165450 + }, + { + "epoch": 35.021123327736554, + "grad_norm": 0.4686124622821808, + "learning_rate": 5.768293343443645e-06, + "loss": 0.0125, + "step": 165460 + }, + { + "epoch": 35.02117749011536, + "grad_norm": 0.0012712825555354357, + "learning_rate": 5.765284322398551e-06, + "loss": 0.0882, + "step": 165470 + }, + { + "epoch": 35.02123165249418, + "grad_norm": 0.0009797755628824234, + "learning_rate": 5.7622753013534575e-06, + "loss": 0.0003, + "step": 165480 + }, + { + "epoch": 35.02128581487299, + "grad_norm": 0.8442863821983337, + "learning_rate": 5.759266280308365e-06, + "loss": 0.0498, + "step": 165490 + }, + { + "epoch": 35.0213399772518, + "grad_norm": 0.0009185427334159613, + "learning_rate": 5.756257259263272e-06, + "loss": 0.0026, + "step": 165500 + }, + { + "epoch": 35.021394139630615, + "grad_norm": 0.0027460134588181973, + "learning_rate": 5.753248238218179e-06, + "loss": 0.0578, + "step": 165510 + }, + { + "epoch": 35.021448302009425, + "grad_norm": 0.002878656378015876, + "learning_rate": 5.750239217173085e-06, + "loss": 0.0224, + "step": 165520 + }, + { + "epoch": 35.021502464388234, + "grad_norm": 0.003830264089629054, + "learning_rate": 5.747230196127992e-06, + "loss": 0.0343, + "step": 165530 + }, + { + "epoch": 35.02155662676705, + "grad_norm": 0.10358764976263046, + "learning_rate": 5.7442211750828985e-06, + "loss": 0.0434, + "step": 165540 + }, + { + "epoch": 35.02161078914586, + "grad_norm": 0.19573886692523956, + "learning_rate": 5.741212154037806e-06, + "loss": 0.0147, + "step": 165550 + }, + { + "epoch": 35.02166495152467, + "grad_norm": 0.3019467890262604, + "learning_rate": 5.738203132992712e-06, + "loss": 0.0242, + "step": 165560 + }, + { + "epoch": 35.02171911390348, + "grad_norm": 0.022954415529966354, + "learning_rate": 5.735194111947619e-06, + "loss": 0.0323, + "step": 165570 + }, + { + "epoch": 35.021773276282296, + "grad_norm": 1.5582466125488281, + "learning_rate": 5.732185090902526e-06, + "loss": 0.0294, + "step": 165580 + }, + { + "epoch": 35.021827438661106, + "grad_norm": 0.0011055844370275736, + "learning_rate": 5.729176069857433e-06, + "loss": 0.0348, + "step": 165590 + }, + { + "epoch": 35.021881601039915, + "grad_norm": 0.014112969860434532, + "learning_rate": 5.7261670488123396e-06, + "loss": 0.0464, + "step": 165600 + }, + { + "epoch": 35.02193576341873, + "grad_norm": 0.0009015818941406906, + "learning_rate": 5.723158027767246e-06, + "loss": 0.1054, + "step": 165610 + }, + { + "epoch": 35.02198992579754, + "grad_norm": 0.0008505780133418739, + "learning_rate": 5.720149006722153e-06, + "loss": 0.0082, + "step": 165620 + }, + { + "epoch": 35.02204408817635, + "grad_norm": 1.8304646015167236, + "learning_rate": 5.71713998567706e-06, + "loss": 0.0314, + "step": 165630 + }, + { + "epoch": 35.02209825055517, + "grad_norm": 0.0020350629929453135, + "learning_rate": 5.714130964631967e-06, + "loss": 0.0389, + "step": 165640 + }, + { + "epoch": 35.02215241293398, + "grad_norm": 0.001122738467529416, + "learning_rate": 5.7111219435868735e-06, + "loss": 0.0425, + "step": 165650 + }, + { + "epoch": 35.02220657531279, + "grad_norm": 0.0008773499284870923, + "learning_rate": 5.708112922541781e-06, + "loss": 0.0205, + "step": 165660 + }, + { + "epoch": 35.022260737691596, + "grad_norm": 0.18567433953285217, + "learning_rate": 5.705103901496688e-06, + "loss": 0.0012, + "step": 165670 + }, + { + "epoch": 35.02231490007041, + "grad_norm": 1.0840462446212769, + "learning_rate": 5.702094880451595e-06, + "loss": 0.0117, + "step": 165680 + }, + { + "epoch": 35.02236906244922, + "grad_norm": 0.5145244002342224, + "learning_rate": 5.6990858594065e-06, + "loss": 0.0625, + "step": 165690 + }, + { + "epoch": 35.02242322482803, + "grad_norm": 1.0744613409042358, + "learning_rate": 5.696076838361407e-06, + "loss": 0.035, + "step": 165700 + }, + { + "epoch": 35.02247738720685, + "grad_norm": 0.008757632225751877, + "learning_rate": 5.6930678173163145e-06, + "loss": 0.0065, + "step": 165710 + }, + { + "epoch": 35.02253154958566, + "grad_norm": 0.0008852204191498458, + "learning_rate": 5.690058796271222e-06, + "loss": 0.0257, + "step": 165720 + }, + { + "epoch": 35.02258571196447, + "grad_norm": 0.0010152928298339248, + "learning_rate": 5.687049775226128e-06, + "loss": 0.109, + "step": 165730 + }, + { + "epoch": 35.022639874343284, + "grad_norm": 0.05858486518263817, + "learning_rate": 5.684040754181035e-06, + "loss": 0.0109, + "step": 165740 + }, + { + "epoch": 35.02269403672209, + "grad_norm": 0.0009312969632446766, + "learning_rate": 5.681031733135942e-06, + "loss": 0.0007, + "step": 165750 + }, + { + "epoch": 35.0227481991009, + "grad_norm": 2.555351972579956, + "learning_rate": 5.678022712090848e-06, + "loss": 0.0493, + "step": 165760 + }, + { + "epoch": 35.02280236147972, + "grad_norm": 0.006306706927716732, + "learning_rate": 5.6750136910457555e-06, + "loss": 0.0285, + "step": 165770 + }, + { + "epoch": 35.02285652385853, + "grad_norm": 0.14150679111480713, + "learning_rate": 5.672004670000662e-06, + "loss": 0.0267, + "step": 165780 + }, + { + "epoch": 35.02291068623734, + "grad_norm": 0.0008928208844736218, + "learning_rate": 5.668995648955569e-06, + "loss": 0.0694, + "step": 165790 + }, + { + "epoch": 35.02296484861615, + "grad_norm": 0.0013062244979664683, + "learning_rate": 5.665986627910476e-06, + "loss": 0.0335, + "step": 165800 + }, + { + "epoch": 35.023019010994965, + "grad_norm": 0.0013802499743178487, + "learning_rate": 5.662977606865383e-06, + "loss": 0.0154, + "step": 165810 + }, + { + "epoch": 35.023073173373774, + "grad_norm": 0.0011230105301365256, + "learning_rate": 5.6599685858202895e-06, + "loss": 0.0357, + "step": 165820 + }, + { + "epoch": 35.023127335752584, + "grad_norm": 0.0009375066147185862, + "learning_rate": 5.656959564775197e-06, + "loss": 0.0253, + "step": 165830 + }, + { + "epoch": 35.0231814981314, + "grad_norm": 0.000903831678442657, + "learning_rate": 5.653950543730103e-06, + "loss": 0.0212, + "step": 165840 + }, + { + "epoch": 35.02323566051021, + "grad_norm": 0.001995806349441409, + "learning_rate": 5.65094152268501e-06, + "loss": 0.0105, + "step": 165850 + }, + { + "epoch": 35.02328982288902, + "grad_norm": 0.0034934922587126493, + "learning_rate": 5.647932501639916e-06, + "loss": 0.0127, + "step": 165860 + }, + { + "epoch": 35.023343985267836, + "grad_norm": 0.043467216193675995, + "learning_rate": 5.644923480594823e-06, + "loss": 0.0117, + "step": 165870 + }, + { + "epoch": 35.023398147646645, + "grad_norm": 0.016438690945506096, + "learning_rate": 5.6419144595497305e-06, + "loss": 0.005, + "step": 165880 + }, + { + "epoch": 35.023452310025455, + "grad_norm": 1.0639598369598389, + "learning_rate": 5.638905438504638e-06, + "loss": 0.1011, + "step": 165890 + }, + { + "epoch": 35.02350647240427, + "grad_norm": 0.004754016175866127, + "learning_rate": 5.635896417459544e-06, + "loss": 0.006, + "step": 165900 + }, + { + "epoch": 35.02356063478308, + "grad_norm": 0.0008804135140962899, + "learning_rate": 5.63288739641445e-06, + "loss": 0.0008, + "step": 165910 + }, + { + "epoch": 35.02361479716189, + "grad_norm": 0.006684631574898958, + "learning_rate": 5.629878375369357e-06, + "loss": 0.0615, + "step": 165920 + }, + { + "epoch": 35.0236689595407, + "grad_norm": 0.004507068544626236, + "learning_rate": 5.626869354324264e-06, + "loss": 0.0057, + "step": 165930 + }, + { + "epoch": 35.02372312191952, + "grad_norm": 0.0015059357974678278, + "learning_rate": 5.6238603332791715e-06, + "loss": 0.0093, + "step": 165940 + }, + { + "epoch": 35.023777284298326, + "grad_norm": 0.0008406146662309766, + "learning_rate": 5.620851312234078e-06, + "loss": 0.0117, + "step": 165950 + }, + { + "epoch": 35.023831446677136, + "grad_norm": 0.0008782295626588166, + "learning_rate": 5.617842291188985e-06, + "loss": 0.101, + "step": 165960 + }, + { + "epoch": 35.02388560905595, + "grad_norm": 0.0011342441430315375, + "learning_rate": 5.614833270143892e-06, + "loss": 0.0027, + "step": 165970 + }, + { + "epoch": 35.02393977143476, + "grad_norm": 0.38329020142555237, + "learning_rate": 5.611824249098799e-06, + "loss": 0.0469, + "step": 165980 + }, + { + "epoch": 35.02399393381357, + "grad_norm": 0.1455399990081787, + "learning_rate": 5.608815228053705e-06, + "loss": 0.0045, + "step": 165990 + }, + { + "epoch": 35.02404809619239, + "grad_norm": 0.21345464885234833, + "learning_rate": 5.605806207008612e-06, + "loss": 0.0958, + "step": 166000 + }, + { + "epoch": 35.0241022585712, + "grad_norm": 0.001289455802179873, + "learning_rate": 5.602797185963519e-06, + "loss": 0.0354, + "step": 166010 + }, + { + "epoch": 35.02415642095001, + "grad_norm": 0.001148782903328538, + "learning_rate": 5.599788164918426e-06, + "loss": 0.0324, + "step": 166020 + }, + { + "epoch": 35.024210583328816, + "grad_norm": 0.002189560793340206, + "learning_rate": 5.596779143873332e-06, + "loss": 0.0218, + "step": 166030 + }, + { + "epoch": 35.02426474570763, + "grad_norm": 1.0690537691116333, + "learning_rate": 5.593770122828239e-06, + "loss": 0.0247, + "step": 166040 + }, + { + "epoch": 35.02431890808644, + "grad_norm": 0.0016528291162103415, + "learning_rate": 5.5907611017831465e-06, + "loss": 0.0446, + "step": 166050 + }, + { + "epoch": 35.02437307046525, + "grad_norm": 0.0010822732001543045, + "learning_rate": 5.587752080738053e-06, + "loss": 0.0215, + "step": 166060 + }, + { + "epoch": 35.02442723284407, + "grad_norm": 0.32335761189460754, + "learning_rate": 5.58474305969296e-06, + "loss": 0.0181, + "step": 166070 + }, + { + "epoch": 35.02448139522288, + "grad_norm": 0.5167683362960815, + "learning_rate": 5.581734038647866e-06, + "loss": 0.0811, + "step": 166080 + }, + { + "epoch": 35.02453555760169, + "grad_norm": 0.0008397666970267892, + "learning_rate": 5.578725017602773e-06, + "loss": 0.0737, + "step": 166090 + }, + { + "epoch": 35.024589719980504, + "grad_norm": 0.44105851650238037, + "learning_rate": 5.57571599655768e-06, + "loss": 0.0651, + "step": 166100 + }, + { + "epoch": 35.024643882359314, + "grad_norm": 0.0034381139557808638, + "learning_rate": 5.5727069755125875e-06, + "loss": 0.03, + "step": 166110 + }, + { + "epoch": 35.02469804473812, + "grad_norm": 0.3802882432937622, + "learning_rate": 5.569697954467494e-06, + "loss": 0.0511, + "step": 166120 + }, + { + "epoch": 35.02475220711694, + "grad_norm": 0.0009053401299752295, + "learning_rate": 5.566688933422401e-06, + "loss": 0.0381, + "step": 166130 + }, + { + "epoch": 35.02480636949575, + "grad_norm": 0.001493169809691608, + "learning_rate": 5.563679912377307e-06, + "loss": 0.0906, + "step": 166140 + }, + { + "epoch": 35.02486053187456, + "grad_norm": 0.9471842050552368, + "learning_rate": 5.560670891332214e-06, + "loss": 0.008, + "step": 166150 + }, + { + "epoch": 35.02491469425337, + "grad_norm": 0.06512938439846039, + "learning_rate": 5.557661870287121e-06, + "loss": 0.0294, + "step": 166160 + }, + { + "epoch": 35.024968856632185, + "grad_norm": 1.8288079500198364, + "learning_rate": 5.554652849242028e-06, + "loss": 0.0421, + "step": 166170 + }, + { + "epoch": 35.02500135405947, + "eval_accuracy": 0.8249510124101894, + "eval_loss": 1.0526683330535889, + "eval_runtime": 116.0832, + "eval_samples_per_second": 26.378, + "eval_steps_per_second": 3.299, + "step": 166176 + }, + { + "epoch": 36.000021664951525, + "grad_norm": 1.9787367582321167, + "learning_rate": 5.551643828196935e-06, + "loss": 0.0338, + "step": 166180 + }, + { + "epoch": 36.000075827330335, + "grad_norm": 0.07117442041635513, + "learning_rate": 5.548634807151842e-06, + "loss": 0.0086, + "step": 166190 + }, + { + "epoch": 36.00012998970915, + "grad_norm": 0.0008568482007831335, + "learning_rate": 5.545625786106748e-06, + "loss": 0.0001, + "step": 166200 + }, + { + "epoch": 36.00018415208796, + "grad_norm": 0.0013406906509771943, + "learning_rate": 5.5426167650616545e-06, + "loss": 0.0046, + "step": 166210 + }, + { + "epoch": 36.00023831446677, + "grad_norm": 0.011190798133611679, + "learning_rate": 5.539607744016562e-06, + "loss": 0.0202, + "step": 166220 + }, + { + "epoch": 36.00029247684558, + "grad_norm": 0.0011108339531347156, + "learning_rate": 5.536598722971469e-06, + "loss": 0.0959, + "step": 166230 + }, + { + "epoch": 36.0003466392244, + "grad_norm": 0.000965715735219419, + "learning_rate": 5.533589701926376e-06, + "loss": 0.036, + "step": 166240 + }, + { + "epoch": 36.000400801603206, + "grad_norm": 0.0023709682282060385, + "learning_rate": 5.530580680881282e-06, + "loss": 0.0472, + "step": 166250 + }, + { + "epoch": 36.000454963982015, + "grad_norm": 0.0011077902745455503, + "learning_rate": 5.527571659836189e-06, + "loss": 0.005, + "step": 166260 + }, + { + "epoch": 36.00050912636083, + "grad_norm": 0.0023801391944289207, + "learning_rate": 5.524562638791096e-06, + "loss": 0.0115, + "step": 166270 + }, + { + "epoch": 36.00056328873964, + "grad_norm": 0.0012013759696856141, + "learning_rate": 5.5215536177460035e-06, + "loss": 0.0401, + "step": 166280 + }, + { + "epoch": 36.00061745111845, + "grad_norm": 0.0010844876524060965, + "learning_rate": 5.518544596700909e-06, + "loss": 0.0363, + "step": 166290 + }, + { + "epoch": 36.00067161349727, + "grad_norm": 0.6069196462631226, + "learning_rate": 5.515535575655816e-06, + "loss": 0.0306, + "step": 166300 + }, + { + "epoch": 36.00072577587608, + "grad_norm": 1.1203449964523315, + "learning_rate": 5.512526554610723e-06, + "loss": 0.0192, + "step": 166310 + }, + { + "epoch": 36.00077993825489, + "grad_norm": 0.0008700987091287971, + "learning_rate": 5.50951753356563e-06, + "loss": 0.0081, + "step": 166320 + }, + { + "epoch": 36.0008341006337, + "grad_norm": 0.0011219161096960306, + "learning_rate": 5.5065085125205366e-06, + "loss": 0.1045, + "step": 166330 + }, + { + "epoch": 36.00088826301251, + "grad_norm": 0.0011115147499367595, + "learning_rate": 5.503499491475444e-06, + "loss": 0.0172, + "step": 166340 + }, + { + "epoch": 36.00094242539132, + "grad_norm": 1.4538861513137817, + "learning_rate": 5.500490470430351e-06, + "loss": 0.0338, + "step": 166350 + }, + { + "epoch": 36.00099658777013, + "grad_norm": 0.0008442748803645372, + "learning_rate": 5.497481449385257e-06, + "loss": 0.0268, + "step": 166360 + }, + { + "epoch": 36.00105075014895, + "grad_norm": 1.3071833848953247, + "learning_rate": 5.494472428340164e-06, + "loss": 0.0339, + "step": 166370 + }, + { + "epoch": 36.00110491252776, + "grad_norm": 0.12240350246429443, + "learning_rate": 5.4914634072950705e-06, + "loss": 0.0565, + "step": 166380 + }, + { + "epoch": 36.00115907490657, + "grad_norm": 0.0018750398885458708, + "learning_rate": 5.488454386249978e-06, + "loss": 0.0897, + "step": 166390 + }, + { + "epoch": 36.001213237285384, + "grad_norm": 0.0008896905346773565, + "learning_rate": 5.485445365204885e-06, + "loss": 0.0105, + "step": 166400 + }, + { + "epoch": 36.001267399664194, + "grad_norm": 0.9063646197319031, + "learning_rate": 5.482436344159792e-06, + "loss": 0.0636, + "step": 166410 + }, + { + "epoch": 36.001321562043, + "grad_norm": 0.0009631849243305624, + "learning_rate": 5.479427323114698e-06, + "loss": 0.021, + "step": 166420 + }, + { + "epoch": 36.00137572442182, + "grad_norm": 3.314526319503784, + "learning_rate": 5.476418302069605e-06, + "loss": 0.0363, + "step": 166430 + }, + { + "epoch": 36.00142988680063, + "grad_norm": 0.23767635226249695, + "learning_rate": 5.4734092810245115e-06, + "loss": 0.0581, + "step": 166440 + }, + { + "epoch": 36.00148404917944, + "grad_norm": 0.012448127381503582, + "learning_rate": 5.470400259979419e-06, + "loss": 0.0097, + "step": 166450 + }, + { + "epoch": 36.00153821155825, + "grad_norm": 0.0012581952614709735, + "learning_rate": 5.467391238934325e-06, + "loss": 0.0503, + "step": 166460 + }, + { + "epoch": 36.001592373937065, + "grad_norm": 0.05163058638572693, + "learning_rate": 5.464382217889232e-06, + "loss": 0.0243, + "step": 166470 + }, + { + "epoch": 36.001646536315874, + "grad_norm": 0.18866495788097382, + "learning_rate": 5.461373196844139e-06, + "loss": 0.0107, + "step": 166480 + }, + { + "epoch": 36.001700698694684, + "grad_norm": 0.19109739363193512, + "learning_rate": 5.458364175799046e-06, + "loss": 0.0158, + "step": 166490 + }, + { + "epoch": 36.0017548610735, + "grad_norm": 0.0029711045790463686, + "learning_rate": 5.4553551547539526e-06, + "loss": 0.0193, + "step": 166500 + }, + { + "epoch": 36.00180902345231, + "grad_norm": 0.0011429868172854185, + "learning_rate": 5.452346133708859e-06, + "loss": 0.0519, + "step": 166510 + }, + { + "epoch": 36.00186318583112, + "grad_norm": 0.0011036796495318413, + "learning_rate": 5.449337112663766e-06, + "loss": 0.014, + "step": 166520 + }, + { + "epoch": 36.001917348209936, + "grad_norm": 0.0009103558259084821, + "learning_rate": 5.446328091618673e-06, + "loss": 0.0184, + "step": 166530 + }, + { + "epoch": 36.001971510588746, + "grad_norm": 0.005956915207207203, + "learning_rate": 5.44331907057358e-06, + "loss": 0.0069, + "step": 166540 + }, + { + "epoch": 36.002025672967555, + "grad_norm": 0.07527735084295273, + "learning_rate": 5.4403100495284865e-06, + "loss": 0.0328, + "step": 166550 + }, + { + "epoch": 36.00207983534637, + "grad_norm": 0.0009415032109245658, + "learning_rate": 5.437301028483394e-06, + "loss": 0.0374, + "step": 166560 + }, + { + "epoch": 36.00213399772518, + "grad_norm": 0.01795900985598564, + "learning_rate": 5.434292007438301e-06, + "loss": 0.0261, + "step": 166570 + }, + { + "epoch": 36.00218816010399, + "grad_norm": 0.00090673757949844, + "learning_rate": 5.431282986393208e-06, + "loss": 0.005, + "step": 166580 + }, + { + "epoch": 36.0022423224828, + "grad_norm": 0.0008491093758493662, + "learning_rate": 5.428273965348113e-06, + "loss": 0.0469, + "step": 166590 + }, + { + "epoch": 36.00229648486162, + "grad_norm": 0.0009141509654000401, + "learning_rate": 5.42526494430302e-06, + "loss": 0.0806, + "step": 166600 + }, + { + "epoch": 36.002350647240426, + "grad_norm": 0.001813918468542397, + "learning_rate": 5.4222559232579275e-06, + "loss": 0.0213, + "step": 166610 + }, + { + "epoch": 36.002404809619236, + "grad_norm": 0.0008651208481751382, + "learning_rate": 5.419246902212835e-06, + "loss": 0.0021, + "step": 166620 + }, + { + "epoch": 36.00245897199805, + "grad_norm": 0.0013111530570313334, + "learning_rate": 5.416237881167741e-06, + "loss": 0.0006, + "step": 166630 + }, + { + "epoch": 36.00251313437686, + "grad_norm": 0.37922653555870056, + "learning_rate": 5.413228860122648e-06, + "loss": 0.0134, + "step": 166640 + }, + { + "epoch": 36.00256729675567, + "grad_norm": 0.00131178367882967, + "learning_rate": 5.410219839077555e-06, + "loss": 0.0298, + "step": 166650 + }, + { + "epoch": 36.00262145913449, + "grad_norm": 0.005975400097668171, + "learning_rate": 5.407210818032461e-06, + "loss": 0.0131, + "step": 166660 + }, + { + "epoch": 36.0026756215133, + "grad_norm": 0.3306601047515869, + "learning_rate": 5.4042017969873685e-06, + "loss": 0.0352, + "step": 166670 + }, + { + "epoch": 36.00272978389211, + "grad_norm": 0.0008433139300905168, + "learning_rate": 5.401192775942275e-06, + "loss": 0.0215, + "step": 166680 + }, + { + "epoch": 36.00278394627092, + "grad_norm": 0.0010758272837847471, + "learning_rate": 5.398183754897182e-06, + "loss": 0.0062, + "step": 166690 + }, + { + "epoch": 36.00283810864973, + "grad_norm": 0.0012308171717450023, + "learning_rate": 5.395174733852089e-06, + "loss": 0.0145, + "step": 166700 + }, + { + "epoch": 36.00289227102854, + "grad_norm": 0.2803548276424408, + "learning_rate": 5.392165712806996e-06, + "loss": 0.0213, + "step": 166710 + }, + { + "epoch": 36.00294643340735, + "grad_norm": 0.09697958827018738, + "learning_rate": 5.3891566917619025e-06, + "loss": 0.036, + "step": 166720 + }, + { + "epoch": 36.00300059578617, + "grad_norm": 2.876652717590332, + "learning_rate": 5.3861476707168096e-06, + "loss": 0.0596, + "step": 166730 + }, + { + "epoch": 36.00305475816498, + "grad_norm": 0.002791549311950803, + "learning_rate": 5.383138649671716e-06, + "loss": 0.0141, + "step": 166740 + }, + { + "epoch": 36.00310892054379, + "grad_norm": 0.0008670588722452521, + "learning_rate": 5.380129628626623e-06, + "loss": 0.0058, + "step": 166750 + }, + { + "epoch": 36.003163082922605, + "grad_norm": 0.8868194818496704, + "learning_rate": 5.377120607581529e-06, + "loss": 0.0178, + "step": 166760 + }, + { + "epoch": 36.003217245301414, + "grad_norm": 4.972955703735352, + "learning_rate": 5.374111586536436e-06, + "loss": 0.0532, + "step": 166770 + }, + { + "epoch": 36.003271407680224, + "grad_norm": 0.8565325140953064, + "learning_rate": 5.3711025654913435e-06, + "loss": 0.0433, + "step": 166780 + }, + { + "epoch": 36.00332557005904, + "grad_norm": 0.0012746495194733143, + "learning_rate": 5.368093544446251e-06, + "loss": 0.0113, + "step": 166790 + }, + { + "epoch": 36.00337973243785, + "grad_norm": 0.00144586234819144, + "learning_rate": 5.365084523401157e-06, + "loss": 0.0016, + "step": 166800 + }, + { + "epoch": 36.00343389481666, + "grad_norm": 0.0011455644853413105, + "learning_rate": 5.362075502356063e-06, + "loss": 0.0009, + "step": 166810 + }, + { + "epoch": 36.00348805719547, + "grad_norm": 0.001184331951662898, + "learning_rate": 5.35906648131097e-06, + "loss": 0.041, + "step": 166820 + }, + { + "epoch": 36.003542219574285, + "grad_norm": 0.6314826011657715, + "learning_rate": 5.356057460265877e-06, + "loss": 0.038, + "step": 166830 + }, + { + "epoch": 36.003596381953095, + "grad_norm": 0.4021388590335846, + "learning_rate": 5.3530484392207845e-06, + "loss": 0.0116, + "step": 166840 + }, + { + "epoch": 36.003650544331904, + "grad_norm": 0.0009942783508449793, + "learning_rate": 5.350039418175691e-06, + "loss": 0.0127, + "step": 166850 + }, + { + "epoch": 36.00370470671072, + "grad_norm": 0.0008249281672760844, + "learning_rate": 5.347030397130598e-06, + "loss": 0.0568, + "step": 166860 + }, + { + "epoch": 36.00375886908953, + "grad_norm": 0.0008476104121655226, + "learning_rate": 5.344021376085505e-06, + "loss": 0.0448, + "step": 166870 + }, + { + "epoch": 36.00381303146834, + "grad_norm": 0.0008365489193238318, + "learning_rate": 5.341012355040412e-06, + "loss": 0.0225, + "step": 166880 + }, + { + "epoch": 36.00386719384716, + "grad_norm": 0.0008325438247993588, + "learning_rate": 5.338003333995318e-06, + "loss": 0.0303, + "step": 166890 + }, + { + "epoch": 36.003921356225966, + "grad_norm": 0.0023512188345193863, + "learning_rate": 5.334994312950225e-06, + "loss": 0.0128, + "step": 166900 + }, + { + "epoch": 36.003975518604776, + "grad_norm": 0.0008458115626126528, + "learning_rate": 5.331985291905132e-06, + "loss": 0.0485, + "step": 166910 + }, + { + "epoch": 36.00402968098359, + "grad_norm": 0.0008329902193509042, + "learning_rate": 5.328976270860039e-06, + "loss": 0.0208, + "step": 166920 + }, + { + "epoch": 36.0040838433624, + "grad_norm": 0.001594260334968567, + "learning_rate": 5.325967249814945e-06, + "loss": 0.0577, + "step": 166930 + }, + { + "epoch": 36.00413800574121, + "grad_norm": 0.0008162225130945444, + "learning_rate": 5.322958228769852e-06, + "loss": 0.0098, + "step": 166940 + }, + { + "epoch": 36.00419216812002, + "grad_norm": 0.0008235282148234546, + "learning_rate": 5.3199492077247595e-06, + "loss": 0.0029, + "step": 166950 + }, + { + "epoch": 36.00424633049884, + "grad_norm": 0.0012974300188943744, + "learning_rate": 5.316940186679666e-06, + "loss": 0.0208, + "step": 166960 + }, + { + "epoch": 36.00430049287765, + "grad_norm": 0.006227768026292324, + "learning_rate": 5.313931165634573e-06, + "loss": 0.0333, + "step": 166970 + }, + { + "epoch": 36.004354655256456, + "grad_norm": 0.001264848979189992, + "learning_rate": 5.310922144589479e-06, + "loss": 0.0032, + "step": 166980 + }, + { + "epoch": 36.00440881763527, + "grad_norm": 0.0013125023106113076, + "learning_rate": 5.307913123544386e-06, + "loss": 0.0037, + "step": 166990 + }, + { + "epoch": 36.00446298001408, + "grad_norm": 0.0008805211982689798, + "learning_rate": 5.304904102499293e-06, + "loss": 0.0007, + "step": 167000 + }, + { + "epoch": 36.00451714239289, + "grad_norm": 1.3715134859085083, + "learning_rate": 5.3018950814542005e-06, + "loss": 0.0577, + "step": 167010 + }, + { + "epoch": 36.00457130477171, + "grad_norm": 0.0008488203166052699, + "learning_rate": 5.298886060409107e-06, + "loss": 0.0124, + "step": 167020 + }, + { + "epoch": 36.00462546715052, + "grad_norm": 0.0008348521078005433, + "learning_rate": 5.295877039364013e-06, + "loss": 0.0137, + "step": 167030 + }, + { + "epoch": 36.00467962952933, + "grad_norm": 0.0008227340877056122, + "learning_rate": 5.29286801831892e-06, + "loss": 0.0141, + "step": 167040 + }, + { + "epoch": 36.00473379190814, + "grad_norm": 0.001028977450914681, + "learning_rate": 5.289858997273827e-06, + "loss": 0.0445, + "step": 167050 + }, + { + "epoch": 36.004787954286954, + "grad_norm": 3.94103741645813, + "learning_rate": 5.286849976228734e-06, + "loss": 0.0505, + "step": 167060 + }, + { + "epoch": 36.00484211666576, + "grad_norm": 0.0008236247813329101, + "learning_rate": 5.283840955183641e-06, + "loss": 0.0405, + "step": 167070 + }, + { + "epoch": 36.00489627904457, + "grad_norm": 0.0018390112090855837, + "learning_rate": 5.280831934138548e-06, + "loss": 0.0059, + "step": 167080 + }, + { + "epoch": 36.00495044142339, + "grad_norm": 3.323925733566284, + "learning_rate": 5.277822913093455e-06, + "loss": 0.0742, + "step": 167090 + }, + { + "epoch": 36.0050046038022, + "grad_norm": 0.033048056066036224, + "learning_rate": 5.274813892048361e-06, + "loss": 0.0291, + "step": 167100 + }, + { + "epoch": 36.00505876618101, + "grad_norm": 1.135335087776184, + "learning_rate": 5.2718048710032675e-06, + "loss": 0.0718, + "step": 167110 + }, + { + "epoch": 36.005112928559825, + "grad_norm": 0.0744451954960823, + "learning_rate": 5.268795849958175e-06, + "loss": 0.0011, + "step": 167120 + }, + { + "epoch": 36.005167090938635, + "grad_norm": 0.000843142275698483, + "learning_rate": 5.265786828913082e-06, + "loss": 0.0036, + "step": 167130 + }, + { + "epoch": 36.005221253317444, + "grad_norm": 0.004012623801827431, + "learning_rate": 5.262777807867989e-06, + "loss": 0.0111, + "step": 167140 + }, + { + "epoch": 36.00527541569626, + "grad_norm": 0.0012283918913453817, + "learning_rate": 5.259768786822895e-06, + "loss": 0.0028, + "step": 167150 + }, + { + "epoch": 36.00532957807507, + "grad_norm": 0.8679483532905579, + "learning_rate": 5.256759765777802e-06, + "loss": 0.1023, + "step": 167160 + }, + { + "epoch": 36.00538374045388, + "grad_norm": 0.0016243732534348965, + "learning_rate": 5.253750744732709e-06, + "loss": 0.0025, + "step": 167170 + }, + { + "epoch": 36.00543790283269, + "grad_norm": 0.0008212216198444366, + "learning_rate": 5.250741723687616e-06, + "loss": 0.0038, + "step": 167180 + }, + { + "epoch": 36.005492065211506, + "grad_norm": 0.0013829206582158804, + "learning_rate": 5.247732702642522e-06, + "loss": 0.0698, + "step": 167190 + }, + { + "epoch": 36.005546227590315, + "grad_norm": 0.001024133525788784, + "learning_rate": 5.244723681597429e-06, + "loss": 0.0008, + "step": 167200 + }, + { + "epoch": 36.005600389969125, + "grad_norm": 0.4525611698627472, + "learning_rate": 5.241714660552336e-06, + "loss": 0.0027, + "step": 167210 + }, + { + "epoch": 36.00565455234794, + "grad_norm": 0.0008159218123182654, + "learning_rate": 5.238705639507243e-06, + "loss": 0.0472, + "step": 167220 + }, + { + "epoch": 36.00570871472675, + "grad_norm": 0.001307271420955658, + "learning_rate": 5.2356966184621496e-06, + "loss": 0.0053, + "step": 167230 + }, + { + "epoch": 36.00576287710556, + "grad_norm": 0.0009145493386313319, + "learning_rate": 5.232687597417057e-06, + "loss": 0.0015, + "step": 167240 + }, + { + "epoch": 36.00581703948438, + "grad_norm": 0.6466802954673767, + "learning_rate": 5.229678576371964e-06, + "loss": 0.0758, + "step": 167250 + }, + { + "epoch": 36.00587120186319, + "grad_norm": 0.00917890202254057, + "learning_rate": 5.22666955532687e-06, + "loss": 0.0742, + "step": 167260 + }, + { + "epoch": 36.005925364241996, + "grad_norm": 0.008656882680952549, + "learning_rate": 5.223660534281777e-06, + "loss": 0.0247, + "step": 167270 + }, + { + "epoch": 36.00597952662081, + "grad_norm": 0.001431786804459989, + "learning_rate": 5.2206515132366835e-06, + "loss": 0.0268, + "step": 167280 + }, + { + "epoch": 36.00603368899962, + "grad_norm": 5.037313461303711, + "learning_rate": 5.217642492191591e-06, + "loss": 0.0811, + "step": 167290 + }, + { + "epoch": 36.00608785137843, + "grad_norm": 0.5521026849746704, + "learning_rate": 5.214633471146498e-06, + "loss": 0.0608, + "step": 167300 + }, + { + "epoch": 36.00614201375724, + "grad_norm": 0.0012810799526050687, + "learning_rate": 5.211624450101405e-06, + "loss": 0.0, + "step": 167310 + }, + { + "epoch": 36.00619617613606, + "grad_norm": 0.0008537792018614709, + "learning_rate": 5.208615429056311e-06, + "loss": 0.0108, + "step": 167320 + }, + { + "epoch": 36.00625033851487, + "grad_norm": 0.011322957463562489, + "learning_rate": 5.205606408011217e-06, + "loss": 0.0028, + "step": 167330 + }, + { + "epoch": 36.00630450089368, + "grad_norm": 2.8313546180725098, + "learning_rate": 5.2025973869661245e-06, + "loss": 0.0664, + "step": 167340 + }, + { + "epoch": 36.00635866327249, + "grad_norm": 0.012937672436237335, + "learning_rate": 5.199588365921032e-06, + "loss": 0.0094, + "step": 167350 + }, + { + "epoch": 36.0064128256513, + "grad_norm": 0.0012845146702602506, + "learning_rate": 5.196579344875938e-06, + "loss": 0.0327, + "step": 167360 + }, + { + "epoch": 36.00646698803011, + "grad_norm": 0.0008144848397932947, + "learning_rate": 5.193570323830845e-06, + "loss": 0.0236, + "step": 167370 + }, + { + "epoch": 36.00652115040893, + "grad_norm": 1.7802257537841797, + "learning_rate": 5.190561302785752e-06, + "loss": 0.1078, + "step": 167380 + }, + { + "epoch": 36.00657531278774, + "grad_norm": 0.9529969096183777, + "learning_rate": 5.187552281740659e-06, + "loss": 0.0596, + "step": 167390 + }, + { + "epoch": 36.00662947516655, + "grad_norm": 0.0014428686117753386, + "learning_rate": 5.1845432606955656e-06, + "loss": 0.0131, + "step": 167400 + }, + { + "epoch": 36.00668363754536, + "grad_norm": 1.672018051147461, + "learning_rate": 5.181534239650472e-06, + "loss": 0.1142, + "step": 167410 + }, + { + "epoch": 36.006737799924174, + "grad_norm": 1.0701758861541748, + "learning_rate": 5.178525218605379e-06, + "loss": 0.0532, + "step": 167420 + }, + { + "epoch": 36.006791962302984, + "grad_norm": 0.0010092781158164144, + "learning_rate": 5.175516197560286e-06, + "loss": 0.0108, + "step": 167430 + }, + { + "epoch": 36.00684612468179, + "grad_norm": 0.0013738104607909918, + "learning_rate": 5.172507176515193e-06, + "loss": 0.0157, + "step": 167440 + }, + { + "epoch": 36.00690028706061, + "grad_norm": 0.0008071465417742729, + "learning_rate": 5.1694981554700995e-06, + "loss": 0.0467, + "step": 167450 + }, + { + "epoch": 36.00695444943942, + "grad_norm": 0.001307015074416995, + "learning_rate": 5.166489134425007e-06, + "loss": 0.0239, + "step": 167460 + }, + { + "epoch": 36.00700861181823, + "grad_norm": 0.0321819931268692, + "learning_rate": 5.163480113379914e-06, + "loss": 0.0106, + "step": 167470 + }, + { + "epoch": 36.007062774197045, + "grad_norm": 0.0017617795383557677, + "learning_rate": 5.16047109233482e-06, + "loss": 0.1192, + "step": 167480 + }, + { + "epoch": 36.007116936575855, + "grad_norm": 0.0008079828112386167, + "learning_rate": 5.157462071289726e-06, + "loss": 0.028, + "step": 167490 + }, + { + "epoch": 36.007171098954665, + "grad_norm": 2.5418994426727295, + "learning_rate": 5.154453050244633e-06, + "loss": 0.0781, + "step": 167500 + }, + { + "epoch": 36.00722526133348, + "grad_norm": 0.0008404169930145144, + "learning_rate": 5.1514440291995405e-06, + "loss": 0.0173, + "step": 167510 + }, + { + "epoch": 36.00727942371229, + "grad_norm": 0.0020606666803359985, + "learning_rate": 5.148435008154448e-06, + "loss": 0.0359, + "step": 167520 + }, + { + "epoch": 36.0073335860911, + "grad_norm": 0.0008195621776394546, + "learning_rate": 5.145425987109354e-06, + "loss": 0.0686, + "step": 167530 + }, + { + "epoch": 36.00738774846991, + "grad_norm": 0.13693667948246002, + "learning_rate": 5.142416966064261e-06, + "loss": 0.0003, + "step": 167540 + }, + { + "epoch": 36.007441910848726, + "grad_norm": 0.040364883840084076, + "learning_rate": 5.139407945019168e-06, + "loss": 0.0039, + "step": 167550 + }, + { + "epoch": 36.007496073227536, + "grad_norm": 0.0020748290698975325, + "learning_rate": 5.136398923974074e-06, + "loss": 0.0298, + "step": 167560 + }, + { + "epoch": 36.007550235606345, + "grad_norm": 0.0008665258646942675, + "learning_rate": 5.1333899029289815e-06, + "loss": 0.0096, + "step": 167570 + }, + { + "epoch": 36.00760439798516, + "grad_norm": 0.0008256255532614887, + "learning_rate": 5.130380881883888e-06, + "loss": 0.0134, + "step": 167580 + }, + { + "epoch": 36.00765856036397, + "grad_norm": 0.0019328838679939508, + "learning_rate": 5.127371860838795e-06, + "loss": 0.0809, + "step": 167590 + }, + { + "epoch": 36.00771272274278, + "grad_norm": 7.894340515136719, + "learning_rate": 5.124362839793702e-06, + "loss": 0.1002, + "step": 167600 + }, + { + "epoch": 36.0077668851216, + "grad_norm": 0.0008665801724418998, + "learning_rate": 5.121353818748609e-06, + "loss": 0.0001, + "step": 167610 + }, + { + "epoch": 36.00782104750041, + "grad_norm": 1.0407345294952393, + "learning_rate": 5.1183447977035155e-06, + "loss": 0.0559, + "step": 167620 + }, + { + "epoch": 36.00787520987922, + "grad_norm": 0.01532695535570383, + "learning_rate": 5.115335776658422e-06, + "loss": 0.0498, + "step": 167630 + }, + { + "epoch": 36.007929372258026, + "grad_norm": 0.006785514764487743, + "learning_rate": 5.112326755613329e-06, + "loss": 0.0, + "step": 167640 + }, + { + "epoch": 36.00798353463684, + "grad_norm": 0.0010297535918653011, + "learning_rate": 5.109317734568236e-06, + "loss": 0.0865, + "step": 167650 + }, + { + "epoch": 36.00803769701565, + "grad_norm": 0.2938828766345978, + "learning_rate": 5.106308713523142e-06, + "loss": 0.0892, + "step": 167660 + }, + { + "epoch": 36.00809185939446, + "grad_norm": 0.0010868320241570473, + "learning_rate": 5.103299692478049e-06, + "loss": 0.1315, + "step": 167670 + }, + { + "epoch": 36.00814602177328, + "grad_norm": 0.004720568656921387, + "learning_rate": 5.1002906714329565e-06, + "loss": 0.0727, + "step": 167680 + }, + { + "epoch": 36.00820018415209, + "grad_norm": 0.0008508030441589653, + "learning_rate": 5.097281650387864e-06, + "loss": 0.0193, + "step": 167690 + }, + { + "epoch": 36.0082543465309, + "grad_norm": 4.061578750610352, + "learning_rate": 5.09427262934277e-06, + "loss": 0.0726, + "step": 167700 + }, + { + "epoch": 36.008308508909714, + "grad_norm": 0.0010145849082618952, + "learning_rate": 5.091263608297676e-06, + "loss": 0.0178, + "step": 167710 + }, + { + "epoch": 36.00836267128852, + "grad_norm": 0.25204506516456604, + "learning_rate": 5.088254587252583e-06, + "loss": 0.0224, + "step": 167720 + }, + { + "epoch": 36.00841683366733, + "grad_norm": 0.2198103368282318, + "learning_rate": 5.08524556620749e-06, + "loss": 0.0349, + "step": 167730 + }, + { + "epoch": 36.00847099604615, + "grad_norm": 0.0012279950315132737, + "learning_rate": 5.0822365451623975e-06, + "loss": 0.0172, + "step": 167740 + }, + { + "epoch": 36.00852515842496, + "grad_norm": 0.7768475413322449, + "learning_rate": 5.079227524117304e-06, + "loss": 0.0773, + "step": 167750 + }, + { + "epoch": 36.00857932080377, + "grad_norm": 0.0008415086194872856, + "learning_rate": 5.076218503072211e-06, + "loss": 0.1381, + "step": 167760 + }, + { + "epoch": 36.00863348318258, + "grad_norm": 0.001362983719445765, + "learning_rate": 5.073209482027118e-06, + "loss": 0.0259, + "step": 167770 + }, + { + "epoch": 36.008687645561395, + "grad_norm": 0.0008361747022718191, + "learning_rate": 5.070200460982024e-06, + "loss": 0.0434, + "step": 167780 + }, + { + "epoch": 36.008741807940204, + "grad_norm": 0.019575420767068863, + "learning_rate": 5.067191439936931e-06, + "loss": 0.0576, + "step": 167790 + }, + { + "epoch": 36.008795970319014, + "grad_norm": 0.0008096144301816821, + "learning_rate": 5.064182418891838e-06, + "loss": 0.0304, + "step": 167800 + }, + { + "epoch": 36.00885013269783, + "grad_norm": 0.0050697242841124535, + "learning_rate": 5.061173397846745e-06, + "loss": 0.0006, + "step": 167810 + }, + { + "epoch": 36.00890429507664, + "grad_norm": 0.0012339319800958037, + "learning_rate": 5.058164376801652e-06, + "loss": 0.046, + "step": 167820 + }, + { + "epoch": 36.00895845745545, + "grad_norm": 0.0010734129464253783, + "learning_rate": 5.055155355756558e-06, + "loss": 0.0537, + "step": 167830 + }, + { + "epoch": 36.009012619834266, + "grad_norm": 0.008709345012903214, + "learning_rate": 5.052146334711465e-06, + "loss": 0.001, + "step": 167840 + }, + { + "epoch": 36.009066782213075, + "grad_norm": 0.0008643078035674989, + "learning_rate": 5.0491373136663725e-06, + "loss": 0.015, + "step": 167850 + }, + { + "epoch": 36.009120944591885, + "grad_norm": 0.0008140095160342753, + "learning_rate": 5.046128292621279e-06, + "loss": 0.0034, + "step": 167860 + }, + { + "epoch": 36.0091751069707, + "grad_norm": 0.29709964990615845, + "learning_rate": 5.043119271576186e-06, + "loss": 0.0013, + "step": 167870 + }, + { + "epoch": 36.00922926934951, + "grad_norm": 4.018499851226807, + "learning_rate": 5.040110250531092e-06, + "loss": 0.0415, + "step": 167880 + }, + { + "epoch": 36.00928343172832, + "grad_norm": 0.4011797606945038, + "learning_rate": 5.037101229485999e-06, + "loss": 0.003, + "step": 167890 + }, + { + "epoch": 36.00933759410713, + "grad_norm": 1.5551660060882568, + "learning_rate": 5.034092208440906e-06, + "loss": 0.0861, + "step": 167900 + }, + { + "epoch": 36.00939175648595, + "grad_norm": 0.001446465845219791, + "learning_rate": 5.031083187395813e-06, + "loss": 0.0132, + "step": 167910 + }, + { + "epoch": 36.009445918864756, + "grad_norm": 0.0008402265957556665, + "learning_rate": 5.02807416635072e-06, + "loss": 0.0378, + "step": 167920 + }, + { + "epoch": 36.009500081243566, + "grad_norm": 21.700927734375, + "learning_rate": 5.025065145305626e-06, + "loss": 0.0313, + "step": 167930 + }, + { + "epoch": 36.00955424362238, + "grad_norm": 0.0015370442997664213, + "learning_rate": 5.022056124260533e-06, + "loss": 0.0799, + "step": 167940 + }, + { + "epoch": 36.00960840600119, + "grad_norm": 0.0011534475488588214, + "learning_rate": 5.01904710321544e-06, + "loss": 0.0045, + "step": 167950 + }, + { + "epoch": 36.00966256838, + "grad_norm": 0.006588871590793133, + "learning_rate": 5.016038082170347e-06, + "loss": 0.0175, + "step": 167960 + }, + { + "epoch": 36.00971673075882, + "grad_norm": 0.19316455721855164, + "learning_rate": 5.013029061125254e-06, + "loss": 0.0099, + "step": 167970 + }, + { + "epoch": 36.00977089313763, + "grad_norm": 0.0008489073952659965, + "learning_rate": 5.010020040080161e-06, + "loss": 0.0219, + "step": 167980 + }, + { + "epoch": 36.00982505551644, + "grad_norm": 0.0011444850824773312, + "learning_rate": 5.007011019035068e-06, + "loss": 0.129, + "step": 167990 + }, + { + "epoch": 36.00987921789525, + "grad_norm": 0.0008596025290898979, + "learning_rate": 5.004001997989974e-06, + "loss": 0.0416, + "step": 168000 + }, + { + "epoch": 36.00993338027406, + "grad_norm": 0.001138582592830062, + "learning_rate": 5.0009929769448805e-06, + "loss": 0.0622, + "step": 168010 + }, + { + "epoch": 36.00998754265287, + "grad_norm": 0.0012587092351168394, + "learning_rate": 4.997983955899788e-06, + "loss": 0.016, + "step": 168020 + }, + { + "epoch": 36.01004170503168, + "grad_norm": 0.0008773301960900426, + "learning_rate": 4.994974934854695e-06, + "loss": 0.0025, + "step": 168030 + }, + { + "epoch": 36.0100958674105, + "grad_norm": 0.000850335753057152, + "learning_rate": 4.991965913809602e-06, + "loss": 0.0442, + "step": 168040 + }, + { + "epoch": 36.01015002978931, + "grad_norm": 0.0014503803104162216, + "learning_rate": 4.988956892764508e-06, + "loss": 0.0686, + "step": 168050 + }, + { + "epoch": 36.01020419216812, + "grad_norm": 0.18736408650875092, + "learning_rate": 4.985947871719415e-06, + "loss": 0.0333, + "step": 168060 + }, + { + "epoch": 36.010258354546934, + "grad_norm": 0.0013767648488283157, + "learning_rate": 4.982938850674322e-06, + "loss": 0.0026, + "step": 168070 + }, + { + "epoch": 36.010312516925744, + "grad_norm": 0.6463889479637146, + "learning_rate": 4.979929829629229e-06, + "loss": 0.0997, + "step": 168080 + }, + { + "epoch": 36.01036667930455, + "grad_norm": 0.03763055428862572, + "learning_rate": 4.976920808584135e-06, + "loss": 0.0259, + "step": 168090 + }, + { + "epoch": 36.01042084168337, + "grad_norm": 0.02325069159269333, + "learning_rate": 4.973911787539042e-06, + "loss": 0.0244, + "step": 168100 + }, + { + "epoch": 36.01047500406218, + "grad_norm": 0.43696457147598267, + "learning_rate": 4.970902766493949e-06, + "loss": 0.0213, + "step": 168110 + }, + { + "epoch": 36.01052916644099, + "grad_norm": 0.0009018326527439058, + "learning_rate": 4.967893745448856e-06, + "loss": 0.0473, + "step": 168120 + }, + { + "epoch": 36.0105833288198, + "grad_norm": 0.03871043771505356, + "learning_rate": 4.9648847244037626e-06, + "loss": 0.0094, + "step": 168130 + }, + { + "epoch": 36.010637491198615, + "grad_norm": 0.12557634711265564, + "learning_rate": 4.96187570335867e-06, + "loss": 0.02, + "step": 168140 + }, + { + "epoch": 36.010691653577425, + "grad_norm": 1.1798402070999146, + "learning_rate": 4.958866682313577e-06, + "loss": 0.036, + "step": 168150 + }, + { + "epoch": 36.010745815956234, + "grad_norm": 0.0010644732974469662, + "learning_rate": 4.955857661268483e-06, + "loss": 0.05, + "step": 168160 + }, + { + "epoch": 36.01079997833505, + "grad_norm": 0.0034611087758094072, + "learning_rate": 4.95284864022339e-06, + "loss": 0.0088, + "step": 168170 + }, + { + "epoch": 36.01085414071386, + "grad_norm": 0.0014210897497832775, + "learning_rate": 4.9498396191782965e-06, + "loss": 0.0422, + "step": 168180 + }, + { + "epoch": 36.01090830309267, + "grad_norm": 0.0008644470945000648, + "learning_rate": 4.946830598133204e-06, + "loss": 0.0437, + "step": 168190 + }, + { + "epoch": 36.010962465471486, + "grad_norm": 0.049323685467243195, + "learning_rate": 4.943821577088111e-06, + "loss": 0.0658, + "step": 168200 + }, + { + "epoch": 36.011016627850296, + "grad_norm": 0.8410304188728333, + "learning_rate": 4.940812556043017e-06, + "loss": 0.0072, + "step": 168210 + }, + { + "epoch": 36.011070790229105, + "grad_norm": 0.017907291650772095, + "learning_rate": 4.937803534997924e-06, + "loss": 0.0379, + "step": 168220 + }, + { + "epoch": 36.01112495260792, + "grad_norm": 0.0009652604931034148, + "learning_rate": 4.93479451395283e-06, + "loss": 0.0885, + "step": 168230 + }, + { + "epoch": 36.01117911498673, + "grad_norm": 0.049600254744291306, + "learning_rate": 4.9317854929077375e-06, + "loss": 0.001, + "step": 168240 + }, + { + "epoch": 36.01123327736554, + "grad_norm": 0.0015584960347041488, + "learning_rate": 4.928776471862645e-06, + "loss": 0.0348, + "step": 168250 + }, + { + "epoch": 36.01128743974435, + "grad_norm": 0.0008700168691575527, + "learning_rate": 4.925767450817551e-06, + "loss": 0.031, + "step": 168260 + }, + { + "epoch": 36.01134160212317, + "grad_norm": 0.0008414337644353509, + "learning_rate": 4.922758429772458e-06, + "loss": 0.0722, + "step": 168270 + }, + { + "epoch": 36.01139576450198, + "grad_norm": 1.860191822052002, + "learning_rate": 4.919749408727365e-06, + "loss": 0.031, + "step": 168280 + }, + { + "epoch": 36.011449926880786, + "grad_norm": 0.0009003659943118691, + "learning_rate": 4.916740387682272e-06, + "loss": 0.0505, + "step": 168290 + }, + { + "epoch": 36.0115040892596, + "grad_norm": 0.09722471982240677, + "learning_rate": 4.9137313666371785e-06, + "loss": 0.0362, + "step": 168300 + }, + { + "epoch": 36.01155825163841, + "grad_norm": 0.002193932654336095, + "learning_rate": 4.910722345592085e-06, + "loss": 0.0042, + "step": 168310 + }, + { + "epoch": 36.01161241401722, + "grad_norm": 0.38853946328163147, + "learning_rate": 4.907713324546992e-06, + "loss": 0.059, + "step": 168320 + }, + { + "epoch": 36.01166657639604, + "grad_norm": 0.0010761620942503214, + "learning_rate": 4.904704303501899e-06, + "loss": 0.0852, + "step": 168330 + }, + { + "epoch": 36.01172073877485, + "grad_norm": 0.0012146361405029893, + "learning_rate": 4.901695282456805e-06, + "loss": 0.0178, + "step": 168340 + }, + { + "epoch": 36.01177490115366, + "grad_norm": 3.4704980850219727, + "learning_rate": 4.8986862614117125e-06, + "loss": 0.1, + "step": 168350 + }, + { + "epoch": 36.01182906353247, + "grad_norm": 1.8501291275024414, + "learning_rate": 4.89567724036662e-06, + "loss": 0.0145, + "step": 168360 + }, + { + "epoch": 36.01188322591128, + "grad_norm": 1.1476552486419678, + "learning_rate": 4.892668219321527e-06, + "loss": 0.0242, + "step": 168370 + }, + { + "epoch": 36.01193738829009, + "grad_norm": 0.0008603233145549893, + "learning_rate": 4.889659198276433e-06, + "loss": 0.0521, + "step": 168380 + }, + { + "epoch": 36.0119915506689, + "grad_norm": 0.0013338105054572225, + "learning_rate": 4.886650177231339e-06, + "loss": 0.0087, + "step": 168390 + }, + { + "epoch": 36.01204571304772, + "grad_norm": 0.14460469782352448, + "learning_rate": 4.883641156186246e-06, + "loss": 0.0208, + "step": 168400 + }, + { + "epoch": 36.01209987542653, + "grad_norm": 0.014870346523821354, + "learning_rate": 4.8806321351411535e-06, + "loss": 0.0218, + "step": 168410 + }, + { + "epoch": 36.01215403780534, + "grad_norm": 0.004354998003691435, + "learning_rate": 4.877623114096061e-06, + "loss": 0.0417, + "step": 168420 + }, + { + "epoch": 36.012208200184155, + "grad_norm": 0.0008619484724476933, + "learning_rate": 4.874614093050967e-06, + "loss": 0.0584, + "step": 168430 + }, + { + "epoch": 36.012262362562964, + "grad_norm": 0.9515470862388611, + "learning_rate": 4.871605072005874e-06, + "loss": 0.0442, + "step": 168440 + }, + { + "epoch": 36.012316524941774, + "grad_norm": 0.0008413080940954387, + "learning_rate": 4.86859605096078e-06, + "loss": 0.0354, + "step": 168450 + }, + { + "epoch": 36.01237068732059, + "grad_norm": 0.0013630518224090338, + "learning_rate": 4.865587029915687e-06, + "loss": 0.0207, + "step": 168460 + }, + { + "epoch": 36.0124248496994, + "grad_norm": 0.0008479414973407984, + "learning_rate": 4.8625780088705945e-06, + "loss": 0.0172, + "step": 168470 + }, + { + "epoch": 36.01247901207821, + "grad_norm": 1.0640090703964233, + "learning_rate": 4.859568987825501e-06, + "loss": 0.0538, + "step": 168480 + }, + { + "epoch": 36.01253317445702, + "grad_norm": 0.03714077174663544, + "learning_rate": 4.856559966780408e-06, + "loss": 0.0448, + "step": 168490 + }, + { + "epoch": 36.012587336835836, + "grad_norm": 0.0018070172518491745, + "learning_rate": 4.853550945735315e-06, + "loss": 0.0281, + "step": 168500 + }, + { + "epoch": 36.012641499214645, + "grad_norm": 0.6197571754455566, + "learning_rate": 4.850541924690221e-06, + "loss": 0.0139, + "step": 168510 + }, + { + "epoch": 36.012695661593455, + "grad_norm": 1.0389448404312134, + "learning_rate": 4.8475329036451284e-06, + "loss": 0.0342, + "step": 168520 + }, + { + "epoch": 36.01274982397227, + "grad_norm": 0.0008264362695626915, + "learning_rate": 4.844523882600035e-06, + "loss": 0.0062, + "step": 168530 + }, + { + "epoch": 36.01280398635108, + "grad_norm": 0.0008175160037353635, + "learning_rate": 4.841514861554942e-06, + "loss": 0.0077, + "step": 168540 + }, + { + "epoch": 36.01285814872989, + "grad_norm": 0.005888854619115591, + "learning_rate": 4.838505840509849e-06, + "loss": 0.029, + "step": 168550 + }, + { + "epoch": 36.01291231110871, + "grad_norm": 2.80320405960083, + "learning_rate": 4.835496819464755e-06, + "loss": 0.0488, + "step": 168560 + }, + { + "epoch": 36.012966473487516, + "grad_norm": 0.0017118966206908226, + "learning_rate": 4.832487798419662e-06, + "loss": 0.0006, + "step": 168570 + }, + { + "epoch": 36.013020635866326, + "grad_norm": 0.021405121311545372, + "learning_rate": 4.8294787773745695e-06, + "loss": 0.0174, + "step": 168580 + }, + { + "epoch": 36.01307479824514, + "grad_norm": 0.0009078591247089207, + "learning_rate": 4.826469756329477e-06, + "loss": 0.0001, + "step": 168590 + }, + { + "epoch": 36.01312896062395, + "grad_norm": 0.14522314071655273, + "learning_rate": 4.823460735284383e-06, + "loss": 0.0019, + "step": 168600 + }, + { + "epoch": 36.01318312300276, + "grad_norm": 3.7753102779388428, + "learning_rate": 4.820451714239289e-06, + "loss": 0.0782, + "step": 168610 + }, + { + "epoch": 36.01323728538157, + "grad_norm": 1.7196526527404785, + "learning_rate": 4.817442693194196e-06, + "loss": 0.0739, + "step": 168620 + }, + { + "epoch": 36.01329144776039, + "grad_norm": 4.117852210998535, + "learning_rate": 4.814433672149103e-06, + "loss": 0.0103, + "step": 168630 + }, + { + "epoch": 36.0133456101392, + "grad_norm": 0.0008428833680227399, + "learning_rate": 4.81142465110401e-06, + "loss": 0.01, + "step": 168640 + }, + { + "epoch": 36.01339977251801, + "grad_norm": 0.23530146479606628, + "learning_rate": 4.808415630058917e-06, + "loss": 0.0114, + "step": 168650 + }, + { + "epoch": 36.01345393489682, + "grad_norm": 4.205746173858643, + "learning_rate": 4.805406609013824e-06, + "loss": 0.0577, + "step": 168660 + }, + { + "epoch": 36.01350809727563, + "grad_norm": 0.000866532966028899, + "learning_rate": 4.802397587968731e-06, + "loss": 0.0031, + "step": 168670 + }, + { + "epoch": 36.01356225965444, + "grad_norm": 0.35261353850364685, + "learning_rate": 4.799388566923637e-06, + "loss": 0.0197, + "step": 168680 + }, + { + "epoch": 36.01361642203326, + "grad_norm": 0.0008530562627129257, + "learning_rate": 4.796379545878544e-06, + "loss": 0.0315, + "step": 168690 + }, + { + "epoch": 36.01367058441207, + "grad_norm": 0.0008224023040384054, + "learning_rate": 4.793370524833451e-06, + "loss": 0.0352, + "step": 168700 + }, + { + "epoch": 36.01372474679088, + "grad_norm": 0.000878546794410795, + "learning_rate": 4.790361503788358e-06, + "loss": 0.0358, + "step": 168710 + }, + { + "epoch": 36.01377890916969, + "grad_norm": 0.0008535638335160911, + "learning_rate": 4.787352482743265e-06, + "loss": 0.0493, + "step": 168720 + }, + { + "epoch": 36.013833071548504, + "grad_norm": 0.0008742573554627597, + "learning_rate": 4.784343461698171e-06, + "loss": 0.007, + "step": 168730 + }, + { + "epoch": 36.01388723392731, + "grad_norm": 13.138195037841797, + "learning_rate": 4.781334440653078e-06, + "loss": 0.0452, + "step": 168740 + }, + { + "epoch": 36.01394139630612, + "grad_norm": 0.0012663620291277766, + "learning_rate": 4.778325419607985e-06, + "loss": 0.034, + "step": 168750 + }, + { + "epoch": 36.01399555868494, + "grad_norm": 0.0008466654107905924, + "learning_rate": 4.775316398562892e-06, + "loss": 0.0123, + "step": 168760 + }, + { + "epoch": 36.01404972106375, + "grad_norm": 0.9837513566017151, + "learning_rate": 4.772307377517798e-06, + "loss": 0.1155, + "step": 168770 + }, + { + "epoch": 36.01410388344256, + "grad_norm": 0.18458420038223267, + "learning_rate": 4.769298356472705e-06, + "loss": 0.0164, + "step": 168780 + }, + { + "epoch": 36.014158045821375, + "grad_norm": 0.0013692110078409314, + "learning_rate": 4.766289335427612e-06, + "loss": 0.0275, + "step": 168790 + }, + { + "epoch": 36.014212208200185, + "grad_norm": 0.00916493684053421, + "learning_rate": 4.763280314382519e-06, + "loss": 0.0546, + "step": 168800 + }, + { + "epoch": 36.014266370578994, + "grad_norm": 0.0011602103477343917, + "learning_rate": 4.760271293337426e-06, + "loss": 0.0003, + "step": 168810 + }, + { + "epoch": 36.01432053295781, + "grad_norm": 0.0016784698236733675, + "learning_rate": 4.757262272292333e-06, + "loss": 0.0117, + "step": 168820 + }, + { + "epoch": 36.01437469533662, + "grad_norm": 0.0011421217350289226, + "learning_rate": 4.754253251247239e-06, + "loss": 0.1008, + "step": 168830 + }, + { + "epoch": 36.01442885771543, + "grad_norm": 0.001498234225437045, + "learning_rate": 4.751244230202146e-06, + "loss": 0.0376, + "step": 168840 + }, + { + "epoch": 36.01448302009424, + "grad_norm": 0.004017556086182594, + "learning_rate": 4.748235209157053e-06, + "loss": 0.07, + "step": 168850 + }, + { + "epoch": 36.014537182473056, + "grad_norm": 0.0008514381479471922, + "learning_rate": 4.7452261881119596e-06, + "loss": 0.0059, + "step": 168860 + }, + { + "epoch": 36.014591344851866, + "grad_norm": 0.05775529518723488, + "learning_rate": 4.742217167066867e-06, + "loss": 0.0536, + "step": 168870 + }, + { + "epoch": 36.014645507230675, + "grad_norm": 0.18277660012245178, + "learning_rate": 4.739208146021774e-06, + "loss": 0.0344, + "step": 168880 + }, + { + "epoch": 36.01469966960949, + "grad_norm": 0.000951350957620889, + "learning_rate": 4.736199124976681e-06, + "loss": 0.0801, + "step": 168890 + }, + { + "epoch": 36.0147538319883, + "grad_norm": 0.0012127620866522193, + "learning_rate": 4.733190103931587e-06, + "loss": 0.0181, + "step": 168900 + }, + { + "epoch": 36.01480799436711, + "grad_norm": 0.010554162785410881, + "learning_rate": 4.7301810828864935e-06, + "loss": 0.0042, + "step": 168910 + }, + { + "epoch": 36.01486215674593, + "grad_norm": 1.4636934995651245, + "learning_rate": 4.727172061841401e-06, + "loss": 0.0201, + "step": 168920 + }, + { + "epoch": 36.01491631912474, + "grad_norm": 0.4855940639972687, + "learning_rate": 4.724163040796308e-06, + "loss": 0.0069, + "step": 168930 + }, + { + "epoch": 36.014970481503546, + "grad_norm": 0.000986348488368094, + "learning_rate": 4.721154019751214e-06, + "loss": 0.0051, + "step": 168940 + }, + { + "epoch": 36.015024643882356, + "grad_norm": 0.0008978776168078184, + "learning_rate": 4.718144998706121e-06, + "loss": 0.076, + "step": 168950 + }, + { + "epoch": 36.01507880626117, + "grad_norm": 0.140339195728302, + "learning_rate": 4.715135977661028e-06, + "loss": 0.0186, + "step": 168960 + }, + { + "epoch": 36.01513296863998, + "grad_norm": 1.6553655862808228, + "learning_rate": 4.712126956615935e-06, + "loss": 0.0159, + "step": 168970 + }, + { + "epoch": 36.01518713101879, + "grad_norm": 0.0008580401772633195, + "learning_rate": 4.709117935570842e-06, + "loss": 0.0227, + "step": 168980 + }, + { + "epoch": 36.01524129339761, + "grad_norm": 1.2224713563919067, + "learning_rate": 4.706108914525748e-06, + "loss": 0.0094, + "step": 168990 + }, + { + "epoch": 36.01529545577642, + "grad_norm": 4.508615493774414, + "learning_rate": 4.703099893480655e-06, + "loss": 0.0901, + "step": 169000 + }, + { + "epoch": 36.01534961815523, + "grad_norm": 0.016565382480621338, + "learning_rate": 4.700090872435562e-06, + "loss": 0.1177, + "step": 169010 + }, + { + "epoch": 36.015403780534044, + "grad_norm": 0.0008576691034249961, + "learning_rate": 4.697081851390469e-06, + "loss": 0.0309, + "step": 169020 + }, + { + "epoch": 36.01545794291285, + "grad_norm": 0.022861206904053688, + "learning_rate": 4.6940728303453756e-06, + "loss": 0.0239, + "step": 169030 + }, + { + "epoch": 36.01551210529166, + "grad_norm": 0.09156028181314468, + "learning_rate": 4.691063809300283e-06, + "loss": 0.013, + "step": 169040 + }, + { + "epoch": 36.01556626767048, + "grad_norm": 0.02980935201048851, + "learning_rate": 4.688054788255189e-06, + "loss": 0.0068, + "step": 169050 + }, + { + "epoch": 36.01562043004929, + "grad_norm": 0.02569376677274704, + "learning_rate": 4.685045767210096e-06, + "loss": 0.0552, + "step": 169060 + }, + { + "epoch": 36.0156745924281, + "grad_norm": 0.0013435775181278586, + "learning_rate": 4.682036746165002e-06, + "loss": 0.0924, + "step": 169070 + }, + { + "epoch": 36.01572875480691, + "grad_norm": 0.0008805626421235502, + "learning_rate": 4.6790277251199095e-06, + "loss": 0.0473, + "step": 169080 + }, + { + "epoch": 36.015782917185724, + "grad_norm": 1.0046148300170898, + "learning_rate": 4.676018704074817e-06, + "loss": 0.0666, + "step": 169090 + }, + { + "epoch": 36.015837079564534, + "grad_norm": 0.001379591878503561, + "learning_rate": 4.673009683029724e-06, + "loss": 0.0104, + "step": 169100 + }, + { + "epoch": 36.01589124194334, + "grad_norm": 0.0022046619560569525, + "learning_rate": 4.67000066198463e-06, + "loss": 0.0106, + "step": 169110 + }, + { + "epoch": 36.01594540432216, + "grad_norm": 5.306776523590088, + "learning_rate": 4.666991640939537e-06, + "loss": 0.048, + "step": 169120 + }, + { + "epoch": 36.01599956670097, + "grad_norm": 0.34218284487724304, + "learning_rate": 4.663982619894443e-06, + "loss": 0.0015, + "step": 169130 + }, + { + "epoch": 36.01605372907978, + "grad_norm": 0.09105543792247772, + "learning_rate": 4.6609735988493505e-06, + "loss": 0.1325, + "step": 169140 + }, + { + "epoch": 36.016107891458596, + "grad_norm": 2.718210458755493, + "learning_rate": 4.657964577804258e-06, + "loss": 0.0413, + "step": 169150 + }, + { + "epoch": 36.016162053837405, + "grad_norm": 0.0009069691295735538, + "learning_rate": 4.654955556759164e-06, + "loss": 0.0207, + "step": 169160 + }, + { + "epoch": 36.016216216216215, + "grad_norm": 0.001165154855698347, + "learning_rate": 4.651946535714071e-06, + "loss": 0.0439, + "step": 169170 + }, + { + "epoch": 36.01627037859503, + "grad_norm": 1.0708943605422974, + "learning_rate": 4.648937514668978e-06, + "loss": 0.0985, + "step": 169180 + }, + { + "epoch": 36.01632454097384, + "grad_norm": 0.07238037884235382, + "learning_rate": 4.645928493623885e-06, + "loss": 0.0259, + "step": 169190 + }, + { + "epoch": 36.01637870335265, + "grad_norm": 3.610935688018799, + "learning_rate": 4.642919472578791e-06, + "loss": 0.0835, + "step": 169200 + }, + { + "epoch": 36.01643286573146, + "grad_norm": 0.0012744518462568521, + "learning_rate": 4.639910451533698e-06, + "loss": 0.0109, + "step": 169210 + }, + { + "epoch": 36.01648702811028, + "grad_norm": 0.0009107431978918612, + "learning_rate": 4.636901430488605e-06, + "loss": 0.0296, + "step": 169220 + }, + { + "epoch": 36.016541190489086, + "grad_norm": 0.4182390570640564, + "learning_rate": 4.633892409443512e-06, + "loss": 0.0549, + "step": 169230 + }, + { + "epoch": 36.016595352867895, + "grad_norm": 0.0009264332475140691, + "learning_rate": 4.630883388398418e-06, + "loss": 0.0358, + "step": 169240 + }, + { + "epoch": 36.01664951524671, + "grad_norm": 1.5814505815505981, + "learning_rate": 4.6278743673533255e-06, + "loss": 0.0219, + "step": 169250 + }, + { + "epoch": 36.01670367762552, + "grad_norm": 0.9450675845146179, + "learning_rate": 4.624865346308233e-06, + "loss": 0.0312, + "step": 169260 + }, + { + "epoch": 36.01675784000433, + "grad_norm": 0.0008881382527761161, + "learning_rate": 4.62185632526314e-06, + "loss": 0.0028, + "step": 169270 + }, + { + "epoch": 36.01681200238315, + "grad_norm": 0.001247295760549605, + "learning_rate": 4.618847304218046e-06, + "loss": 0.0191, + "step": 169280 + }, + { + "epoch": 36.01686616476196, + "grad_norm": 0.0019666936714202166, + "learning_rate": 4.615838283172952e-06, + "loss": 0.0236, + "step": 169290 + }, + { + "epoch": 36.01692032714077, + "grad_norm": 0.05318209528923035, + "learning_rate": 4.612829262127859e-06, + "loss": 0.0024, + "step": 169300 + }, + { + "epoch": 36.016974489519576, + "grad_norm": 0.0012142694322392344, + "learning_rate": 4.6098202410827665e-06, + "loss": 0.0121, + "step": 169310 + }, + { + "epoch": 36.01702865189839, + "grad_norm": 0.6230562329292297, + "learning_rate": 4.606811220037674e-06, + "loss": 0.0075, + "step": 169320 + }, + { + "epoch": 36.0170828142772, + "grad_norm": 0.00090614496730268, + "learning_rate": 4.60380219899258e-06, + "loss": 0.0224, + "step": 169330 + }, + { + "epoch": 36.01713697665601, + "grad_norm": 1.1467421054840088, + "learning_rate": 4.600793177947487e-06, + "loss": 0.0335, + "step": 169340 + }, + { + "epoch": 36.01719113903483, + "grad_norm": 0.26180678606033325, + "learning_rate": 4.597784156902393e-06, + "loss": 0.0334, + "step": 169350 + }, + { + "epoch": 36.01724530141364, + "grad_norm": 0.0017742374911904335, + "learning_rate": 4.5947751358573e-06, + "loss": 0.053, + "step": 169360 + }, + { + "epoch": 36.01729946379245, + "grad_norm": 0.0009221122018061578, + "learning_rate": 4.591766114812207e-06, + "loss": 0.1211, + "step": 169370 + }, + { + "epoch": 36.017353626171264, + "grad_norm": 0.06463294476270676, + "learning_rate": 4.588757093767114e-06, + "loss": 0.0262, + "step": 169380 + }, + { + "epoch": 36.017407788550074, + "grad_norm": 1.1179076433181763, + "learning_rate": 4.585748072722021e-06, + "loss": 0.059, + "step": 169390 + }, + { + "epoch": 36.01746195092888, + "grad_norm": 0.0012750821188092232, + "learning_rate": 4.582739051676928e-06, + "loss": 0.0398, + "step": 169400 + }, + { + "epoch": 36.0175161133077, + "grad_norm": 0.2448476254940033, + "learning_rate": 4.579730030631834e-06, + "loss": 0.0641, + "step": 169410 + }, + { + "epoch": 36.01757027568651, + "grad_norm": 0.0012610189151018858, + "learning_rate": 4.5767210095867414e-06, + "loss": 0.0355, + "step": 169420 + }, + { + "epoch": 36.01762443806532, + "grad_norm": 1.9277763366699219, + "learning_rate": 4.573711988541648e-06, + "loss": 0.0345, + "step": 169430 + }, + { + "epoch": 36.01767860044413, + "grad_norm": 0.002698772819712758, + "learning_rate": 4.570702967496555e-06, + "loss": 0.0002, + "step": 169440 + }, + { + "epoch": 36.017732762822945, + "grad_norm": 0.7747287750244141, + "learning_rate": 4.567693946451462e-06, + "loss": 0.0508, + "step": 169450 + }, + { + "epoch": 36.017786925201754, + "grad_norm": 0.001027909223921597, + "learning_rate": 4.564684925406368e-06, + "loss": 0.0444, + "step": 169460 + }, + { + "epoch": 36.017841087580564, + "grad_norm": 0.0016038059256970882, + "learning_rate": 4.561675904361275e-06, + "loss": 0.1062, + "step": 169470 + }, + { + "epoch": 36.01789524995938, + "grad_norm": 0.0008799915085546672, + "learning_rate": 4.5586668833161825e-06, + "loss": 0.0383, + "step": 169480 + }, + { + "epoch": 36.01794941233819, + "grad_norm": 0.0013294619275256991, + "learning_rate": 4.55565786227109e-06, + "loss": 0.036, + "step": 169490 + }, + { + "epoch": 36.018003574717, + "grad_norm": 0.008267663419246674, + "learning_rate": 4.552648841225995e-06, + "loss": 0.0405, + "step": 169500 + }, + { + "epoch": 36.018057737095816, + "grad_norm": 0.0008932392229326069, + "learning_rate": 4.549639820180902e-06, + "loss": 0.0212, + "step": 169510 + }, + { + "epoch": 36.018111899474626, + "grad_norm": 0.0010947134578600526, + "learning_rate": 4.546630799135809e-06, + "loss": 0.025, + "step": 169520 + }, + { + "epoch": 36.018166061853435, + "grad_norm": 0.052785348147153854, + "learning_rate": 4.543621778090716e-06, + "loss": 0.0451, + "step": 169530 + }, + { + "epoch": 36.01822022423225, + "grad_norm": 0.0008989209891296923, + "learning_rate": 4.540612757045623e-06, + "loss": 0.06, + "step": 169540 + }, + { + "epoch": 36.01827438661106, + "grad_norm": 0.014421174302697182, + "learning_rate": 4.53760373600053e-06, + "loss": 0.0195, + "step": 169550 + }, + { + "epoch": 36.01832854898987, + "grad_norm": 0.9643068313598633, + "learning_rate": 4.534594714955437e-06, + "loss": 0.0653, + "step": 169560 + }, + { + "epoch": 36.01838271136868, + "grad_norm": 0.025697365403175354, + "learning_rate": 4.531585693910344e-06, + "loss": 0.0403, + "step": 169570 + }, + { + "epoch": 36.0184368737475, + "grad_norm": 0.002464000601321459, + "learning_rate": 4.52857667286525e-06, + "loss": 0.0096, + "step": 169580 + }, + { + "epoch": 36.018491036126306, + "grad_norm": 0.0021506724879145622, + "learning_rate": 4.525567651820157e-06, + "loss": 0.0087, + "step": 169590 + }, + { + "epoch": 36.018545198505116, + "grad_norm": 0.0012121327454224229, + "learning_rate": 4.522558630775064e-06, + "loss": 0.0372, + "step": 169600 + }, + { + "epoch": 36.01859936088393, + "grad_norm": 0.0019104602979496121, + "learning_rate": 4.519549609729971e-06, + "loss": 0.0104, + "step": 169610 + }, + { + "epoch": 36.01865352326274, + "grad_norm": 2.7851760387420654, + "learning_rate": 4.516540588684878e-06, + "loss": 0.0506, + "step": 169620 + }, + { + "epoch": 36.01870768564155, + "grad_norm": 0.9654416441917419, + "learning_rate": 4.513531567639784e-06, + "loss": 0.0227, + "step": 169630 + }, + { + "epoch": 36.01876184802037, + "grad_norm": 1.5493600368499756, + "learning_rate": 4.510522546594691e-06, + "loss": 0.0152, + "step": 169640 + }, + { + "epoch": 36.01881601039918, + "grad_norm": 2.3571412563323975, + "learning_rate": 4.507513525549598e-06, + "loss": 0.1049, + "step": 169650 + }, + { + "epoch": 36.01887017277799, + "grad_norm": 0.003250266658142209, + "learning_rate": 4.504504504504505e-06, + "loss": 0.0109, + "step": 169660 + }, + { + "epoch": 36.0189243351568, + "grad_norm": 0.0010924061061814427, + "learning_rate": 4.501495483459411e-06, + "loss": 0.0353, + "step": 169670 + }, + { + "epoch": 36.01897849753561, + "grad_norm": 1.71467125415802, + "learning_rate": 4.498486462414318e-06, + "loss": 0.0641, + "step": 169680 + }, + { + "epoch": 36.01903265991442, + "grad_norm": 0.0019471690757200122, + "learning_rate": 4.495477441369225e-06, + "loss": 0.0294, + "step": 169690 + }, + { + "epoch": 36.01908682229323, + "grad_norm": 0.28930434584617615, + "learning_rate": 4.492468420324132e-06, + "loss": 0.0062, + "step": 169700 + }, + { + "epoch": 36.01914098467205, + "grad_norm": 0.0011994875967502594, + "learning_rate": 4.489459399279039e-06, + "loss": 0.0019, + "step": 169710 + }, + { + "epoch": 36.01919514705086, + "grad_norm": 0.0023336417507380247, + "learning_rate": 4.486450378233946e-06, + "loss": 0.0126, + "step": 169720 + }, + { + "epoch": 36.01924930942967, + "grad_norm": 0.0028231169562786818, + "learning_rate": 4.483441357188852e-06, + "loss": 0.0514, + "step": 169730 + }, + { + "epoch": 36.019303471808485, + "grad_norm": 0.0049352338537573814, + "learning_rate": 4.480432336143759e-06, + "loss": 0.0554, + "step": 169740 + }, + { + "epoch": 36.019357634187294, + "grad_norm": 0.000957097508944571, + "learning_rate": 4.477423315098666e-06, + "loss": 0.0178, + "step": 169750 + }, + { + "epoch": 36.019411796566104, + "grad_norm": 0.0013756842818111181, + "learning_rate": 4.4744142940535726e-06, + "loss": 0.0118, + "step": 169760 + }, + { + "epoch": 36.01946595894492, + "grad_norm": 0.966035008430481, + "learning_rate": 4.47140527300848e-06, + "loss": 0.0205, + "step": 169770 + }, + { + "epoch": 36.01952012132373, + "grad_norm": 0.0015954130794852972, + "learning_rate": 4.468396251963387e-06, + "loss": 0.0492, + "step": 169780 + }, + { + "epoch": 36.01957428370254, + "grad_norm": 4.3849945068359375, + "learning_rate": 4.465387230918294e-06, + "loss": 0.0435, + "step": 169790 + }, + { + "epoch": 36.01962844608135, + "grad_norm": 0.012518858537077904, + "learning_rate": 4.462378209873199e-06, + "loss": 0.0371, + "step": 169800 + }, + { + "epoch": 36.019682608460165, + "grad_norm": 0.0009319184464402497, + "learning_rate": 4.4593691888281065e-06, + "loss": 0.0177, + "step": 169810 + }, + { + "epoch": 36.019736770838975, + "grad_norm": 0.0008449890301562846, + "learning_rate": 4.456360167783014e-06, + "loss": 0.0016, + "step": 169820 + }, + { + "epoch": 36.019790933217784, + "grad_norm": 0.0010863429633900523, + "learning_rate": 4.453351146737921e-06, + "loss": 0.0054, + "step": 169830 + }, + { + "epoch": 36.0198450955966, + "grad_norm": 0.3291650414466858, + "learning_rate": 4.450342125692827e-06, + "loss": 0.0157, + "step": 169840 + }, + { + "epoch": 36.01989925797541, + "grad_norm": 0.19325822591781616, + "learning_rate": 4.447333104647734e-06, + "loss": 0.0364, + "step": 169850 + }, + { + "epoch": 36.01995342035422, + "grad_norm": 0.17775091528892517, + "learning_rate": 4.444324083602641e-06, + "loss": 0.1031, + "step": 169860 + }, + { + "epoch": 36.02000758273304, + "grad_norm": 0.0017895165365189314, + "learning_rate": 4.4413150625575475e-06, + "loss": 0.0211, + "step": 169870 + }, + { + "epoch": 36.020061745111846, + "grad_norm": 1.4969042539596558, + "learning_rate": 4.438306041512455e-06, + "loss": 0.0147, + "step": 169880 + }, + { + "epoch": 36.020115907490656, + "grad_norm": 0.001410101423971355, + "learning_rate": 4.435297020467361e-06, + "loss": 0.0094, + "step": 169890 + }, + { + "epoch": 36.020170069869465, + "grad_norm": 0.0008592611993663013, + "learning_rate": 4.432287999422268e-06, + "loss": 0.0264, + "step": 169900 + }, + { + "epoch": 36.02022423224828, + "grad_norm": 0.024431347846984863, + "learning_rate": 4.429278978377175e-06, + "loss": 0.0203, + "step": 169910 + }, + { + "epoch": 36.02027839462709, + "grad_norm": 0.0014252489199861884, + "learning_rate": 4.426269957332082e-06, + "loss": 0.0417, + "step": 169920 + }, + { + "epoch": 36.0203325570059, + "grad_norm": 0.0018117940053343773, + "learning_rate": 4.4232609362869886e-06, + "loss": 0.0634, + "step": 169930 + }, + { + "epoch": 36.02038671938472, + "grad_norm": 0.0020225283224135637, + "learning_rate": 4.420251915241896e-06, + "loss": 0.0278, + "step": 169940 + }, + { + "epoch": 36.02044088176353, + "grad_norm": 0.11667662858963013, + "learning_rate": 4.417242894196802e-06, + "loss": 0.0029, + "step": 169950 + }, + { + "epoch": 36.020495044142336, + "grad_norm": 0.000833869562484324, + "learning_rate": 4.414233873151709e-06, + "loss": 0.0592, + "step": 169960 + }, + { + "epoch": 36.02054920652115, + "grad_norm": 0.0008598519489169121, + "learning_rate": 4.411224852106615e-06, + "loss": 0.0134, + "step": 169970 + }, + { + "epoch": 36.02060336889996, + "grad_norm": 0.012865537777543068, + "learning_rate": 4.4082158310615225e-06, + "loss": 0.0565, + "step": 169980 + }, + { + "epoch": 36.02065753127877, + "grad_norm": 0.0008403941756114364, + "learning_rate": 4.40520681001643e-06, + "loss": 0.0394, + "step": 169990 + }, + { + "epoch": 36.02071169365759, + "grad_norm": 0.001110933138988912, + "learning_rate": 4.402197788971337e-06, + "loss": 0.0394, + "step": 170000 + }, + { + "epoch": 36.0207658560364, + "grad_norm": 0.02414289489388466, + "learning_rate": 4.399188767926243e-06, + "loss": 0.0062, + "step": 170010 + }, + { + "epoch": 36.02082001841521, + "grad_norm": 0.0010992748429998755, + "learning_rate": 4.396179746881149e-06, + "loss": 0.0122, + "step": 170020 + }, + { + "epoch": 36.02087418079402, + "grad_norm": 0.09975677728652954, + "learning_rate": 4.393170725836056e-06, + "loss": 0.0122, + "step": 170030 + }, + { + "epoch": 36.020928343172834, + "grad_norm": 0.0008586359908804297, + "learning_rate": 4.3901617047909635e-06, + "loss": 0.0311, + "step": 170040 + }, + { + "epoch": 36.02098250555164, + "grad_norm": 0.0008432646282017231, + "learning_rate": 4.387152683745871e-06, + "loss": 0.0232, + "step": 170050 + }, + { + "epoch": 36.02103666793045, + "grad_norm": 1.300757646560669, + "learning_rate": 4.384143662700777e-06, + "loss": 0.073, + "step": 170060 + }, + { + "epoch": 36.02109083030927, + "grad_norm": 0.005464351270347834, + "learning_rate": 4.381134641655684e-06, + "loss": 0.0126, + "step": 170070 + }, + { + "epoch": 36.02114499268808, + "grad_norm": 0.05640900135040283, + "learning_rate": 4.378125620610591e-06, + "loss": 0.0247, + "step": 170080 + }, + { + "epoch": 36.02119915506689, + "grad_norm": 0.0012624916853383183, + "learning_rate": 4.375116599565498e-06, + "loss": 0.0093, + "step": 170090 + }, + { + "epoch": 36.021253317445705, + "grad_norm": 0.0008268936071544886, + "learning_rate": 4.372107578520404e-06, + "loss": 0.1222, + "step": 170100 + }, + { + "epoch": 36.021307479824515, + "grad_norm": 0.0010696996469050646, + "learning_rate": 4.369098557475311e-06, + "loss": 0.0083, + "step": 170110 + }, + { + "epoch": 36.021361642203324, + "grad_norm": 0.0018732106545940042, + "learning_rate": 4.366089536430218e-06, + "loss": 0.0679, + "step": 170120 + }, + { + "epoch": 36.02141580458214, + "grad_norm": 0.0011284856591373682, + "learning_rate": 4.363080515385125e-06, + "loss": 0.0098, + "step": 170130 + }, + { + "epoch": 36.02146996696095, + "grad_norm": 0.0012126239016652107, + "learning_rate": 4.360071494340031e-06, + "loss": 0.0391, + "step": 170140 + }, + { + "epoch": 36.02152412933976, + "grad_norm": 4.657915115356445, + "learning_rate": 4.3570624732949385e-06, + "loss": 0.0723, + "step": 170150 + }, + { + "epoch": 36.02157829171857, + "grad_norm": 0.0008413754985667765, + "learning_rate": 4.354053452249846e-06, + "loss": 0.0226, + "step": 170160 + }, + { + "epoch": 36.021632454097386, + "grad_norm": 3.100358724594116, + "learning_rate": 4.351044431204752e-06, + "loss": 0.0428, + "step": 170170 + }, + { + "epoch": 36.021686616476195, + "grad_norm": 0.0014887233264744282, + "learning_rate": 4.348035410159659e-06, + "loss": 0.0136, + "step": 170180 + }, + { + "epoch": 36.021740778855005, + "grad_norm": 0.001210297690704465, + "learning_rate": 4.345026389114565e-06, + "loss": 0.033, + "step": 170190 + }, + { + "epoch": 36.02179494123382, + "grad_norm": 0.1650860607624054, + "learning_rate": 4.342017368069472e-06, + "loss": 0.0632, + "step": 170200 + }, + { + "epoch": 36.02184910361263, + "grad_norm": 0.06551231443881989, + "learning_rate": 4.3390083470243795e-06, + "loss": 0.0113, + "step": 170210 + }, + { + "epoch": 36.02190326599144, + "grad_norm": 0.3651583790779114, + "learning_rate": 4.335999325979287e-06, + "loss": 0.0038, + "step": 170220 + }, + { + "epoch": 36.02195742837026, + "grad_norm": 2.4644668102264404, + "learning_rate": 4.332990304934193e-06, + "loss": 0.026, + "step": 170230 + }, + { + "epoch": 36.02201159074907, + "grad_norm": 0.017854860052466393, + "learning_rate": 4.3299812838891e-06, + "loss": 0.0132, + "step": 170240 + }, + { + "epoch": 36.022065753127876, + "grad_norm": 0.004426563624292612, + "learning_rate": 4.326972262844006e-06, + "loss": 0.0074, + "step": 170250 + }, + { + "epoch": 36.022119915506686, + "grad_norm": 0.005995658226311207, + "learning_rate": 4.323963241798913e-06, + "loss": 0.0085, + "step": 170260 + }, + { + "epoch": 36.0221740778855, + "grad_norm": 0.0008283404167741537, + "learning_rate": 4.32095422075382e-06, + "loss": 0.0139, + "step": 170270 + }, + { + "epoch": 36.02222824026431, + "grad_norm": 1.6858097314834595, + "learning_rate": 4.317945199708727e-06, + "loss": 0.0315, + "step": 170280 + }, + { + "epoch": 36.02228240264312, + "grad_norm": 0.0008862217655405402, + "learning_rate": 4.314936178663634e-06, + "loss": 0.0028, + "step": 170290 + }, + { + "epoch": 36.02233656502194, + "grad_norm": 0.05726300925016403, + "learning_rate": 4.311927157618541e-06, + "loss": 0.0131, + "step": 170300 + }, + { + "epoch": 36.02239072740075, + "grad_norm": 0.0008867797441780567, + "learning_rate": 4.308918136573447e-06, + "loss": 0.0202, + "step": 170310 + }, + { + "epoch": 36.02244488977956, + "grad_norm": 1.2609401941299438, + "learning_rate": 4.305909115528354e-06, + "loss": 0.0689, + "step": 170320 + }, + { + "epoch": 36.02249905215837, + "grad_norm": 0.001273761852644384, + "learning_rate": 4.302900094483261e-06, + "loss": 0.0267, + "step": 170330 + }, + { + "epoch": 36.02255321453718, + "grad_norm": 0.0008613480604253709, + "learning_rate": 4.299891073438168e-06, + "loss": 0.0287, + "step": 170340 + }, + { + "epoch": 36.02260737691599, + "grad_norm": 0.7294852137565613, + "learning_rate": 4.296882052393075e-06, + "loss": 0.0499, + "step": 170350 + }, + { + "epoch": 36.02266153929481, + "grad_norm": 0.2117619812488556, + "learning_rate": 4.293873031347981e-06, + "loss": 0.0208, + "step": 170360 + }, + { + "epoch": 36.02271570167362, + "grad_norm": 0.0008021883550100029, + "learning_rate": 4.290864010302888e-06, + "loss": 0.0272, + "step": 170370 + }, + { + "epoch": 36.02276986405243, + "grad_norm": 0.7534086108207703, + "learning_rate": 4.2878549892577955e-06, + "loss": 0.0331, + "step": 170380 + }, + { + "epoch": 36.02282402643124, + "grad_norm": 0.0008051421027630568, + "learning_rate": 4.284845968212703e-06, + "loss": 0.0154, + "step": 170390 + }, + { + "epoch": 36.022878188810054, + "grad_norm": 0.6313069462776184, + "learning_rate": 4.281836947167608e-06, + "loss": 0.0061, + "step": 170400 + }, + { + "epoch": 36.022932351188864, + "grad_norm": 0.0008205220801755786, + "learning_rate": 4.278827926122515e-06, + "loss": 0.0247, + "step": 170410 + }, + { + "epoch": 36.02298651356767, + "grad_norm": 0.008002454414963722, + "learning_rate": 4.275818905077422e-06, + "loss": 0.1012, + "step": 170420 + }, + { + "epoch": 36.02304067594649, + "grad_norm": 0.07145565003156662, + "learning_rate": 4.272809884032329e-06, + "loss": 0.0014, + "step": 170430 + }, + { + "epoch": 36.0230948383253, + "grad_norm": 0.18710757791996002, + "learning_rate": 4.269800862987236e-06, + "loss": 0.0087, + "step": 170440 + }, + { + "epoch": 36.02314900070411, + "grad_norm": 0.0011960675474256277, + "learning_rate": 4.266791841942143e-06, + "loss": 0.0119, + "step": 170450 + }, + { + "epoch": 36.023203163082925, + "grad_norm": 1.9428796768188477, + "learning_rate": 4.26378282089705e-06, + "loss": 0.0804, + "step": 170460 + }, + { + "epoch": 36.023257325461735, + "grad_norm": 1.3675153255462646, + "learning_rate": 4.260773799851956e-06, + "loss": 0.0759, + "step": 170470 + }, + { + "epoch": 36.023311487840544, + "grad_norm": 0.0013272471260279417, + "learning_rate": 4.257764778806863e-06, + "loss": 0.0286, + "step": 170480 + }, + { + "epoch": 36.02336565021936, + "grad_norm": 1.6245195865631104, + "learning_rate": 4.25475575776177e-06, + "loss": 0.0255, + "step": 170490 + }, + { + "epoch": 36.02341981259817, + "grad_norm": 0.0010774161200970411, + "learning_rate": 4.251746736716677e-06, + "loss": 0.0235, + "step": 170500 + }, + { + "epoch": 36.02347397497698, + "grad_norm": 0.007352480199187994, + "learning_rate": 4.248737715671584e-06, + "loss": 0.0479, + "step": 170510 + }, + { + "epoch": 36.02352813735579, + "grad_norm": 0.0009073458495549858, + "learning_rate": 4.245728694626491e-06, + "loss": 0.0125, + "step": 170520 + }, + { + "epoch": 36.023582299734606, + "grad_norm": 0.0010155378840863705, + "learning_rate": 4.242719673581397e-06, + "loss": 0.0536, + "step": 170530 + }, + { + "epoch": 36.023636462113416, + "grad_norm": 0.6116577982902527, + "learning_rate": 4.239710652536304e-06, + "loss": 0.0383, + "step": 170540 + }, + { + "epoch": 36.023690624492225, + "grad_norm": 0.0015192585997283459, + "learning_rate": 4.236701631491211e-06, + "loss": 0.0205, + "step": 170550 + }, + { + "epoch": 36.02374478687104, + "grad_norm": 0.5614287257194519, + "learning_rate": 4.233692610446118e-06, + "loss": 0.0555, + "step": 170560 + }, + { + "epoch": 36.02379894924985, + "grad_norm": 1.1511406898498535, + "learning_rate": 4.230683589401024e-06, + "loss": 0.1123, + "step": 170570 + }, + { + "epoch": 36.02385311162866, + "grad_norm": 0.0011205706978216767, + "learning_rate": 4.227674568355931e-06, + "loss": 0.0162, + "step": 170580 + }, + { + "epoch": 36.02390727400748, + "grad_norm": 0.003131320234388113, + "learning_rate": 4.224665547310838e-06, + "loss": 0.0001, + "step": 170590 + }, + { + "epoch": 36.02396143638629, + "grad_norm": 0.0008195065893232822, + "learning_rate": 4.221656526265745e-06, + "loss": 0.0148, + "step": 170600 + }, + { + "epoch": 36.0240155987651, + "grad_norm": 0.6736443638801575, + "learning_rate": 4.218647505220652e-06, + "loss": 0.0525, + "step": 170610 + }, + { + "epoch": 36.024069761143906, + "grad_norm": 0.0008929538307711482, + "learning_rate": 4.215638484175558e-06, + "loss": 0.0036, + "step": 170620 + }, + { + "epoch": 36.02412392352272, + "grad_norm": 0.0008213265100494027, + "learning_rate": 4.212629463130465e-06, + "loss": 0.0124, + "step": 170630 + }, + { + "epoch": 36.02417808590153, + "grad_norm": 0.028663087636232376, + "learning_rate": 4.209620442085372e-06, + "loss": 0.0033, + "step": 170640 + }, + { + "epoch": 36.02423224828034, + "grad_norm": 0.0011625034967437387, + "learning_rate": 4.206611421040279e-06, + "loss": 0.0277, + "step": 170650 + }, + { + "epoch": 36.02428641065916, + "grad_norm": 0.003906958270817995, + "learning_rate": 4.2036023999951856e-06, + "loss": 0.0121, + "step": 170660 + }, + { + "epoch": 36.02434057303797, + "grad_norm": 0.006837969180196524, + "learning_rate": 4.200593378950093e-06, + "loss": 0.0072, + "step": 170670 + }, + { + "epoch": 36.02439473541678, + "grad_norm": 0.001051102764904499, + "learning_rate": 4.197584357905e-06, + "loss": 0.0149, + "step": 170680 + }, + { + "epoch": 36.024448897795594, + "grad_norm": 1.0207747220993042, + "learning_rate": 4.194575336859907e-06, + "loss": 0.0181, + "step": 170690 + }, + { + "epoch": 36.0245030601744, + "grad_norm": 0.999271035194397, + "learning_rate": 4.191566315814812e-06, + "loss": 0.0372, + "step": 170700 + }, + { + "epoch": 36.02455722255321, + "grad_norm": 0.0016232479829341173, + "learning_rate": 4.1885572947697195e-06, + "loss": 0.0316, + "step": 170710 + }, + { + "epoch": 36.02461138493203, + "grad_norm": 0.0010955673642456532, + "learning_rate": 4.185548273724627e-06, + "loss": 0.0036, + "step": 170720 + }, + { + "epoch": 36.02466554731084, + "grad_norm": 0.0010177382500842214, + "learning_rate": 4.182539252679534e-06, + "loss": 0.0313, + "step": 170730 + }, + { + "epoch": 36.02471970968965, + "grad_norm": 0.0008725218940526247, + "learning_rate": 4.17953023163444e-06, + "loss": 0.0142, + "step": 170740 + }, + { + "epoch": 36.02477387206846, + "grad_norm": 0.0024697824846953154, + "learning_rate": 4.176521210589347e-06, + "loss": 0.0257, + "step": 170750 + }, + { + "epoch": 36.024828034447275, + "grad_norm": 0.35335972905158997, + "learning_rate": 4.173512189544254e-06, + "loss": 0.056, + "step": 170760 + }, + { + "epoch": 36.024882196826084, + "grad_norm": 0.264496386051178, + "learning_rate": 4.1705031684991605e-06, + "loss": 0.0501, + "step": 170770 + }, + { + "epoch": 36.024936359204894, + "grad_norm": 0.026379013434052467, + "learning_rate": 4.167494147454068e-06, + "loss": 0.0144, + "step": 170780 + }, + { + "epoch": 36.02499052158371, + "grad_norm": 0.05489543452858925, + "learning_rate": 4.164485126408974e-06, + "loss": 0.0354, + "step": 170790 + }, + { + "epoch": 36.02500135405947, + "eval_accuracy": 0.8442194644023514, + "eval_loss": 1.0300602912902832, + "eval_runtime": 118.7807, + "eval_samples_per_second": 25.779, + "eval_steps_per_second": 3.224, + "step": 170792 + }, + { + "epoch": 37.00004332990305, + "grad_norm": 0.8292244672775269, + "learning_rate": 4.161476105363881e-06, + "loss": 0.0239, + "step": 170800 + }, + { + "epoch": 37.00009749228186, + "grad_norm": 0.002137780888006091, + "learning_rate": 4.158467084318788e-06, + "loss": 0.0189, + "step": 170810 + }, + { + "epoch": 37.00015165466067, + "grad_norm": 0.0008027204312384129, + "learning_rate": 4.155458063273695e-06, + "loss": 0.0747, + "step": 170820 + }, + { + "epoch": 37.000205817039486, + "grad_norm": 0.0015425942838191986, + "learning_rate": 4.1524490422286016e-06, + "loss": 0.0164, + "step": 170830 + }, + { + "epoch": 37.000259979418296, + "grad_norm": 0.865655779838562, + "learning_rate": 4.149440021183509e-06, + "loss": 0.0093, + "step": 170840 + }, + { + "epoch": 37.000314141797105, + "grad_norm": 1.0331097841262817, + "learning_rate": 4.146431000138415e-06, + "loss": 0.0268, + "step": 170850 + }, + { + "epoch": 37.00036830417592, + "grad_norm": 0.15440823137760162, + "learning_rate": 4.143421979093322e-06, + "loss": 0.0253, + "step": 170860 + }, + { + "epoch": 37.00042246655473, + "grad_norm": 0.7050843834877014, + "learning_rate": 4.140412958048228e-06, + "loss": 0.0205, + "step": 170870 + }, + { + "epoch": 37.00047662893354, + "grad_norm": 3.59082293510437, + "learning_rate": 4.1374039370031355e-06, + "loss": 0.0207, + "step": 170880 + }, + { + "epoch": 37.00053079131236, + "grad_norm": 0.0008000549278222024, + "learning_rate": 4.134394915958043e-06, + "loss": 0.0119, + "step": 170890 + }, + { + "epoch": 37.00058495369117, + "grad_norm": 0.0007917347829788923, + "learning_rate": 4.13138589491295e-06, + "loss": 0.0116, + "step": 170900 + }, + { + "epoch": 37.000639116069976, + "grad_norm": 0.0007910252315923572, + "learning_rate": 4.128376873867856e-06, + "loss": 0.0, + "step": 170910 + }, + { + "epoch": 37.00069327844879, + "grad_norm": 0.7528685331344604, + "learning_rate": 4.125367852822762e-06, + "loss": 0.0589, + "step": 170920 + }, + { + "epoch": 37.0007474408276, + "grad_norm": 0.0012784095015376806, + "learning_rate": 4.122358831777669e-06, + "loss": 0.0081, + "step": 170930 + }, + { + "epoch": 37.00080160320641, + "grad_norm": 0.007715463172644377, + "learning_rate": 4.1193498107325765e-06, + "loss": 0.0191, + "step": 170940 + }, + { + "epoch": 37.00085576558522, + "grad_norm": 3.2701339721679688, + "learning_rate": 4.116340789687484e-06, + "loss": 0.0499, + "step": 170950 + }, + { + "epoch": 37.00090992796404, + "grad_norm": 0.0007882280624471605, + "learning_rate": 4.11333176864239e-06, + "loss": 0.008, + "step": 170960 + }, + { + "epoch": 37.00096409034285, + "grad_norm": 0.0007737841224297881, + "learning_rate": 4.110322747597297e-06, + "loss": 0.0165, + "step": 170970 + }, + { + "epoch": 37.00101825272166, + "grad_norm": 0.11030612885951996, + "learning_rate": 4.107313726552204e-06, + "loss": 0.0244, + "step": 170980 + }, + { + "epoch": 37.001072415100474, + "grad_norm": 3.172722578048706, + "learning_rate": 4.104304705507111e-06, + "loss": 0.069, + "step": 170990 + }, + { + "epoch": 37.00112657747928, + "grad_norm": 0.0008421677048318088, + "learning_rate": 4.101295684462017e-06, + "loss": 0.0039, + "step": 171000 + }, + { + "epoch": 37.00118073985809, + "grad_norm": 0.0007906786049716175, + "learning_rate": 4.098286663416924e-06, + "loss": 0.028, + "step": 171010 + }, + { + "epoch": 37.00123490223691, + "grad_norm": 0.012367421761155128, + "learning_rate": 4.095277642371831e-06, + "loss": 0.0356, + "step": 171020 + }, + { + "epoch": 37.00128906461572, + "grad_norm": 1.5920480489730835, + "learning_rate": 4.092268621326738e-06, + "loss": 0.0392, + "step": 171030 + }, + { + "epoch": 37.00134322699453, + "grad_norm": 0.0007799710729159415, + "learning_rate": 4.089259600281644e-06, + "loss": 0.0282, + "step": 171040 + }, + { + "epoch": 37.00139738937334, + "grad_norm": 0.0010121142258867621, + "learning_rate": 4.0862505792365515e-06, + "loss": 0.009, + "step": 171050 + }, + { + "epoch": 37.001451551752155, + "grad_norm": 0.022690564393997192, + "learning_rate": 4.083241558191459e-06, + "loss": 0.036, + "step": 171060 + }, + { + "epoch": 37.001505714130964, + "grad_norm": 0.0007692844956181943, + "learning_rate": 4.080232537146365e-06, + "loss": 0.0712, + "step": 171070 + }, + { + "epoch": 37.00155987650977, + "grad_norm": 0.0008948232280090451, + "learning_rate": 4.077223516101272e-06, + "loss": 0.0349, + "step": 171080 + }, + { + "epoch": 37.00161403888859, + "grad_norm": 3.460379123687744, + "learning_rate": 4.074214495056178e-06, + "loss": 0.0644, + "step": 171090 + }, + { + "epoch": 37.0016682012674, + "grad_norm": 0.0007966970442794263, + "learning_rate": 4.071205474011085e-06, + "loss": 0.0311, + "step": 171100 + }, + { + "epoch": 37.00172236364621, + "grad_norm": 0.0008267014636658132, + "learning_rate": 4.0681964529659925e-06, + "loss": 0.0233, + "step": 171110 + }, + { + "epoch": 37.001776526025026, + "grad_norm": 0.0010071768192574382, + "learning_rate": 4.0651874319209e-06, + "loss": 0.0173, + "step": 171120 + }, + { + "epoch": 37.001830688403835, + "grad_norm": 0.4733763039112091, + "learning_rate": 4.062178410875806e-06, + "loss": 0.0159, + "step": 171130 + }, + { + "epoch": 37.001884850782645, + "grad_norm": 0.141288161277771, + "learning_rate": 4.059169389830713e-06, + "loss": 0.0154, + "step": 171140 + }, + { + "epoch": 37.00193901316146, + "grad_norm": 0.0009644300444051623, + "learning_rate": 4.056160368785619e-06, + "loss": 0.031, + "step": 171150 + }, + { + "epoch": 37.00199317554027, + "grad_norm": 0.0007994702900759876, + "learning_rate": 4.053151347740526e-06, + "loss": 0.0866, + "step": 171160 + }, + { + "epoch": 37.00204733791908, + "grad_norm": 0.007531072944402695, + "learning_rate": 4.050142326695433e-06, + "loss": 0.0309, + "step": 171170 + }, + { + "epoch": 37.00210150029789, + "grad_norm": 0.001128084841184318, + "learning_rate": 4.04713330565034e-06, + "loss": 0.1058, + "step": 171180 + }, + { + "epoch": 37.00215566267671, + "grad_norm": 0.10106997191905975, + "learning_rate": 4.044124284605247e-06, + "loss": 0.0139, + "step": 171190 + }, + { + "epoch": 37.002209825055516, + "grad_norm": 0.0014667811337858438, + "learning_rate": 4.041115263560154e-06, + "loss": 0.0512, + "step": 171200 + }, + { + "epoch": 37.002263987434326, + "grad_norm": 0.8153089284896851, + "learning_rate": 4.03810624251506e-06, + "loss": 0.0128, + "step": 171210 + }, + { + "epoch": 37.00231814981314, + "grad_norm": 0.6292165517807007, + "learning_rate": 4.035097221469967e-06, + "loss": 0.029, + "step": 171220 + }, + { + "epoch": 37.00237231219195, + "grad_norm": 0.0009834034135565162, + "learning_rate": 4.032088200424874e-06, + "loss": 0.0766, + "step": 171230 + }, + { + "epoch": 37.00242647457076, + "grad_norm": 0.0010674860095605254, + "learning_rate": 4.029079179379781e-06, + "loss": 0.0052, + "step": 171240 + }, + { + "epoch": 37.00248063694958, + "grad_norm": 0.013490041717886925, + "learning_rate": 4.026070158334688e-06, + "loss": 0.0079, + "step": 171250 + }, + { + "epoch": 37.00253479932839, + "grad_norm": 0.04514464735984802, + "learning_rate": 4.023061137289594e-06, + "loss": 0.0001, + "step": 171260 + }, + { + "epoch": 37.0025889617072, + "grad_norm": 0.001310844672843814, + "learning_rate": 4.020052116244501e-06, + "loss": 0.0738, + "step": 171270 + }, + { + "epoch": 37.002643124086006, + "grad_norm": 0.0010693108197301626, + "learning_rate": 4.0170430951994085e-06, + "loss": 0.0293, + "step": 171280 + }, + { + "epoch": 37.00269728646482, + "grad_norm": 0.0017812096048146486, + "learning_rate": 4.014034074154315e-06, + "loss": 0.0144, + "step": 171290 + }, + { + "epoch": 37.00275144884363, + "grad_norm": 0.0012695773039013147, + "learning_rate": 4.011025053109221e-06, + "loss": 0.0643, + "step": 171300 + }, + { + "epoch": 37.00280561122244, + "grad_norm": 2.2491958141326904, + "learning_rate": 4.008016032064128e-06, + "loss": 0.0999, + "step": 171310 + }, + { + "epoch": 37.00285977360126, + "grad_norm": 0.0011670064413920045, + "learning_rate": 4.005007011019035e-06, + "loss": 0.007, + "step": 171320 + }, + { + "epoch": 37.00291393598007, + "grad_norm": 1.0903671979904175, + "learning_rate": 4.001997989973942e-06, + "loss": 0.0443, + "step": 171330 + }, + { + "epoch": 37.00296809835888, + "grad_norm": 0.8095029592514038, + "learning_rate": 3.998988968928849e-06, + "loss": 0.0385, + "step": 171340 + }, + { + "epoch": 37.003022260737694, + "grad_norm": 0.0008491152548231184, + "learning_rate": 3.995979947883756e-06, + "loss": 0.0576, + "step": 171350 + }, + { + "epoch": 37.003076423116504, + "grad_norm": 0.0008401444065384567, + "learning_rate": 3.992970926838663e-06, + "loss": 0.0314, + "step": 171360 + }, + { + "epoch": 37.00313058549531, + "grad_norm": 4.381592273712158, + "learning_rate": 3.989961905793569e-06, + "loss": 0.0467, + "step": 171370 + }, + { + "epoch": 37.00318474787413, + "grad_norm": 0.0022494462318718433, + "learning_rate": 3.986952884748476e-06, + "loss": 0.006, + "step": 171380 + }, + { + "epoch": 37.00323891025294, + "grad_norm": 0.0010603561531752348, + "learning_rate": 3.983943863703383e-06, + "loss": 0.1257, + "step": 171390 + }, + { + "epoch": 37.00329307263175, + "grad_norm": 0.0008222204633057117, + "learning_rate": 3.98093484265829e-06, + "loss": 0.0565, + "step": 171400 + }, + { + "epoch": 37.00334723501056, + "grad_norm": 0.0008761094068177044, + "learning_rate": 3.977925821613197e-06, + "loss": 0.0084, + "step": 171410 + }, + { + "epoch": 37.003401397389375, + "grad_norm": 3.703249931335449, + "learning_rate": 3.974916800568104e-06, + "loss": 0.0611, + "step": 171420 + }, + { + "epoch": 37.003455559768184, + "grad_norm": 0.0011579229030758142, + "learning_rate": 3.97190777952301e-06, + "loss": 0.032, + "step": 171430 + }, + { + "epoch": 37.003509722146994, + "grad_norm": 0.0011082865530624986, + "learning_rate": 3.9688987584779165e-06, + "loss": 0.0527, + "step": 171440 + }, + { + "epoch": 37.00356388452581, + "grad_norm": 1.8210232257843018, + "learning_rate": 3.965889737432824e-06, + "loss": 0.0338, + "step": 171450 + }, + { + "epoch": 37.00361804690462, + "grad_norm": 0.38272175192832947, + "learning_rate": 3.962880716387731e-06, + "loss": 0.0086, + "step": 171460 + }, + { + "epoch": 37.00367220928343, + "grad_norm": 0.010118577629327774, + "learning_rate": 3.959871695342637e-06, + "loss": 0.068, + "step": 171470 + }, + { + "epoch": 37.003726371662246, + "grad_norm": 0.0011030243476852775, + "learning_rate": 3.956862674297544e-06, + "loss": 0.1002, + "step": 171480 + }, + { + "epoch": 37.003780534041056, + "grad_norm": 0.0008141877478919923, + "learning_rate": 3.953853653252451e-06, + "loss": 0.0062, + "step": 171490 + }, + { + "epoch": 37.003834696419865, + "grad_norm": 0.0008376462501473725, + "learning_rate": 3.950844632207358e-06, + "loss": 0.019, + "step": 171500 + }, + { + "epoch": 37.00388885879868, + "grad_norm": 0.0010715923272073269, + "learning_rate": 3.947835611162265e-06, + "loss": 0.0344, + "step": 171510 + }, + { + "epoch": 37.00394302117749, + "grad_norm": 0.32326024770736694, + "learning_rate": 3.944826590117171e-06, + "loss": 0.0584, + "step": 171520 + }, + { + "epoch": 37.0039971835563, + "grad_norm": 0.0008068440365605056, + "learning_rate": 3.941817569072078e-06, + "loss": 0.0343, + "step": 171530 + }, + { + "epoch": 37.00405134593511, + "grad_norm": 0.0008099614060483873, + "learning_rate": 3.938808548026985e-06, + "loss": 0.0571, + "step": 171540 + }, + { + "epoch": 37.00410550831393, + "grad_norm": 0.0010563648538663983, + "learning_rate": 3.935799526981892e-06, + "loss": 0.032, + "step": 171550 + }, + { + "epoch": 37.00415967069274, + "grad_norm": 0.00652502104640007, + "learning_rate": 3.9327905059367986e-06, + "loss": 0.1014, + "step": 171560 + }, + { + "epoch": 37.004213833071546, + "grad_norm": 0.001089294906705618, + "learning_rate": 3.929781484891706e-06, + "loss": 0.0146, + "step": 171570 + }, + { + "epoch": 37.00426799545036, + "grad_norm": 0.0012675911420956254, + "learning_rate": 3.926772463846613e-06, + "loss": 0.0441, + "step": 171580 + }, + { + "epoch": 37.00432215782917, + "grad_norm": 0.017047978937625885, + "learning_rate": 3.923763442801519e-06, + "loss": 0.0052, + "step": 171590 + }, + { + "epoch": 37.00437632020798, + "grad_norm": 0.0008315364248119295, + "learning_rate": 3.920754421756425e-06, + "loss": 0.0072, + "step": 171600 + }, + { + "epoch": 37.0044304825868, + "grad_norm": 0.0008248278172686696, + "learning_rate": 3.9177454007113325e-06, + "loss": 0.0056, + "step": 171610 + }, + { + "epoch": 37.00448464496561, + "grad_norm": 0.0013343284372240305, + "learning_rate": 3.91473637966624e-06, + "loss": 0.0103, + "step": 171620 + }, + { + "epoch": 37.00453880734442, + "grad_norm": 0.0010127881541848183, + "learning_rate": 3.911727358621147e-06, + "loss": 0.0181, + "step": 171630 + }, + { + "epoch": 37.00459296972323, + "grad_norm": 0.0011952011846005917, + "learning_rate": 3.908718337576053e-06, + "loss": 0.0417, + "step": 171640 + }, + { + "epoch": 37.00464713210204, + "grad_norm": 0.0010598081862553954, + "learning_rate": 3.90570931653096e-06, + "loss": 0.0533, + "step": 171650 + }, + { + "epoch": 37.00470129448085, + "grad_norm": 3.6868698596954346, + "learning_rate": 3.902700295485867e-06, + "loss": 0.0532, + "step": 171660 + }, + { + "epoch": 37.00475545685966, + "grad_norm": 0.0032498964574187994, + "learning_rate": 3.8996912744407735e-06, + "loss": 0.0045, + "step": 171670 + }, + { + "epoch": 37.00480961923848, + "grad_norm": 0.9857991337776184, + "learning_rate": 3.896682253395681e-06, + "loss": 0.0124, + "step": 171680 + }, + { + "epoch": 37.00486378161729, + "grad_norm": 1.407527208328247, + "learning_rate": 3.893673232350587e-06, + "loss": 0.0436, + "step": 171690 + }, + { + "epoch": 37.0049179439961, + "grad_norm": 0.6696224808692932, + "learning_rate": 3.890664211305494e-06, + "loss": 0.0455, + "step": 171700 + }, + { + "epoch": 37.004972106374915, + "grad_norm": 0.0013466186355799437, + "learning_rate": 3.887655190260401e-06, + "loss": 0.0842, + "step": 171710 + }, + { + "epoch": 37.005026268753724, + "grad_norm": 0.0016912331338971853, + "learning_rate": 3.884646169215308e-06, + "loss": 0.0172, + "step": 171720 + }, + { + "epoch": 37.005080431132534, + "grad_norm": 0.0018323053373023868, + "learning_rate": 3.8816371481702146e-06, + "loss": 0.0352, + "step": 171730 + }, + { + "epoch": 37.00513459351135, + "grad_norm": 0.001239610486663878, + "learning_rate": 3.878628127125121e-06, + "loss": 0.0108, + "step": 171740 + }, + { + "epoch": 37.00518875589016, + "grad_norm": 0.0008540820563212037, + "learning_rate": 3.875619106080028e-06, + "loss": 0.0365, + "step": 171750 + }, + { + "epoch": 37.00524291826897, + "grad_norm": 0.5280416011810303, + "learning_rate": 3.872610085034935e-06, + "loss": 0.0212, + "step": 171760 + }, + { + "epoch": 37.00529708064778, + "grad_norm": 0.0008284348296001554, + "learning_rate": 3.869601063989841e-06, + "loss": 0.0522, + "step": 171770 + }, + { + "epoch": 37.005351243026595, + "grad_norm": 0.013458071276545525, + "learning_rate": 3.8665920429447485e-06, + "loss": 0.1017, + "step": 171780 + }, + { + "epoch": 37.005405405405405, + "grad_norm": 0.0016797290882095695, + "learning_rate": 3.863583021899656e-06, + "loss": 0.028, + "step": 171790 + }, + { + "epoch": 37.005459567784214, + "grad_norm": 0.002977165160700679, + "learning_rate": 3.860574000854563e-06, + "loss": 0.0023, + "step": 171800 + }, + { + "epoch": 37.00551373016303, + "grad_norm": 0.39941319823265076, + "learning_rate": 3.857564979809469e-06, + "loss": 0.0564, + "step": 171810 + }, + { + "epoch": 37.00556789254184, + "grad_norm": 0.0029520795214921236, + "learning_rate": 3.854555958764375e-06, + "loss": 0.014, + "step": 171820 + }, + { + "epoch": 37.00562205492065, + "grad_norm": 0.10014886409044266, + "learning_rate": 3.851546937719282e-06, + "loss": 0.0371, + "step": 171830 + }, + { + "epoch": 37.00567621729947, + "grad_norm": 0.003913263790309429, + "learning_rate": 3.8485379166741895e-06, + "loss": 0.0354, + "step": 171840 + }, + { + "epoch": 37.005730379678276, + "grad_norm": 0.000998393283225596, + "learning_rate": 3.845528895629097e-06, + "loss": 0.0029, + "step": 171850 + }, + { + "epoch": 37.005784542057086, + "grad_norm": 0.0008429504232481122, + "learning_rate": 3.842519874584003e-06, + "loss": 0.006, + "step": 171860 + }, + { + "epoch": 37.0058387044359, + "grad_norm": 0.00271582487039268, + "learning_rate": 3.83951085353891e-06, + "loss": 0.083, + "step": 171870 + }, + { + "epoch": 37.00589286681471, + "grad_norm": 0.0008071401971392334, + "learning_rate": 3.836501832493817e-06, + "loss": 0.0632, + "step": 171880 + }, + { + "epoch": 37.00594702919352, + "grad_norm": 0.001835839357227087, + "learning_rate": 3.833492811448723e-06, + "loss": 0.0154, + "step": 171890 + }, + { + "epoch": 37.00600119157233, + "grad_norm": 1.044127345085144, + "learning_rate": 3.83048379040363e-06, + "loss": 0.0523, + "step": 171900 + }, + { + "epoch": 37.00605535395115, + "grad_norm": 0.0010642155539244413, + "learning_rate": 3.827474769358537e-06, + "loss": 0.0847, + "step": 171910 + }, + { + "epoch": 37.00610951632996, + "grad_norm": 0.0020110984332859516, + "learning_rate": 3.824465748313444e-06, + "loss": 0.0215, + "step": 171920 + }, + { + "epoch": 37.006163678708766, + "grad_norm": 9.318732261657715, + "learning_rate": 3.821456727268351e-06, + "loss": 0.0115, + "step": 171930 + }, + { + "epoch": 37.00621784108758, + "grad_norm": 0.9520605206489563, + "learning_rate": 3.818447706223257e-06, + "loss": 0.0087, + "step": 171940 + }, + { + "epoch": 37.00627200346639, + "grad_norm": 2.4970967769622803, + "learning_rate": 3.8154386851781645e-06, + "loss": 0.0333, + "step": 171950 + }, + { + "epoch": 37.0063261658452, + "grad_norm": 0.1234484314918518, + "learning_rate": 3.8124296641330716e-06, + "loss": 0.0483, + "step": 171960 + }, + { + "epoch": 37.00638032822402, + "grad_norm": 0.001310562714934349, + "learning_rate": 3.809420643087978e-06, + "loss": 0.0275, + "step": 171970 + }, + { + "epoch": 37.00643449060283, + "grad_norm": 0.0015332839684560895, + "learning_rate": 3.8064116220428845e-06, + "loss": 0.1411, + "step": 171980 + }, + { + "epoch": 37.00648865298164, + "grad_norm": 0.0105024054646492, + "learning_rate": 3.8034026009977917e-06, + "loss": 0.0277, + "step": 171990 + }, + { + "epoch": 37.00654281536045, + "grad_norm": 0.3739081919193268, + "learning_rate": 3.8003935799526984e-06, + "loss": 0.0021, + "step": 172000 + }, + { + "epoch": 37.006596977739264, + "grad_norm": 0.002066811081022024, + "learning_rate": 3.7973845589076055e-06, + "loss": 0.0299, + "step": 172010 + }, + { + "epoch": 37.00665114011807, + "grad_norm": 0.09071826189756393, + "learning_rate": 3.794375537862512e-06, + "loss": 0.1356, + "step": 172020 + }, + { + "epoch": 37.00670530249688, + "grad_norm": 0.10380946099758148, + "learning_rate": 3.791366516817419e-06, + "loss": 0.0097, + "step": 172030 + }, + { + "epoch": 37.0067594648757, + "grad_norm": 0.00149316038005054, + "learning_rate": 3.788357495772325e-06, + "loss": 0.0123, + "step": 172040 + }, + { + "epoch": 37.00681362725451, + "grad_norm": 0.0112034622579813, + "learning_rate": 3.7853484747272323e-06, + "loss": 0.0305, + "step": 172050 + }, + { + "epoch": 37.00686778963332, + "grad_norm": 0.8269984126091003, + "learning_rate": 3.782339453682139e-06, + "loss": 0.0174, + "step": 172060 + }, + { + "epoch": 37.006921952012135, + "grad_norm": 0.0010425832588225603, + "learning_rate": 3.779330432637046e-06, + "loss": 0.0552, + "step": 172070 + }, + { + "epoch": 37.006976114390945, + "grad_norm": 0.07634668052196503, + "learning_rate": 3.776321411591953e-06, + "loss": 0.0042, + "step": 172080 + }, + { + "epoch": 37.007030276769754, + "grad_norm": 0.0010690378258004785, + "learning_rate": 3.77331239054686e-06, + "loss": 0.0229, + "step": 172090 + }, + { + "epoch": 37.00708443914857, + "grad_norm": 0.0029575808439403772, + "learning_rate": 3.7703033695017666e-06, + "loss": 0.0048, + "step": 172100 + }, + { + "epoch": 37.00713860152738, + "grad_norm": 0.1626429408788681, + "learning_rate": 3.7672943484566737e-06, + "loss": 0.0094, + "step": 172110 + }, + { + "epoch": 37.00719276390619, + "grad_norm": 0.0013363469624891877, + "learning_rate": 3.76428532741158e-06, + "loss": 0.0021, + "step": 172120 + }, + { + "epoch": 37.007246926285, + "grad_norm": 0.0008116490207612514, + "learning_rate": 3.7612763063664867e-06, + "loss": 0.0004, + "step": 172130 + }, + { + "epoch": 37.007301088663816, + "grad_norm": 0.005155552178621292, + "learning_rate": 3.758267285321394e-06, + "loss": 0.0076, + "step": 172140 + }, + { + "epoch": 37.007355251042625, + "grad_norm": 0.0027052154764533043, + "learning_rate": 3.7552582642763005e-06, + "loss": 0.0214, + "step": 172150 + }, + { + "epoch": 37.007409413421435, + "grad_norm": 0.001400019507855177, + "learning_rate": 3.7522492432312072e-06, + "loss": 0.0187, + "step": 172160 + }, + { + "epoch": 37.00746357580025, + "grad_norm": 0.0008436494972556829, + "learning_rate": 3.7492402221861144e-06, + "loss": 0.0, + "step": 172170 + }, + { + "epoch": 37.00751773817906, + "grad_norm": 0.0013486959505826235, + "learning_rate": 3.746231201141021e-06, + "loss": 0.0123, + "step": 172180 + }, + { + "epoch": 37.00757190055787, + "grad_norm": 0.003307839622721076, + "learning_rate": 3.7432221800959273e-06, + "loss": 0.0291, + "step": 172190 + }, + { + "epoch": 37.00762606293669, + "grad_norm": 0.0008322742651216686, + "learning_rate": 3.7402131590508344e-06, + "loss": 0.0165, + "step": 172200 + }, + { + "epoch": 37.0076802253155, + "grad_norm": 0.00133352424018085, + "learning_rate": 3.737204138005741e-06, + "loss": 0.0245, + "step": 172210 + }, + { + "epoch": 37.007734387694306, + "grad_norm": 0.002216955181211233, + "learning_rate": 3.7341951169606483e-06, + "loss": 0.0159, + "step": 172220 + }, + { + "epoch": 37.00778855007312, + "grad_norm": 0.0055941324681043625, + "learning_rate": 3.731186095915555e-06, + "loss": 0.0006, + "step": 172230 + }, + { + "epoch": 37.00784271245193, + "grad_norm": 0.0008066186564974487, + "learning_rate": 3.728177074870462e-06, + "loss": 0.0082, + "step": 172240 + }, + { + "epoch": 37.00789687483074, + "grad_norm": 0.0013830432435497642, + "learning_rate": 3.7251680538253688e-06, + "loss": 0.0433, + "step": 172250 + }, + { + "epoch": 37.00795103720955, + "grad_norm": 0.0010948135750368237, + "learning_rate": 3.722159032780276e-06, + "loss": 0.0313, + "step": 172260 + }, + { + "epoch": 37.00800519958837, + "grad_norm": 0.0015288321301341057, + "learning_rate": 3.719150011735182e-06, + "loss": 0.0339, + "step": 172270 + }, + { + "epoch": 37.00805936196718, + "grad_norm": 0.0008047224255278707, + "learning_rate": 3.716140990690089e-06, + "loss": 0.0131, + "step": 172280 + }, + { + "epoch": 37.00811352434599, + "grad_norm": 0.0009002590086311102, + "learning_rate": 3.713131969644996e-06, + "loss": 0.0051, + "step": 172290 + }, + { + "epoch": 37.0081676867248, + "grad_norm": 0.0008147255284711719, + "learning_rate": 3.7101229485999027e-06, + "loss": 0.0332, + "step": 172300 + }, + { + "epoch": 37.00822184910361, + "grad_norm": 0.4627190828323364, + "learning_rate": 3.7071139275548094e-06, + "loss": 0.0145, + "step": 172310 + }, + { + "epoch": 37.00827601148242, + "grad_norm": 3.999107599258423, + "learning_rate": 3.7041049065097165e-06, + "loss": 0.0776, + "step": 172320 + }, + { + "epoch": 37.00833017386124, + "grad_norm": 0.000817427528090775, + "learning_rate": 3.7010958854646232e-06, + "loss": 0.0308, + "step": 172330 + }, + { + "epoch": 37.00838433624005, + "grad_norm": 0.001403569127433002, + "learning_rate": 3.6980868644195295e-06, + "loss": 0.0716, + "step": 172340 + }, + { + "epoch": 37.00843849861886, + "grad_norm": 1.1369343996047974, + "learning_rate": 3.6950778433744366e-06, + "loss": 0.0575, + "step": 172350 + }, + { + "epoch": 37.00849266099767, + "grad_norm": 0.0021056472323834896, + "learning_rate": 3.6920688223293433e-06, + "loss": 0.0402, + "step": 172360 + }, + { + "epoch": 37.008546823376484, + "grad_norm": 0.0008079509134404361, + "learning_rate": 3.6890598012842504e-06, + "loss": 0.0908, + "step": 172370 + }, + { + "epoch": 37.008600985755294, + "grad_norm": 0.0008054193458519876, + "learning_rate": 3.686050780239157e-06, + "loss": 0.011, + "step": 172380 + }, + { + "epoch": 37.0086551481341, + "grad_norm": 0.001364331692457199, + "learning_rate": 3.6830417591940643e-06, + "loss": 0.0394, + "step": 172390 + }, + { + "epoch": 37.00870931051292, + "grad_norm": 0.0008030548924580216, + "learning_rate": 3.680032738148971e-06, + "loss": 0.0399, + "step": 172400 + }, + { + "epoch": 37.00876347289173, + "grad_norm": 0.9041386246681213, + "learning_rate": 3.677023717103878e-06, + "loss": 0.0369, + "step": 172410 + }, + { + "epoch": 37.00881763527054, + "grad_norm": 0.0008840254158712924, + "learning_rate": 3.6740146960587843e-06, + "loss": 0.0275, + "step": 172420 + }, + { + "epoch": 37.008871797649356, + "grad_norm": 1.304887056350708, + "learning_rate": 3.671005675013691e-06, + "loss": 0.026, + "step": 172430 + }, + { + "epoch": 37.008925960028165, + "grad_norm": 0.0010367860086262226, + "learning_rate": 3.667996653968598e-06, + "loss": 0.0192, + "step": 172440 + }, + { + "epoch": 37.008980122406975, + "grad_norm": 0.0008215602720156312, + "learning_rate": 3.664987632923505e-06, + "loss": 0.025, + "step": 172450 + }, + { + "epoch": 37.00903428478579, + "grad_norm": 0.001918722060509026, + "learning_rate": 3.6619786118784116e-06, + "loss": 0.0007, + "step": 172460 + }, + { + "epoch": 37.0090884471646, + "grad_norm": 0.006365655921399593, + "learning_rate": 3.6589695908333187e-06, + "loss": 0.0486, + "step": 172470 + }, + { + "epoch": 37.00914260954341, + "grad_norm": 0.0014652099926024675, + "learning_rate": 3.6559605697882254e-06, + "loss": 0.0087, + "step": 172480 + }, + { + "epoch": 37.00919677192222, + "grad_norm": 0.0019172047032043338, + "learning_rate": 3.6529515487431317e-06, + "loss": 0.012, + "step": 172490 + }, + { + "epoch": 37.009250934301036, + "grad_norm": 0.0030651213601231575, + "learning_rate": 3.6499425276980388e-06, + "loss": 0.0002, + "step": 172500 + }, + { + "epoch": 37.009305096679846, + "grad_norm": 0.0010742505546659231, + "learning_rate": 3.6469335066529455e-06, + "loss": 0.0229, + "step": 172510 + }, + { + "epoch": 37.009359259058655, + "grad_norm": 0.0008152774535119534, + "learning_rate": 3.6439244856078526e-06, + "loss": 0.0443, + "step": 172520 + }, + { + "epoch": 37.00941342143747, + "grad_norm": 0.0007919742492958903, + "learning_rate": 3.6409154645627593e-06, + "loss": 0.0312, + "step": 172530 + }, + { + "epoch": 37.00946758381628, + "grad_norm": 0.000980880344286561, + "learning_rate": 3.6379064435176664e-06, + "loss": 0.014, + "step": 172540 + }, + { + "epoch": 37.00952174619509, + "grad_norm": 1.3707735538482666, + "learning_rate": 3.634897422472573e-06, + "loss": 0.0244, + "step": 172550 + }, + { + "epoch": 37.00957590857391, + "grad_norm": 0.03006097488105297, + "learning_rate": 3.6318884014274802e-06, + "loss": 0.0172, + "step": 172560 + }, + { + "epoch": 37.00963007095272, + "grad_norm": 2.2271740436553955, + "learning_rate": 3.6288793803823865e-06, + "loss": 0.0636, + "step": 172570 + }, + { + "epoch": 37.00968423333153, + "grad_norm": 0.000801712041720748, + "learning_rate": 3.6258703593372932e-06, + "loss": 0.0049, + "step": 172580 + }, + { + "epoch": 37.009738395710336, + "grad_norm": 0.3657362759113312, + "learning_rate": 3.6228613382922e-06, + "loss": 0.0007, + "step": 172590 + }, + { + "epoch": 37.00979255808915, + "grad_norm": 0.003539398545399308, + "learning_rate": 3.619852317247107e-06, + "loss": 0.0148, + "step": 172600 + }, + { + "epoch": 37.00984672046796, + "grad_norm": 0.0007782237371429801, + "learning_rate": 3.6168432962020137e-06, + "loss": 0.0001, + "step": 172610 + }, + { + "epoch": 37.00990088284677, + "grad_norm": 0.001981207402423024, + "learning_rate": 3.613834275156921e-06, + "loss": 0.0, + "step": 172620 + }, + { + "epoch": 37.00995504522559, + "grad_norm": 0.0017840422224253416, + "learning_rate": 3.6108252541118275e-06, + "loss": 0.0116, + "step": 172630 + }, + { + "epoch": 37.0100092076044, + "grad_norm": 0.0012027635239064693, + "learning_rate": 3.607816233066734e-06, + "loss": 0.0112, + "step": 172640 + }, + { + "epoch": 37.01006336998321, + "grad_norm": 0.23208262026309967, + "learning_rate": 3.604807212021641e-06, + "loss": 0.0657, + "step": 172650 + }, + { + "epoch": 37.010117532362024, + "grad_norm": 0.0009864787571132183, + "learning_rate": 3.6017981909765476e-06, + "loss": 0.0369, + "step": 172660 + }, + { + "epoch": 37.01017169474083, + "grad_norm": 0.024833805859088898, + "learning_rate": 3.5987891699314548e-06, + "loss": 0.0257, + "step": 172670 + }, + { + "epoch": 37.01022585711964, + "grad_norm": 0.0009861222933977842, + "learning_rate": 3.5957801488863615e-06, + "loss": 0.0, + "step": 172680 + }, + { + "epoch": 37.01028001949846, + "grad_norm": 0.01818581484258175, + "learning_rate": 3.5927711278412686e-06, + "loss": 0.0221, + "step": 172690 + }, + { + "epoch": 37.01033418187727, + "grad_norm": 0.0014694746350869536, + "learning_rate": 3.5897621067961753e-06, + "loss": 0.0552, + "step": 172700 + }, + { + "epoch": 37.01038834425608, + "grad_norm": 0.00251210224814713, + "learning_rate": 3.5867530857510816e-06, + "loss": 0.0654, + "step": 172710 + }, + { + "epoch": 37.01044250663489, + "grad_norm": 0.09423563629388809, + "learning_rate": 3.5837440647059887e-06, + "loss": 0.0026, + "step": 172720 + }, + { + "epoch": 37.010496669013705, + "grad_norm": 0.7666874527931213, + "learning_rate": 3.5807350436608954e-06, + "loss": 0.0308, + "step": 172730 + }, + { + "epoch": 37.010550831392514, + "grad_norm": 0.0007656699744984508, + "learning_rate": 3.577726022615802e-06, + "loss": 0.0001, + "step": 172740 + }, + { + "epoch": 37.010604993771324, + "grad_norm": 0.0007876867894083261, + "learning_rate": 3.574717001570709e-06, + "loss": 0.027, + "step": 172750 + }, + { + "epoch": 37.01065915615014, + "grad_norm": 0.0460415817797184, + "learning_rate": 3.571707980525616e-06, + "loss": 0.0499, + "step": 172760 + }, + { + "epoch": 37.01071331852895, + "grad_norm": 0.000974314461927861, + "learning_rate": 3.568698959480523e-06, + "loss": 0.0106, + "step": 172770 + }, + { + "epoch": 37.01076748090776, + "grad_norm": 0.0009791998891159892, + "learning_rate": 3.5656899384354297e-06, + "loss": 0.0124, + "step": 172780 + }, + { + "epoch": 37.010821643286576, + "grad_norm": 0.0007978025823831558, + "learning_rate": 3.562680917390336e-06, + "loss": 0.046, + "step": 172790 + }, + { + "epoch": 37.010875805665385, + "grad_norm": 0.001159367267973721, + "learning_rate": 3.559671896345243e-06, + "loss": 0.0261, + "step": 172800 + }, + { + "epoch": 37.010929968044195, + "grad_norm": 0.022714901715517044, + "learning_rate": 3.55666287530015e-06, + "loss": 0.056, + "step": 172810 + }, + { + "epoch": 37.01098413042301, + "grad_norm": 0.0009663946111686528, + "learning_rate": 3.553653854255057e-06, + "loss": 0.0048, + "step": 172820 + }, + { + "epoch": 37.01103829280182, + "grad_norm": 1.8604437112808228, + "learning_rate": 3.5506448332099636e-06, + "loss": 0.0344, + "step": 172830 + }, + { + "epoch": 37.01109245518063, + "grad_norm": 0.0008027469739317894, + "learning_rate": 3.5476358121648708e-06, + "loss": 0.0238, + "step": 172840 + }, + { + "epoch": 37.01114661755944, + "grad_norm": 0.020741816610097885, + "learning_rate": 3.5446267911197774e-06, + "loss": 0.0766, + "step": 172850 + }, + { + "epoch": 37.01120077993826, + "grad_norm": 0.003966461401432753, + "learning_rate": 3.5416177700746837e-06, + "loss": 0.0551, + "step": 172860 + }, + { + "epoch": 37.011254942317066, + "grad_norm": 0.008403719402849674, + "learning_rate": 3.538608749029591e-06, + "loss": 0.0245, + "step": 172870 + }, + { + "epoch": 37.011309104695876, + "grad_norm": 0.0010068962583318353, + "learning_rate": 3.5355997279844975e-06, + "loss": 0.0452, + "step": 172880 + }, + { + "epoch": 37.01136326707469, + "grad_norm": 1.0419503450393677, + "learning_rate": 3.5325907069394042e-06, + "loss": 0.0793, + "step": 172890 + }, + { + "epoch": 37.0114174294535, + "grad_norm": 0.010844798758625984, + "learning_rate": 3.5295816858943114e-06, + "loss": 0.0126, + "step": 172900 + }, + { + "epoch": 37.01147159183231, + "grad_norm": 0.0007875915034674108, + "learning_rate": 3.526572664849218e-06, + "loss": 0.0697, + "step": 172910 + }, + { + "epoch": 37.01152575421113, + "grad_norm": 0.0012062144232913852, + "learning_rate": 3.523563643804125e-06, + "loss": 0.0229, + "step": 172920 + }, + { + "epoch": 37.01157991658994, + "grad_norm": 0.43571579456329346, + "learning_rate": 3.520554622759032e-06, + "loss": 0.0885, + "step": 172930 + }, + { + "epoch": 37.01163407896875, + "grad_norm": 0.0012139383470639586, + "learning_rate": 3.517545601713938e-06, + "loss": 0.0064, + "step": 172940 + }, + { + "epoch": 37.01168824134756, + "grad_norm": 6.014416217803955, + "learning_rate": 3.5145365806688453e-06, + "loss": 0.0723, + "step": 172950 + }, + { + "epoch": 37.01174240372637, + "grad_norm": 0.0018927435157820582, + "learning_rate": 3.511527559623752e-06, + "loss": 0.0, + "step": 172960 + }, + { + "epoch": 37.01179656610518, + "grad_norm": 0.3595438301563263, + "learning_rate": 3.508518538578659e-06, + "loss": 0.0063, + "step": 172970 + }, + { + "epoch": 37.01185072848399, + "grad_norm": 0.000872615200933069, + "learning_rate": 3.505509517533566e-06, + "loss": 0.0354, + "step": 172980 + }, + { + "epoch": 37.01190489086281, + "grad_norm": 0.0014717914164066315, + "learning_rate": 3.502500496488473e-06, + "loss": 0.0165, + "step": 172990 + }, + { + "epoch": 37.01195905324162, + "grad_norm": 0.0008188693900592625, + "learning_rate": 3.4994914754433796e-06, + "loss": 0.0446, + "step": 173000 + }, + { + "epoch": 37.01201321562043, + "grad_norm": 0.0009717466309666634, + "learning_rate": 3.496482454398286e-06, + "loss": 0.0058, + "step": 173010 + }, + { + "epoch": 37.012067377999244, + "grad_norm": 3.9715499877929688, + "learning_rate": 3.4934734333531926e-06, + "loss": 0.0352, + "step": 173020 + }, + { + "epoch": 37.012121540378054, + "grad_norm": 0.641735851764679, + "learning_rate": 3.4904644123080997e-06, + "loss": 0.0786, + "step": 173030 + }, + { + "epoch": 37.01217570275686, + "grad_norm": 0.19199955463409424, + "learning_rate": 3.4874553912630064e-06, + "loss": 0.0479, + "step": 173040 + }, + { + "epoch": 37.01222986513568, + "grad_norm": 0.0007993640028871596, + "learning_rate": 3.4844463702179135e-06, + "loss": 0.0239, + "step": 173050 + }, + { + "epoch": 37.01228402751449, + "grad_norm": 0.0009800513507798314, + "learning_rate": 3.4814373491728202e-06, + "loss": 0.0357, + "step": 173060 + }, + { + "epoch": 37.0123381898933, + "grad_norm": 0.0008131730719469488, + "learning_rate": 3.4784283281277274e-06, + "loss": 0.0065, + "step": 173070 + }, + { + "epoch": 37.01239235227211, + "grad_norm": 3.7443039417266846, + "learning_rate": 3.475419307082634e-06, + "loss": 0.0797, + "step": 173080 + }, + { + "epoch": 37.012446514650925, + "grad_norm": 0.0009878072887659073, + "learning_rate": 3.4724102860375403e-06, + "loss": 0.0085, + "step": 173090 + }, + { + "epoch": 37.012500677029735, + "grad_norm": 0.12547089159488678, + "learning_rate": 3.4694012649924474e-06, + "loss": 0.0099, + "step": 173100 + }, + { + "epoch": 37.012554839408544, + "grad_norm": 0.0008088977192528546, + "learning_rate": 3.466392243947354e-06, + "loss": 0.154, + "step": 173110 + }, + { + "epoch": 37.01260900178736, + "grad_norm": 0.0009531649993732572, + "learning_rate": 3.4633832229022613e-06, + "loss": 0.0037, + "step": 173120 + }, + { + "epoch": 37.01266316416617, + "grad_norm": 0.984161913394928, + "learning_rate": 3.460374201857168e-06, + "loss": 0.0277, + "step": 173130 + }, + { + "epoch": 37.01271732654498, + "grad_norm": 0.0011294211726635695, + "learning_rate": 3.457365180812075e-06, + "loss": 0.0165, + "step": 173140 + }, + { + "epoch": 37.012771488923796, + "grad_norm": 1.797083854675293, + "learning_rate": 3.4543561597669818e-06, + "loss": 0.032, + "step": 173150 + }, + { + "epoch": 37.012825651302606, + "grad_norm": 0.001003924640826881, + "learning_rate": 3.451347138721888e-06, + "loss": 0.085, + "step": 173160 + }, + { + "epoch": 37.012879813681415, + "grad_norm": 2.676452398300171, + "learning_rate": 3.4483381176767948e-06, + "loss": 0.0511, + "step": 173170 + }, + { + "epoch": 37.01293397606023, + "grad_norm": 0.6699644327163696, + "learning_rate": 3.445329096631702e-06, + "loss": 0.0135, + "step": 173180 + }, + { + "epoch": 37.01298813843904, + "grad_norm": 0.047771748155355453, + "learning_rate": 3.4423200755866086e-06, + "loss": 0.0019, + "step": 173190 + }, + { + "epoch": 37.01304230081785, + "grad_norm": 0.11197495460510254, + "learning_rate": 3.4393110545415157e-06, + "loss": 0.0005, + "step": 173200 + }, + { + "epoch": 37.01309646319666, + "grad_norm": 0.0008341282373294234, + "learning_rate": 3.4363020334964224e-06, + "loss": 0.0009, + "step": 173210 + }, + { + "epoch": 37.01315062557548, + "grad_norm": 0.0007914028246887028, + "learning_rate": 3.4332930124513295e-06, + "loss": 0.0222, + "step": 173220 + }, + { + "epoch": 37.01320478795429, + "grad_norm": 0.03622245416045189, + "learning_rate": 3.4302839914062362e-06, + "loss": 0.0082, + "step": 173230 + }, + { + "epoch": 37.013258950333096, + "grad_norm": 0.7290205955505371, + "learning_rate": 3.4272749703611425e-06, + "loss": 0.0166, + "step": 173240 + }, + { + "epoch": 37.01331311271191, + "grad_norm": 0.0008272305130958557, + "learning_rate": 3.4242659493160496e-06, + "loss": 0.0549, + "step": 173250 + }, + { + "epoch": 37.01336727509072, + "grad_norm": 0.001145363668911159, + "learning_rate": 3.4212569282709563e-06, + "loss": 0.0307, + "step": 173260 + }, + { + "epoch": 37.01342143746953, + "grad_norm": 0.0009977082954719663, + "learning_rate": 3.4182479072258634e-06, + "loss": 0.0117, + "step": 173270 + }, + { + "epoch": 37.01347559984835, + "grad_norm": 0.06419260054826736, + "learning_rate": 3.41523888618077e-06, + "loss": 0.0698, + "step": 173280 + }, + { + "epoch": 37.01352976222716, + "grad_norm": 0.0010136171476915479, + "learning_rate": 3.4122298651356773e-06, + "loss": 0.1924, + "step": 173290 + }, + { + "epoch": 37.01358392460597, + "grad_norm": 0.000798017717897892, + "learning_rate": 3.409220844090584e-06, + "loss": 0.0638, + "step": 173300 + }, + { + "epoch": 37.01363808698478, + "grad_norm": 51.718101501464844, + "learning_rate": 3.4062118230454902e-06, + "loss": 0.0187, + "step": 173310 + }, + { + "epoch": 37.013692249363594, + "grad_norm": 0.0007860148325562477, + "learning_rate": 3.403202802000397e-06, + "loss": 0.021, + "step": 173320 + }, + { + "epoch": 37.0137464117424, + "grad_norm": 0.0017459368100389838, + "learning_rate": 3.400193780955304e-06, + "loss": 0.0288, + "step": 173330 + }, + { + "epoch": 37.01380057412121, + "grad_norm": 0.0008745508384890854, + "learning_rate": 3.3971847599102107e-06, + "loss": 0.004, + "step": 173340 + }, + { + "epoch": 37.01385473650003, + "grad_norm": 0.001053200918249786, + "learning_rate": 3.394175738865118e-06, + "loss": 0.0064, + "step": 173350 + }, + { + "epoch": 37.01390889887884, + "grad_norm": 0.42431411147117615, + "learning_rate": 3.3911667178200246e-06, + "loss": 0.0074, + "step": 173360 + }, + { + "epoch": 37.01396306125765, + "grad_norm": 1.6579197645187378, + "learning_rate": 3.3881576967749317e-06, + "loss": 0.0829, + "step": 173370 + }, + { + "epoch": 37.014017223636465, + "grad_norm": 0.0025498438626527786, + "learning_rate": 3.3851486757298384e-06, + "loss": 0.0109, + "step": 173380 + }, + { + "epoch": 37.014071386015274, + "grad_norm": 0.0007798484875820577, + "learning_rate": 3.3821396546847447e-06, + "loss": 0.0217, + "step": 173390 + }, + { + "epoch": 37.014125548394084, + "grad_norm": 2.5608503818511963, + "learning_rate": 3.3791306336396518e-06, + "loss": 0.14, + "step": 173400 + }, + { + "epoch": 37.0141797107729, + "grad_norm": 0.0008004001574590802, + "learning_rate": 3.3761216125945585e-06, + "loss": 0.0025, + "step": 173410 + }, + { + "epoch": 37.01423387315171, + "grad_norm": 0.7652400732040405, + "learning_rate": 3.3731125915494656e-06, + "loss": 0.0055, + "step": 173420 + }, + { + "epoch": 37.01428803553052, + "grad_norm": 0.0036184380296617746, + "learning_rate": 3.3701035705043723e-06, + "loss": 0.0137, + "step": 173430 + }, + { + "epoch": 37.01434219790933, + "grad_norm": 0.0009987939847633243, + "learning_rate": 3.3670945494592794e-06, + "loss": 0.0361, + "step": 173440 + }, + { + "epoch": 37.014396360288146, + "grad_norm": 0.1178588718175888, + "learning_rate": 3.364085528414186e-06, + "loss": 0.0312, + "step": 173450 + }, + { + "epoch": 37.014450522666955, + "grad_norm": 2.0147957801818848, + "learning_rate": 3.3610765073690924e-06, + "loss": 0.0757, + "step": 173460 + }, + { + "epoch": 37.014504685045765, + "grad_norm": 0.8322107195854187, + "learning_rate": 3.358067486323999e-06, + "loss": 0.1332, + "step": 173470 + }, + { + "epoch": 37.01455884742458, + "grad_norm": 0.0008036931976675987, + "learning_rate": 3.355058465278906e-06, + "loss": 0.0, + "step": 173480 + }, + { + "epoch": 37.01461300980339, + "grad_norm": 0.18104244768619537, + "learning_rate": 3.352049444233813e-06, + "loss": 0.0527, + "step": 173490 + }, + { + "epoch": 37.0146671721822, + "grad_norm": 0.43407732248306274, + "learning_rate": 3.34904042318872e-06, + "loss": 0.0315, + "step": 173500 + }, + { + "epoch": 37.01472133456102, + "grad_norm": 0.0008004895062185824, + "learning_rate": 3.3460314021436267e-06, + "loss": 0.0, + "step": 173510 + }, + { + "epoch": 37.014775496939826, + "grad_norm": 0.2632681727409363, + "learning_rate": 3.343022381098534e-06, + "loss": 0.0007, + "step": 173520 + }, + { + "epoch": 37.014829659318636, + "grad_norm": 0.0008015421335585415, + "learning_rate": 3.3400133600534405e-06, + "loss": 0.0294, + "step": 173530 + }, + { + "epoch": 37.01488382169745, + "grad_norm": 0.027004197239875793, + "learning_rate": 3.337004339008347e-06, + "loss": 0.031, + "step": 173540 + }, + { + "epoch": 37.01493798407626, + "grad_norm": 0.25581467151641846, + "learning_rate": 3.333995317963254e-06, + "loss": 0.0032, + "step": 173550 + }, + { + "epoch": 37.01499214645507, + "grad_norm": 0.008347421884536743, + "learning_rate": 3.3309862969181606e-06, + "loss": 0.0286, + "step": 173560 + }, + { + "epoch": 37.01504630883388, + "grad_norm": 0.11406821757555008, + "learning_rate": 3.3279772758730678e-06, + "loss": 0.0207, + "step": 173570 + }, + { + "epoch": 37.0151004712127, + "grad_norm": 0.007894986309111118, + "learning_rate": 3.3249682548279745e-06, + "loss": 0.0163, + "step": 173580 + }, + { + "epoch": 37.01515463359151, + "grad_norm": 0.12300459295511246, + "learning_rate": 3.3219592337828816e-06, + "loss": 0.0144, + "step": 173590 + }, + { + "epoch": 37.01520879597032, + "grad_norm": 0.8019542098045349, + "learning_rate": 3.3189502127377883e-06, + "loss": 0.1355, + "step": 173600 + }, + { + "epoch": 37.01526295834913, + "grad_norm": 0.003487761365249753, + "learning_rate": 3.3159411916926946e-06, + "loss": 0.0353, + "step": 173610 + }, + { + "epoch": 37.01531712072794, + "grad_norm": 0.009618058800697327, + "learning_rate": 3.3129321706476013e-06, + "loss": 0.0707, + "step": 173620 + }, + { + "epoch": 37.01537128310675, + "grad_norm": 0.6450120806694031, + "learning_rate": 3.3099231496025084e-06, + "loss": 0.0393, + "step": 173630 + }, + { + "epoch": 37.01542544548557, + "grad_norm": 0.003199404338374734, + "learning_rate": 3.306914128557415e-06, + "loss": 0.0397, + "step": 173640 + }, + { + "epoch": 37.01547960786438, + "grad_norm": 0.00974106602370739, + "learning_rate": 3.303905107512322e-06, + "loss": 0.0102, + "step": 173650 + }, + { + "epoch": 37.01553377024319, + "grad_norm": 0.0010841117473319173, + "learning_rate": 3.300896086467229e-06, + "loss": 0.0031, + "step": 173660 + }, + { + "epoch": 37.015587932622, + "grad_norm": 0.00223208824172616, + "learning_rate": 3.297887065422136e-06, + "loss": 0.0159, + "step": 173670 + }, + { + "epoch": 37.015642095000814, + "grad_norm": 0.0010236933594569564, + "learning_rate": 3.2948780443770427e-06, + "loss": 0.0133, + "step": 173680 + }, + { + "epoch": 37.01569625737962, + "grad_norm": 0.07256293296813965, + "learning_rate": 3.291869023331949e-06, + "loss": 0.0096, + "step": 173690 + }, + { + "epoch": 37.01575041975843, + "grad_norm": 7.669879913330078, + "learning_rate": 3.288860002286856e-06, + "loss": 0.0811, + "step": 173700 + }, + { + "epoch": 37.01580458213725, + "grad_norm": 0.06148591637611389, + "learning_rate": 3.285850981241763e-06, + "loss": 0.0443, + "step": 173710 + }, + { + "epoch": 37.01585874451606, + "grad_norm": 0.0010945204412564635, + "learning_rate": 3.28284196019667e-06, + "loss": 0.0365, + "step": 173720 + }, + { + "epoch": 37.01591290689487, + "grad_norm": 0.14992661774158478, + "learning_rate": 3.2798329391515766e-06, + "loss": 0.0007, + "step": 173730 + }, + { + "epoch": 37.015967069273685, + "grad_norm": 0.0007909851847216487, + "learning_rate": 3.2768239181064837e-06, + "loss": 0.0186, + "step": 173740 + }, + { + "epoch": 37.016021231652495, + "grad_norm": 0.006903079804033041, + "learning_rate": 3.2738148970613904e-06, + "loss": 0.0207, + "step": 173750 + }, + { + "epoch": 37.016075394031304, + "grad_norm": 0.6642659902572632, + "learning_rate": 3.2708058760162967e-06, + "loss": 0.0687, + "step": 173760 + }, + { + "epoch": 37.01612955641012, + "grad_norm": 1.1985019445419312, + "learning_rate": 3.2677968549712034e-06, + "loss": 0.0233, + "step": 173770 + }, + { + "epoch": 37.01618371878893, + "grad_norm": 0.0007955724140629172, + "learning_rate": 3.2647878339261105e-06, + "loss": 0.0203, + "step": 173780 + }, + { + "epoch": 37.01623788116774, + "grad_norm": 0.008441833779215813, + "learning_rate": 3.2617788128810172e-06, + "loss": 0.028, + "step": 173790 + }, + { + "epoch": 37.01629204354655, + "grad_norm": 0.0008494533249177039, + "learning_rate": 3.2587697918359244e-06, + "loss": 0.0654, + "step": 173800 + }, + { + "epoch": 37.016346205925366, + "grad_norm": 0.16885846853256226, + "learning_rate": 3.255760770790831e-06, + "loss": 0.0643, + "step": 173810 + }, + { + "epoch": 37.016400368304176, + "grad_norm": 1.014456868171692, + "learning_rate": 3.252751749745738e-06, + "loss": 0.0387, + "step": 173820 + }, + { + "epoch": 37.016454530682985, + "grad_norm": 0.4583532214164734, + "learning_rate": 3.249742728700645e-06, + "loss": 0.044, + "step": 173830 + }, + { + "epoch": 37.0165086930618, + "grad_norm": 1.1773282289505005, + "learning_rate": 3.246733707655551e-06, + "loss": 0.0122, + "step": 173840 + }, + { + "epoch": 37.01656285544061, + "grad_norm": 1.7233169078826904, + "learning_rate": 3.2437246866104583e-06, + "loss": 0.06, + "step": 173850 + }, + { + "epoch": 37.01661701781942, + "grad_norm": 0.0026838320773094893, + "learning_rate": 3.240715665565365e-06, + "loss": 0.0002, + "step": 173860 + }, + { + "epoch": 37.01667118019824, + "grad_norm": 0.027719080448150635, + "learning_rate": 3.237706644520272e-06, + "loss": 0.0402, + "step": 173870 + }, + { + "epoch": 37.01672534257705, + "grad_norm": 1.8892462253570557, + "learning_rate": 3.234697623475179e-06, + "loss": 0.0345, + "step": 173880 + }, + { + "epoch": 37.016779504955856, + "grad_norm": 0.0008020716486498713, + "learning_rate": 3.231688602430086e-06, + "loss": 0.0155, + "step": 173890 + }, + { + "epoch": 37.016833667334666, + "grad_norm": 1.1904584169387817, + "learning_rate": 3.2286795813849926e-06, + "loss": 0.0066, + "step": 173900 + }, + { + "epoch": 37.01688782971348, + "grad_norm": 0.0008101448765955865, + "learning_rate": 3.225670560339899e-06, + "loss": 0.0131, + "step": 173910 + }, + { + "epoch": 37.01694199209229, + "grad_norm": 1.070783019065857, + "learning_rate": 3.2226615392948056e-06, + "loss": 0.0227, + "step": 173920 + }, + { + "epoch": 37.0169961544711, + "grad_norm": 3.193230152130127, + "learning_rate": 3.2196525182497127e-06, + "loss": 0.0221, + "step": 173930 + }, + { + "epoch": 37.01705031684992, + "grad_norm": 0.0011546919122338295, + "learning_rate": 3.2166434972046194e-06, + "loss": 0.0392, + "step": 173940 + }, + { + "epoch": 37.01710447922873, + "grad_norm": 0.0007943938835524023, + "learning_rate": 3.2136344761595265e-06, + "loss": 0.0023, + "step": 173950 + }, + { + "epoch": 37.01715864160754, + "grad_norm": 0.0009139537578448653, + "learning_rate": 3.2106254551144332e-06, + "loss": 0.0204, + "step": 173960 + }, + { + "epoch": 37.017212803986354, + "grad_norm": 0.0008019606466405094, + "learning_rate": 3.2076164340693403e-06, + "loss": 0.0259, + "step": 173970 + }, + { + "epoch": 37.01726696636516, + "grad_norm": 0.000790986989159137, + "learning_rate": 3.204607413024247e-06, + "loss": 0.0059, + "step": 173980 + }, + { + "epoch": 37.01732112874397, + "grad_norm": 0.0008098819525912404, + "learning_rate": 3.2015983919791533e-06, + "loss": 0.0361, + "step": 173990 + }, + { + "epoch": 37.01737529112279, + "grad_norm": 0.0012999572791159153, + "learning_rate": 3.1985893709340604e-06, + "loss": 0.0311, + "step": 174000 + }, + { + "epoch": 37.0174294535016, + "grad_norm": 0.000797212531324476, + "learning_rate": 3.195580349888967e-06, + "loss": 0.0394, + "step": 174010 + }, + { + "epoch": 37.01748361588041, + "grad_norm": 0.15202556550502777, + "learning_rate": 3.1925713288438743e-06, + "loss": 0.0158, + "step": 174020 + }, + { + "epoch": 37.01753777825922, + "grad_norm": 0.0011196587001904845, + "learning_rate": 3.189562307798781e-06, + "loss": 0.0267, + "step": 174030 + }, + { + "epoch": 37.017591940638034, + "grad_norm": 0.6654215455055237, + "learning_rate": 3.186553286753688e-06, + "loss": 0.0188, + "step": 174040 + }, + { + "epoch": 37.017646103016844, + "grad_norm": 0.0007809897069819272, + "learning_rate": 3.1835442657085948e-06, + "loss": 0.0191, + "step": 174050 + }, + { + "epoch": 37.01770026539565, + "grad_norm": 0.0016836704453453422, + "learning_rate": 3.180535244663501e-06, + "loss": 0.042, + "step": 174060 + }, + { + "epoch": 37.01775442777447, + "grad_norm": 0.001355640823021531, + "learning_rate": 3.1775262236184078e-06, + "loss": 0.1017, + "step": 174070 + }, + { + "epoch": 37.01780859015328, + "grad_norm": 0.0009676503832451999, + "learning_rate": 3.174517202573315e-06, + "loss": 0.0068, + "step": 174080 + }, + { + "epoch": 37.01786275253209, + "grad_norm": 0.000978532712906599, + "learning_rate": 3.1715081815282216e-06, + "loss": 0.0327, + "step": 174090 + }, + { + "epoch": 37.017916914910906, + "grad_norm": 1.9504518508911133, + "learning_rate": 3.1684991604831287e-06, + "loss": 0.1148, + "step": 174100 + }, + { + "epoch": 37.017971077289715, + "grad_norm": 0.0007951490697450936, + "learning_rate": 3.1654901394380354e-06, + "loss": 0.0441, + "step": 174110 + }, + { + "epoch": 37.018025239668525, + "grad_norm": 0.0015237436164170504, + "learning_rate": 3.1624811183929425e-06, + "loss": 0.0149, + "step": 174120 + }, + { + "epoch": 37.01807940204734, + "grad_norm": 0.4416618347167969, + "learning_rate": 3.159472097347849e-06, + "loss": 0.0384, + "step": 174130 + }, + { + "epoch": 37.01813356442615, + "grad_norm": 0.0010041752830147743, + "learning_rate": 3.1564630763027555e-06, + "loss": 0.0331, + "step": 174140 + }, + { + "epoch": 37.01818772680496, + "grad_norm": 0.005744830705225468, + "learning_rate": 3.1534540552576626e-06, + "loss": 0.1166, + "step": 174150 + }, + { + "epoch": 37.01824188918377, + "grad_norm": 0.002877640537917614, + "learning_rate": 3.1504450342125693e-06, + "loss": 0.0107, + "step": 174160 + }, + { + "epoch": 37.01829605156259, + "grad_norm": 0.9672771692276001, + "learning_rate": 3.1474360131674764e-06, + "loss": 0.0478, + "step": 174170 + }, + { + "epoch": 37.018350213941396, + "grad_norm": 0.002261520829051733, + "learning_rate": 3.144426992122383e-06, + "loss": 0.0002, + "step": 174180 + }, + { + "epoch": 37.018404376320206, + "grad_norm": 0.0007868159445933998, + "learning_rate": 3.1414179710772902e-06, + "loss": 0.0954, + "step": 174190 + }, + { + "epoch": 37.01845853869902, + "grad_norm": 0.10857972502708435, + "learning_rate": 3.138408950032197e-06, + "loss": 0.0229, + "step": 174200 + }, + { + "epoch": 37.01851270107783, + "grad_norm": 0.0007989879231899977, + "learning_rate": 3.1353999289871032e-06, + "loss": 0.0004, + "step": 174210 + }, + { + "epoch": 37.01856686345664, + "grad_norm": 0.0008188518695533276, + "learning_rate": 3.13239090794201e-06, + "loss": 0.0214, + "step": 174220 + }, + { + "epoch": 37.01862102583546, + "grad_norm": 0.0010310686193406582, + "learning_rate": 3.129381886896917e-06, + "loss": 0.0064, + "step": 174230 + }, + { + "epoch": 37.01867518821427, + "grad_norm": 0.227041095495224, + "learning_rate": 3.1263728658518237e-06, + "loss": 0.0104, + "step": 174240 + }, + { + "epoch": 37.01872935059308, + "grad_norm": 0.0011744614457711577, + "learning_rate": 3.123363844806731e-06, + "loss": 0.0255, + "step": 174250 + }, + { + "epoch": 37.018783512971886, + "grad_norm": 0.3887953758239746, + "learning_rate": 3.1203548237616376e-06, + "loss": 0.0445, + "step": 174260 + }, + { + "epoch": 37.0188376753507, + "grad_norm": 0.0008076582453213632, + "learning_rate": 3.1173458027165443e-06, + "loss": 0.0942, + "step": 174270 + }, + { + "epoch": 37.01889183772951, + "grad_norm": 0.000811546400655061, + "learning_rate": 3.1143367816714514e-06, + "loss": 0.0082, + "step": 174280 + }, + { + "epoch": 37.01894600010832, + "grad_norm": 0.0011025136336684227, + "learning_rate": 3.111327760626358e-06, + "loss": 0.1136, + "step": 174290 + }, + { + "epoch": 37.01900016248714, + "grad_norm": 0.008286464028060436, + "learning_rate": 3.1083187395812648e-06, + "loss": 0.0681, + "step": 174300 + }, + { + "epoch": 37.01905432486595, + "grad_norm": 0.001647386234253645, + "learning_rate": 3.1053097185361715e-06, + "loss": 0.1033, + "step": 174310 + }, + { + "epoch": 37.01910848724476, + "grad_norm": 0.0016092165606096387, + "learning_rate": 3.1023006974910786e-06, + "loss": 0.0844, + "step": 174320 + }, + { + "epoch": 37.019162649623574, + "grad_norm": 0.001295560272410512, + "learning_rate": 3.0992916764459853e-06, + "loss": 0.0011, + "step": 174330 + }, + { + "epoch": 37.019216812002384, + "grad_norm": 0.0008254917920567095, + "learning_rate": 3.096282655400892e-06, + "loss": 0.0188, + "step": 174340 + }, + { + "epoch": 37.01927097438119, + "grad_norm": 0.9644960761070251, + "learning_rate": 3.0932736343557987e-06, + "loss": 0.0261, + "step": 174350 + }, + { + "epoch": 37.01932513676001, + "grad_norm": 0.002110442379489541, + "learning_rate": 3.090264613310706e-06, + "loss": 0.001, + "step": 174360 + }, + { + "epoch": 37.01937929913882, + "grad_norm": 0.0010450744302943349, + "learning_rate": 3.0872555922656125e-06, + "loss": 0.004, + "step": 174370 + }, + { + "epoch": 37.01943346151763, + "grad_norm": 0.0010524718090891838, + "learning_rate": 3.084246571220519e-06, + "loss": 0.0287, + "step": 174380 + }, + { + "epoch": 37.01948762389644, + "grad_norm": 0.0008308691903948784, + "learning_rate": 3.081237550175426e-06, + "loss": 0.0033, + "step": 174390 + }, + { + "epoch": 37.019541786275255, + "grad_norm": 0.0023798546753823757, + "learning_rate": 3.078228529130333e-06, + "loss": 0.0138, + "step": 174400 + }, + { + "epoch": 37.019595948654064, + "grad_norm": 0.011534069664776325, + "learning_rate": 3.0752195080852397e-06, + "loss": 0.0554, + "step": 174410 + }, + { + "epoch": 37.019650111032874, + "grad_norm": 0.0008002524264156818, + "learning_rate": 3.0722104870401464e-06, + "loss": 0.012, + "step": 174420 + }, + { + "epoch": 37.01970427341169, + "grad_norm": 0.7738936543464661, + "learning_rate": 3.069201465995053e-06, + "loss": 0.0262, + "step": 174430 + }, + { + "epoch": 37.0197584357905, + "grad_norm": 0.9281009435653687, + "learning_rate": 3.0661924449499602e-06, + "loss": 0.0461, + "step": 174440 + }, + { + "epoch": 37.01981259816931, + "grad_norm": 0.14230620861053467, + "learning_rate": 3.063183423904867e-06, + "loss": 0.0045, + "step": 174450 + }, + { + "epoch": 37.019866760548126, + "grad_norm": 0.9205911755561829, + "learning_rate": 3.0601744028597736e-06, + "loss": 0.0366, + "step": 174460 + }, + { + "epoch": 37.019920922926936, + "grad_norm": 0.0015527260256931186, + "learning_rate": 3.0571653818146808e-06, + "loss": 0.005, + "step": 174470 + }, + { + "epoch": 37.019975085305745, + "grad_norm": 0.0019788218196481466, + "learning_rate": 3.0541563607695875e-06, + "loss": 0.0361, + "step": 174480 + }, + { + "epoch": 37.02002924768456, + "grad_norm": 0.3898187279701233, + "learning_rate": 3.051147339724494e-06, + "loss": 0.0572, + "step": 174490 + }, + { + "epoch": 37.02008341006337, + "grad_norm": 0.0009678194182924926, + "learning_rate": 3.048138318679401e-06, + "loss": 0.0281, + "step": 174500 + }, + { + "epoch": 37.02013757244218, + "grad_norm": 0.0007764500332996249, + "learning_rate": 3.045129297634308e-06, + "loss": 0.0005, + "step": 174510 + }, + { + "epoch": 37.02019173482099, + "grad_norm": 0.007058287505060434, + "learning_rate": 3.0421202765892147e-06, + "loss": 0.001, + "step": 174520 + }, + { + "epoch": 37.02024589719981, + "grad_norm": 0.0025309566408395767, + "learning_rate": 3.0391112555441214e-06, + "loss": 0.0236, + "step": 174530 + }, + { + "epoch": 37.02030005957862, + "grad_norm": 0.0007868865504860878, + "learning_rate": 3.036102234499028e-06, + "loss": 0.0, + "step": 174540 + }, + { + "epoch": 37.020354221957426, + "grad_norm": 0.0009179339976981282, + "learning_rate": 3.033093213453935e-06, + "loss": 0.0058, + "step": 174550 + }, + { + "epoch": 37.02040838433624, + "grad_norm": 0.6436759829521179, + "learning_rate": 3.030084192408842e-06, + "loss": 0.0066, + "step": 174560 + }, + { + "epoch": 37.02046254671505, + "grad_norm": 1.5225023031234741, + "learning_rate": 3.0270751713637486e-06, + "loss": 0.0249, + "step": 174570 + }, + { + "epoch": 37.02051670909386, + "grad_norm": 0.0012103549670428038, + "learning_rate": 3.0240661503186553e-06, + "loss": 0.0093, + "step": 174580 + }, + { + "epoch": 37.02057087147268, + "grad_norm": 0.0007777851424179971, + "learning_rate": 3.0210571292735624e-06, + "loss": 0.0141, + "step": 174590 + }, + { + "epoch": 37.02062503385149, + "grad_norm": 1.0381678342819214, + "learning_rate": 3.018048108228469e-06, + "loss": 0.0354, + "step": 174600 + }, + { + "epoch": 37.0206791962303, + "grad_norm": 0.0018702477682381868, + "learning_rate": 3.015039087183376e-06, + "loss": 0.0114, + "step": 174610 + }, + { + "epoch": 37.02073335860911, + "grad_norm": 1.0429871082305908, + "learning_rate": 3.012030066138283e-06, + "loss": 0.0979, + "step": 174620 + }, + { + "epoch": 37.02078752098792, + "grad_norm": 0.0872407779097557, + "learning_rate": 3.0090210450931896e-06, + "loss": 0.0438, + "step": 174630 + }, + { + "epoch": 37.02084168336673, + "grad_norm": 0.12576685845851898, + "learning_rate": 3.0060120240480963e-06, + "loss": 0.0281, + "step": 174640 + }, + { + "epoch": 37.02089584574554, + "grad_norm": 0.0018530018860474229, + "learning_rate": 3.003003003003003e-06, + "loss": 0.0007, + "step": 174650 + }, + { + "epoch": 37.02095000812436, + "grad_norm": 0.0010143982945010066, + "learning_rate": 2.99999398195791e-06, + "loss": 0.0488, + "step": 174660 + }, + { + "epoch": 37.02100417050317, + "grad_norm": 0.0010043344227597117, + "learning_rate": 2.996984960912817e-06, + "loss": 0.0732, + "step": 174670 + }, + { + "epoch": 37.02105833288198, + "grad_norm": 0.0025827332865446806, + "learning_rate": 2.9939759398677235e-06, + "loss": 0.0127, + "step": 174680 + }, + { + "epoch": 37.021112495260795, + "grad_norm": 0.010429893620312214, + "learning_rate": 2.9909669188226302e-06, + "loss": 0.0291, + "step": 174690 + }, + { + "epoch": 37.021166657639604, + "grad_norm": 0.0009890939109027386, + "learning_rate": 2.9879578977775374e-06, + "loss": 0.0221, + "step": 174700 + }, + { + "epoch": 37.021220820018414, + "grad_norm": 0.001057092915289104, + "learning_rate": 2.984948876732444e-06, + "loss": 0.0167, + "step": 174710 + }, + { + "epoch": 37.02127498239723, + "grad_norm": 0.007075261790305376, + "learning_rate": 2.9819398556873508e-06, + "loss": 0.0149, + "step": 174720 + }, + { + "epoch": 37.02132914477604, + "grad_norm": 0.04499290883541107, + "learning_rate": 2.9789308346422575e-06, + "loss": 0.0301, + "step": 174730 + }, + { + "epoch": 37.02138330715485, + "grad_norm": 0.0008615755941718817, + "learning_rate": 2.9759218135971646e-06, + "loss": 0.1007, + "step": 174740 + }, + { + "epoch": 37.02143746953366, + "grad_norm": 0.0013132239691913128, + "learning_rate": 2.9729127925520713e-06, + "loss": 0.057, + "step": 174750 + }, + { + "epoch": 37.021491631912475, + "grad_norm": 1.0818828344345093, + "learning_rate": 2.969903771506978e-06, + "loss": 0.0213, + "step": 174760 + }, + { + "epoch": 37.021545794291285, + "grad_norm": 0.0007744014728814363, + "learning_rate": 2.966894750461885e-06, + "loss": 0.0064, + "step": 174770 + }, + { + "epoch": 37.021599956670094, + "grad_norm": 0.001109801814891398, + "learning_rate": 2.963885729416792e-06, + "loss": 0.0626, + "step": 174780 + }, + { + "epoch": 37.02165411904891, + "grad_norm": 0.0009812087519094348, + "learning_rate": 2.9608767083716985e-06, + "loss": 0.0059, + "step": 174790 + }, + { + "epoch": 37.02170828142772, + "grad_norm": 0.0020680446177721024, + "learning_rate": 2.957867687326605e-06, + "loss": 0.0136, + "step": 174800 + }, + { + "epoch": 37.02176244380653, + "grad_norm": 0.0045153312385082245, + "learning_rate": 2.9548586662815123e-06, + "loss": 0.0117, + "step": 174810 + }, + { + "epoch": 37.02181660618535, + "grad_norm": 0.0010170076275244355, + "learning_rate": 2.951849645236419e-06, + "loss": 0.0126, + "step": 174820 + }, + { + "epoch": 37.021870768564156, + "grad_norm": 0.0015171548584476113, + "learning_rate": 2.9488406241913257e-06, + "loss": 0.0098, + "step": 174830 + }, + { + "epoch": 37.021924930942966, + "grad_norm": 0.3846544027328491, + "learning_rate": 2.9458316031462324e-06, + "loss": 0.0502, + "step": 174840 + }, + { + "epoch": 37.021979093321775, + "grad_norm": 0.0010927524417638779, + "learning_rate": 2.9428225821011395e-06, + "loss": 0.0063, + "step": 174850 + }, + { + "epoch": 37.02203325570059, + "grad_norm": 0.000789814570453018, + "learning_rate": 2.939813561056046e-06, + "loss": 0.0179, + "step": 174860 + }, + { + "epoch": 37.0220874180794, + "grad_norm": 0.7308169007301331, + "learning_rate": 2.936804540010953e-06, + "loss": 0.0166, + "step": 174870 + }, + { + "epoch": 37.02214158045821, + "grad_norm": 0.0008624049369245768, + "learning_rate": 2.9337955189658596e-06, + "loss": 0.0187, + "step": 174880 + }, + { + "epoch": 37.02219574283703, + "grad_norm": 0.07046352326869965, + "learning_rate": 2.9307864979207667e-06, + "loss": 0.0273, + "step": 174890 + }, + { + "epoch": 37.02224990521584, + "grad_norm": 0.0008787093684077263, + "learning_rate": 2.9277774768756734e-06, + "loss": 0.0001, + "step": 174900 + }, + { + "epoch": 37.022304067594646, + "grad_norm": 0.0007617671508342028, + "learning_rate": 2.92476845583058e-06, + "loss": 0.0003, + "step": 174910 + }, + { + "epoch": 37.02235822997346, + "grad_norm": 0.0007972106104716659, + "learning_rate": 2.9217594347854873e-06, + "loss": 0.0527, + "step": 174920 + }, + { + "epoch": 37.02241239235227, + "grad_norm": 1.4265000820159912, + "learning_rate": 2.918750413740394e-06, + "loss": 0.0271, + "step": 174930 + }, + { + "epoch": 37.02246655473108, + "grad_norm": 0.007458465173840523, + "learning_rate": 2.9157413926953007e-06, + "loss": 0.0866, + "step": 174940 + }, + { + "epoch": 37.0225207171099, + "grad_norm": 0.000830750388558954, + "learning_rate": 2.9127323716502074e-06, + "loss": 0.0273, + "step": 174950 + }, + { + "epoch": 37.02257487948871, + "grad_norm": 0.008290044963359833, + "learning_rate": 2.9097233506051145e-06, + "loss": 0.0626, + "step": 174960 + }, + { + "epoch": 37.02262904186752, + "grad_norm": 0.750956118106842, + "learning_rate": 2.906714329560021e-06, + "loss": 0.009, + "step": 174970 + }, + { + "epoch": 37.02268320424633, + "grad_norm": 0.6467841863632202, + "learning_rate": 2.903705308514928e-06, + "loss": 0.0062, + "step": 174980 + }, + { + "epoch": 37.022737366625144, + "grad_norm": 0.000989577965810895, + "learning_rate": 2.9006962874698346e-06, + "loss": 0.0001, + "step": 174990 + }, + { + "epoch": 37.02279152900395, + "grad_norm": 0.0019461369374766946, + "learning_rate": 2.8976872664247417e-06, + "loss": 0.0055, + "step": 175000 + }, + { + "epoch": 37.02284569138276, + "grad_norm": 0.0007859334000386298, + "learning_rate": 2.894678245379648e-06, + "loss": 0.0138, + "step": 175010 + }, + { + "epoch": 37.02289985376158, + "grad_norm": 0.0013234504731371999, + "learning_rate": 2.891669224334555e-06, + "loss": 0.0149, + "step": 175020 + }, + { + "epoch": 37.02295401614039, + "grad_norm": 0.9733223915100098, + "learning_rate": 2.8886602032894618e-06, + "loss": 0.013, + "step": 175030 + }, + { + "epoch": 37.0230081785192, + "grad_norm": 0.004752746317535639, + "learning_rate": 2.885651182244369e-06, + "loss": 0.0013, + "step": 175040 + }, + { + "epoch": 37.023062340898015, + "grad_norm": 11.966546058654785, + "learning_rate": 2.8826421611992756e-06, + "loss": 0.1374, + "step": 175050 + }, + { + "epoch": 37.023116503276825, + "grad_norm": 0.0007681656279601157, + "learning_rate": 2.8796331401541823e-06, + "loss": 0.0018, + "step": 175060 + }, + { + "epoch": 37.023170665655634, + "grad_norm": 2.216834783554077, + "learning_rate": 2.8766241191090894e-06, + "loss": 0.0492, + "step": 175070 + }, + { + "epoch": 37.02322482803445, + "grad_norm": 1.7434718608856201, + "learning_rate": 2.873615098063996e-06, + "loss": 0.0343, + "step": 175080 + }, + { + "epoch": 37.02327899041326, + "grad_norm": 0.11485763639211655, + "learning_rate": 2.870606077018903e-06, + "loss": 0.0262, + "step": 175090 + }, + { + "epoch": 37.02333315279207, + "grad_norm": 0.0012493554968386889, + "learning_rate": 2.8675970559738095e-06, + "loss": 0.0471, + "step": 175100 + }, + { + "epoch": 37.02338731517088, + "grad_norm": 16.73137664794922, + "learning_rate": 2.8645880349287166e-06, + "loss": 0.033, + "step": 175110 + }, + { + "epoch": 37.023441477549696, + "grad_norm": 0.000785623793490231, + "learning_rate": 2.861579013883623e-06, + "loss": 0.0074, + "step": 175120 + }, + { + "epoch": 37.023495639928505, + "grad_norm": 1.6296266317367554, + "learning_rate": 2.85856999283853e-06, + "loss": 0.0387, + "step": 175130 + }, + { + "epoch": 37.023549802307315, + "grad_norm": 0.0007695392123423517, + "learning_rate": 2.8555609717934367e-06, + "loss": 0.0244, + "step": 175140 + }, + { + "epoch": 37.02360396468613, + "grad_norm": 0.0007623511482961476, + "learning_rate": 2.852551950748344e-06, + "loss": 0.0485, + "step": 175150 + }, + { + "epoch": 37.02365812706494, + "grad_norm": 0.061076220124959946, + "learning_rate": 2.84954292970325e-06, + "loss": 0.0446, + "step": 175160 + }, + { + "epoch": 37.02371228944375, + "grad_norm": 0.0023232600651681423, + "learning_rate": 2.8465339086581573e-06, + "loss": 0.056, + "step": 175170 + }, + { + "epoch": 37.02376645182257, + "grad_norm": 1.3736720085144043, + "learning_rate": 2.843524887613064e-06, + "loss": 0.0593, + "step": 175180 + }, + { + "epoch": 37.02382061420138, + "grad_norm": 0.003701429581269622, + "learning_rate": 2.840515866567971e-06, + "loss": 0.0503, + "step": 175190 + }, + { + "epoch": 37.023874776580186, + "grad_norm": 0.049868397414684296, + "learning_rate": 2.8375068455228778e-06, + "loss": 0.0278, + "step": 175200 + }, + { + "epoch": 37.023928938958996, + "grad_norm": 0.0015496175037696958, + "learning_rate": 2.8344978244777845e-06, + "loss": 0.0531, + "step": 175210 + }, + { + "epoch": 37.02398310133781, + "grad_norm": 1.0218063592910767, + "learning_rate": 2.8314888034326916e-06, + "loss": 0.0089, + "step": 175220 + }, + { + "epoch": 37.02403726371662, + "grad_norm": 0.0010154821211472154, + "learning_rate": 2.8284797823875983e-06, + "loss": 0.0211, + "step": 175230 + }, + { + "epoch": 37.02409142609543, + "grad_norm": 0.0007811873219907284, + "learning_rate": 2.825470761342505e-06, + "loss": 0.0237, + "step": 175240 + }, + { + "epoch": 37.02414558847425, + "grad_norm": 0.0008002938120625913, + "learning_rate": 2.8224617402974117e-06, + "loss": 0.0011, + "step": 175250 + }, + { + "epoch": 37.02419975085306, + "grad_norm": 0.030518919229507446, + "learning_rate": 2.819452719252319e-06, + "loss": 0.017, + "step": 175260 + }, + { + "epoch": 37.02425391323187, + "grad_norm": 0.0008389558643102646, + "learning_rate": 2.816443698207225e-06, + "loss": 0.0417, + "step": 175270 + }, + { + "epoch": 37.02430807561068, + "grad_norm": 0.0008297215681523085, + "learning_rate": 2.813434677162132e-06, + "loss": 0.0553, + "step": 175280 + }, + { + "epoch": 37.02436223798949, + "grad_norm": 0.0012156720040366054, + "learning_rate": 2.810425656117039e-06, + "loss": 0.0275, + "step": 175290 + }, + { + "epoch": 37.0244164003683, + "grad_norm": 0.7402788400650024, + "learning_rate": 2.807416635071946e-06, + "loss": 0.0265, + "step": 175300 + }, + { + "epoch": 37.02447056274712, + "grad_norm": 6.466289520263672, + "learning_rate": 2.8044076140268523e-06, + "loss": 0.0189, + "step": 175310 + }, + { + "epoch": 37.02452472512593, + "grad_norm": 0.5319099426269531, + "learning_rate": 2.8013985929817594e-06, + "loss": 0.0011, + "step": 175320 + }, + { + "epoch": 37.02457888750474, + "grad_norm": 0.0010471853893250227, + "learning_rate": 2.798389571936666e-06, + "loss": 0.0136, + "step": 175330 + }, + { + "epoch": 37.02463304988355, + "grad_norm": 0.9425756335258484, + "learning_rate": 2.7953805508915732e-06, + "loss": 0.0247, + "step": 175340 + }, + { + "epoch": 37.024687212262364, + "grad_norm": 2.2833588123321533, + "learning_rate": 2.79237152984648e-06, + "loss": 0.038, + "step": 175350 + }, + { + "epoch": 37.024741374641174, + "grad_norm": 0.0009742610855028033, + "learning_rate": 2.7893625088013866e-06, + "loss": 0.0375, + "step": 175360 + }, + { + "epoch": 37.02479553701998, + "grad_norm": 0.0014224672922864556, + "learning_rate": 2.7863534877562938e-06, + "loss": 0.077, + "step": 175370 + }, + { + "epoch": 37.0248496993988, + "grad_norm": 0.0010172293987125158, + "learning_rate": 2.7833444667112005e-06, + "loss": 0.0071, + "step": 175380 + }, + { + "epoch": 37.02490386177761, + "grad_norm": 0.9193557500839233, + "learning_rate": 2.780335445666107e-06, + "loss": 0.0089, + "step": 175390 + }, + { + "epoch": 37.02495802415642, + "grad_norm": 0.001308273640461266, + "learning_rate": 2.777326424621014e-06, + "loss": 0.0086, + "step": 175400 + }, + { + "epoch": 37.02500135405947, + "eval_accuracy": 0.8432397126061397, + "eval_loss": 1.1077842712402344, + "eval_runtime": 116.5017, + "eval_samples_per_second": 26.283, + "eval_steps_per_second": 3.288, + "step": 175408 + }, + { + "epoch": 38.00001083247576, + "grad_norm": 0.0007786155329085886, + "learning_rate": 2.774317403575921e-06, + "loss": 0.02, + "step": 175410 + }, + { + "epoch": 38.000064994854576, + "grad_norm": 0.930684506893158, + "learning_rate": 2.7713083825308272e-06, + "loss": 0.0311, + "step": 175420 + }, + { + "epoch": 38.000119157233385, + "grad_norm": 0.0007814519922249019, + "learning_rate": 2.7682993614857344e-06, + "loss": 0.0317, + "step": 175430 + }, + { + "epoch": 38.000173319612195, + "grad_norm": 0.14165621995925903, + "learning_rate": 2.765290340440641e-06, + "loss": 0.0112, + "step": 175440 + }, + { + "epoch": 38.00022748199101, + "grad_norm": 3.1935837268829346, + "learning_rate": 2.762281319395548e-06, + "loss": 0.0816, + "step": 175450 + }, + { + "epoch": 38.00028164436982, + "grad_norm": 0.02208477258682251, + "learning_rate": 2.7592722983504545e-06, + "loss": 0.0392, + "step": 175460 + }, + { + "epoch": 38.00033580674863, + "grad_norm": 0.0007774619152769446, + "learning_rate": 2.7562632773053616e-06, + "loss": 0.02, + "step": 175470 + }, + { + "epoch": 38.00038996912745, + "grad_norm": 0.7211089134216309, + "learning_rate": 2.7532542562602683e-06, + "loss": 0.0961, + "step": 175480 + }, + { + "epoch": 38.000444131506256, + "grad_norm": 0.06387373059988022, + "learning_rate": 2.7502452352151754e-06, + "loss": 0.0005, + "step": 175490 + }, + { + "epoch": 38.000498293885066, + "grad_norm": 0.0009799296967685223, + "learning_rate": 2.747236214170082e-06, + "loss": 0.0464, + "step": 175500 + }, + { + "epoch": 38.00055245626388, + "grad_norm": 0.0007812293479219079, + "learning_rate": 2.744227193124989e-06, + "loss": 0.0291, + "step": 175510 + }, + { + "epoch": 38.00060661864269, + "grad_norm": 0.005914108827710152, + "learning_rate": 2.741218172079896e-06, + "loss": 0.01, + "step": 175520 + }, + { + "epoch": 38.0006607810215, + "grad_norm": 0.02892201952636242, + "learning_rate": 2.7382091510348026e-06, + "loss": 0.0364, + "step": 175530 + }, + { + "epoch": 38.00071494340031, + "grad_norm": 0.0007835695869289339, + "learning_rate": 2.7352001299897093e-06, + "loss": 0.015, + "step": 175540 + }, + { + "epoch": 38.00076910577913, + "grad_norm": 0.0007840902544558048, + "learning_rate": 2.732191108944616e-06, + "loss": 0.0031, + "step": 175550 + }, + { + "epoch": 38.00082326815794, + "grad_norm": 0.14341682195663452, + "learning_rate": 2.729182087899523e-06, + "loss": 0.1309, + "step": 175560 + }, + { + "epoch": 38.00087743053675, + "grad_norm": 0.0010052908910438418, + "learning_rate": 2.7261730668544294e-06, + "loss": 0.0244, + "step": 175570 + }, + { + "epoch": 38.00093159291556, + "grad_norm": 1.0153083801269531, + "learning_rate": 2.7231640458093365e-06, + "loss": 0.0093, + "step": 175580 + }, + { + "epoch": 38.00098575529437, + "grad_norm": 13.613250732421875, + "learning_rate": 2.7201550247642432e-06, + "loss": 0.1171, + "step": 175590 + }, + { + "epoch": 38.00103991767318, + "grad_norm": 0.006049848161637783, + "learning_rate": 2.7171460037191504e-06, + "loss": 0.0226, + "step": 175600 + }, + { + "epoch": 38.001094080052, + "grad_norm": 0.0007952926680445671, + "learning_rate": 2.7141369826740566e-06, + "loss": 0.0021, + "step": 175610 + }, + { + "epoch": 38.00114824243081, + "grad_norm": 0.001437653787434101, + "learning_rate": 2.7111279616289638e-06, + "loss": 0.0534, + "step": 175620 + }, + { + "epoch": 38.00120240480962, + "grad_norm": 0.0014258739538490772, + "learning_rate": 2.7081189405838705e-06, + "loss": 0.0008, + "step": 175630 + }, + { + "epoch": 38.00125656718843, + "grad_norm": 0.01581117883324623, + "learning_rate": 2.7051099195387776e-06, + "loss": 0.0841, + "step": 175640 + }, + { + "epoch": 38.001310729567244, + "grad_norm": 0.0010711019858717918, + "learning_rate": 2.7021008984936843e-06, + "loss": 0.0284, + "step": 175650 + }, + { + "epoch": 38.001364891946054, + "grad_norm": 0.7760922312736511, + "learning_rate": 2.699091877448591e-06, + "loss": 0.0245, + "step": 175660 + }, + { + "epoch": 38.00141905432486, + "grad_norm": 0.08258466422557831, + "learning_rate": 2.696082856403498e-06, + "loss": 0.03, + "step": 175670 + }, + { + "epoch": 38.00147321670368, + "grad_norm": 0.000792363309301436, + "learning_rate": 2.6930738353584048e-06, + "loss": 0.0094, + "step": 175680 + }, + { + "epoch": 38.00152737908249, + "grad_norm": 0.003925940487533808, + "learning_rate": 2.6900648143133115e-06, + "loss": 0.0192, + "step": 175690 + }, + { + "epoch": 38.0015815414613, + "grad_norm": 0.4823477864265442, + "learning_rate": 2.687055793268218e-06, + "loss": 0.0233, + "step": 175700 + }, + { + "epoch": 38.001635703840115, + "grad_norm": 0.0010506336111575365, + "learning_rate": 2.6840467722231253e-06, + "loss": 0.0256, + "step": 175710 + }, + { + "epoch": 38.001689866218925, + "grad_norm": 0.0007972577004693449, + "learning_rate": 2.6810377511780316e-06, + "loss": 0.0478, + "step": 175720 + }, + { + "epoch": 38.001744028597734, + "grad_norm": 0.001050417311489582, + "learning_rate": 2.6780287301329387e-06, + "loss": 0.0161, + "step": 175730 + }, + { + "epoch": 38.00179819097655, + "grad_norm": 0.0030792003963142633, + "learning_rate": 2.6750197090878454e-06, + "loss": 0.0164, + "step": 175740 + }, + { + "epoch": 38.00185235335536, + "grad_norm": 0.0007677156245335937, + "learning_rate": 2.6720106880427525e-06, + "loss": 0.0278, + "step": 175750 + }, + { + "epoch": 38.00190651573417, + "grad_norm": 0.0007849053363315761, + "learning_rate": 2.669001666997659e-06, + "loss": 0.045, + "step": 175760 + }, + { + "epoch": 38.00196067811298, + "grad_norm": 0.000971058092545718, + "learning_rate": 2.665992645952566e-06, + "loss": 0.1153, + "step": 175770 + }, + { + "epoch": 38.002014840491796, + "grad_norm": 0.0010143743129447103, + "learning_rate": 2.6629836249074726e-06, + "loss": 0.0308, + "step": 175780 + }, + { + "epoch": 38.002069002870606, + "grad_norm": 0.4982239902019501, + "learning_rate": 2.6599746038623797e-06, + "loss": 0.0464, + "step": 175790 + }, + { + "epoch": 38.002123165249415, + "grad_norm": 0.0013455476146191359, + "learning_rate": 2.6569655828172864e-06, + "loss": 0.0051, + "step": 175800 + }, + { + "epoch": 38.00217732762823, + "grad_norm": 0.0014633068349212408, + "learning_rate": 2.653956561772193e-06, + "loss": 0.0526, + "step": 175810 + }, + { + "epoch": 38.00223149000704, + "grad_norm": 0.0089344447478652, + "learning_rate": 2.6509475407271003e-06, + "loss": 0.0095, + "step": 175820 + }, + { + "epoch": 38.00228565238585, + "grad_norm": 0.0029734931886196136, + "learning_rate": 2.6479385196820065e-06, + "loss": 0.0047, + "step": 175830 + }, + { + "epoch": 38.00233981476467, + "grad_norm": 0.0011691506952047348, + "learning_rate": 2.6449294986369137e-06, + "loss": 0.0294, + "step": 175840 + }, + { + "epoch": 38.00239397714348, + "grad_norm": 0.0008556023240089417, + "learning_rate": 2.6419204775918204e-06, + "loss": 0.0034, + "step": 175850 + }, + { + "epoch": 38.002448139522286, + "grad_norm": 0.0007633905042894185, + "learning_rate": 2.6389114565467275e-06, + "loss": 0.0388, + "step": 175860 + }, + { + "epoch": 38.0025023019011, + "grad_norm": 0.3960166275501251, + "learning_rate": 2.6359024355016337e-06, + "loss": 0.0218, + "step": 175870 + }, + { + "epoch": 38.00255646427991, + "grad_norm": 0.41605275869369507, + "learning_rate": 2.632893414456541e-06, + "loss": 0.0078, + "step": 175880 + }, + { + "epoch": 38.00261062665872, + "grad_norm": 0.0007680283160880208, + "learning_rate": 2.6298843934114476e-06, + "loss": 0.0096, + "step": 175890 + }, + { + "epoch": 38.00266478903753, + "grad_norm": 0.026717346161603928, + "learning_rate": 2.6268753723663547e-06, + "loss": 0.0227, + "step": 175900 + }, + { + "epoch": 38.00271895141635, + "grad_norm": 0.0007694211089983582, + "learning_rate": 2.623866351321261e-06, + "loss": 0.0306, + "step": 175910 + }, + { + "epoch": 38.00277311379516, + "grad_norm": 0.0009366144076921046, + "learning_rate": 2.620857330276168e-06, + "loss": 0.0288, + "step": 175920 + }, + { + "epoch": 38.00282727617397, + "grad_norm": 3.485586404800415, + "learning_rate": 2.6178483092310748e-06, + "loss": 0.0425, + "step": 175930 + }, + { + "epoch": 38.002881438552784, + "grad_norm": 0.004872885998338461, + "learning_rate": 2.614839288185982e-06, + "loss": 0.0159, + "step": 175940 + }, + { + "epoch": 38.00293560093159, + "grad_norm": 0.000920544087421149, + "learning_rate": 2.6118302671408886e-06, + "loss": 0.0578, + "step": 175950 + }, + { + "epoch": 38.0029897633104, + "grad_norm": 0.3955715596675873, + "learning_rate": 2.6088212460957953e-06, + "loss": 0.0194, + "step": 175960 + }, + { + "epoch": 38.00304392568922, + "grad_norm": 0.027786336839199066, + "learning_rate": 2.6058122250507024e-06, + "loss": 0.0454, + "step": 175970 + }, + { + "epoch": 38.00309808806803, + "grad_norm": 0.0007716934196650982, + "learning_rate": 2.6028032040056087e-06, + "loss": 0.0174, + "step": 175980 + }, + { + "epoch": 38.00315225044684, + "grad_norm": 0.0008288932149298489, + "learning_rate": 2.599794182960516e-06, + "loss": 0.0082, + "step": 175990 + }, + { + "epoch": 38.00320641282565, + "grad_norm": 0.8739233613014221, + "learning_rate": 2.5967851619154225e-06, + "loss": 0.0052, + "step": 176000 + }, + { + "epoch": 38.003260575204465, + "grad_norm": 0.8396881222724915, + "learning_rate": 2.5937761408703296e-06, + "loss": 0.1177, + "step": 176010 + }, + { + "epoch": 38.003314737583274, + "grad_norm": 0.0013929776614531875, + "learning_rate": 2.590767119825236e-06, + "loss": 0.0595, + "step": 176020 + }, + { + "epoch": 38.003368899962084, + "grad_norm": 0.7422429323196411, + "learning_rate": 2.587758098780143e-06, + "loss": 0.0383, + "step": 176030 + }, + { + "epoch": 38.0034230623409, + "grad_norm": 0.0015079486183822155, + "learning_rate": 2.5847490777350497e-06, + "loss": 0.0024, + "step": 176040 + }, + { + "epoch": 38.00347722471971, + "grad_norm": 0.026216324418783188, + "learning_rate": 2.581740056689957e-06, + "loss": 0.0001, + "step": 176050 + }, + { + "epoch": 38.00353138709852, + "grad_norm": 0.4187496304512024, + "learning_rate": 2.578731035644863e-06, + "loss": 0.0199, + "step": 176060 + }, + { + "epoch": 38.003585549477336, + "grad_norm": 0.0007849890971556306, + "learning_rate": 2.5757220145997703e-06, + "loss": 0.0053, + "step": 176070 + }, + { + "epoch": 38.003639711856145, + "grad_norm": 0.0007623648853041232, + "learning_rate": 2.572712993554677e-06, + "loss": 0.0084, + "step": 176080 + }, + { + "epoch": 38.003693874234955, + "grad_norm": 0.000784233387093991, + "learning_rate": 2.569703972509584e-06, + "loss": 0.0084, + "step": 176090 + }, + { + "epoch": 38.00374803661377, + "grad_norm": 1.1126450300216675, + "learning_rate": 2.5666949514644908e-06, + "loss": 0.0641, + "step": 176100 + }, + { + "epoch": 38.00380219899258, + "grad_norm": 1.7716728448867798, + "learning_rate": 2.5636859304193975e-06, + "loss": 0.021, + "step": 176110 + }, + { + "epoch": 38.00385636137139, + "grad_norm": 0.0010501571232452989, + "learning_rate": 2.5606769093743046e-06, + "loss": 0.0282, + "step": 176120 + }, + { + "epoch": 38.0039105237502, + "grad_norm": 0.0007641773554496467, + "learning_rate": 2.557667888329211e-06, + "loss": 0.02, + "step": 176130 + }, + { + "epoch": 38.00396468612902, + "grad_norm": 0.6362070441246033, + "learning_rate": 2.554658867284118e-06, + "loss": 0.0396, + "step": 176140 + }, + { + "epoch": 38.004018848507826, + "grad_norm": 0.14229823648929596, + "learning_rate": 2.5516498462390247e-06, + "loss": 0.0633, + "step": 176150 + }, + { + "epoch": 38.004073010886636, + "grad_norm": 0.49298807978630066, + "learning_rate": 2.548640825193932e-06, + "loss": 0.073, + "step": 176160 + }, + { + "epoch": 38.00412717326545, + "grad_norm": 0.020700372755527496, + "learning_rate": 2.545631804148838e-06, + "loss": 0.0317, + "step": 176170 + }, + { + "epoch": 38.00418133564426, + "grad_norm": 0.0012556292349472642, + "learning_rate": 2.542622783103745e-06, + "loss": 0.0086, + "step": 176180 + }, + { + "epoch": 38.00423549802307, + "grad_norm": 0.1794896125793457, + "learning_rate": 2.539613762058652e-06, + "loss": 0.0116, + "step": 176190 + }, + { + "epoch": 38.00428966040189, + "grad_norm": 0.0007598724332638085, + "learning_rate": 2.536604741013559e-06, + "loss": 0.0366, + "step": 176200 + }, + { + "epoch": 38.0043438227807, + "grad_norm": 0.0009793781209737062, + "learning_rate": 2.5335957199684653e-06, + "loss": 0.0379, + "step": 176210 + }, + { + "epoch": 38.00439798515951, + "grad_norm": 3.190422296524048, + "learning_rate": 2.5305866989233724e-06, + "loss": 0.0845, + "step": 176220 + }, + { + "epoch": 38.004452147538316, + "grad_norm": 3.3022329807281494, + "learning_rate": 2.527577677878279e-06, + "loss": 0.0587, + "step": 176230 + }, + { + "epoch": 38.00450630991713, + "grad_norm": 0.0012376009253785014, + "learning_rate": 2.5245686568331862e-06, + "loss": 0.0017, + "step": 176240 + }, + { + "epoch": 38.00456047229594, + "grad_norm": 0.014750936068594456, + "learning_rate": 2.521559635788093e-06, + "loss": 0.0131, + "step": 176250 + }, + { + "epoch": 38.00461463467475, + "grad_norm": 3.3914923667907715, + "learning_rate": 2.5185506147429996e-06, + "loss": 0.1076, + "step": 176260 + }, + { + "epoch": 38.00466879705357, + "grad_norm": 0.0010377531871199608, + "learning_rate": 2.5155415936979063e-06, + "loss": 0.0163, + "step": 176270 + }, + { + "epoch": 38.00472295943238, + "grad_norm": 0.0007644420256838202, + "learning_rate": 2.512532572652813e-06, + "loss": 0.0047, + "step": 176280 + }, + { + "epoch": 38.00477712181119, + "grad_norm": 0.2715223729610443, + "learning_rate": 2.50952355160772e-06, + "loss": 0.0015, + "step": 176290 + }, + { + "epoch": 38.004831284190004, + "grad_norm": 1.6101514101028442, + "learning_rate": 2.506514530562627e-06, + "loss": 0.0551, + "step": 176300 + }, + { + "epoch": 38.004885446568814, + "grad_norm": 0.002458834322169423, + "learning_rate": 2.503505509517534e-06, + "loss": 0.0817, + "step": 176310 + }, + { + "epoch": 38.00493960894762, + "grad_norm": 0.0011953595094382763, + "learning_rate": 2.5004964884724402e-06, + "loss": 0.0133, + "step": 176320 + }, + { + "epoch": 38.00499377132644, + "grad_norm": 0.053438927978277206, + "learning_rate": 2.4974874674273474e-06, + "loss": 0.0258, + "step": 176330 + }, + { + "epoch": 38.00504793370525, + "grad_norm": 0.0008093796786852181, + "learning_rate": 2.494478446382254e-06, + "loss": 0.077, + "step": 176340 + }, + { + "epoch": 38.00510209608406, + "grad_norm": 0.001289462554268539, + "learning_rate": 2.491469425337161e-06, + "loss": 0.0521, + "step": 176350 + }, + { + "epoch": 38.00515625846287, + "grad_norm": 0.000984587473794818, + "learning_rate": 2.4884604042920675e-06, + "loss": 0.024, + "step": 176360 + }, + { + "epoch": 38.005210420841685, + "grad_norm": 0.7874487042427063, + "learning_rate": 2.4854513832469746e-06, + "loss": 0.0285, + "step": 176370 + }, + { + "epoch": 38.005264583220495, + "grad_norm": 0.0018183381762355566, + "learning_rate": 2.4824423622018813e-06, + "loss": 0.0089, + "step": 176380 + }, + { + "epoch": 38.005318745599304, + "grad_norm": 0.0007786600617691875, + "learning_rate": 2.4794333411567884e-06, + "loss": 0.0311, + "step": 176390 + }, + { + "epoch": 38.00537290797812, + "grad_norm": 0.003780268831178546, + "learning_rate": 2.476424320111695e-06, + "loss": 0.0036, + "step": 176400 + }, + { + "epoch": 38.00542707035693, + "grad_norm": 0.000780914444476366, + "learning_rate": 2.473415299066602e-06, + "loss": 0.0115, + "step": 176410 + }, + { + "epoch": 38.00548123273574, + "grad_norm": 0.0010403509950265288, + "learning_rate": 2.4704062780215085e-06, + "loss": 0.0105, + "step": 176420 + }, + { + "epoch": 38.005535395114556, + "grad_norm": 0.0017113867215812206, + "learning_rate": 2.467397256976415e-06, + "loss": 0.014, + "step": 176430 + }, + { + "epoch": 38.005589557493366, + "grad_norm": 0.0007897829636931419, + "learning_rate": 2.4643882359313223e-06, + "loss": 0.0288, + "step": 176440 + }, + { + "epoch": 38.005643719872175, + "grad_norm": 0.0007795915589667857, + "learning_rate": 2.461379214886229e-06, + "loss": 0.0345, + "step": 176450 + }, + { + "epoch": 38.00569788225099, + "grad_norm": 0.0007993610925041139, + "learning_rate": 2.458370193841136e-06, + "loss": 0.0002, + "step": 176460 + }, + { + "epoch": 38.0057520446298, + "grad_norm": 0.007455014623701572, + "learning_rate": 2.4553611727960424e-06, + "loss": 0.0617, + "step": 176470 + }, + { + "epoch": 38.00580620700861, + "grad_norm": 2.806609869003296, + "learning_rate": 2.4523521517509495e-06, + "loss": 0.0532, + "step": 176480 + }, + { + "epoch": 38.00586036938742, + "grad_norm": 0.0009180449414998293, + "learning_rate": 2.4493431307058562e-06, + "loss": 0.0873, + "step": 176490 + }, + { + "epoch": 38.00591453176624, + "grad_norm": 0.0013171843020245433, + "learning_rate": 2.4463341096607634e-06, + "loss": 0.0162, + "step": 176500 + }, + { + "epoch": 38.00596869414505, + "grad_norm": 0.0007606649887748063, + "learning_rate": 2.4433250886156696e-06, + "loss": 0.0042, + "step": 176510 + }, + { + "epoch": 38.006022856523856, + "grad_norm": 0.0009435324463993311, + "learning_rate": 2.4403160675705767e-06, + "loss": 0.0239, + "step": 176520 + }, + { + "epoch": 38.00607701890267, + "grad_norm": 0.007159783970564604, + "learning_rate": 2.4373070465254834e-06, + "loss": 0.02, + "step": 176530 + }, + { + "epoch": 38.00613118128148, + "grad_norm": 0.0007638251408934593, + "learning_rate": 2.43429802548039e-06, + "loss": 0.0331, + "step": 176540 + }, + { + "epoch": 38.00618534366029, + "grad_norm": 0.42440879344940186, + "learning_rate": 2.4312890044352973e-06, + "loss": 0.0183, + "step": 176550 + }, + { + "epoch": 38.00623950603911, + "grad_norm": 0.000779684167355299, + "learning_rate": 2.428279983390204e-06, + "loss": 0.0138, + "step": 176560 + }, + { + "epoch": 38.00629366841792, + "grad_norm": 0.0007679199916310608, + "learning_rate": 2.4252709623451107e-06, + "loss": 0.0434, + "step": 176570 + }, + { + "epoch": 38.00634783079673, + "grad_norm": 1.2443068027496338, + "learning_rate": 2.4222619413000174e-06, + "loss": 0.0233, + "step": 176580 + }, + { + "epoch": 38.00640199317554, + "grad_norm": 0.8269509077072144, + "learning_rate": 2.4192529202549245e-06, + "loss": 0.0097, + "step": 176590 + }, + { + "epoch": 38.00645615555435, + "grad_norm": 0.012214002199470997, + "learning_rate": 2.416243899209831e-06, + "loss": 0.0211, + "step": 176600 + }, + { + "epoch": 38.00651031793316, + "grad_norm": 0.0007727917400188744, + "learning_rate": 2.4132348781647383e-06, + "loss": 0.002, + "step": 176610 + }, + { + "epoch": 38.00656448031197, + "grad_norm": 0.004404833074659109, + "learning_rate": 2.4102258571196446e-06, + "loss": 0.0001, + "step": 176620 + }, + { + "epoch": 38.00661864269079, + "grad_norm": 0.2531627416610718, + "learning_rate": 2.4072168360745517e-06, + "loss": 0.0262, + "step": 176630 + }, + { + "epoch": 38.0066728050696, + "grad_norm": 0.0007599242380820215, + "learning_rate": 2.4042078150294584e-06, + "loss": 0.032, + "step": 176640 + }, + { + "epoch": 38.00672696744841, + "grad_norm": 0.11699836701154709, + "learning_rate": 2.4011987939843655e-06, + "loss": 0.0103, + "step": 176650 + }, + { + "epoch": 38.006781129827225, + "grad_norm": 0.07297474145889282, + "learning_rate": 2.398189772939272e-06, + "loss": 0.0124, + "step": 176660 + }, + { + "epoch": 38.006835292206034, + "grad_norm": 1.5190798044204712, + "learning_rate": 2.395180751894179e-06, + "loss": 0.0653, + "step": 176670 + }, + { + "epoch": 38.006889454584844, + "grad_norm": 1.7708522081375122, + "learning_rate": 2.3921717308490856e-06, + "loss": 0.0791, + "step": 176680 + }, + { + "epoch": 38.00694361696366, + "grad_norm": 0.0010124689433723688, + "learning_rate": 2.3891627098039923e-06, + "loss": 0.0317, + "step": 176690 + }, + { + "epoch": 38.00699777934247, + "grad_norm": 0.0020127003081142902, + "learning_rate": 2.386153688758899e-06, + "loss": 0.0061, + "step": 176700 + }, + { + "epoch": 38.00705194172128, + "grad_norm": 0.0007896673050709069, + "learning_rate": 2.383144667713806e-06, + "loss": 0.0547, + "step": 176710 + }, + { + "epoch": 38.00710610410009, + "grad_norm": 0.0009678371716290712, + "learning_rate": 2.380135646668713e-06, + "loss": 0.0137, + "step": 176720 + }, + { + "epoch": 38.007160266478905, + "grad_norm": 0.0007500541396439075, + "learning_rate": 2.3771266256236195e-06, + "loss": 0.03, + "step": 176730 + }, + { + "epoch": 38.007214428857715, + "grad_norm": 0.001346835633739829, + "learning_rate": 2.3741176045785266e-06, + "loss": 0.02, + "step": 176740 + }, + { + "epoch": 38.007268591236524, + "grad_norm": 0.8002526164054871, + "learning_rate": 2.3711085835334333e-06, + "loss": 0.0057, + "step": 176750 + }, + { + "epoch": 38.00732275361534, + "grad_norm": 0.0009544334607198834, + "learning_rate": 2.3680995624883405e-06, + "loss": 0.0041, + "step": 176760 + }, + { + "epoch": 38.00737691599415, + "grad_norm": 0.0007745184702798724, + "learning_rate": 2.3650905414432467e-06, + "loss": 0.0795, + "step": 176770 + }, + { + "epoch": 38.00743107837296, + "grad_norm": 15.261144638061523, + "learning_rate": 2.362081520398154e-06, + "loss": 0.0277, + "step": 176780 + }, + { + "epoch": 38.00748524075178, + "grad_norm": 0.0010207748273387551, + "learning_rate": 2.3590724993530606e-06, + "loss": 0.0035, + "step": 176790 + }, + { + "epoch": 38.007539403130586, + "grad_norm": 0.0007771679083816707, + "learning_rate": 2.3560634783079677e-06, + "loss": 0.0014, + "step": 176800 + }, + { + "epoch": 38.007593565509396, + "grad_norm": 0.00307985907420516, + "learning_rate": 2.353054457262874e-06, + "loss": 0.0335, + "step": 176810 + }, + { + "epoch": 38.00764772788821, + "grad_norm": 0.0007498282939195633, + "learning_rate": 2.350045436217781e-06, + "loss": 0.0194, + "step": 176820 + }, + { + "epoch": 38.00770189026702, + "grad_norm": 0.000997345894575119, + "learning_rate": 2.3470364151726878e-06, + "loss": 0.0394, + "step": 176830 + }, + { + "epoch": 38.00775605264583, + "grad_norm": 0.0007554988842457533, + "learning_rate": 2.3440273941275945e-06, + "loss": 0.0237, + "step": 176840 + }, + { + "epoch": 38.00781021502464, + "grad_norm": 0.0007497569313272834, + "learning_rate": 2.341018373082501e-06, + "loss": 0.0021, + "step": 176850 + }, + { + "epoch": 38.00786437740346, + "grad_norm": 0.0009784060530364513, + "learning_rate": 2.3380093520374083e-06, + "loss": 0.0013, + "step": 176860 + }, + { + "epoch": 38.00791853978227, + "grad_norm": 0.0009878227720037103, + "learning_rate": 2.335000330992315e-06, + "loss": 0.0617, + "step": 176870 + }, + { + "epoch": 38.00797270216108, + "grad_norm": 0.0007489359704777598, + "learning_rate": 2.3319913099472217e-06, + "loss": 0.0, + "step": 176880 + }, + { + "epoch": 38.00802686453989, + "grad_norm": 0.0007569846347905695, + "learning_rate": 2.328982288902129e-06, + "loss": 0.02, + "step": 176890 + }, + { + "epoch": 38.0080810269187, + "grad_norm": 0.0009372637141495943, + "learning_rate": 2.3259732678570355e-06, + "loss": 0.0, + "step": 176900 + }, + { + "epoch": 38.00813518929751, + "grad_norm": 0.11359896510839462, + "learning_rate": 2.3229642468119426e-06, + "loss": 0.0034, + "step": 176910 + }, + { + "epoch": 38.00818935167633, + "grad_norm": 1.0849826335906982, + "learning_rate": 2.319955225766849e-06, + "loss": 0.107, + "step": 176920 + }, + { + "epoch": 38.00824351405514, + "grad_norm": 0.0007795888814143836, + "learning_rate": 2.316946204721756e-06, + "loss": 0.0238, + "step": 176930 + }, + { + "epoch": 38.00829767643395, + "grad_norm": 0.0007952875457704067, + "learning_rate": 2.3139371836766627e-06, + "loss": 0.0313, + "step": 176940 + }, + { + "epoch": 38.00835183881276, + "grad_norm": 0.0007494797464460135, + "learning_rate": 2.31092816263157e-06, + "loss": 0.0546, + "step": 176950 + }, + { + "epoch": 38.008406001191574, + "grad_norm": 0.000951880298089236, + "learning_rate": 2.307919141586476e-06, + "loss": 0.0001, + "step": 176960 + }, + { + "epoch": 38.00846016357038, + "grad_norm": 0.09977398812770844, + "learning_rate": 2.3049101205413832e-06, + "loss": 0.0117, + "step": 176970 + }, + { + "epoch": 38.00851432594919, + "grad_norm": 0.0010849705431610346, + "learning_rate": 2.30190109949629e-06, + "loss": 0.0, + "step": 176980 + }, + { + "epoch": 38.00856848832801, + "grad_norm": 0.0007539308280684054, + "learning_rate": 2.2988920784511966e-06, + "loss": 0.0221, + "step": 176990 + }, + { + "epoch": 38.00862265070682, + "grad_norm": 2.5643138885498047, + "learning_rate": 2.2958830574061033e-06, + "loss": 0.022, + "step": 177000 + }, + { + "epoch": 38.00867681308563, + "grad_norm": 0.0013097123010084033, + "learning_rate": 2.2928740363610105e-06, + "loss": 0.0149, + "step": 177010 + }, + { + "epoch": 38.008730975464445, + "grad_norm": 0.0013055811868980527, + "learning_rate": 2.289865015315917e-06, + "loss": 0.0568, + "step": 177020 + }, + { + "epoch": 38.008785137843255, + "grad_norm": 0.05590701103210449, + "learning_rate": 2.286855994270824e-06, + "loss": 0.0595, + "step": 177030 + }, + { + "epoch": 38.008839300222064, + "grad_norm": 0.0029640281572937965, + "learning_rate": 2.283846973225731e-06, + "loss": 0.0274, + "step": 177040 + }, + { + "epoch": 38.00889346260088, + "grad_norm": 0.0010079867206513882, + "learning_rate": 2.2808379521806377e-06, + "loss": 0.0385, + "step": 177050 + }, + { + "epoch": 38.00894762497969, + "grad_norm": 0.002152190776541829, + "learning_rate": 2.277828931135545e-06, + "loss": 0.0135, + "step": 177060 + }, + { + "epoch": 38.0090017873585, + "grad_norm": 0.002661728300154209, + "learning_rate": 2.274819910090451e-06, + "loss": 0.0278, + "step": 177070 + }, + { + "epoch": 38.00905594973731, + "grad_norm": 0.0012403178261592984, + "learning_rate": 2.271810889045358e-06, + "loss": 0.0076, + "step": 177080 + }, + { + "epoch": 38.009110112116126, + "grad_norm": 0.00178140914067626, + "learning_rate": 2.268801868000265e-06, + "loss": 0.0257, + "step": 177090 + }, + { + "epoch": 38.009164274494935, + "grad_norm": 0.12221399694681168, + "learning_rate": 2.265792846955172e-06, + "loss": 0.099, + "step": 177100 + }, + { + "epoch": 38.009218436873745, + "grad_norm": 0.34413889050483704, + "learning_rate": 2.2627838259100783e-06, + "loss": 0.0176, + "step": 177110 + }, + { + "epoch": 38.00927259925256, + "grad_norm": 0.0007590840104967356, + "learning_rate": 2.2597748048649854e-06, + "loss": 0.0562, + "step": 177120 + }, + { + "epoch": 38.00932676163137, + "grad_norm": 0.06764103472232819, + "learning_rate": 2.256765783819892e-06, + "loss": 0.0111, + "step": 177130 + }, + { + "epoch": 38.00938092401018, + "grad_norm": 0.00457989564165473, + "learning_rate": 2.253756762774799e-06, + "loss": 0.0217, + "step": 177140 + }, + { + "epoch": 38.009435086389, + "grad_norm": 0.03822951763868332, + "learning_rate": 2.2507477417297055e-06, + "loss": 0.0614, + "step": 177150 + }, + { + "epoch": 38.00948924876781, + "grad_norm": 1.4084255695343018, + "learning_rate": 2.2477387206846126e-06, + "loss": 0.0321, + "step": 177160 + }, + { + "epoch": 38.009543411146616, + "grad_norm": 3.795746326446533, + "learning_rate": 2.2447296996395193e-06, + "loss": 0.0385, + "step": 177170 + }, + { + "epoch": 38.00959757352543, + "grad_norm": 0.013659130781888962, + "learning_rate": 2.241720678594426e-06, + "loss": 0.0087, + "step": 177180 + }, + { + "epoch": 38.00965173590424, + "grad_norm": 0.0007499651983380318, + "learning_rate": 2.238711657549333e-06, + "loss": 0.0451, + "step": 177190 + }, + { + "epoch": 38.00970589828305, + "grad_norm": 0.7683113217353821, + "learning_rate": 2.23570263650424e-06, + "loss": 0.0223, + "step": 177200 + }, + { + "epoch": 38.00976006066186, + "grad_norm": 0.0019614852499216795, + "learning_rate": 2.232693615459147e-06, + "loss": 0.0277, + "step": 177210 + }, + { + "epoch": 38.00981422304068, + "grad_norm": 0.00532146031036973, + "learning_rate": 2.2296845944140532e-06, + "loss": 0.0094, + "step": 177220 + }, + { + "epoch": 38.00986838541949, + "grad_norm": 0.0009646416292525828, + "learning_rate": 2.2266755733689604e-06, + "loss": 0.0238, + "step": 177230 + }, + { + "epoch": 38.0099225477983, + "grad_norm": 1.7290579080581665, + "learning_rate": 2.223666552323867e-06, + "loss": 0.0243, + "step": 177240 + }, + { + "epoch": 38.00997671017711, + "grad_norm": 0.001210078364238143, + "learning_rate": 2.2206575312787738e-06, + "loss": 0.0081, + "step": 177250 + }, + { + "epoch": 38.01003087255592, + "grad_norm": 0.0009330784669145942, + "learning_rate": 2.2176485102336805e-06, + "loss": 0.0086, + "step": 177260 + }, + { + "epoch": 38.01008503493473, + "grad_norm": 0.0007429652614519, + "learning_rate": 2.2146394891885876e-06, + "loss": 0.0524, + "step": 177270 + }, + { + "epoch": 38.01013919731355, + "grad_norm": 0.08512066304683685, + "learning_rate": 2.2116304681434943e-06, + "loss": 0.0035, + "step": 177280 + }, + { + "epoch": 38.01019335969236, + "grad_norm": 0.6521084308624268, + "learning_rate": 2.208621447098401e-06, + "loss": 0.0056, + "step": 177290 + }, + { + "epoch": 38.01024752207117, + "grad_norm": 0.0009955092100426555, + "learning_rate": 2.2056124260533077e-06, + "loss": 0.0195, + "step": 177300 + }, + { + "epoch": 38.01030168444998, + "grad_norm": 0.002466962207108736, + "learning_rate": 2.202603405008215e-06, + "loss": 0.0, + "step": 177310 + }, + { + "epoch": 38.010355846828794, + "grad_norm": 0.03732657805085182, + "learning_rate": 2.1995943839631215e-06, + "loss": 0.0497, + "step": 177320 + }, + { + "epoch": 38.010410009207604, + "grad_norm": 0.001284367055632174, + "learning_rate": 2.196585362918028e-06, + "loss": 0.0869, + "step": 177330 + }, + { + "epoch": 38.01046417158641, + "grad_norm": 0.0009841285645961761, + "learning_rate": 2.1935763418729353e-06, + "loss": 0.0221, + "step": 177340 + }, + { + "epoch": 38.01051833396523, + "grad_norm": 2.433738946914673, + "learning_rate": 2.190567320827842e-06, + "loss": 0.0708, + "step": 177350 + }, + { + "epoch": 38.01057249634404, + "grad_norm": 0.0012050432851538062, + "learning_rate": 2.187558299782749e-06, + "loss": 0.0753, + "step": 177360 + }, + { + "epoch": 38.01062665872285, + "grad_norm": 0.0009317342774011195, + "learning_rate": 2.1845492787376554e-06, + "loss": 0.0293, + "step": 177370 + }, + { + "epoch": 38.010680821101666, + "grad_norm": 0.0748678669333458, + "learning_rate": 2.1815402576925625e-06, + "loss": 0.0027, + "step": 177380 + }, + { + "epoch": 38.010734983480475, + "grad_norm": 0.030039804056286812, + "learning_rate": 2.1785312366474692e-06, + "loss": 0.0527, + "step": 177390 + }, + { + "epoch": 38.010789145859285, + "grad_norm": 5.434042930603027, + "learning_rate": 2.175522215602376e-06, + "loss": 0.0758, + "step": 177400 + }, + { + "epoch": 38.0108433082381, + "grad_norm": 0.0007647427846677601, + "learning_rate": 2.1725131945572826e-06, + "loss": 0.0386, + "step": 177410 + }, + { + "epoch": 38.01089747061691, + "grad_norm": 0.0007979015354067087, + "learning_rate": 2.1695041735121897e-06, + "loss": 0.0152, + "step": 177420 + }, + { + "epoch": 38.01095163299572, + "grad_norm": 0.009086973965168, + "learning_rate": 2.1664951524670964e-06, + "loss": 0.053, + "step": 177430 + }, + { + "epoch": 38.01100579537453, + "grad_norm": 0.0010295305401086807, + "learning_rate": 2.163486131422003e-06, + "loss": 0.1433, + "step": 177440 + }, + { + "epoch": 38.011059957753346, + "grad_norm": 0.09747742116451263, + "learning_rate": 2.16047711037691e-06, + "loss": 0.0212, + "step": 177450 + }, + { + "epoch": 38.011114120132156, + "grad_norm": 0.000744571560062468, + "learning_rate": 2.157468089331817e-06, + "loss": 0.0492, + "step": 177460 + }, + { + "epoch": 38.011168282510965, + "grad_norm": 0.003464530222117901, + "learning_rate": 2.1544590682867237e-06, + "loss": 0.0078, + "step": 177470 + }, + { + "epoch": 38.01122244488978, + "grad_norm": 0.0007751798839308321, + "learning_rate": 2.1514500472416304e-06, + "loss": 0.074, + "step": 177480 + }, + { + "epoch": 38.01127660726859, + "grad_norm": 0.0012350413016974926, + "learning_rate": 2.1484410261965375e-06, + "loss": 0.0552, + "step": 177490 + }, + { + "epoch": 38.0113307696474, + "grad_norm": 0.0008465074934065342, + "learning_rate": 2.145432005151444e-06, + "loss": 0.0072, + "step": 177500 + }, + { + "epoch": 38.01138493202622, + "grad_norm": 0.09835094213485718, + "learning_rate": 2.1424229841063513e-06, + "loss": 0.0008, + "step": 177510 + }, + { + "epoch": 38.01143909440503, + "grad_norm": 0.0011449934681877494, + "learning_rate": 2.1394139630612576e-06, + "loss": 0.0428, + "step": 177520 + }, + { + "epoch": 38.01149325678384, + "grad_norm": 0.0011084482539445162, + "learning_rate": 2.1364049420161647e-06, + "loss": 0.014, + "step": 177530 + }, + { + "epoch": 38.011547419162646, + "grad_norm": 1.5841177701950073, + "learning_rate": 2.1333959209710714e-06, + "loss": 0.0323, + "step": 177540 + }, + { + "epoch": 38.01160158154146, + "grad_norm": 0.019022617489099503, + "learning_rate": 2.130386899925978e-06, + "loss": 0.0433, + "step": 177550 + }, + { + "epoch": 38.01165574392027, + "grad_norm": 0.0015528921503573656, + "learning_rate": 2.127377878880885e-06, + "loss": 0.0584, + "step": 177560 + }, + { + "epoch": 38.01170990629908, + "grad_norm": 0.0007535583572462201, + "learning_rate": 2.124368857835792e-06, + "loss": 0.0098, + "step": 177570 + }, + { + "epoch": 38.0117640686779, + "grad_norm": 3.371450424194336, + "learning_rate": 2.1213598367906986e-06, + "loss": 0.0771, + "step": 177580 + }, + { + "epoch": 38.01181823105671, + "grad_norm": 0.1113499104976654, + "learning_rate": 2.1183508157456053e-06, + "loss": 0.0079, + "step": 177590 + }, + { + "epoch": 38.01187239343552, + "grad_norm": 0.0021767900325357914, + "learning_rate": 2.115341794700512e-06, + "loss": 0.1046, + "step": 177600 + }, + { + "epoch": 38.011926555814334, + "grad_norm": 0.000759482616558671, + "learning_rate": 2.112332773655419e-06, + "loss": 0.0216, + "step": 177610 + }, + { + "epoch": 38.01198071819314, + "grad_norm": 0.0007595551433041692, + "learning_rate": 2.109323752610326e-06, + "loss": 0.0335, + "step": 177620 + }, + { + "epoch": 38.01203488057195, + "grad_norm": 0.0007580636884085834, + "learning_rate": 2.1063147315652325e-06, + "loss": 0.0544, + "step": 177630 + }, + { + "epoch": 38.01208904295077, + "grad_norm": 0.0007634214707650244, + "learning_rate": 2.1033057105201396e-06, + "loss": 0.004, + "step": 177640 + }, + { + "epoch": 38.01214320532958, + "grad_norm": 0.9498738646507263, + "learning_rate": 2.1002966894750463e-06, + "loss": 0.0229, + "step": 177650 + }, + { + "epoch": 38.01219736770839, + "grad_norm": 0.0008446866413578391, + "learning_rate": 2.0972876684299535e-06, + "loss": 0.0109, + "step": 177660 + }, + { + "epoch": 38.0122515300872, + "grad_norm": 0.21144075691699982, + "learning_rate": 2.0942786473848597e-06, + "loss": 0.0152, + "step": 177670 + }, + { + "epoch": 38.012305692466015, + "grad_norm": 0.022158553823828697, + "learning_rate": 2.091269626339767e-06, + "loss": 0.027, + "step": 177680 + }, + { + "epoch": 38.012359854844824, + "grad_norm": 0.0009521080646663904, + "learning_rate": 2.0882606052946736e-06, + "loss": 0.0349, + "step": 177690 + }, + { + "epoch": 38.012414017223634, + "grad_norm": 0.000748847087379545, + "learning_rate": 2.0852515842495803e-06, + "loss": 0.0863, + "step": 177700 + }, + { + "epoch": 38.01246817960245, + "grad_norm": 0.0013596079079434276, + "learning_rate": 2.082242563204487e-06, + "loss": 0.0213, + "step": 177710 + }, + { + "epoch": 38.01252234198126, + "grad_norm": 0.0009436464169993997, + "learning_rate": 2.079233542159394e-06, + "loss": 0.0189, + "step": 177720 + }, + { + "epoch": 38.01257650436007, + "grad_norm": 0.002834669779986143, + "learning_rate": 2.0762245211143008e-06, + "loss": 0.0361, + "step": 177730 + }, + { + "epoch": 38.012630666738886, + "grad_norm": 1.3571891784667969, + "learning_rate": 2.0732155000692075e-06, + "loss": 0.0331, + "step": 177740 + }, + { + "epoch": 38.012684829117696, + "grad_norm": 0.6025797128677368, + "learning_rate": 2.070206479024114e-06, + "loss": 0.0414, + "step": 177750 + }, + { + "epoch": 38.012738991496505, + "grad_norm": 0.0008257959852926433, + "learning_rate": 2.0671974579790213e-06, + "loss": 0.0167, + "step": 177760 + }, + { + "epoch": 38.01279315387532, + "grad_norm": 0.0009606071980670094, + "learning_rate": 2.064188436933928e-06, + "loss": 0.0608, + "step": 177770 + }, + { + "epoch": 38.01284731625413, + "grad_norm": 0.0007555423653684556, + "learning_rate": 2.0611794158888347e-06, + "loss": 0.0194, + "step": 177780 + }, + { + "epoch": 38.01290147863294, + "grad_norm": 0.25664737820625305, + "learning_rate": 2.058170394843742e-06, + "loss": 0.0057, + "step": 177790 + }, + { + "epoch": 38.01295564101175, + "grad_norm": 2.443005323410034, + "learning_rate": 2.0551613737986485e-06, + "loss": 0.0259, + "step": 177800 + }, + { + "epoch": 38.01300980339057, + "grad_norm": 1.277863621711731, + "learning_rate": 2.0521523527535556e-06, + "loss": 0.0421, + "step": 177810 + }, + { + "epoch": 38.013063965769376, + "grad_norm": 0.0015592618146911263, + "learning_rate": 2.049143331708462e-06, + "loss": 0.0006, + "step": 177820 + }, + { + "epoch": 38.013118128148186, + "grad_norm": 0.10021936148405075, + "learning_rate": 2.046134310663369e-06, + "loss": 0.0004, + "step": 177830 + }, + { + "epoch": 38.013172290527, + "grad_norm": 0.0012227613478899002, + "learning_rate": 2.0431252896182757e-06, + "loss": 0.0356, + "step": 177840 + }, + { + "epoch": 38.01322645290581, + "grad_norm": 0.005844597239047289, + "learning_rate": 2.0401162685731824e-06, + "loss": 0.0381, + "step": 177850 + }, + { + "epoch": 38.01328061528462, + "grad_norm": 0.4300704896450043, + "learning_rate": 2.037107247528089e-06, + "loss": 0.0275, + "step": 177860 + }, + { + "epoch": 38.01333477766344, + "grad_norm": 0.0007456906605511904, + "learning_rate": 2.0340982264829962e-06, + "loss": 0.0939, + "step": 177870 + }, + { + "epoch": 38.01338894004225, + "grad_norm": 0.0008069221512414515, + "learning_rate": 2.031089205437903e-06, + "loss": 0.0248, + "step": 177880 + }, + { + "epoch": 38.01344310242106, + "grad_norm": 0.0007474821177311242, + "learning_rate": 2.0280801843928096e-06, + "loss": 0.0404, + "step": 177890 + }, + { + "epoch": 38.01349726479987, + "grad_norm": 1.638824462890625, + "learning_rate": 2.0250711633477163e-06, + "loss": 0.0309, + "step": 177900 + }, + { + "epoch": 38.01355142717868, + "grad_norm": 0.0009937784634530544, + "learning_rate": 2.0220621423026235e-06, + "loss": 0.0199, + "step": 177910 + }, + { + "epoch": 38.01360558955749, + "grad_norm": 0.25958749651908875, + "learning_rate": 2.01905312125753e-06, + "loss": 0.0114, + "step": 177920 + }, + { + "epoch": 38.0136597519363, + "grad_norm": 0.0007683933945372701, + "learning_rate": 2.016044100212437e-06, + "loss": 0.0308, + "step": 177930 + }, + { + "epoch": 38.01371391431512, + "grad_norm": 0.8679211735725403, + "learning_rate": 2.013035079167344e-06, + "loss": 0.1587, + "step": 177940 + }, + { + "epoch": 38.01376807669393, + "grad_norm": 0.0007489034323953092, + "learning_rate": 2.0100260581222507e-06, + "loss": 0.023, + "step": 177950 + }, + { + "epoch": 38.01382223907274, + "grad_norm": 4.758671283721924, + "learning_rate": 2.0070170370771574e-06, + "loss": 0.0367, + "step": 177960 + }, + { + "epoch": 38.013876401451554, + "grad_norm": 0.023009879514575005, + "learning_rate": 2.004008016032064e-06, + "loss": 0.0478, + "step": 177970 + }, + { + "epoch": 38.013930563830364, + "grad_norm": 0.000756593537516892, + "learning_rate": 2.000998994986971e-06, + "loss": 0.0046, + "step": 177980 + }, + { + "epoch": 38.01398472620917, + "grad_norm": 0.5067854523658752, + "learning_rate": 1.997989973941878e-06, + "loss": 0.0244, + "step": 177990 + }, + { + "epoch": 38.01403888858799, + "grad_norm": 0.014932718127965927, + "learning_rate": 1.9949809528967846e-06, + "loss": 0.0134, + "step": 178000 + }, + { + "epoch": 38.0140930509668, + "grad_norm": 0.0014865061966702342, + "learning_rate": 1.9919719318516913e-06, + "loss": 0.1046, + "step": 178010 + }, + { + "epoch": 38.01414721334561, + "grad_norm": 0.00076503143645823, + "learning_rate": 1.9889629108065984e-06, + "loss": 0.0312, + "step": 178020 + }, + { + "epoch": 38.01420137572442, + "grad_norm": 0.0007935506873764098, + "learning_rate": 1.985953889761505e-06, + "loss": 0.0102, + "step": 178030 + }, + { + "epoch": 38.014255538103235, + "grad_norm": 2.2665483951568604, + "learning_rate": 1.982944868716412e-06, + "loss": 0.0646, + "step": 178040 + }, + { + "epoch": 38.014309700482045, + "grad_norm": 0.10899050533771515, + "learning_rate": 1.9799358476713185e-06, + "loss": 0.0138, + "step": 178050 + }, + { + "epoch": 38.014363862860854, + "grad_norm": 0.0008756914176046848, + "learning_rate": 1.9769268266262256e-06, + "loss": 0.0307, + "step": 178060 + }, + { + "epoch": 38.01441802523967, + "grad_norm": 0.0033771349117159843, + "learning_rate": 1.9739178055811323e-06, + "loss": 0.0648, + "step": 178070 + }, + { + "epoch": 38.01447218761848, + "grad_norm": 0.005322013515979052, + "learning_rate": 1.970908784536039e-06, + "loss": 0.0005, + "step": 178080 + }, + { + "epoch": 38.01452634999729, + "grad_norm": 0.001159437233582139, + "learning_rate": 1.967899763490946e-06, + "loss": 0.0377, + "step": 178090 + }, + { + "epoch": 38.01458051237611, + "grad_norm": 1.137306809425354, + "learning_rate": 1.964890742445853e-06, + "loss": 0.0312, + "step": 178100 + }, + { + "epoch": 38.014634674754916, + "grad_norm": 0.0007524071261286736, + "learning_rate": 1.9618817214007595e-06, + "loss": 0.0527, + "step": 178110 + }, + { + "epoch": 38.014688837133725, + "grad_norm": 1.9128077030181885, + "learning_rate": 1.9588727003556662e-06, + "loss": 0.0511, + "step": 178120 + }, + { + "epoch": 38.01474299951254, + "grad_norm": 0.0007501911022700369, + "learning_rate": 1.9558636793105734e-06, + "loss": 0.0043, + "step": 178130 + }, + { + "epoch": 38.01479716189135, + "grad_norm": 0.6591194272041321, + "learning_rate": 1.95285465826548e-06, + "loss": 0.0449, + "step": 178140 + }, + { + "epoch": 38.01485132427016, + "grad_norm": 0.0007826816290616989, + "learning_rate": 1.9498456372203868e-06, + "loss": 0.0008, + "step": 178150 + }, + { + "epoch": 38.01490548664897, + "grad_norm": 0.0007757437997497618, + "learning_rate": 1.9468366161752935e-06, + "loss": 0.0385, + "step": 178160 + }, + { + "epoch": 38.01495964902779, + "grad_norm": 0.005649576894938946, + "learning_rate": 1.9438275951302006e-06, + "loss": 0.0089, + "step": 178170 + }, + { + "epoch": 38.0150138114066, + "grad_norm": 0.0007391981198452413, + "learning_rate": 1.9408185740851073e-06, + "loss": 0.0037, + "step": 178180 + }, + { + "epoch": 38.015067973785406, + "grad_norm": 0.0030167358927428722, + "learning_rate": 1.937809553040014e-06, + "loss": 0.0097, + "step": 178190 + }, + { + "epoch": 38.01512213616422, + "grad_norm": 0.004116617143154144, + "learning_rate": 1.9348005319949207e-06, + "loss": 0.0306, + "step": 178200 + }, + { + "epoch": 38.01517629854303, + "grad_norm": 0.7810890078544617, + "learning_rate": 1.931791510949828e-06, + "loss": 0.0116, + "step": 178210 + }, + { + "epoch": 38.01523046092184, + "grad_norm": 2.493760108947754, + "learning_rate": 1.9287824899047345e-06, + "loss": 0.0906, + "step": 178220 + }, + { + "epoch": 38.01528462330066, + "grad_norm": 0.0009706621058285236, + "learning_rate": 1.925773468859641e-06, + "loss": 0.0432, + "step": 178230 + }, + { + "epoch": 38.01533878567947, + "grad_norm": 0.00652898708358407, + "learning_rate": 1.9227644478145483e-06, + "loss": 0.0345, + "step": 178240 + }, + { + "epoch": 38.01539294805828, + "grad_norm": 0.0007517024641856551, + "learning_rate": 1.919755426769455e-06, + "loss": 0.0151, + "step": 178250 + }, + { + "epoch": 38.01544711043709, + "grad_norm": 0.0009975008433684707, + "learning_rate": 1.9167464057243617e-06, + "loss": 0.0805, + "step": 178260 + }, + { + "epoch": 38.015501272815904, + "grad_norm": 0.9091448187828064, + "learning_rate": 1.9137373846792684e-06, + "loss": 0.0072, + "step": 178270 + }, + { + "epoch": 38.01555543519471, + "grad_norm": 2.489297389984131, + "learning_rate": 1.9107283636341755e-06, + "loss": 0.1133, + "step": 178280 + }, + { + "epoch": 38.01560959757352, + "grad_norm": 0.016307875514030457, + "learning_rate": 1.9077193425890822e-06, + "loss": 0.0092, + "step": 178290 + }, + { + "epoch": 38.01566375995234, + "grad_norm": 0.0007732883095741272, + "learning_rate": 1.904710321543989e-06, + "loss": 0.004, + "step": 178300 + }, + { + "epoch": 38.01571792233115, + "grad_norm": 0.003617464331910014, + "learning_rate": 1.9017013004988958e-06, + "loss": 0.0029, + "step": 178310 + }, + { + "epoch": 38.01577208470996, + "grad_norm": 0.045588891953229904, + "learning_rate": 1.8986922794538027e-06, + "loss": 0.0093, + "step": 178320 + }, + { + "epoch": 38.015826247088775, + "grad_norm": 0.78718101978302, + "learning_rate": 1.8956832584087094e-06, + "loss": 0.0061, + "step": 178330 + }, + { + "epoch": 38.015880409467584, + "grad_norm": 0.002931960392743349, + "learning_rate": 1.8926742373636161e-06, + "loss": 0.0328, + "step": 178340 + }, + { + "epoch": 38.015934571846394, + "grad_norm": 0.0009945867350324988, + "learning_rate": 1.889665216318523e-06, + "loss": 0.0041, + "step": 178350 + }, + { + "epoch": 38.01598873422521, + "grad_norm": 0.0007510483846999705, + "learning_rate": 1.88665619527343e-06, + "loss": 0.0346, + "step": 178360 + }, + { + "epoch": 38.01604289660402, + "grad_norm": 0.005856838542968035, + "learning_rate": 1.8836471742283369e-06, + "loss": 0.0037, + "step": 178370 + }, + { + "epoch": 38.01609705898283, + "grad_norm": 0.0007461248897016048, + "learning_rate": 1.8806381531832434e-06, + "loss": 0.0034, + "step": 178380 + }, + { + "epoch": 38.01615122136164, + "grad_norm": 0.0007258428377099335, + "learning_rate": 1.8776291321381503e-06, + "loss": 0.0128, + "step": 178390 + }, + { + "epoch": 38.016205383740456, + "grad_norm": 0.0030552793759852648, + "learning_rate": 1.8746201110930572e-06, + "loss": 0.0202, + "step": 178400 + }, + { + "epoch": 38.016259546119265, + "grad_norm": 0.002785465447232127, + "learning_rate": 1.8716110900479637e-06, + "loss": 0.0109, + "step": 178410 + }, + { + "epoch": 38.016313708498075, + "grad_norm": 2.484341859817505, + "learning_rate": 1.8686020690028706e-06, + "loss": 0.0068, + "step": 178420 + }, + { + "epoch": 38.01636787087689, + "grad_norm": 0.13538531959056854, + "learning_rate": 1.8655930479577775e-06, + "loss": 0.091, + "step": 178430 + }, + { + "epoch": 38.0164220332557, + "grad_norm": 0.0007370199891738594, + "learning_rate": 1.8625840269126844e-06, + "loss": 0.0341, + "step": 178440 + }, + { + "epoch": 38.01647619563451, + "grad_norm": 1.4354991912841797, + "learning_rate": 1.859575005867591e-06, + "loss": 0.0408, + "step": 178450 + }, + { + "epoch": 38.01653035801333, + "grad_norm": 0.0007339101284742355, + "learning_rate": 1.856565984822498e-06, + "loss": 0.0843, + "step": 178460 + }, + { + "epoch": 38.016584520392136, + "grad_norm": 0.000772767118178308, + "learning_rate": 1.8535569637774047e-06, + "loss": 0.0306, + "step": 178470 + }, + { + "epoch": 38.016638682770946, + "grad_norm": 0.0012127457885071635, + "learning_rate": 1.8505479427323116e-06, + "loss": 0.0219, + "step": 178480 + }, + { + "epoch": 38.016692845149755, + "grad_norm": 0.31237268447875977, + "learning_rate": 1.8475389216872183e-06, + "loss": 0.019, + "step": 178490 + }, + { + "epoch": 38.01674700752857, + "grad_norm": 4.209659099578857, + "learning_rate": 1.8445299006421252e-06, + "loss": 0.124, + "step": 178500 + }, + { + "epoch": 38.01680116990738, + "grad_norm": 0.000958107877522707, + "learning_rate": 1.8415208795970321e-06, + "loss": 0.0145, + "step": 178510 + }, + { + "epoch": 38.01685533228619, + "grad_norm": 0.0009448272176086903, + "learning_rate": 1.838511858551939e-06, + "loss": 0.1006, + "step": 178520 + }, + { + "epoch": 38.01690949466501, + "grad_norm": 1.1294786930084229, + "learning_rate": 1.8355028375068455e-06, + "loss": 0.0258, + "step": 178530 + }, + { + "epoch": 38.01696365704382, + "grad_norm": 0.001017816481180489, + "learning_rate": 1.8324938164617524e-06, + "loss": 0.0645, + "step": 178540 + }, + { + "epoch": 38.01701781942263, + "grad_norm": 0.0011292829876765609, + "learning_rate": 1.8294847954166593e-06, + "loss": 0.0052, + "step": 178550 + }, + { + "epoch": 38.01707198180144, + "grad_norm": 0.0165595430880785, + "learning_rate": 1.8264757743715658e-06, + "loss": 0.0102, + "step": 178560 + }, + { + "epoch": 38.01712614418025, + "grad_norm": 0.0007442565984092653, + "learning_rate": 1.8234667533264727e-06, + "loss": 0.0288, + "step": 178570 + }, + { + "epoch": 38.01718030655906, + "grad_norm": 0.10671781003475189, + "learning_rate": 1.8204577322813796e-06, + "loss": 0.0042, + "step": 178580 + }, + { + "epoch": 38.01723446893788, + "grad_norm": 0.00725919334217906, + "learning_rate": 1.8174487112362866e-06, + "loss": 0.0009, + "step": 178590 + }, + { + "epoch": 38.01728863131669, + "grad_norm": 0.002487239195033908, + "learning_rate": 1.8144396901911933e-06, + "loss": 0.0097, + "step": 178600 + }, + { + "epoch": 38.0173427936955, + "grad_norm": 0.009803936816751957, + "learning_rate": 1.8114306691461e-06, + "loss": 0.0248, + "step": 178610 + }, + { + "epoch": 38.01739695607431, + "grad_norm": 0.0013692500069737434, + "learning_rate": 1.8084216481010069e-06, + "loss": 0.0078, + "step": 178620 + }, + { + "epoch": 38.017451118453124, + "grad_norm": 0.015772853046655655, + "learning_rate": 1.8054126270559138e-06, + "loss": 0.0091, + "step": 178630 + }, + { + "epoch": 38.017505280831934, + "grad_norm": 0.0010135543998330832, + "learning_rate": 1.8024036060108205e-06, + "loss": 0.0063, + "step": 178640 + }, + { + "epoch": 38.01755944321074, + "grad_norm": 0.0007222300628200173, + "learning_rate": 1.7993945849657274e-06, + "loss": 0.0039, + "step": 178650 + }, + { + "epoch": 38.01761360558956, + "grad_norm": 0.0007327276980504394, + "learning_rate": 1.7963855639206343e-06, + "loss": 0.0291, + "step": 178660 + }, + { + "epoch": 38.01766776796837, + "grad_norm": 0.0007829059613868594, + "learning_rate": 1.7933765428755408e-06, + "loss": 0.0049, + "step": 178670 + }, + { + "epoch": 38.01772193034718, + "grad_norm": 0.0048218658193945885, + "learning_rate": 1.7903675218304477e-06, + "loss": 0.0017, + "step": 178680 + }, + { + "epoch": 38.017776092725995, + "grad_norm": 0.4644496738910675, + "learning_rate": 1.7873585007853546e-06, + "loss": 0.0128, + "step": 178690 + }, + { + "epoch": 38.017830255104805, + "grad_norm": 0.0007312042871490121, + "learning_rate": 1.7843494797402615e-06, + "loss": 0.0213, + "step": 178700 + }, + { + "epoch": 38.017884417483614, + "grad_norm": 4.567173480987549, + "learning_rate": 1.781340458695168e-06, + "loss": 0.0699, + "step": 178710 + }, + { + "epoch": 38.01793857986243, + "grad_norm": 4.51783561706543, + "learning_rate": 1.778331437650075e-06, + "loss": 0.0445, + "step": 178720 + }, + { + "epoch": 38.01799274224124, + "grad_norm": 0.0008068880415521562, + "learning_rate": 1.7753224166049818e-06, + "loss": 0.0328, + "step": 178730 + }, + { + "epoch": 38.01804690462005, + "grad_norm": 0.008878432214260101, + "learning_rate": 1.7723133955598887e-06, + "loss": 0.0007, + "step": 178740 + }, + { + "epoch": 38.01810106699886, + "grad_norm": 0.8280496597290039, + "learning_rate": 1.7693043745147954e-06, + "loss": 0.02, + "step": 178750 + }, + { + "epoch": 38.018155229377676, + "grad_norm": 0.8987212777137756, + "learning_rate": 1.7662953534697021e-06, + "loss": 0.0856, + "step": 178760 + }, + { + "epoch": 38.018209391756486, + "grad_norm": 0.0007536601042374969, + "learning_rate": 1.763286332424609e-06, + "loss": 0.0477, + "step": 178770 + }, + { + "epoch": 38.018263554135295, + "grad_norm": 0.6685301661491394, + "learning_rate": 1.760277311379516e-06, + "loss": 0.0289, + "step": 178780 + }, + { + "epoch": 38.01831771651411, + "grad_norm": 0.4928387999534607, + "learning_rate": 1.7572682903344226e-06, + "loss": 0.0858, + "step": 178790 + }, + { + "epoch": 38.01837187889292, + "grad_norm": 1.1638906002044678, + "learning_rate": 1.7542592692893295e-06, + "loss": 0.0073, + "step": 178800 + }, + { + "epoch": 38.01842604127173, + "grad_norm": 0.0007508854614570737, + "learning_rate": 1.7512502482442365e-06, + "loss": 0.0444, + "step": 178810 + }, + { + "epoch": 38.01848020365055, + "grad_norm": 0.0007257623947225511, + "learning_rate": 1.748241227199143e-06, + "loss": 0.0063, + "step": 178820 + }, + { + "epoch": 38.01853436602936, + "grad_norm": 0.0009150134865194559, + "learning_rate": 1.7452322061540499e-06, + "loss": 0.0082, + "step": 178830 + }, + { + "epoch": 38.018588528408166, + "grad_norm": 0.0007340179872699082, + "learning_rate": 1.7422231851089568e-06, + "loss": 0.016, + "step": 178840 + }, + { + "epoch": 38.018642690786976, + "grad_norm": 0.0011450984748080373, + "learning_rate": 1.7392141640638637e-06, + "loss": 0.0626, + "step": 178850 + }, + { + "epoch": 38.01869685316579, + "grad_norm": 1.1762553453445435, + "learning_rate": 1.7362051430187702e-06, + "loss": 0.0507, + "step": 178860 + }, + { + "epoch": 38.0187510155446, + "grad_norm": 6.437075138092041, + "learning_rate": 1.733196121973677e-06, + "loss": 0.0806, + "step": 178870 + }, + { + "epoch": 38.01880517792341, + "grad_norm": 0.4993639588356018, + "learning_rate": 1.730187100928584e-06, + "loss": 0.014, + "step": 178880 + }, + { + "epoch": 38.01885934030223, + "grad_norm": 0.002259851200506091, + "learning_rate": 1.7271780798834909e-06, + "loss": 0.0201, + "step": 178890 + }, + { + "epoch": 38.01891350268104, + "grad_norm": 0.000993640162050724, + "learning_rate": 1.7241690588383974e-06, + "loss": 0.0137, + "step": 178900 + }, + { + "epoch": 38.01896766505985, + "grad_norm": 0.0007373287808150053, + "learning_rate": 1.7211600377933043e-06, + "loss": 0.0133, + "step": 178910 + }, + { + "epoch": 38.019021827438664, + "grad_norm": 0.0007529492140747607, + "learning_rate": 1.7181510167482112e-06, + "loss": 0.0001, + "step": 178920 + }, + { + "epoch": 38.01907598981747, + "grad_norm": 0.0009382852003909647, + "learning_rate": 1.7151419957031181e-06, + "loss": 0.0069, + "step": 178930 + }, + { + "epoch": 38.01913015219628, + "grad_norm": 2.279480218887329, + "learning_rate": 1.7121329746580248e-06, + "loss": 0.0385, + "step": 178940 + }, + { + "epoch": 38.0191843145751, + "grad_norm": 0.005399232730269432, + "learning_rate": 1.7091239536129317e-06, + "loss": 0.0052, + "step": 178950 + }, + { + "epoch": 38.01923847695391, + "grad_norm": 1.3231672048568726, + "learning_rate": 1.7061149325678386e-06, + "loss": 0.0129, + "step": 178960 + }, + { + "epoch": 38.01929263933272, + "grad_norm": 0.21576163172721863, + "learning_rate": 1.7031059115227451e-06, + "loss": 0.0008, + "step": 178970 + }, + { + "epoch": 38.01934680171153, + "grad_norm": 0.0010876323794946074, + "learning_rate": 1.700096890477652e-06, + "loss": 0.012, + "step": 178980 + }, + { + "epoch": 38.019400964090345, + "grad_norm": 0.9151847958564758, + "learning_rate": 1.697087869432559e-06, + "loss": 0.0557, + "step": 178990 + }, + { + "epoch": 38.019455126469154, + "grad_norm": 0.0009344681748189032, + "learning_rate": 1.6940788483874658e-06, + "loss": 0.0005, + "step": 179000 + }, + { + "epoch": 38.01950928884796, + "grad_norm": 0.011630150489509106, + "learning_rate": 1.6910698273423723e-06, + "loss": 0.009, + "step": 179010 + }, + { + "epoch": 38.01956345122678, + "grad_norm": 0.6159878373146057, + "learning_rate": 1.6880608062972792e-06, + "loss": 0.0731, + "step": 179020 + }, + { + "epoch": 38.01961761360559, + "grad_norm": 0.0007570093148387969, + "learning_rate": 1.6850517852521861e-06, + "loss": 0.0492, + "step": 179030 + }, + { + "epoch": 38.0196717759844, + "grad_norm": 0.0007350902305915952, + "learning_rate": 1.682042764207093e-06, + "loss": 0.0274, + "step": 179040 + }, + { + "epoch": 38.019725938363216, + "grad_norm": 0.0007278241100721061, + "learning_rate": 1.6790337431619995e-06, + "loss": 0.0116, + "step": 179050 + }, + { + "epoch": 38.019780100742025, + "grad_norm": 0.0007378341979347169, + "learning_rate": 1.6760247221169065e-06, + "loss": 0.0403, + "step": 179060 + }, + { + "epoch": 38.019834263120835, + "grad_norm": 0.0028613614849746227, + "learning_rate": 1.6730157010718134e-06, + "loss": 0.0576, + "step": 179070 + }, + { + "epoch": 38.01988842549965, + "grad_norm": 0.0009343307465314865, + "learning_rate": 1.6700066800267203e-06, + "loss": 0.003, + "step": 179080 + }, + { + "epoch": 38.01994258787846, + "grad_norm": 0.0007340359152294695, + "learning_rate": 1.666997658981627e-06, + "loss": 0.0449, + "step": 179090 + }, + { + "epoch": 38.01999675025727, + "grad_norm": 0.0014146220637485385, + "learning_rate": 1.6639886379365339e-06, + "loss": 0.0331, + "step": 179100 + }, + { + "epoch": 38.02005091263608, + "grad_norm": 0.0012837990652769804, + "learning_rate": 1.6609796168914408e-06, + "loss": 0.0228, + "step": 179110 + }, + { + "epoch": 38.0201050750149, + "grad_norm": 1.0355486869812012, + "learning_rate": 1.6579705958463473e-06, + "loss": 0.0124, + "step": 179120 + }, + { + "epoch": 38.020159237393706, + "grad_norm": 0.003792193718254566, + "learning_rate": 1.6549615748012542e-06, + "loss": 0.0259, + "step": 179130 + }, + { + "epoch": 38.020213399772516, + "grad_norm": 0.0007389390375465155, + "learning_rate": 1.651952553756161e-06, + "loss": 0.0117, + "step": 179140 + }, + { + "epoch": 38.02026756215133, + "grad_norm": 0.0009444031165912747, + "learning_rate": 1.648943532711068e-06, + "loss": 0.0001, + "step": 179150 + }, + { + "epoch": 38.02032172453014, + "grad_norm": 0.0009216529433615506, + "learning_rate": 1.6459345116659745e-06, + "loss": 0.039, + "step": 179160 + }, + { + "epoch": 38.02037588690895, + "grad_norm": 1.120517611503601, + "learning_rate": 1.6429254906208814e-06, + "loss": 0.0584, + "step": 179170 + }, + { + "epoch": 38.02043004928777, + "grad_norm": 0.0007386025390587747, + "learning_rate": 1.6399164695757883e-06, + "loss": 0.0127, + "step": 179180 + }, + { + "epoch": 38.02048421166658, + "grad_norm": 0.0011024248087778687, + "learning_rate": 1.6369074485306952e-06, + "loss": 0.0041, + "step": 179190 + }, + { + "epoch": 38.02053837404539, + "grad_norm": 1.3289670944213867, + "learning_rate": 1.6338984274856017e-06, + "loss": 0.0367, + "step": 179200 + }, + { + "epoch": 38.020592536424196, + "grad_norm": 0.0007273496594280005, + "learning_rate": 1.6308894064405086e-06, + "loss": 0.0681, + "step": 179210 + }, + { + "epoch": 38.02064669880301, + "grad_norm": 0.0009555255528539419, + "learning_rate": 1.6278803853954155e-06, + "loss": 0.0768, + "step": 179220 + }, + { + "epoch": 38.02070086118182, + "grad_norm": 0.001526104984804988, + "learning_rate": 1.6248713643503224e-06, + "loss": 0.0009, + "step": 179230 + }, + { + "epoch": 38.02075502356063, + "grad_norm": 0.00446302630007267, + "learning_rate": 1.6218623433052291e-06, + "loss": 0.0025, + "step": 179240 + }, + { + "epoch": 38.02080918593945, + "grad_norm": 0.0008847969584167004, + "learning_rate": 1.618853322260136e-06, + "loss": 0.0001, + "step": 179250 + }, + { + "epoch": 38.02086334831826, + "grad_norm": 0.006641523912549019, + "learning_rate": 1.615844301215043e-06, + "loss": 0.0426, + "step": 179260 + }, + { + "epoch": 38.02091751069707, + "grad_norm": 0.39479294419288635, + "learning_rate": 1.6128352801699494e-06, + "loss": 0.0353, + "step": 179270 + }, + { + "epoch": 38.020971673075884, + "grad_norm": 0.001176198129542172, + "learning_rate": 1.6098262591248564e-06, + "loss": 0.0351, + "step": 179280 + }, + { + "epoch": 38.021025835454694, + "grad_norm": 0.002462790347635746, + "learning_rate": 1.6068172380797633e-06, + "loss": 0.038, + "step": 179290 + }, + { + "epoch": 38.0210799978335, + "grad_norm": 0.0009291725000366569, + "learning_rate": 1.6038082170346702e-06, + "loss": 0.0106, + "step": 179300 + }, + { + "epoch": 38.02113416021232, + "grad_norm": 6.892057418823242, + "learning_rate": 1.6007991959895767e-06, + "loss": 0.1015, + "step": 179310 + }, + { + "epoch": 38.02118832259113, + "grad_norm": 0.0007282394799403846, + "learning_rate": 1.5977901749444836e-06, + "loss": 0.0188, + "step": 179320 + }, + { + "epoch": 38.02124248496994, + "grad_norm": 0.6856464147567749, + "learning_rate": 1.5947811538993905e-06, + "loss": 0.0277, + "step": 179330 + }, + { + "epoch": 38.02129664734875, + "grad_norm": 0.2093699872493744, + "learning_rate": 1.5917721328542974e-06, + "loss": 0.0115, + "step": 179340 + }, + { + "epoch": 38.021350809727565, + "grad_norm": 0.0039583840407431126, + "learning_rate": 1.5887631118092039e-06, + "loss": 0.0011, + "step": 179350 + }, + { + "epoch": 38.021404972106374, + "grad_norm": 0.3809273838996887, + "learning_rate": 1.5857540907641108e-06, + "loss": 0.0087, + "step": 179360 + }, + { + "epoch": 38.021459134485184, + "grad_norm": 0.0011923996498808265, + "learning_rate": 1.5827450697190177e-06, + "loss": 0.0131, + "step": 179370 + }, + { + "epoch": 38.021513296864, + "grad_norm": 2.1855597496032715, + "learning_rate": 1.5797360486739244e-06, + "loss": 0.0202, + "step": 179380 + }, + { + "epoch": 38.02156745924281, + "grad_norm": 0.0007540611550211906, + "learning_rate": 1.5767270276288313e-06, + "loss": 0.003, + "step": 179390 + }, + { + "epoch": 38.02162162162162, + "grad_norm": 0.007934748195111752, + "learning_rate": 1.5737180065837382e-06, + "loss": 0.0744, + "step": 179400 + }, + { + "epoch": 38.021675784000436, + "grad_norm": 0.0009732316830195487, + "learning_rate": 1.5707089855386451e-06, + "loss": 0.0025, + "step": 179410 + }, + { + "epoch": 38.021729946379246, + "grad_norm": 0.013847406022250652, + "learning_rate": 1.5676999644935516e-06, + "loss": 0.0415, + "step": 179420 + }, + { + "epoch": 38.021784108758055, + "grad_norm": 0.000760765396989882, + "learning_rate": 1.5646909434484585e-06, + "loss": 0.04, + "step": 179430 + }, + { + "epoch": 38.02183827113687, + "grad_norm": 0.0007420488400384784, + "learning_rate": 1.5616819224033654e-06, + "loss": 0.0057, + "step": 179440 + }, + { + "epoch": 38.02189243351568, + "grad_norm": 7.034397125244141, + "learning_rate": 1.5586729013582721e-06, + "loss": 0.0461, + "step": 179450 + }, + { + "epoch": 38.02194659589449, + "grad_norm": 0.00989916455000639, + "learning_rate": 1.555663880313179e-06, + "loss": 0.0474, + "step": 179460 + }, + { + "epoch": 38.0220007582733, + "grad_norm": 0.0007280641584657133, + "learning_rate": 1.5526548592680857e-06, + "loss": 0.0126, + "step": 179470 + }, + { + "epoch": 38.02205492065212, + "grad_norm": 0.49010202288627625, + "learning_rate": 1.5496458382229926e-06, + "loss": 0.0272, + "step": 179480 + }, + { + "epoch": 38.02210908303093, + "grad_norm": 0.0015987063525244594, + "learning_rate": 1.5466368171778993e-06, + "loss": 0.0059, + "step": 179490 + }, + { + "epoch": 38.022163245409736, + "grad_norm": 5.470503807067871, + "learning_rate": 1.5436277961328063e-06, + "loss": 0.1272, + "step": 179500 + }, + { + "epoch": 38.02221740778855, + "grad_norm": 0.000737761496566236, + "learning_rate": 1.540618775087713e-06, + "loss": 0.0628, + "step": 179510 + }, + { + "epoch": 38.02227157016736, + "grad_norm": 0.0007460391498170793, + "learning_rate": 1.5376097540426199e-06, + "loss": 0.0048, + "step": 179520 + }, + { + "epoch": 38.02232573254617, + "grad_norm": 0.0010526750702410936, + "learning_rate": 1.5346007329975266e-06, + "loss": 0.0398, + "step": 179530 + }, + { + "epoch": 38.02237989492499, + "grad_norm": 0.0009585223742760718, + "learning_rate": 1.5315917119524335e-06, + "loss": 0.0213, + "step": 179540 + }, + { + "epoch": 38.0224340573038, + "grad_norm": 0.7527326345443726, + "learning_rate": 1.5285826909073404e-06, + "loss": 0.0549, + "step": 179550 + }, + { + "epoch": 38.02248821968261, + "grad_norm": 0.0008031940087676048, + "learning_rate": 1.525573669862247e-06, + "loss": 0.0024, + "step": 179560 + }, + { + "epoch": 38.02254238206142, + "grad_norm": 0.0007254682714119554, + "learning_rate": 1.522564648817154e-06, + "loss": 0.0039, + "step": 179570 + }, + { + "epoch": 38.02259654444023, + "grad_norm": 0.0012790306936949492, + "learning_rate": 1.5195556277720607e-06, + "loss": 0.0018, + "step": 179580 + }, + { + "epoch": 38.02265070681904, + "grad_norm": 0.9154486060142517, + "learning_rate": 1.5165466067269676e-06, + "loss": 0.0307, + "step": 179590 + }, + { + "epoch": 38.02270486919785, + "grad_norm": 0.0011188744101673365, + "learning_rate": 1.5135375856818743e-06, + "loss": 0.0038, + "step": 179600 + }, + { + "epoch": 38.02275903157667, + "grad_norm": 0.0009136902517639101, + "learning_rate": 1.5105285646367812e-06, + "loss": 0.0285, + "step": 179610 + }, + { + "epoch": 38.02281319395548, + "grad_norm": 0.0010885590454563498, + "learning_rate": 1.507519543591688e-06, + "loss": 0.0139, + "step": 179620 + }, + { + "epoch": 38.02286735633429, + "grad_norm": 0.14026080071926117, + "learning_rate": 1.5045105225465948e-06, + "loss": 0.0413, + "step": 179630 + }, + { + "epoch": 38.022921518713105, + "grad_norm": 0.0756896585226059, + "learning_rate": 1.5015015015015015e-06, + "loss": 0.0395, + "step": 179640 + }, + { + "epoch": 38.022975681091914, + "grad_norm": 0.000743543729186058, + "learning_rate": 1.4984924804564084e-06, + "loss": 0.0306, + "step": 179650 + }, + { + "epoch": 38.023029843470724, + "grad_norm": 0.0014003085670992732, + "learning_rate": 1.4954834594113151e-06, + "loss": 0.012, + "step": 179660 + }, + { + "epoch": 38.02308400584954, + "grad_norm": 0.0009816379752010107, + "learning_rate": 1.492474438366222e-06, + "loss": 0.0293, + "step": 179670 + }, + { + "epoch": 38.02313816822835, + "grad_norm": 0.0011845645494759083, + "learning_rate": 1.4894654173211287e-06, + "loss": 0.121, + "step": 179680 + }, + { + "epoch": 38.02319233060716, + "grad_norm": 0.0010165170533582568, + "learning_rate": 1.4864563962760356e-06, + "loss": 0.0096, + "step": 179690 + }, + { + "epoch": 38.02324649298597, + "grad_norm": 1.2935672998428345, + "learning_rate": 1.4834473752309425e-06, + "loss": 0.0268, + "step": 179700 + }, + { + "epoch": 38.023300655364785, + "grad_norm": 0.0007313817040994763, + "learning_rate": 1.4804383541858492e-06, + "loss": 0.0393, + "step": 179710 + }, + { + "epoch": 38.023354817743595, + "grad_norm": 0.0009326525614596903, + "learning_rate": 1.4774293331407562e-06, + "loss": 0.027, + "step": 179720 + }, + { + "epoch": 38.023408980122404, + "grad_norm": 0.002060367725789547, + "learning_rate": 1.4744203120956629e-06, + "loss": 0.0047, + "step": 179730 + }, + { + "epoch": 38.02346314250122, + "grad_norm": 0.09756675362586975, + "learning_rate": 1.4714112910505698e-06, + "loss": 0.1242, + "step": 179740 + }, + { + "epoch": 38.02351730488003, + "grad_norm": 1.1468524932861328, + "learning_rate": 1.4684022700054765e-06, + "loss": 0.0609, + "step": 179750 + }, + { + "epoch": 38.02357146725884, + "grad_norm": 0.0010336199775338173, + "learning_rate": 1.4653932489603834e-06, + "loss": 0.001, + "step": 179760 + }, + { + "epoch": 38.02362562963766, + "grad_norm": 0.04167688265442848, + "learning_rate": 1.46238422791529e-06, + "loss": 0.0312, + "step": 179770 + }, + { + "epoch": 38.023679792016466, + "grad_norm": 1.8637497425079346, + "learning_rate": 1.459375206870197e-06, + "loss": 0.0298, + "step": 179780 + }, + { + "epoch": 38.023733954395276, + "grad_norm": 0.0007396914879791439, + "learning_rate": 1.4563661858251037e-06, + "loss": 0.1336, + "step": 179790 + }, + { + "epoch": 38.023788116774085, + "grad_norm": 0.7768630981445312, + "learning_rate": 1.4533571647800106e-06, + "loss": 0.0701, + "step": 179800 + }, + { + "epoch": 38.0238422791529, + "grad_norm": 0.0007614995120093226, + "learning_rate": 1.4503481437349173e-06, + "loss": 0.0218, + "step": 179810 + }, + { + "epoch": 38.02389644153171, + "grad_norm": 0.005423928610980511, + "learning_rate": 1.447339122689824e-06, + "loss": 0.014, + "step": 179820 + }, + { + "epoch": 38.02395060391052, + "grad_norm": 0.23643280565738678, + "learning_rate": 1.4443301016447309e-06, + "loss": 0.0141, + "step": 179830 + }, + { + "epoch": 38.02400476628934, + "grad_norm": 0.5255110859870911, + "learning_rate": 1.4413210805996378e-06, + "loss": 0.0097, + "step": 179840 + }, + { + "epoch": 38.02405892866815, + "grad_norm": 1.1561945676803589, + "learning_rate": 1.4383120595545447e-06, + "loss": 0.0277, + "step": 179850 + }, + { + "epoch": 38.02411309104696, + "grad_norm": 0.01826157607138157, + "learning_rate": 1.4353030385094514e-06, + "loss": 0.0005, + "step": 179860 + }, + { + "epoch": 38.02416725342577, + "grad_norm": 0.0009595243609510362, + "learning_rate": 1.4322940174643583e-06, + "loss": 0.059, + "step": 179870 + }, + { + "epoch": 38.02422141580458, + "grad_norm": 0.001229300512932241, + "learning_rate": 1.429284996419265e-06, + "loss": 0.0109, + "step": 179880 + }, + { + "epoch": 38.02427557818339, + "grad_norm": 0.0007584267295897007, + "learning_rate": 1.426275975374172e-06, + "loss": 0.0795, + "step": 179890 + }, + { + "epoch": 38.02432974056221, + "grad_norm": 0.0007360877934843302, + "learning_rate": 1.4232669543290786e-06, + "loss": 0.0569, + "step": 179900 + }, + { + "epoch": 38.02438390294102, + "grad_norm": 0.9331650137901306, + "learning_rate": 1.4202579332839855e-06, + "loss": 0.0772, + "step": 179910 + }, + { + "epoch": 38.02443806531983, + "grad_norm": 0.0009246626286767423, + "learning_rate": 1.4172489122388922e-06, + "loss": 0.0052, + "step": 179920 + }, + { + "epoch": 38.02449222769864, + "grad_norm": 0.0014918247470632195, + "learning_rate": 1.4142398911937991e-06, + "loss": 0.0727, + "step": 179930 + }, + { + "epoch": 38.024546390077454, + "grad_norm": 0.0007485249079763889, + "learning_rate": 1.4112308701487058e-06, + "loss": 0.0122, + "step": 179940 + }, + { + "epoch": 38.02460055245626, + "grad_norm": 0.008424937725067139, + "learning_rate": 1.4082218491036125e-06, + "loss": 0.0242, + "step": 179950 + }, + { + "epoch": 38.02465471483507, + "grad_norm": 0.0007466160459443927, + "learning_rate": 1.4052128280585195e-06, + "loss": 0.0596, + "step": 179960 + }, + { + "epoch": 38.02470887721389, + "grad_norm": 0.0009538914891891181, + "learning_rate": 1.4022038070134261e-06, + "loss": 0.0592, + "step": 179970 + }, + { + "epoch": 38.0247630395927, + "grad_norm": 0.0007495923200622201, + "learning_rate": 1.399194785968333e-06, + "loss": 0.009, + "step": 179980 + }, + { + "epoch": 38.02481720197151, + "grad_norm": 0.0007405490032397211, + "learning_rate": 1.39618576492324e-06, + "loss": 0.0148, + "step": 179990 + }, + { + "epoch": 38.024871364350325, + "grad_norm": 0.0007621011463925242, + "learning_rate": 1.3931767438781469e-06, + "loss": 0.072, + "step": 180000 + }, + { + "epoch": 38.024925526729135, + "grad_norm": 0.0007496492471545935, + "learning_rate": 1.3901677228330536e-06, + "loss": 0.0313, + "step": 180010 + }, + { + "epoch": 38.024979689107944, + "grad_norm": 2.657888889312744, + "learning_rate": 1.3871587017879605e-06, + "loss": 0.1433, + "step": 180020 + }, + { + "epoch": 38.02500135405947, + "eval_accuracy": 0.8396472893533639, + "eval_loss": 1.1224924325942993, + "eval_runtime": 118.0632, + "eval_samples_per_second": 25.935, + "eval_steps_per_second": 3.244, + "step": 180024 + }, + { + "epoch": 39.000032497427284, + "grad_norm": 2.832393169403076, + "learning_rate": 1.3841496807428672e-06, + "loss": 0.0968, + "step": 180030 + }, + { + "epoch": 39.0000866598061, + "grad_norm": 0.0009430120699107647, + "learning_rate": 1.381140659697774e-06, + "loss": 0.0047, + "step": 180040 + }, + { + "epoch": 39.00014082218491, + "grad_norm": 0.9922214150428772, + "learning_rate": 1.3781316386526808e-06, + "loss": 0.0272, + "step": 180050 + }, + { + "epoch": 39.00019498456372, + "grad_norm": 0.002038811333477497, + "learning_rate": 1.3751226176075877e-06, + "loss": 0.0562, + "step": 180060 + }, + { + "epoch": 39.00024914694254, + "grad_norm": 1.4931060075759888, + "learning_rate": 1.3721135965624944e-06, + "loss": 0.0195, + "step": 180070 + }, + { + "epoch": 39.000303309321346, + "grad_norm": 0.0007576661300845444, + "learning_rate": 1.3691045755174013e-06, + "loss": 0.0182, + "step": 180080 + }, + { + "epoch": 39.000357471700156, + "grad_norm": 2.9911208152770996, + "learning_rate": 1.366095554472308e-06, + "loss": 0.0371, + "step": 180090 + }, + { + "epoch": 39.00041163407897, + "grad_norm": 0.0007351575186476111, + "learning_rate": 1.3630865334272147e-06, + "loss": 0.0181, + "step": 180100 + }, + { + "epoch": 39.00046579645778, + "grad_norm": 1.2316465377807617, + "learning_rate": 1.3600775123821216e-06, + "loss": 0.0345, + "step": 180110 + }, + { + "epoch": 39.00051995883659, + "grad_norm": 0.0007679270929656923, + "learning_rate": 1.3570684913370283e-06, + "loss": 0.0325, + "step": 180120 + }, + { + "epoch": 39.0005741212154, + "grad_norm": 0.0007399363676086068, + "learning_rate": 1.3540594702919352e-06, + "loss": 0.0338, + "step": 180130 + }, + { + "epoch": 39.00062828359422, + "grad_norm": 0.0008226659847423434, + "learning_rate": 1.3510504492468421e-06, + "loss": 0.0132, + "step": 180140 + }, + { + "epoch": 39.00068244597303, + "grad_norm": 0.2668918967247009, + "learning_rate": 1.348041428201749e-06, + "loss": 0.0362, + "step": 180150 + }, + { + "epoch": 39.000736608351836, + "grad_norm": 0.035790570080280304, + "learning_rate": 1.3450324071566557e-06, + "loss": 0.081, + "step": 180160 + }, + { + "epoch": 39.00079077073065, + "grad_norm": 0.0010457112221047282, + "learning_rate": 1.3420233861115627e-06, + "loss": 0.0093, + "step": 180170 + }, + { + "epoch": 39.00084493310946, + "grad_norm": 0.0007403132622130215, + "learning_rate": 1.3390143650664694e-06, + "loss": 0.0336, + "step": 180180 + }, + { + "epoch": 39.00089909548827, + "grad_norm": 0.0007499521598219872, + "learning_rate": 1.3360053440213763e-06, + "loss": 0.0064, + "step": 180190 + }, + { + "epoch": 39.00095325786709, + "grad_norm": 1.9699362516403198, + "learning_rate": 1.332996322976283e-06, + "loss": 0.0887, + "step": 180200 + }, + { + "epoch": 39.0010074202459, + "grad_norm": 0.03941214084625244, + "learning_rate": 1.3299873019311899e-06, + "loss": 0.0455, + "step": 180210 + }, + { + "epoch": 39.00106158262471, + "grad_norm": 0.0007520170183852315, + "learning_rate": 1.3269782808860966e-06, + "loss": 0.0372, + "step": 180220 + }, + { + "epoch": 39.00111574500352, + "grad_norm": 38.69363784790039, + "learning_rate": 1.3239692598410033e-06, + "loss": 0.0552, + "step": 180230 + }, + { + "epoch": 39.001169907382334, + "grad_norm": 0.0007854952127672732, + "learning_rate": 1.3209602387959102e-06, + "loss": 0.0051, + "step": 180240 + }, + { + "epoch": 39.00122406976114, + "grad_norm": 0.0007779925363138318, + "learning_rate": 1.3179512177508169e-06, + "loss": 0.0307, + "step": 180250 + }, + { + "epoch": 39.00127823213995, + "grad_norm": 1.2948743104934692, + "learning_rate": 1.3149421967057238e-06, + "loss": 0.1087, + "step": 180260 + }, + { + "epoch": 39.00133239451877, + "grad_norm": 0.0007538851932622492, + "learning_rate": 1.3119331756606305e-06, + "loss": 0.0054, + "step": 180270 + }, + { + "epoch": 39.00138655689758, + "grad_norm": 0.05457738786935806, + "learning_rate": 1.3089241546155374e-06, + "loss": 0.0079, + "step": 180280 + }, + { + "epoch": 39.00144071927639, + "grad_norm": 1.8675646781921387, + "learning_rate": 1.3059151335704443e-06, + "loss": 0.0348, + "step": 180290 + }, + { + "epoch": 39.001494881655205, + "grad_norm": 0.001006747712381184, + "learning_rate": 1.3029061125253512e-06, + "loss": 0.0173, + "step": 180300 + }, + { + "epoch": 39.001549044034014, + "grad_norm": 0.0009955386631190777, + "learning_rate": 1.299897091480258e-06, + "loss": 0.0338, + "step": 180310 + }, + { + "epoch": 39.001603206412824, + "grad_norm": 0.0009452440426684916, + "learning_rate": 1.2968880704351648e-06, + "loss": 0.0008, + "step": 180320 + }, + { + "epoch": 39.00165736879164, + "grad_norm": 0.000761917675845325, + "learning_rate": 1.2938790493900715e-06, + "loss": 0.0119, + "step": 180330 + }, + { + "epoch": 39.00171153117045, + "grad_norm": 1.0108470916748047, + "learning_rate": 1.2908700283449784e-06, + "loss": 0.0104, + "step": 180340 + }, + { + "epoch": 39.00176569354926, + "grad_norm": 0.0007941372459754348, + "learning_rate": 1.2878610072998851e-06, + "loss": 0.0937, + "step": 180350 + }, + { + "epoch": 39.00181985592807, + "grad_norm": 0.02152411639690399, + "learning_rate": 1.284851986254792e-06, + "loss": 0.0807, + "step": 180360 + }, + { + "epoch": 39.001874018306886, + "grad_norm": 0.0007445669616572559, + "learning_rate": 1.2818429652096987e-06, + "loss": 0.0121, + "step": 180370 + }, + { + "epoch": 39.001928180685695, + "grad_norm": 0.000943600432947278, + "learning_rate": 1.2788339441646054e-06, + "loss": 0.0655, + "step": 180380 + }, + { + "epoch": 39.001982343064505, + "grad_norm": 0.0010451213456690311, + "learning_rate": 1.2758249231195123e-06, + "loss": 0.0112, + "step": 180390 + }, + { + "epoch": 39.00203650544332, + "grad_norm": 0.0007894292939454317, + "learning_rate": 1.272815902074419e-06, + "loss": 0.0086, + "step": 180400 + }, + { + "epoch": 39.00209066782213, + "grad_norm": 0.0007472081342712045, + "learning_rate": 1.269806881029326e-06, + "loss": 0.0491, + "step": 180410 + }, + { + "epoch": 39.00214483020094, + "grad_norm": 0.0007461966597475111, + "learning_rate": 1.2667978599842326e-06, + "loss": 0.0137, + "step": 180420 + }, + { + "epoch": 39.00219899257976, + "grad_norm": 0.0011972583597525954, + "learning_rate": 1.2637888389391396e-06, + "loss": 0.035, + "step": 180430 + }, + { + "epoch": 39.00225315495857, + "grad_norm": 0.0026080808602273464, + "learning_rate": 1.2607798178940465e-06, + "loss": 0.0222, + "step": 180440 + }, + { + "epoch": 39.002307317337376, + "grad_norm": 0.0009698047651909292, + "learning_rate": 1.2577707968489532e-06, + "loss": 0.0006, + "step": 180450 + }, + { + "epoch": 39.00236147971619, + "grad_norm": 0.0022293103393167257, + "learning_rate": 1.25476177580386e-06, + "loss": 0.0565, + "step": 180460 + }, + { + "epoch": 39.002415642095, + "grad_norm": 0.0009869715431705117, + "learning_rate": 1.251752754758767e-06, + "loss": 0.0064, + "step": 180470 + }, + { + "epoch": 39.00246980447381, + "grad_norm": 0.000749244645703584, + "learning_rate": 1.2487437337136737e-06, + "loss": 0.0149, + "step": 180480 + }, + { + "epoch": 39.00252396685262, + "grad_norm": 0.0007483507506549358, + "learning_rate": 1.2457347126685806e-06, + "loss": 0.06, + "step": 180490 + }, + { + "epoch": 39.00257812923144, + "grad_norm": 0.000955216761212796, + "learning_rate": 1.2427256916234873e-06, + "loss": 0.0037, + "step": 180500 + }, + { + "epoch": 39.00263229161025, + "grad_norm": 0.0007329779909923673, + "learning_rate": 1.2397166705783942e-06, + "loss": 0.0109, + "step": 180510 + }, + { + "epoch": 39.00268645398906, + "grad_norm": 0.0008064220892265439, + "learning_rate": 1.236707649533301e-06, + "loss": 0.0204, + "step": 180520 + }, + { + "epoch": 39.00274061636787, + "grad_norm": 5.492773056030273, + "learning_rate": 1.2336986284882076e-06, + "loss": 0.048, + "step": 180530 + }, + { + "epoch": 39.00279477874668, + "grad_norm": 0.0007755148108117282, + "learning_rate": 1.2306896074431145e-06, + "loss": 0.0598, + "step": 180540 + }, + { + "epoch": 39.00284894112549, + "grad_norm": 2.7342686653137207, + "learning_rate": 1.2276805863980212e-06, + "loss": 0.0779, + "step": 180550 + }, + { + "epoch": 39.00290310350431, + "grad_norm": 0.0071947830729186535, + "learning_rate": 1.2246715653529281e-06, + "loss": 0.0407, + "step": 180560 + }, + { + "epoch": 39.00295726588312, + "grad_norm": 0.007929135113954544, + "learning_rate": 1.2216625443078348e-06, + "loss": 0.009, + "step": 180570 + }, + { + "epoch": 39.00301142826193, + "grad_norm": 0.7913433909416199, + "learning_rate": 1.2186535232627417e-06, + "loss": 0.0072, + "step": 180580 + }, + { + "epoch": 39.00306559064074, + "grad_norm": 0.001669116085395217, + "learning_rate": 1.2156445022176486e-06, + "loss": 0.028, + "step": 180590 + }, + { + "epoch": 39.003119753019554, + "grad_norm": 4.5070037841796875, + "learning_rate": 1.2126354811725553e-06, + "loss": 0.0687, + "step": 180600 + }, + { + "epoch": 39.003173915398364, + "grad_norm": 1.652976632118225, + "learning_rate": 1.2096264601274622e-06, + "loss": 0.0182, + "step": 180610 + }, + { + "epoch": 39.00322807777717, + "grad_norm": 0.001035531866364181, + "learning_rate": 1.2066174390823692e-06, + "loss": 0.0029, + "step": 180620 + }, + { + "epoch": 39.00328224015599, + "grad_norm": 0.0008097646059468389, + "learning_rate": 1.2036084180372758e-06, + "loss": 0.0006, + "step": 180630 + }, + { + "epoch": 39.0033364025348, + "grad_norm": 0.0007358632865361869, + "learning_rate": 1.2005993969921828e-06, + "loss": 0.0663, + "step": 180640 + }, + { + "epoch": 39.00339056491361, + "grad_norm": 0.0012871078215539455, + "learning_rate": 1.1975903759470895e-06, + "loss": 0.0042, + "step": 180650 + }, + { + "epoch": 39.003444727292425, + "grad_norm": 0.0009361362317577004, + "learning_rate": 1.1945813549019962e-06, + "loss": 0.0195, + "step": 180660 + }, + { + "epoch": 39.003498889671235, + "grad_norm": 0.0012041091686114669, + "learning_rate": 1.191572333856903e-06, + "loss": 0.0368, + "step": 180670 + }, + { + "epoch": 39.003553052050044, + "grad_norm": 0.004197875503450632, + "learning_rate": 1.1885633128118098e-06, + "loss": 0.0331, + "step": 180680 + }, + { + "epoch": 39.00360721442886, + "grad_norm": 0.0007430299301631749, + "learning_rate": 1.1855542917667167e-06, + "loss": 0.0693, + "step": 180690 + }, + { + "epoch": 39.00366137680767, + "grad_norm": 0.005092596169561148, + "learning_rate": 1.1825452707216234e-06, + "loss": 0.0406, + "step": 180700 + }, + { + "epoch": 39.00371553918648, + "grad_norm": 0.001075602718628943, + "learning_rate": 1.1795362496765303e-06, + "loss": 0.0155, + "step": 180710 + }, + { + "epoch": 39.00376970156529, + "grad_norm": 0.0009951493702828884, + "learning_rate": 1.176527228631437e-06, + "loss": 0.0115, + "step": 180720 + }, + { + "epoch": 39.003823863944106, + "grad_norm": 0.0009957326110452414, + "learning_rate": 1.1735182075863439e-06, + "loss": 0.001, + "step": 180730 + }, + { + "epoch": 39.003878026322916, + "grad_norm": 1.1684154272079468, + "learning_rate": 1.1705091865412506e-06, + "loss": 0.0132, + "step": 180740 + }, + { + "epoch": 39.003932188701725, + "grad_norm": 1.2923260927200317, + "learning_rate": 1.1675001654961575e-06, + "loss": 0.0061, + "step": 180750 + }, + { + "epoch": 39.00398635108054, + "grad_norm": 0.0008165179751813412, + "learning_rate": 1.1644911444510644e-06, + "loss": 0.0514, + "step": 180760 + }, + { + "epoch": 39.00404051345935, + "grad_norm": 0.0009671638836152852, + "learning_rate": 1.1614821234059713e-06, + "loss": 0.0002, + "step": 180770 + }, + { + "epoch": 39.00409467583816, + "grad_norm": 0.0007425337098538876, + "learning_rate": 1.158473102360878e-06, + "loss": 0.0401, + "step": 180780 + }, + { + "epoch": 39.00414883821698, + "grad_norm": 0.0020775787997990847, + "learning_rate": 1.155464081315785e-06, + "loss": 0.0161, + "step": 180790 + }, + { + "epoch": 39.00420300059579, + "grad_norm": 8.296321868896484, + "learning_rate": 1.1524550602706916e-06, + "loss": 0.0799, + "step": 180800 + }, + { + "epoch": 39.004257162974596, + "grad_norm": 0.004624218679964542, + "learning_rate": 1.1494460392255983e-06, + "loss": 0.036, + "step": 180810 + }, + { + "epoch": 39.00431132535341, + "grad_norm": 0.6993679404258728, + "learning_rate": 1.1464370181805052e-06, + "loss": 0.0307, + "step": 180820 + }, + { + "epoch": 39.00436548773222, + "grad_norm": 0.0018242007354274392, + "learning_rate": 1.143427997135412e-06, + "loss": 0.0511, + "step": 180830 + }, + { + "epoch": 39.00441965011103, + "grad_norm": 0.0007457670872099698, + "learning_rate": 1.1404189760903188e-06, + "loss": 0.0131, + "step": 180840 + }, + { + "epoch": 39.00447381248984, + "grad_norm": 0.0007364354096353054, + "learning_rate": 1.1374099550452255e-06, + "loss": 0.0257, + "step": 180850 + }, + { + "epoch": 39.00452797486866, + "grad_norm": 0.0011668774532154202, + "learning_rate": 1.1344009340001324e-06, + "loss": 0.0273, + "step": 180860 + }, + { + "epoch": 39.00458213724747, + "grad_norm": 0.839964747428894, + "learning_rate": 1.1313919129550391e-06, + "loss": 0.0486, + "step": 180870 + }, + { + "epoch": 39.00463629962628, + "grad_norm": 0.0009427538025192916, + "learning_rate": 1.128382891909946e-06, + "loss": 0.0002, + "step": 180880 + }, + { + "epoch": 39.004690462005094, + "grad_norm": 0.6697298884391785, + "learning_rate": 1.1253738708648528e-06, + "loss": 0.0148, + "step": 180890 + }, + { + "epoch": 39.0047446243839, + "grad_norm": 0.04894113913178444, + "learning_rate": 1.1223648498197597e-06, + "loss": 0.0375, + "step": 180900 + }, + { + "epoch": 39.00479878676271, + "grad_norm": 0.011492591351270676, + "learning_rate": 1.1193558287746666e-06, + "loss": 0.0052, + "step": 180910 + }, + { + "epoch": 39.00485294914153, + "grad_norm": 0.45549410581588745, + "learning_rate": 1.1163468077295735e-06, + "loss": 0.0688, + "step": 180920 + }, + { + "epoch": 39.00490711152034, + "grad_norm": 0.0019099053461104631, + "learning_rate": 1.1133377866844802e-06, + "loss": 0.0952, + "step": 180930 + }, + { + "epoch": 39.00496127389915, + "grad_norm": 0.008982334285974503, + "learning_rate": 1.1103287656393869e-06, + "loss": 0.0106, + "step": 180940 + }, + { + "epoch": 39.00501543627796, + "grad_norm": 0.3121539354324341, + "learning_rate": 1.1073197445942938e-06, + "loss": 0.0189, + "step": 180950 + }, + { + "epoch": 39.005069598656775, + "grad_norm": 0.0010429442627355456, + "learning_rate": 1.1043107235492005e-06, + "loss": 0.0052, + "step": 180960 + }, + { + "epoch": 39.005123761035584, + "grad_norm": 0.3802167475223541, + "learning_rate": 1.1013017025041074e-06, + "loss": 0.0022, + "step": 180970 + }, + { + "epoch": 39.005177923414394, + "grad_norm": 0.037529800087213516, + "learning_rate": 1.098292681459014e-06, + "loss": 0.016, + "step": 180980 + }, + { + "epoch": 39.00523208579321, + "grad_norm": 0.0428200364112854, + "learning_rate": 1.095283660413921e-06, + "loss": 0.0443, + "step": 180990 + }, + { + "epoch": 39.00528624817202, + "grad_norm": 0.029981806874275208, + "learning_rate": 1.0922746393688277e-06, + "loss": 0.0017, + "step": 181000 + }, + { + "epoch": 39.00534041055083, + "grad_norm": 0.0009319759556092322, + "learning_rate": 1.0892656183237346e-06, + "loss": 0.0281, + "step": 181010 + }, + { + "epoch": 39.005394572929646, + "grad_norm": 0.8919660449028015, + "learning_rate": 1.0862565972786413e-06, + "loss": 0.0611, + "step": 181020 + }, + { + "epoch": 39.005448735308455, + "grad_norm": 3.88128399848938, + "learning_rate": 1.0832475762335482e-06, + "loss": 0.0354, + "step": 181030 + }, + { + "epoch": 39.005502897687265, + "grad_norm": 0.7745836973190308, + "learning_rate": 1.080238555188455e-06, + "loss": 0.0441, + "step": 181040 + }, + { + "epoch": 39.00555706006608, + "grad_norm": 0.09898444265127182, + "learning_rate": 1.0772295341433618e-06, + "loss": 0.0215, + "step": 181050 + }, + { + "epoch": 39.00561122244489, + "grad_norm": 0.0007402059272862971, + "learning_rate": 1.0742205130982687e-06, + "loss": 0.0455, + "step": 181060 + }, + { + "epoch": 39.0056653848237, + "grad_norm": 0.0009901105659082532, + "learning_rate": 1.0712114920531757e-06, + "loss": 0.0001, + "step": 181070 + }, + { + "epoch": 39.00571954720251, + "grad_norm": 4.456518173217773, + "learning_rate": 1.0682024710080823e-06, + "loss": 0.0961, + "step": 181080 + }, + { + "epoch": 39.00577370958133, + "grad_norm": 0.0009510400122962892, + "learning_rate": 1.065193449962989e-06, + "loss": 0.034, + "step": 181090 + }, + { + "epoch": 39.005827871960136, + "grad_norm": 0.02486482262611389, + "learning_rate": 1.062184428917896e-06, + "loss": 0.0877, + "step": 181100 + }, + { + "epoch": 39.005882034338946, + "grad_norm": 0.0009848425397649407, + "learning_rate": 1.0591754078728027e-06, + "loss": 0.1654, + "step": 181110 + }, + { + "epoch": 39.00593619671776, + "grad_norm": 0.0010481799254193902, + "learning_rate": 1.0561663868277096e-06, + "loss": 0.0038, + "step": 181120 + }, + { + "epoch": 39.00599035909657, + "grad_norm": 0.0007508526323363185, + "learning_rate": 1.0531573657826163e-06, + "loss": 0.0271, + "step": 181130 + }, + { + "epoch": 39.00604452147538, + "grad_norm": 0.15195225179195404, + "learning_rate": 1.0501483447375232e-06, + "loss": 0.0191, + "step": 181140 + }, + { + "epoch": 39.0060986838542, + "grad_norm": 0.0008149508503265679, + "learning_rate": 1.0471393236924299e-06, + "loss": 0.0071, + "step": 181150 + }, + { + "epoch": 39.00615284623301, + "grad_norm": 0.0007470540585927665, + "learning_rate": 1.0441303026473368e-06, + "loss": 0.0005, + "step": 181160 + }, + { + "epoch": 39.00620700861182, + "grad_norm": 0.07296983897686005, + "learning_rate": 1.0411212816022435e-06, + "loss": 0.0203, + "step": 181170 + }, + { + "epoch": 39.006261170990626, + "grad_norm": 0.001325729419477284, + "learning_rate": 1.0381122605571504e-06, + "loss": 0.0279, + "step": 181180 + }, + { + "epoch": 39.00631533336944, + "grad_norm": 0.000990457832813263, + "learning_rate": 1.035103239512057e-06, + "loss": 0.0139, + "step": 181190 + }, + { + "epoch": 39.00636949574825, + "grad_norm": 0.001170012867078185, + "learning_rate": 1.032094218466964e-06, + "loss": 0.0246, + "step": 181200 + }, + { + "epoch": 39.00642365812706, + "grad_norm": 0.004547839984297752, + "learning_rate": 1.029085197421871e-06, + "loss": 0.0002, + "step": 181210 + }, + { + "epoch": 39.00647782050588, + "grad_norm": 0.0007336048292927444, + "learning_rate": 1.0260761763767778e-06, + "loss": 0.0033, + "step": 181220 + }, + { + "epoch": 39.00653198288469, + "grad_norm": 0.0009405285236425698, + "learning_rate": 1.0230671553316845e-06, + "loss": 0.0117, + "step": 181230 + }, + { + "epoch": 39.0065861452635, + "grad_norm": 0.0010130653390660882, + "learning_rate": 1.0200581342865912e-06, + "loss": 0.0148, + "step": 181240 + }, + { + "epoch": 39.006640307642314, + "grad_norm": 0.001245876308530569, + "learning_rate": 1.0170491132414981e-06, + "loss": 0.006, + "step": 181250 + }, + { + "epoch": 39.006694470021124, + "grad_norm": 0.0009568753303028643, + "learning_rate": 1.0140400921964048e-06, + "loss": 0.0004, + "step": 181260 + }, + { + "epoch": 39.00674863239993, + "grad_norm": 0.0007397029548883438, + "learning_rate": 1.0110310711513117e-06, + "loss": 0.0555, + "step": 181270 + }, + { + "epoch": 39.00680279477875, + "grad_norm": 1.003947138786316, + "learning_rate": 1.0080220501062184e-06, + "loss": 0.0195, + "step": 181280 + }, + { + "epoch": 39.00685695715756, + "grad_norm": 0.26080644130706787, + "learning_rate": 1.0050130290611253e-06, + "loss": 0.0126, + "step": 181290 + }, + { + "epoch": 39.00691111953637, + "grad_norm": 0.006591213867068291, + "learning_rate": 1.002004008016032e-06, + "loss": 0.0052, + "step": 181300 + }, + { + "epoch": 39.00696528191518, + "grad_norm": 0.0009500657324679196, + "learning_rate": 9.98994986970939e-07, + "loss": 0.0112, + "step": 181310 + }, + { + "epoch": 39.007019444293995, + "grad_norm": 0.7794575095176697, + "learning_rate": 9.959859659258456e-07, + "loss": 0.1241, + "step": 181320 + }, + { + "epoch": 39.007073606672805, + "grad_norm": 0.0007375680725090206, + "learning_rate": 9.929769448807526e-07, + "loss": 0.0192, + "step": 181330 + }, + { + "epoch": 39.007127769051614, + "grad_norm": 1.0731185674667358, + "learning_rate": 9.899679238356593e-07, + "loss": 0.0554, + "step": 181340 + }, + { + "epoch": 39.00718193143043, + "grad_norm": 0.41580748558044434, + "learning_rate": 9.869589027905662e-07, + "loss": 0.0248, + "step": 181350 + }, + { + "epoch": 39.00723609380924, + "grad_norm": 0.0007690354832448065, + "learning_rate": 9.83949881745473e-07, + "loss": 0.0084, + "step": 181360 + }, + { + "epoch": 39.00729025618805, + "grad_norm": 0.000955988944042474, + "learning_rate": 9.809408607003798e-07, + "loss": 0.0241, + "step": 181370 + }, + { + "epoch": 39.007344418566866, + "grad_norm": 0.0009407959296368062, + "learning_rate": 9.779318396552867e-07, + "loss": 0.0282, + "step": 181380 + }, + { + "epoch": 39.007398580945676, + "grad_norm": 0.001404927228577435, + "learning_rate": 9.749228186101934e-07, + "loss": 0.0092, + "step": 181390 + }, + { + "epoch": 39.007452743324485, + "grad_norm": 1.4260616302490234, + "learning_rate": 9.719137975651003e-07, + "loss": 0.0512, + "step": 181400 + }, + { + "epoch": 39.0075069057033, + "grad_norm": 0.011831793002784252, + "learning_rate": 9.68904776520007e-07, + "loss": 0.0194, + "step": 181410 + }, + { + "epoch": 39.00756106808211, + "grad_norm": 2.2646279335021973, + "learning_rate": 9.65895755474914e-07, + "loss": 0.0474, + "step": 181420 + }, + { + "epoch": 39.00761523046092, + "grad_norm": 0.00363049004226923, + "learning_rate": 9.628867344298206e-07, + "loss": 0.0195, + "step": 181430 + }, + { + "epoch": 39.00766939283973, + "grad_norm": 0.6286881566047668, + "learning_rate": 9.598777133847275e-07, + "loss": 0.0227, + "step": 181440 + }, + { + "epoch": 39.00772355521855, + "grad_norm": 0.0007442540372721851, + "learning_rate": 9.568686923396342e-07, + "loss": 0.0315, + "step": 181450 + }, + { + "epoch": 39.00777771759736, + "grad_norm": 0.0027382434345781803, + "learning_rate": 9.538596712945411e-07, + "loss": 0.0001, + "step": 181460 + }, + { + "epoch": 39.007831879976166, + "grad_norm": 0.0007446819799952209, + "learning_rate": 9.508506502494479e-07, + "loss": 0.0087, + "step": 181470 + }, + { + "epoch": 39.00788604235498, + "grad_norm": 0.8095842599868774, + "learning_rate": 9.478416292043547e-07, + "loss": 0.0276, + "step": 181480 + }, + { + "epoch": 39.00794020473379, + "grad_norm": 1.018316388130188, + "learning_rate": 9.448326081592615e-07, + "loss": 0.024, + "step": 181490 + }, + { + "epoch": 39.0079943671126, + "grad_norm": 5.9106011390686035, + "learning_rate": 9.418235871141684e-07, + "loss": 0.0102, + "step": 181500 + }, + { + "epoch": 39.00804852949142, + "grad_norm": 1.0909910202026367, + "learning_rate": 9.388145660690751e-07, + "loss": 0.0732, + "step": 181510 + }, + { + "epoch": 39.00810269187023, + "grad_norm": 0.8703783750534058, + "learning_rate": 9.358055450239818e-07, + "loss": 0.0223, + "step": 181520 + }, + { + "epoch": 39.00815685424904, + "grad_norm": 1.5223400592803955, + "learning_rate": 9.327965239788887e-07, + "loss": 0.0498, + "step": 181530 + }, + { + "epoch": 39.00821101662785, + "grad_norm": 0.0007342213648371398, + "learning_rate": 9.297875029337955e-07, + "loss": 0.0407, + "step": 181540 + }, + { + "epoch": 39.00826517900666, + "grad_norm": 0.0007366276695393026, + "learning_rate": 9.267784818887023e-07, + "loss": 0.0146, + "step": 181550 + }, + { + "epoch": 39.00831934138547, + "grad_norm": 1.0986229181289673, + "learning_rate": 9.237694608436092e-07, + "loss": 0.0244, + "step": 181560 + }, + { + "epoch": 39.00837350376428, + "grad_norm": 0.0010628016898408532, + "learning_rate": 9.207604397985161e-07, + "loss": 0.0158, + "step": 181570 + }, + { + "epoch": 39.0084276661431, + "grad_norm": 0.007801339961588383, + "learning_rate": 9.177514187534228e-07, + "loss": 0.0144, + "step": 181580 + }, + { + "epoch": 39.00848182852191, + "grad_norm": 0.6843697428703308, + "learning_rate": 9.147423977083297e-07, + "loss": 0.0375, + "step": 181590 + }, + { + "epoch": 39.00853599090072, + "grad_norm": 1.1354650259017944, + "learning_rate": 9.117333766632364e-07, + "loss": 0.0661, + "step": 181600 + }, + { + "epoch": 39.008590153279535, + "grad_norm": 0.0007364426855929196, + "learning_rate": 9.087243556181433e-07, + "loss": 0.075, + "step": 181610 + }, + { + "epoch": 39.008644315658344, + "grad_norm": 0.0011667407816275954, + "learning_rate": 9.0571533457305e-07, + "loss": 0.0365, + "step": 181620 + }, + { + "epoch": 39.008698478037154, + "grad_norm": 0.1404525637626648, + "learning_rate": 9.027063135279569e-07, + "loss": 0.0775, + "step": 181630 + }, + { + "epoch": 39.00875264041597, + "grad_norm": 0.0020205029286444187, + "learning_rate": 8.996972924828637e-07, + "loss": 0.0182, + "step": 181640 + }, + { + "epoch": 39.00880680279478, + "grad_norm": 0.0009400185663253069, + "learning_rate": 8.966882714377704e-07, + "loss": 0.019, + "step": 181650 + }, + { + "epoch": 39.00886096517359, + "grad_norm": 3.0316152572631836, + "learning_rate": 8.936792503926773e-07, + "loss": 0.0433, + "step": 181660 + }, + { + "epoch": 39.0089151275524, + "grad_norm": 0.0009554787538945675, + "learning_rate": 8.90670229347584e-07, + "loss": 0.0851, + "step": 181670 + }, + { + "epoch": 39.008969289931215, + "grad_norm": 0.003209690097719431, + "learning_rate": 8.876612083024909e-07, + "loss": 0.0186, + "step": 181680 + }, + { + "epoch": 39.009023452310025, + "grad_norm": 0.011153544299304485, + "learning_rate": 8.846521872573977e-07, + "loss": 0.0171, + "step": 181690 + }, + { + "epoch": 39.009077614688835, + "grad_norm": 0.0007337084389291704, + "learning_rate": 8.816431662123045e-07, + "loss": 0.0125, + "step": 181700 + }, + { + "epoch": 39.00913177706765, + "grad_norm": 0.0007811323157511652, + "learning_rate": 8.786341451672113e-07, + "loss": 0.0756, + "step": 181710 + }, + { + "epoch": 39.00918593944646, + "grad_norm": 1.4742411375045776, + "learning_rate": 8.756251241221182e-07, + "loss": 0.035, + "step": 181720 + }, + { + "epoch": 39.00924010182527, + "grad_norm": 2.169248580932617, + "learning_rate": 8.726161030770249e-07, + "loss": 0.0108, + "step": 181730 + }, + { + "epoch": 39.00929426420409, + "grad_norm": 0.122444286942482, + "learning_rate": 8.696070820319318e-07, + "loss": 0.0916, + "step": 181740 + }, + { + "epoch": 39.009348426582896, + "grad_norm": 0.35807129740715027, + "learning_rate": 8.665980609868385e-07, + "loss": 0.0791, + "step": 181750 + }, + { + "epoch": 39.009402588961706, + "grad_norm": 0.0009787800954654813, + "learning_rate": 8.635890399417454e-07, + "loss": 0.0138, + "step": 181760 + }, + { + "epoch": 39.00945675134052, + "grad_norm": 0.0007442581700161099, + "learning_rate": 8.605800188966521e-07, + "loss": 0.0241, + "step": 181770 + }, + { + "epoch": 39.00951091371933, + "grad_norm": 0.0038262836169451475, + "learning_rate": 8.575709978515591e-07, + "loss": 0.0108, + "step": 181780 + }, + { + "epoch": 39.00956507609814, + "grad_norm": 0.001431515789590776, + "learning_rate": 8.545619768064659e-07, + "loss": 0.076, + "step": 181790 + }, + { + "epoch": 39.00961923847695, + "grad_norm": 0.0018611748237162828, + "learning_rate": 8.515529557613726e-07, + "loss": 0.0099, + "step": 181800 + }, + { + "epoch": 39.00967340085577, + "grad_norm": 0.0009983425261452794, + "learning_rate": 8.485439347162795e-07, + "loss": 0.0569, + "step": 181810 + }, + { + "epoch": 39.00972756323458, + "grad_norm": 0.4016736149787903, + "learning_rate": 8.455349136711862e-07, + "loss": 0.0039, + "step": 181820 + }, + { + "epoch": 39.00978172561339, + "grad_norm": 0.0014323507202789187, + "learning_rate": 8.425258926260931e-07, + "loss": 0.0427, + "step": 181830 + }, + { + "epoch": 39.0098358879922, + "grad_norm": 0.0012375052319839597, + "learning_rate": 8.395168715809998e-07, + "loss": 0.0062, + "step": 181840 + }, + { + "epoch": 39.00989005037101, + "grad_norm": 0.14044079184532166, + "learning_rate": 8.365078505359067e-07, + "loss": 0.0051, + "step": 181850 + }, + { + "epoch": 39.00994421274982, + "grad_norm": 0.009306766092777252, + "learning_rate": 8.334988294908135e-07, + "loss": 0.0059, + "step": 181860 + }, + { + "epoch": 39.00999837512864, + "grad_norm": 0.0007471213466487825, + "learning_rate": 8.304898084457204e-07, + "loss": 0.0078, + "step": 181870 + }, + { + "epoch": 39.01005253750745, + "grad_norm": 0.0009477491257712245, + "learning_rate": 8.274807874006271e-07, + "loss": 0.0245, + "step": 181880 + }, + { + "epoch": 39.01010669988626, + "grad_norm": 0.0007680452545173466, + "learning_rate": 8.24471766355534e-07, + "loss": 0.0001, + "step": 181890 + }, + { + "epoch": 39.01016086226507, + "grad_norm": 3.193352460861206, + "learning_rate": 8.214627453104407e-07, + "loss": 0.0343, + "step": 181900 + }, + { + "epoch": 39.010215024643884, + "grad_norm": 1.1112713813781738, + "learning_rate": 8.184537242653476e-07, + "loss": 0.0383, + "step": 181910 + }, + { + "epoch": 39.01026918702269, + "grad_norm": 0.0011550469789654016, + "learning_rate": 8.154447032202543e-07, + "loss": 0.0725, + "step": 181920 + }, + { + "epoch": 39.0103233494015, + "grad_norm": 0.002058476908132434, + "learning_rate": 8.124356821751612e-07, + "loss": 0.0328, + "step": 181930 + }, + { + "epoch": 39.01037751178032, + "grad_norm": 0.0007575304480269551, + "learning_rate": 8.09426661130068e-07, + "loss": 0.0153, + "step": 181940 + }, + { + "epoch": 39.01043167415913, + "grad_norm": 0.0017009705770760775, + "learning_rate": 8.064176400849747e-07, + "loss": 0.0261, + "step": 181950 + }, + { + "epoch": 39.01048583653794, + "grad_norm": 0.0007510128780268133, + "learning_rate": 8.034086190398816e-07, + "loss": 0.0256, + "step": 181960 + }, + { + "epoch": 39.010539998916755, + "grad_norm": 0.39548546075820923, + "learning_rate": 8.003995979947883e-07, + "loss": 0.028, + "step": 181970 + }, + { + "epoch": 39.010594161295565, + "grad_norm": 0.0012771760812029243, + "learning_rate": 7.973905769496952e-07, + "loss": 0.0183, + "step": 181980 + }, + { + "epoch": 39.010648323674374, + "grad_norm": 0.0012886157492175698, + "learning_rate": 7.943815559046019e-07, + "loss": 0.0288, + "step": 181990 + }, + { + "epoch": 39.01070248605319, + "grad_norm": 0.0010019204346463084, + "learning_rate": 7.913725348595088e-07, + "loss": 0.0207, + "step": 182000 + }, + { + "epoch": 39.010756648432, + "grad_norm": 0.0013622618280351162, + "learning_rate": 7.883635138144157e-07, + "loss": 0.0458, + "step": 182010 + }, + { + "epoch": 39.01081081081081, + "grad_norm": 0.0008949125185608864, + "learning_rate": 7.853544927693226e-07, + "loss": 0.0228, + "step": 182020 + }, + { + "epoch": 39.01086497318962, + "grad_norm": 0.011933318339288235, + "learning_rate": 7.823454717242293e-07, + "loss": 0.0193, + "step": 182030 + }, + { + "epoch": 39.010919135568436, + "grad_norm": 1.6696914434432983, + "learning_rate": 7.793364506791361e-07, + "loss": 0.0466, + "step": 182040 + }, + { + "epoch": 39.010973297947245, + "grad_norm": 0.000921955390367657, + "learning_rate": 7.763274296340429e-07, + "loss": 0.0142, + "step": 182050 + }, + { + "epoch": 39.011027460326055, + "grad_norm": 0.0015708903083577752, + "learning_rate": 7.733184085889497e-07, + "loss": 0.0257, + "step": 182060 + }, + { + "epoch": 39.01108162270487, + "grad_norm": 0.4390842914581299, + "learning_rate": 7.703093875438565e-07, + "loss": 0.0318, + "step": 182070 + }, + { + "epoch": 39.01113578508368, + "grad_norm": 0.016590530052781105, + "learning_rate": 7.673003664987633e-07, + "loss": 0.0173, + "step": 182080 + }, + { + "epoch": 39.01118994746249, + "grad_norm": 2.9802403450012207, + "learning_rate": 7.642913454536702e-07, + "loss": 0.1377, + "step": 182090 + }, + { + "epoch": 39.01124410984131, + "grad_norm": 2.9687488079071045, + "learning_rate": 7.61282324408577e-07, + "loss": 0.0484, + "step": 182100 + }, + { + "epoch": 39.01129827222012, + "grad_norm": 0.0007437170716002584, + "learning_rate": 7.582733033634838e-07, + "loss": 0.0229, + "step": 182110 + }, + { + "epoch": 39.011352434598926, + "grad_norm": 0.0018337317742407322, + "learning_rate": 7.552642823183906e-07, + "loss": 0.0138, + "step": 182120 + }, + { + "epoch": 39.011406596977736, + "grad_norm": 0.0007311734952963889, + "learning_rate": 7.522552612732974e-07, + "loss": 0.0392, + "step": 182130 + }, + { + "epoch": 39.01146075935655, + "grad_norm": 0.0007514613098464906, + "learning_rate": 7.492462402282042e-07, + "loss": 0.0685, + "step": 182140 + }, + { + "epoch": 39.01151492173536, + "grad_norm": 0.0009996547596529126, + "learning_rate": 7.46237219183111e-07, + "loss": 0.0088, + "step": 182150 + }, + { + "epoch": 39.01156908411417, + "grad_norm": 0.0011406998382881284, + "learning_rate": 7.432281981380178e-07, + "loss": 0.0165, + "step": 182160 + }, + { + "epoch": 39.01162324649299, + "grad_norm": 4.560863494873047, + "learning_rate": 7.402191770929246e-07, + "loss": 0.1005, + "step": 182170 + }, + { + "epoch": 39.0116774088718, + "grad_norm": 0.0007595621864311397, + "learning_rate": 7.372101560478314e-07, + "loss": 0.0, + "step": 182180 + }, + { + "epoch": 39.01173157125061, + "grad_norm": 0.01367371715605259, + "learning_rate": 7.342011350027382e-07, + "loss": 0.0087, + "step": 182190 + }, + { + "epoch": 39.011785733629424, + "grad_norm": 0.000733995228074491, + "learning_rate": 7.31192113957645e-07, + "loss": 0.0049, + "step": 182200 + }, + { + "epoch": 39.01183989600823, + "grad_norm": 0.0007557041826657951, + "learning_rate": 7.281830929125518e-07, + "loss": 0.0406, + "step": 182210 + }, + { + "epoch": 39.01189405838704, + "grad_norm": 0.006184575147926807, + "learning_rate": 7.251740718674586e-07, + "loss": 0.0298, + "step": 182220 + }, + { + "epoch": 39.01194822076586, + "grad_norm": 0.0007361685275100172, + "learning_rate": 7.221650508223654e-07, + "loss": 0.0129, + "step": 182230 + }, + { + "epoch": 39.01200238314467, + "grad_norm": 0.001169301918707788, + "learning_rate": 7.191560297772724e-07, + "loss": 0.0483, + "step": 182240 + }, + { + "epoch": 39.01205654552348, + "grad_norm": 0.007794172968715429, + "learning_rate": 7.161470087321792e-07, + "loss": 0.0154, + "step": 182250 + }, + { + "epoch": 39.01211070790229, + "grad_norm": 0.0009816226083785295, + "learning_rate": 7.13137987687086e-07, + "loss": 0.0015, + "step": 182260 + }, + { + "epoch": 39.012164870281104, + "grad_norm": 0.04409128054976463, + "learning_rate": 7.101289666419928e-07, + "loss": 0.0208, + "step": 182270 + }, + { + "epoch": 39.012219032659914, + "grad_norm": 0.11827891319990158, + "learning_rate": 7.071199455968996e-07, + "loss": 0.0578, + "step": 182280 + }, + { + "epoch": 39.01227319503872, + "grad_norm": 0.8660979866981506, + "learning_rate": 7.041109245518063e-07, + "loss": 0.0649, + "step": 182290 + }, + { + "epoch": 39.01232735741754, + "grad_norm": 0.0008353855810128152, + "learning_rate": 7.011019035067131e-07, + "loss": 0.0021, + "step": 182300 + }, + { + "epoch": 39.01238151979635, + "grad_norm": 0.580409586429596, + "learning_rate": 6.9809288246162e-07, + "loss": 0.0333, + "step": 182310 + }, + { + "epoch": 39.01243568217516, + "grad_norm": 1.3818058967590332, + "learning_rate": 6.950838614165268e-07, + "loss": 0.0125, + "step": 182320 + }, + { + "epoch": 39.012489844553976, + "grad_norm": 0.8835129141807556, + "learning_rate": 6.920748403714336e-07, + "loss": 0.0179, + "step": 182330 + }, + { + "epoch": 39.012544006932785, + "grad_norm": 0.05930306389927864, + "learning_rate": 6.890658193263404e-07, + "loss": 0.0459, + "step": 182340 + }, + { + "epoch": 39.012598169311595, + "grad_norm": 0.3209381103515625, + "learning_rate": 6.860567982812472e-07, + "loss": 0.0084, + "step": 182350 + }, + { + "epoch": 39.01265233169041, + "grad_norm": 0.00074180489173159, + "learning_rate": 6.83047777236154e-07, + "loss": 0.0005, + "step": 182360 + }, + { + "epoch": 39.01270649406922, + "grad_norm": 0.000745896715670824, + "learning_rate": 6.800387561910608e-07, + "loss": 0.0583, + "step": 182370 + }, + { + "epoch": 39.01276065644803, + "grad_norm": 0.0007491673459298909, + "learning_rate": 6.770297351459676e-07, + "loss": 0.0355, + "step": 182380 + }, + { + "epoch": 39.01281481882684, + "grad_norm": 0.0007486623944714665, + "learning_rate": 6.740207141008745e-07, + "loss": 0.0072, + "step": 182390 + }, + { + "epoch": 39.012868981205656, + "grad_norm": 0.0007519989740103483, + "learning_rate": 6.710116930557813e-07, + "loss": 0.0085, + "step": 182400 + }, + { + "epoch": 39.012923143584466, + "grad_norm": 1.2593815326690674, + "learning_rate": 6.680026720106881e-07, + "loss": 0.0377, + "step": 182410 + }, + { + "epoch": 39.012977305963275, + "grad_norm": 0.0007404410862363875, + "learning_rate": 6.649936509655949e-07, + "loss": 0.0001, + "step": 182420 + }, + { + "epoch": 39.01303146834209, + "grad_norm": 0.0008781190845184028, + "learning_rate": 6.619846299205016e-07, + "loss": 0.0407, + "step": 182430 + }, + { + "epoch": 39.0130856307209, + "grad_norm": 0.0007535560871474445, + "learning_rate": 6.589756088754084e-07, + "loss": 0.0281, + "step": 182440 + }, + { + "epoch": 39.01313979309971, + "grad_norm": 0.004785594996064901, + "learning_rate": 6.559665878303152e-07, + "loss": 0.0412, + "step": 182450 + }, + { + "epoch": 39.01319395547853, + "grad_norm": 1.3173469305038452, + "learning_rate": 6.529575667852222e-07, + "loss": 0.0211, + "step": 182460 + }, + { + "epoch": 39.01324811785734, + "grad_norm": 0.0007859129691496491, + "learning_rate": 6.49948545740129e-07, + "loss": 0.0108, + "step": 182470 + }, + { + "epoch": 39.01330228023615, + "grad_norm": 0.0010813198750838637, + "learning_rate": 6.469395246950358e-07, + "loss": 0.0215, + "step": 182480 + }, + { + "epoch": 39.013356442614956, + "grad_norm": 3.824429750442505, + "learning_rate": 6.439305036499426e-07, + "loss": 0.0245, + "step": 182490 + }, + { + "epoch": 39.01341060499377, + "grad_norm": 0.006243702955543995, + "learning_rate": 6.409214826048494e-07, + "loss": 0.0355, + "step": 182500 + }, + { + "epoch": 39.01346476737258, + "grad_norm": 4.17315149307251, + "learning_rate": 6.379124615597562e-07, + "loss": 0.0417, + "step": 182510 + }, + { + "epoch": 39.01351892975139, + "grad_norm": 0.0009844766464084387, + "learning_rate": 6.34903440514663e-07, + "loss": 0.0143, + "step": 182520 + }, + { + "epoch": 39.01357309213021, + "grad_norm": 1.1825971603393555, + "learning_rate": 6.318944194695698e-07, + "loss": 0.0925, + "step": 182530 + }, + { + "epoch": 39.01362725450902, + "grad_norm": 0.9902319312095642, + "learning_rate": 6.288853984244766e-07, + "loss": 0.0048, + "step": 182540 + }, + { + "epoch": 39.01368141688783, + "grad_norm": 0.0010355408303439617, + "learning_rate": 6.258763773793835e-07, + "loss": 0.0458, + "step": 182550 + }, + { + "epoch": 39.013735579266644, + "grad_norm": 0.11683783680200577, + "learning_rate": 6.228673563342903e-07, + "loss": 0.0206, + "step": 182560 + }, + { + "epoch": 39.01378974164545, + "grad_norm": 0.557627260684967, + "learning_rate": 6.198583352891971e-07, + "loss": 0.0271, + "step": 182570 + }, + { + "epoch": 39.01384390402426, + "grad_norm": 0.0007489745621569455, + "learning_rate": 6.168493142441038e-07, + "loss": 0.0341, + "step": 182580 + }, + { + "epoch": 39.01389806640308, + "grad_norm": 0.0007549148285761476, + "learning_rate": 6.138402931990106e-07, + "loss": 0.0224, + "step": 182590 + }, + { + "epoch": 39.01395222878189, + "grad_norm": 0.0009646957041695714, + "learning_rate": 6.108312721539174e-07, + "loss": 0.016, + "step": 182600 + }, + { + "epoch": 39.0140063911607, + "grad_norm": 0.0007487506372854114, + "learning_rate": 6.078222511088243e-07, + "loss": 0.066, + "step": 182610 + }, + { + "epoch": 39.01406055353951, + "grad_norm": 0.21362410485744476, + "learning_rate": 6.048132300637311e-07, + "loss": 0.0011, + "step": 182620 + }, + { + "epoch": 39.014114715918325, + "grad_norm": 0.0016373989637941122, + "learning_rate": 6.018042090186379e-07, + "loss": 0.0147, + "step": 182630 + }, + { + "epoch": 39.014168878297134, + "grad_norm": 0.0007387863588519394, + "learning_rate": 5.987951879735447e-07, + "loss": 0.0116, + "step": 182640 + }, + { + "epoch": 39.014223040675944, + "grad_norm": 0.001140178763307631, + "learning_rate": 5.957861669284515e-07, + "loss": 0.0152, + "step": 182650 + }, + { + "epoch": 39.01427720305476, + "grad_norm": 0.00245620752684772, + "learning_rate": 5.927771458833583e-07, + "loss": 0.0162, + "step": 182660 + }, + { + "epoch": 39.01433136543357, + "grad_norm": 0.0366162545979023, + "learning_rate": 5.897681248382651e-07, + "loss": 0.0082, + "step": 182670 + }, + { + "epoch": 39.01438552781238, + "grad_norm": 0.0009686044650152326, + "learning_rate": 5.867591037931719e-07, + "loss": 0.0161, + "step": 182680 + }, + { + "epoch": 39.014439690191196, + "grad_norm": 0.4820201098918915, + "learning_rate": 5.837500827480787e-07, + "loss": 0.0436, + "step": 182690 + }, + { + "epoch": 39.014493852570006, + "grad_norm": 0.6982743740081787, + "learning_rate": 5.807410617029857e-07, + "loss": 0.0691, + "step": 182700 + }, + { + "epoch": 39.014548014948815, + "grad_norm": 0.0007344472105614841, + "learning_rate": 5.777320406578925e-07, + "loss": 0.0172, + "step": 182710 + }, + { + "epoch": 39.01460217732763, + "grad_norm": 0.0007407709490507841, + "learning_rate": 5.747230196127992e-07, + "loss": 0.0545, + "step": 182720 + }, + { + "epoch": 39.01465633970644, + "grad_norm": 0.0007427984965033829, + "learning_rate": 5.71713998567706e-07, + "loss": 0.0009, + "step": 182730 + }, + { + "epoch": 39.01471050208525, + "grad_norm": 0.0009572673006914556, + "learning_rate": 5.687049775226128e-07, + "loss": 0.0228, + "step": 182740 + }, + { + "epoch": 39.01476466446406, + "grad_norm": 1.0083858966827393, + "learning_rate": 5.656959564775196e-07, + "loss": 0.0274, + "step": 182750 + }, + { + "epoch": 39.01481882684288, + "grad_norm": 0.0013700125273317099, + "learning_rate": 5.626869354324264e-07, + "loss": 0.01, + "step": 182760 + }, + { + "epoch": 39.014872989221686, + "grad_norm": 0.0011813852470368147, + "learning_rate": 5.596779143873333e-07, + "loss": 0.0099, + "step": 182770 + }, + { + "epoch": 39.014927151600496, + "grad_norm": 0.9180886149406433, + "learning_rate": 5.566688933422401e-07, + "loss": 0.0119, + "step": 182780 + }, + { + "epoch": 39.01498131397931, + "grad_norm": 0.25836098194122314, + "learning_rate": 5.536598722971469e-07, + "loss": 0.032, + "step": 182790 + }, + { + "epoch": 39.01503547635812, + "grad_norm": 0.11241437494754791, + "learning_rate": 5.506508512520537e-07, + "loss": 0.0086, + "step": 182800 + }, + { + "epoch": 39.01508963873693, + "grad_norm": 3.4762372970581055, + "learning_rate": 5.476418302069605e-07, + "loss": 0.0479, + "step": 182810 + }, + { + "epoch": 39.01514380111575, + "grad_norm": 0.2176937609910965, + "learning_rate": 5.446328091618673e-07, + "loss": 0.012, + "step": 182820 + }, + { + "epoch": 39.01519796349456, + "grad_norm": 0.0020684003829956055, + "learning_rate": 5.416237881167741e-07, + "loss": 0.0148, + "step": 182830 + }, + { + "epoch": 39.01525212587337, + "grad_norm": 0.3733072876930237, + "learning_rate": 5.386147670716809e-07, + "loss": 0.0286, + "step": 182840 + }, + { + "epoch": 39.01530628825218, + "grad_norm": 0.0053901649080216885, + "learning_rate": 5.356057460265878e-07, + "loss": 0.0464, + "step": 182850 + }, + { + "epoch": 39.01536045063099, + "grad_norm": 1.3770297765731812, + "learning_rate": 5.325967249814945e-07, + "loss": 0.0309, + "step": 182860 + }, + { + "epoch": 39.0154146130098, + "grad_norm": 0.0007824733038432896, + "learning_rate": 5.295877039364013e-07, + "loss": 0.0562, + "step": 182870 + }, + { + "epoch": 39.01546877538861, + "grad_norm": 0.0007376361172646284, + "learning_rate": 5.265786828913081e-07, + "loss": 0.013, + "step": 182880 + }, + { + "epoch": 39.01552293776743, + "grad_norm": 0.02202259562909603, + "learning_rate": 5.235696618462149e-07, + "loss": 0.0619, + "step": 182890 + }, + { + "epoch": 39.01557710014624, + "grad_norm": 0.009379063732922077, + "learning_rate": 5.205606408011217e-07, + "loss": 0.0005, + "step": 182900 + }, + { + "epoch": 39.01563126252505, + "grad_norm": 0.0009696563938632607, + "learning_rate": 5.175516197560285e-07, + "loss": 0.0203, + "step": 182910 + }, + { + "epoch": 39.015685424903864, + "grad_norm": 0.0013664441648870707, + "learning_rate": 5.145425987109355e-07, + "loss": 0.0009, + "step": 182920 + }, + { + "epoch": 39.015739587282674, + "grad_norm": 4.642022609710693, + "learning_rate": 5.115335776658423e-07, + "loss": 0.0877, + "step": 182930 + }, + { + "epoch": 39.01579374966148, + "grad_norm": 0.0016805485356599092, + "learning_rate": 5.085245566207491e-07, + "loss": 0.0434, + "step": 182940 + }, + { + "epoch": 39.0158479120403, + "grad_norm": 0.0007473534788005054, + "learning_rate": 5.055155355756559e-07, + "loss": 0.0378, + "step": 182950 + }, + { + "epoch": 39.01590207441911, + "grad_norm": 0.0009551823604851961, + "learning_rate": 5.025065145305627e-07, + "loss": 0.066, + "step": 182960 + }, + { + "epoch": 39.01595623679792, + "grad_norm": 0.925957441329956, + "learning_rate": 4.994974934854695e-07, + "loss": 0.0102, + "step": 182970 + }, + { + "epoch": 39.01601039917673, + "grad_norm": 0.0007449653348885477, + "learning_rate": 4.964884724403763e-07, + "loss": 0.0252, + "step": 182980 + }, + { + "epoch": 39.016064561555545, + "grad_norm": 0.0010952678276225924, + "learning_rate": 4.934794513952831e-07, + "loss": 0.043, + "step": 182990 + }, + { + "epoch": 39.016118723934355, + "grad_norm": 1.0558408498764038, + "learning_rate": 4.904704303501899e-07, + "loss": 0.0038, + "step": 183000 + }, + { + "epoch": 39.016172886313164, + "grad_norm": 0.0009284614934585989, + "learning_rate": 4.874614093050967e-07, + "loss": 0.0732, + "step": 183010 + }, + { + "epoch": 39.01622704869198, + "grad_norm": 0.0007574326591566205, + "learning_rate": 4.844523882600035e-07, + "loss": 0.0736, + "step": 183020 + }, + { + "epoch": 39.01628121107079, + "grad_norm": 0.001149323652498424, + "learning_rate": 4.814433672149103e-07, + "loss": 0.0346, + "step": 183030 + }, + { + "epoch": 39.0163353734496, + "grad_norm": 0.0007368244114331901, + "learning_rate": 4.784343461698171e-07, + "loss": 0.0013, + "step": 183040 + }, + { + "epoch": 39.01638953582842, + "grad_norm": 0.6254088282585144, + "learning_rate": 4.7542532512472396e-07, + "loss": 0.0416, + "step": 183050 + }, + { + "epoch": 39.016443698207226, + "grad_norm": 0.0021849244367331266, + "learning_rate": 4.7241630407963076e-07, + "loss": 0.047, + "step": 183060 + }, + { + "epoch": 39.016497860586036, + "grad_norm": 0.2800041735172272, + "learning_rate": 4.6940728303453757e-07, + "loss": 0.0233, + "step": 183070 + }, + { + "epoch": 39.01655202296485, + "grad_norm": 0.0007428014650940895, + "learning_rate": 4.6639826198944437e-07, + "loss": 0.0169, + "step": 183080 + }, + { + "epoch": 39.01660618534366, + "grad_norm": 4.050756931304932, + "learning_rate": 4.633892409443512e-07, + "loss": 0.0802, + "step": 183090 + }, + { + "epoch": 39.01666034772247, + "grad_norm": 1.2406554222106934, + "learning_rate": 4.6038021989925803e-07, + "loss": 0.0148, + "step": 183100 + }, + { + "epoch": 39.01671451010128, + "grad_norm": 0.0007621179101988673, + "learning_rate": 4.5737119885416484e-07, + "loss": 0.0344, + "step": 183110 + }, + { + "epoch": 39.0167686724801, + "grad_norm": 0.0009816151577979326, + "learning_rate": 4.5436217780907164e-07, + "loss": 0.0001, + "step": 183120 + }, + { + "epoch": 39.01682283485891, + "grad_norm": 0.0007248587207868695, + "learning_rate": 4.5135315676397844e-07, + "loss": 0.0044, + "step": 183130 + }, + { + "epoch": 39.016876997237716, + "grad_norm": 0.0016310986829921603, + "learning_rate": 4.483441357188852e-07, + "loss": 0.0094, + "step": 183140 + }, + { + "epoch": 39.01693115961653, + "grad_norm": 0.004796566907316446, + "learning_rate": 4.45335114673792e-07, + "loss": 0.0014, + "step": 183150 + }, + { + "epoch": 39.01698532199534, + "grad_norm": 0.000730058120097965, + "learning_rate": 4.4232609362869886e-07, + "loss": 0.0027, + "step": 183160 + }, + { + "epoch": 39.01703948437415, + "grad_norm": 0.0009346532169729471, + "learning_rate": 4.3931707258360566e-07, + "loss": 0.0047, + "step": 183170 + }, + { + "epoch": 39.01709364675297, + "grad_norm": 4.789468288421631, + "learning_rate": 4.3630805153851246e-07, + "loss": 0.1237, + "step": 183180 + }, + { + "epoch": 39.01714780913178, + "grad_norm": 3.215933084487915, + "learning_rate": 4.3329903049341927e-07, + "loss": 0.0313, + "step": 183190 + }, + { + "epoch": 39.01720197151059, + "grad_norm": 0.0007316338014788926, + "learning_rate": 4.3029000944832607e-07, + "loss": 0.0046, + "step": 183200 + }, + { + "epoch": 39.0172561338894, + "grad_norm": 0.40809762477874756, + "learning_rate": 4.2728098840323293e-07, + "loss": 0.025, + "step": 183210 + }, + { + "epoch": 39.017310296268214, + "grad_norm": 0.13514584302902222, + "learning_rate": 4.2427196735813973e-07, + "loss": 0.0596, + "step": 183220 + }, + { + "epoch": 39.01736445864702, + "grad_norm": 0.01715896651148796, + "learning_rate": 4.2126294631304654e-07, + "loss": 0.0486, + "step": 183230 + }, + { + "epoch": 39.01741862102583, + "grad_norm": 1.7685332298278809, + "learning_rate": 4.1825392526795334e-07, + "loss": 0.0064, + "step": 183240 + }, + { + "epoch": 39.01747278340465, + "grad_norm": 0.0009797700913622975, + "learning_rate": 4.152449042228602e-07, + "loss": 0.0094, + "step": 183250 + }, + { + "epoch": 39.01752694578346, + "grad_norm": 0.006118379067629576, + "learning_rate": 4.12235883177767e-07, + "loss": 0.0256, + "step": 183260 + }, + { + "epoch": 39.01758110816227, + "grad_norm": 0.0009279194055125117, + "learning_rate": 4.092268621326738e-07, + "loss": 0.0549, + "step": 183270 + }, + { + "epoch": 39.017635270541085, + "grad_norm": 0.6017332673072815, + "learning_rate": 4.062178410875806e-07, + "loss": 0.013, + "step": 183280 + }, + { + "epoch": 39.017689432919894, + "grad_norm": 0.0009687431156635284, + "learning_rate": 4.0320882004248736e-07, + "loss": 0.0602, + "step": 183290 + }, + { + "epoch": 39.017743595298704, + "grad_norm": 0.0007417803863063455, + "learning_rate": 4.0019979899739417e-07, + "loss": 0.0199, + "step": 183300 + }, + { + "epoch": 39.01779775767752, + "grad_norm": 0.004787389654666185, + "learning_rate": 3.9719077795230097e-07, + "loss": 0.0558, + "step": 183310 + }, + { + "epoch": 39.01785192005633, + "grad_norm": 2.3752152919769287, + "learning_rate": 3.941817569072078e-07, + "loss": 0.0453, + "step": 183320 + }, + { + "epoch": 39.01790608243514, + "grad_norm": 0.0007359642768278718, + "learning_rate": 3.9117273586211463e-07, + "loss": 0.0127, + "step": 183330 + }, + { + "epoch": 39.01796024481395, + "grad_norm": 0.9538353681564331, + "learning_rate": 3.8816371481702143e-07, + "loss": 0.0556, + "step": 183340 + }, + { + "epoch": 39.018014407192766, + "grad_norm": 0.11475551128387451, + "learning_rate": 3.8515469377192824e-07, + "loss": 0.0058, + "step": 183350 + }, + { + "epoch": 39.018068569571575, + "grad_norm": 1.8266857862472534, + "learning_rate": 3.821456727268351e-07, + "loss": 0.0337, + "step": 183360 + }, + { + "epoch": 39.018122731950385, + "grad_norm": 0.005567215848714113, + "learning_rate": 3.791366516817419e-07, + "loss": 0.0576, + "step": 183370 + }, + { + "epoch": 39.0181768943292, + "grad_norm": 0.0007344994810409844, + "learning_rate": 3.761276306366487e-07, + "loss": 0.0183, + "step": 183380 + }, + { + "epoch": 39.01823105670801, + "grad_norm": 0.0010435794247314334, + "learning_rate": 3.731186095915555e-07, + "loss": 0.0076, + "step": 183390 + }, + { + "epoch": 39.01828521908682, + "grad_norm": 0.0010461201891303062, + "learning_rate": 3.701095885464623e-07, + "loss": 0.072, + "step": 183400 + }, + { + "epoch": 39.01833938146564, + "grad_norm": 0.0007772684912197292, + "learning_rate": 3.671005675013691e-07, + "loss": 0.0006, + "step": 183410 + }, + { + "epoch": 39.01839354384445, + "grad_norm": 0.0007395752472802997, + "learning_rate": 3.640915464562759e-07, + "loss": 0.0104, + "step": 183420 + }, + { + "epoch": 39.018447706223256, + "grad_norm": 0.02133157290518284, + "learning_rate": 3.610825254111827e-07, + "loss": 0.0162, + "step": 183430 + }, + { + "epoch": 39.018501868602065, + "grad_norm": 0.7438943982124329, + "learning_rate": 3.580735043660896e-07, + "loss": 0.0109, + "step": 183440 + }, + { + "epoch": 39.01855603098088, + "grad_norm": 0.756751537322998, + "learning_rate": 3.550644833209964e-07, + "loss": 0.0422, + "step": 183450 + }, + { + "epoch": 39.01861019335969, + "grad_norm": 0.03537726774811745, + "learning_rate": 3.5205546227590314e-07, + "loss": 0.0094, + "step": 183460 + }, + { + "epoch": 39.0186643557385, + "grad_norm": 0.0007295958930626512, + "learning_rate": 3.4904644123081e-07, + "loss": 0.0006, + "step": 183470 + }, + { + "epoch": 39.01871851811732, + "grad_norm": 0.0021349703893065453, + "learning_rate": 3.460374201857168e-07, + "loss": 0.0403, + "step": 183480 + }, + { + "epoch": 39.01877268049613, + "grad_norm": 0.002203306183218956, + "learning_rate": 3.430283991406236e-07, + "loss": 0.0492, + "step": 183490 + }, + { + "epoch": 39.01882684287494, + "grad_norm": 0.0013049226254224777, + "learning_rate": 3.400193780955304e-07, + "loss": 0.0457, + "step": 183500 + }, + { + "epoch": 39.01888100525375, + "grad_norm": 3.1927073001861572, + "learning_rate": 3.3701035705043726e-07, + "loss": 0.0453, + "step": 183510 + }, + { + "epoch": 39.01893516763256, + "grad_norm": 0.04235835745930672, + "learning_rate": 3.3400133600534407e-07, + "loss": 0.0067, + "step": 183520 + }, + { + "epoch": 39.01898933001137, + "grad_norm": 0.0007423833012580872, + "learning_rate": 3.309923149602508e-07, + "loss": 0.0213, + "step": 183530 + }, + { + "epoch": 39.01904349239019, + "grad_norm": 0.0007408801466226578, + "learning_rate": 3.279832939151576e-07, + "loss": 0.0715, + "step": 183540 + }, + { + "epoch": 39.019097654769, + "grad_norm": 0.0007588813314214349, + "learning_rate": 3.249742728700645e-07, + "loss": 0.0795, + "step": 183550 + }, + { + "epoch": 39.01915181714781, + "grad_norm": 0.0010555306216701865, + "learning_rate": 3.219652518249713e-07, + "loss": 0.0071, + "step": 183560 + }, + { + "epoch": 39.01920597952662, + "grad_norm": 0.05050152540206909, + "learning_rate": 3.189562307798781e-07, + "loss": 0.0106, + "step": 183570 + }, + { + "epoch": 39.019260141905434, + "grad_norm": 0.01205587387084961, + "learning_rate": 3.159472097347849e-07, + "loss": 0.0291, + "step": 183580 + }, + { + "epoch": 39.019314304284244, + "grad_norm": 0.0007308484055101871, + "learning_rate": 3.1293818868969175e-07, + "loss": 0.0085, + "step": 183590 + }, + { + "epoch": 39.01936846666305, + "grad_norm": 0.2671773135662079, + "learning_rate": 3.0992916764459855e-07, + "loss": 0.0388, + "step": 183600 + }, + { + "epoch": 39.01942262904187, + "grad_norm": 0.000943589024245739, + "learning_rate": 3.069201465995053e-07, + "loss": 0.0458, + "step": 183610 + }, + { + "epoch": 39.01947679142068, + "grad_norm": 0.4236644506454468, + "learning_rate": 3.0391112555441216e-07, + "loss": 0.0038, + "step": 183620 + }, + { + "epoch": 39.01953095379949, + "grad_norm": 0.0011536093661561608, + "learning_rate": 3.0090210450931896e-07, + "loss": 0.063, + "step": 183630 + }, + { + "epoch": 39.019585116178305, + "grad_norm": 0.0007464765221811831, + "learning_rate": 2.9789308346422577e-07, + "loss": 0.0087, + "step": 183640 + }, + { + "epoch": 39.019639278557115, + "grad_norm": 3.2039403915405273, + "learning_rate": 2.9488406241913257e-07, + "loss": 0.0474, + "step": 183650 + }, + { + "epoch": 39.019693440935924, + "grad_norm": 0.012449132278561592, + "learning_rate": 2.918750413740394e-07, + "loss": 0.0005, + "step": 183660 + }, + { + "epoch": 39.01974760331474, + "grad_norm": 0.0010069095296785235, + "learning_rate": 2.8886602032894623e-07, + "loss": 0.0784, + "step": 183670 + }, + { + "epoch": 39.01980176569355, + "grad_norm": 2.9469292163848877, + "learning_rate": 2.85856999283853e-07, + "loss": 0.0503, + "step": 183680 + }, + { + "epoch": 39.01985592807236, + "grad_norm": 0.0007312145316973329, + "learning_rate": 2.828479782387598e-07, + "loss": 0.1228, + "step": 183690 + }, + { + "epoch": 39.01991009045117, + "grad_norm": 0.000733281543944031, + "learning_rate": 2.7983895719366664e-07, + "loss": 0.0067, + "step": 183700 + }, + { + "epoch": 39.019964252829986, + "grad_norm": 0.45948994159698486, + "learning_rate": 2.7682993614857345e-07, + "loss": 0.0584, + "step": 183710 + }, + { + "epoch": 39.020018415208796, + "grad_norm": 0.011150095611810684, + "learning_rate": 2.7382091510348025e-07, + "loss": 0.0721, + "step": 183720 + }, + { + "epoch": 39.020072577587605, + "grad_norm": 0.001128360047005117, + "learning_rate": 2.7081189405838706e-07, + "loss": 0.0119, + "step": 183730 + }, + { + "epoch": 39.02012673996642, + "grad_norm": 0.0009609913104213774, + "learning_rate": 2.678028730132939e-07, + "loss": 0.0051, + "step": 183740 + }, + { + "epoch": 39.02018090234523, + "grad_norm": 0.6786898374557495, + "learning_rate": 2.6479385196820066e-07, + "loss": 0.0255, + "step": 183750 + }, + { + "epoch": 39.02023506472404, + "grad_norm": 0.29240527749061584, + "learning_rate": 2.6178483092310747e-07, + "loss": 0.0159, + "step": 183760 + }, + { + "epoch": 39.02028922710286, + "grad_norm": 0.002634732285514474, + "learning_rate": 2.5877580987801427e-07, + "loss": 0.0598, + "step": 183770 + }, + { + "epoch": 39.02034338948167, + "grad_norm": 0.003970491699874401, + "learning_rate": 2.5576678883292113e-07, + "loss": 0.019, + "step": 183780 + }, + { + "epoch": 39.020397551860476, + "grad_norm": 0.2000720351934433, + "learning_rate": 2.5275776778782793e-07, + "loss": 0.0876, + "step": 183790 + }, + { + "epoch": 39.020451714239286, + "grad_norm": 17.883960723876953, + "learning_rate": 2.4974874674273474e-07, + "loss": 0.0643, + "step": 183800 + }, + { + "epoch": 39.0205058766181, + "grad_norm": 0.08109995722770691, + "learning_rate": 2.4673972569764154e-07, + "loss": 0.0012, + "step": 183810 + }, + { + "epoch": 39.02056003899691, + "grad_norm": 0.0007311087101697922, + "learning_rate": 2.4373070465254834e-07, + "loss": 0.0473, + "step": 183820 + }, + { + "epoch": 39.02061420137572, + "grad_norm": 0.006329360418021679, + "learning_rate": 2.4072168360745515e-07, + "loss": 0.0068, + "step": 183830 + }, + { + "epoch": 39.02066836375454, + "grad_norm": 0.2930441200733185, + "learning_rate": 2.3771266256236198e-07, + "loss": 0.0293, + "step": 183840 + }, + { + "epoch": 39.02072252613335, + "grad_norm": 2.274744987487793, + "learning_rate": 2.3470364151726878e-07, + "loss": 0.0284, + "step": 183850 + }, + { + "epoch": 39.02077668851216, + "grad_norm": 0.0007334710098803043, + "learning_rate": 2.316946204721756e-07, + "loss": 0.0396, + "step": 183860 + }, + { + "epoch": 39.020830850890974, + "grad_norm": 0.017186250537633896, + "learning_rate": 2.2868559942708242e-07, + "loss": 0.0532, + "step": 183870 + }, + { + "epoch": 39.02088501326978, + "grad_norm": 0.0007321900920942426, + "learning_rate": 2.2567657838198922e-07, + "loss": 0.0247, + "step": 183880 + }, + { + "epoch": 39.02093917564859, + "grad_norm": 0.009271914139389992, + "learning_rate": 2.22667557336896e-07, + "loss": 0.0779, + "step": 183890 + }, + { + "epoch": 39.02099333802741, + "grad_norm": 1.1705230474472046, + "learning_rate": 2.1965853629180283e-07, + "loss": 0.0628, + "step": 183900 + }, + { + "epoch": 39.02104750040622, + "grad_norm": 0.000740440038498491, + "learning_rate": 2.1664951524670963e-07, + "loss": 0.0176, + "step": 183910 + }, + { + "epoch": 39.02110166278503, + "grad_norm": 9.115439414978027, + "learning_rate": 2.1364049420161646e-07, + "loss": 0.2261, + "step": 183920 + }, + { + "epoch": 39.02115582516384, + "grad_norm": 0.22621431946754456, + "learning_rate": 2.1063147315652327e-07, + "loss": 0.0034, + "step": 183930 + }, + { + "epoch": 39.021209987542655, + "grad_norm": 1.5284873247146606, + "learning_rate": 2.076224521114301e-07, + "loss": 0.0187, + "step": 183940 + }, + { + "epoch": 39.021264149921464, + "grad_norm": 0.0009754534112289548, + "learning_rate": 2.046134310663369e-07, + "loss": 0.0146, + "step": 183950 + }, + { + "epoch": 39.021318312300274, + "grad_norm": 0.0011400195071473718, + "learning_rate": 2.0160441002124368e-07, + "loss": 0.0113, + "step": 183960 + }, + { + "epoch": 39.02137247467909, + "grad_norm": 1.0309350490570068, + "learning_rate": 1.9859538897615048e-07, + "loss": 0.0284, + "step": 183970 + }, + { + "epoch": 39.0214266370579, + "grad_norm": 0.0007441609632223845, + "learning_rate": 1.9558636793105732e-07, + "loss": 0.0381, + "step": 183980 + }, + { + "epoch": 39.02148079943671, + "grad_norm": 1.7799865007400513, + "learning_rate": 1.9257734688596412e-07, + "loss": 0.0307, + "step": 183990 + }, + { + "epoch": 39.021534961815526, + "grad_norm": 0.43230900168418884, + "learning_rate": 1.8956832584087095e-07, + "loss": 0.0111, + "step": 184000 + }, + { + "epoch": 39.021589124194335, + "grad_norm": 0.000745126511901617, + "learning_rate": 1.8655930479577775e-07, + "loss": 0.036, + "step": 184010 + }, + { + "epoch": 39.021643286573145, + "grad_norm": 0.005313860718160868, + "learning_rate": 1.8355028375068456e-07, + "loss": 0.0395, + "step": 184020 + }, + { + "epoch": 39.02169744895196, + "grad_norm": 0.0010609703604131937, + "learning_rate": 1.8054126270559136e-07, + "loss": 0.0377, + "step": 184030 + }, + { + "epoch": 39.02175161133077, + "grad_norm": 0.0007359395385719836, + "learning_rate": 1.775322416604982e-07, + "loss": 0.001, + "step": 184040 + }, + { + "epoch": 39.02180577370958, + "grad_norm": 0.001186197972856462, + "learning_rate": 1.74523220615405e-07, + "loss": 0.0017, + "step": 184050 + }, + { + "epoch": 39.02185993608839, + "grad_norm": 0.0015801242552697659, + "learning_rate": 1.715141995703118e-07, + "loss": 0.0013, + "step": 184060 + }, + { + "epoch": 39.02191409846721, + "grad_norm": 2.0352139472961426, + "learning_rate": 1.6850517852521863e-07, + "loss": 0.0709, + "step": 184070 + }, + { + "epoch": 39.021968260846016, + "grad_norm": 0.0007469548145309091, + "learning_rate": 1.654961574801254e-07, + "loss": 0.0322, + "step": 184080 + }, + { + "epoch": 39.022022423224826, + "grad_norm": 0.9568939208984375, + "learning_rate": 1.6248713643503224e-07, + "loss": 0.0468, + "step": 184090 + }, + { + "epoch": 39.02207658560364, + "grad_norm": 0.9453892707824707, + "learning_rate": 1.5947811538993904e-07, + "loss": 0.0344, + "step": 184100 + }, + { + "epoch": 39.02213074798245, + "grad_norm": 0.0010042639914900064, + "learning_rate": 1.5646909434484587e-07, + "loss": 0.0164, + "step": 184110 + }, + { + "epoch": 39.02218491036126, + "grad_norm": 0.0007461266941390932, + "learning_rate": 1.5346007329975265e-07, + "loss": 0.0799, + "step": 184120 + }, + { + "epoch": 39.02223907274008, + "grad_norm": 0.002126015955582261, + "learning_rate": 1.5045105225465948e-07, + "loss": 0.049, + "step": 184130 + }, + { + "epoch": 39.02229323511889, + "grad_norm": 0.4808402955532074, + "learning_rate": 1.4744203120956629e-07, + "loss": 0.0029, + "step": 184140 + }, + { + "epoch": 39.0223473974977, + "grad_norm": 2.5561599731445312, + "learning_rate": 1.4443301016447312e-07, + "loss": 0.0428, + "step": 184150 + }, + { + "epoch": 39.022401559876506, + "grad_norm": 0.0014508265303447843, + "learning_rate": 1.414239891193799e-07, + "loss": 0.0252, + "step": 184160 + }, + { + "epoch": 39.02245572225532, + "grad_norm": 2.126464366912842, + "learning_rate": 1.3841496807428672e-07, + "loss": 0.0514, + "step": 184170 + }, + { + "epoch": 39.02250988463413, + "grad_norm": 0.0007379441522061825, + "learning_rate": 1.3540594702919353e-07, + "loss": 0.0599, + "step": 184180 + }, + { + "epoch": 39.02256404701294, + "grad_norm": 0.36268115043640137, + "learning_rate": 1.3239692598410033e-07, + "loss": 0.0284, + "step": 184190 + }, + { + "epoch": 39.02261820939176, + "grad_norm": 1.0728974342346191, + "learning_rate": 1.2938790493900714e-07, + "loss": 0.0292, + "step": 184200 + }, + { + "epoch": 39.02267237177057, + "grad_norm": 0.003149620955809951, + "learning_rate": 1.2637888389391397e-07, + "loss": 0.0417, + "step": 184210 + }, + { + "epoch": 39.02272653414938, + "grad_norm": 0.0007378180744126439, + "learning_rate": 1.2336986284882077e-07, + "loss": 0.0031, + "step": 184220 + }, + { + "epoch": 39.022780696528194, + "grad_norm": 0.0007554627954959869, + "learning_rate": 1.2036084180372757e-07, + "loss": 0.0267, + "step": 184230 + }, + { + "epoch": 39.022834858907004, + "grad_norm": 0.00073809182504192, + "learning_rate": 1.1735182075863439e-07, + "loss": 0.0222, + "step": 184240 + }, + { + "epoch": 39.02288902128581, + "grad_norm": 2.217698812484741, + "learning_rate": 1.1434279971354121e-07, + "loss": 0.0692, + "step": 184250 + }, + { + "epoch": 39.02294318366463, + "grad_norm": 0.003872060216963291, + "learning_rate": 1.11333778668448e-07, + "loss": 0.0314, + "step": 184260 + }, + { + "epoch": 39.02299734604344, + "grad_norm": 0.0007410382968373597, + "learning_rate": 1.0832475762335482e-07, + "loss": 0.0077, + "step": 184270 + }, + { + "epoch": 39.02305150842225, + "grad_norm": 0.000975951727014035, + "learning_rate": 1.0531573657826163e-07, + "loss": 0.0002, + "step": 184280 + }, + { + "epoch": 39.02310567080106, + "grad_norm": 0.0013191872276365757, + "learning_rate": 1.0230671553316845e-07, + "loss": 0.0452, + "step": 184290 + }, + { + "epoch": 39.023159833179875, + "grad_norm": 0.0007272767252288759, + "learning_rate": 9.929769448807524e-08, + "loss": 0.0216, + "step": 184300 + }, + { + "epoch": 39.023213995558685, + "grad_norm": 5.436712741851807, + "learning_rate": 9.628867344298206e-08, + "loss": 0.0189, + "step": 184310 + }, + { + "epoch": 39.023268157937494, + "grad_norm": 0.0007378793088719249, + "learning_rate": 9.327965239788888e-08, + "loss": 0.0134, + "step": 184320 + }, + { + "epoch": 39.02332232031631, + "grad_norm": 0.07643593102693558, + "learning_rate": 9.027063135279568e-08, + "loss": 0.0403, + "step": 184330 + }, + { + "epoch": 39.02337648269512, + "grad_norm": 0.0013395232381299138, + "learning_rate": 8.72616103077025e-08, + "loss": 0.0083, + "step": 184340 + }, + { + "epoch": 39.02343064507393, + "grad_norm": 0.9879648685455322, + "learning_rate": 8.425258926260932e-08, + "loss": 0.0474, + "step": 184350 + }, + { + "epoch": 39.023484807452746, + "grad_norm": 0.0010532272281125188, + "learning_rate": 8.124356821751612e-08, + "loss": 0.0521, + "step": 184360 + }, + { + "epoch": 39.023538969831556, + "grad_norm": 0.001296673552133143, + "learning_rate": 7.823454717242294e-08, + "loss": 0.0009, + "step": 184370 + }, + { + "epoch": 39.023593132210365, + "grad_norm": 0.0009769233874976635, + "learning_rate": 7.522552612732974e-08, + "loss": 0.0316, + "step": 184380 + }, + { + "epoch": 39.02364729458918, + "grad_norm": 0.1393493115901947, + "learning_rate": 7.221650508223656e-08, + "loss": 0.0083, + "step": 184390 + }, + { + "epoch": 39.02370145696799, + "grad_norm": 0.000956449774093926, + "learning_rate": 6.920748403714336e-08, + "loss": 0.0469, + "step": 184400 + }, + { + "epoch": 39.0237556193468, + "grad_norm": 0.0012012700317427516, + "learning_rate": 6.619846299205017e-08, + "loss": 0.003, + "step": 184410 + }, + { + "epoch": 39.02380978172561, + "grad_norm": 0.000753920350689441, + "learning_rate": 6.318944194695698e-08, + "loss": 0.0141, + "step": 184420 + }, + { + "epoch": 39.02386394410443, + "grad_norm": 0.40911397337913513, + "learning_rate": 6.018042090186379e-08, + "loss": 0.0429, + "step": 184430 + }, + { + "epoch": 39.02391810648324, + "grad_norm": 13.392030715942383, + "learning_rate": 5.7171399856770604e-08, + "loss": 0.1775, + "step": 184440 + }, + { + "epoch": 39.023972268862046, + "grad_norm": 0.0016492705326527357, + "learning_rate": 5.416237881167741e-08, + "loss": 0.0112, + "step": 184450 + }, + { + "epoch": 39.02402643124086, + "grad_norm": 0.0013278612168505788, + "learning_rate": 5.1153357766584226e-08, + "loss": 0.0159, + "step": 184460 + }, + { + "epoch": 39.02408059361967, + "grad_norm": 0.0009835415985435247, + "learning_rate": 4.814433672149103e-08, + "loss": 0.0186, + "step": 184470 + }, + { + "epoch": 39.02413475599848, + "grad_norm": 0.02664458192884922, + "learning_rate": 4.513531567639784e-08, + "loss": 0.0212, + "step": 184480 + }, + { + "epoch": 39.0241889183773, + "grad_norm": 0.726003885269165, + "learning_rate": 4.212629463130466e-08, + "loss": 0.0247, + "step": 184490 + }, + { + "epoch": 39.02424308075611, + "grad_norm": 0.0007885917439125478, + "learning_rate": 3.911727358621147e-08, + "loss": 0.0335, + "step": 184500 + }, + { + "epoch": 39.02429724313492, + "grad_norm": 0.6175790429115295, + "learning_rate": 3.610825254111828e-08, + "loss": 0.0323, + "step": 184510 + }, + { + "epoch": 39.02435140551373, + "grad_norm": 1.5303760766983032, + "learning_rate": 3.309923149602508e-08, + "loss": 0.0498, + "step": 184520 + }, + { + "epoch": 39.02440556789254, + "grad_norm": 0.3650299906730652, + "learning_rate": 3.0090210450931894e-08, + "loss": 0.0157, + "step": 184530 + }, + { + "epoch": 39.02445973027135, + "grad_norm": 0.0007342526805587113, + "learning_rate": 2.7081189405838704e-08, + "loss": 0.0254, + "step": 184540 + }, + { + "epoch": 39.02451389265016, + "grad_norm": 0.0009459190187044442, + "learning_rate": 2.4072168360745515e-08, + "loss": 0.0076, + "step": 184550 + }, + { + "epoch": 39.02456805502898, + "grad_norm": 0.039848845452070236, + "learning_rate": 2.106314731565233e-08, + "loss": 0.0636, + "step": 184560 + }, + { + "epoch": 39.02462221740779, + "grad_norm": 0.39908185601234436, + "learning_rate": 1.805412627055914e-08, + "loss": 0.0025, + "step": 184570 + }, + { + "epoch": 39.0246763797866, + "grad_norm": 0.005328903440386057, + "learning_rate": 1.5045105225465947e-08, + "loss": 0.0594, + "step": 184580 + }, + { + "epoch": 39.024730542165415, + "grad_norm": 0.41888168454170227, + "learning_rate": 1.2036084180372757e-08, + "loss": 0.0498, + "step": 184590 + }, + { + "epoch": 39.024784704544224, + "grad_norm": 0.0008082143613137305, + "learning_rate": 9.02706313527957e-09, + "loss": 0.0528, + "step": 184600 + }, + { + "epoch": 39.024838866923034, + "grad_norm": 0.5473966002464294, + "learning_rate": 6.018042090186379e-09, + "loss": 0.0206, + "step": 184610 + }, + { + "epoch": 39.02489302930185, + "grad_norm": 1.1408103704452515, + "learning_rate": 3.0090210450931894e-09, + "loss": 0.0211, + "step": 184620 + }, + { + "epoch": 39.02494719168066, + "grad_norm": 0.0007415027939714491, + "learning_rate": 0.0, + "loss": 0.0164, + "step": 184630 + }, + { + "epoch": 39.02494719168066, + "eval_accuracy": 0.8396472893533639, + "eval_loss": 1.1232649087905884, + "eval_runtime": 115.3409, + "eval_samples_per_second": 26.547, + "eval_steps_per_second": 3.321, + "step": 184630 + }, + { + "epoch": 39.02494719168066, + "step": 184630, + "total_flos": 1.2940362814847428e+21, + "train_loss": 0.06548064198671348, + "train_runtime": 87612.5477, + "train_samples_per_second": 16.859, + "train_steps_per_second": 2.107 + }, + { + "epoch": 39.02494719168066, + "eval_accuracy": 0.775413176364048, + "eval_loss": 0.7684532999992371, + "eval_runtime": 167.5463, + "eval_samples_per_second": 26.363, + "eval_steps_per_second": 3.301, + "step": 184630 + } + ], + "logging_steps": 10, + "max_steps": 184630, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2940362814847428e+21, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}